aboutsummaryrefslogtreecommitdiffstats
path: root/lib/lib8tion
diff options
context:
space:
mode:
authorXScorpion2 <rcalt2vt@gmail.com>2019-04-02 19:24:14 -0500
committerDrashna Jaelre <drashna@live.com>2019-04-02 17:24:14 -0700
commitc98247e3dd2958bd2d8969dc75170e7e2757b895 (patch)
treea566de223a9501809e1059c522b52adf7d37fe74 /lib/lib8tion
parent68d8bb2b3fb8a35fda164539d27754b3f74e0819 (diff)
downloadfirmware-c98247e3dd2958bd2d8969dc75170e7e2757b895.tar.gz
firmware-c98247e3dd2958bd2d8969dc75170e7e2757b895.tar.bz2
firmware-c98247e3dd2958bd2d8969dc75170e7e2757b895.zip
RGB Matrix Overhaul (#5372)
* RGB Matrix overhaul Breakout of animations to separate files Integration of optimized int based math lib Overhaul of rgb_matrix.c and animations for performance * Updating effect function api for future extensions * Combined the keypresses || keyreleases define checks into a single define so I stop forgetting it where necessary * Moving define RGB_MATRIX_KEYREACTIVE_ENABLED earlier in the include chain
Diffstat (limited to 'lib/lib8tion')
-rw-r--r--lib/lib8tion/LICENSE20
-rw-r--r--lib/lib8tion/lib8tion.c242
-rw-r--r--lib/lib8tion/lib8tion.h934
-rw-r--r--lib/lib8tion/math8.h552
-rw-r--r--lib/lib8tion/random8.h94
-rw-r--r--lib/lib8tion/scale8.h542
-rw-r--r--lib/lib8tion/trig8.h259
7 files changed, 2643 insertions, 0 deletions
diff --git a/lib/lib8tion/LICENSE b/lib/lib8tion/LICENSE
new file mode 100644
index 000000000..ebe476330
--- /dev/null
+++ b/lib/lib8tion/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2013 FastLED
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/lib/lib8tion/lib8tion.c b/lib/lib8tion/lib8tion.c
new file mode 100644
index 000000000..84b3e9c61
--- /dev/null
+++ b/lib/lib8tion/lib8tion.c
@@ -0,0 +1,242 @@
+#define FASTLED_INTERNAL
+#include <stdint.h>
+
+#define RAND16_SEED 1337
+uint16_t rand16seed = RAND16_SEED;
+
+
+// memset8, memcpy8, memmove8:
+// optimized avr replacements for the standard "C" library
+// routines memset, memcpy, and memmove.
+//
+// There are two techniques that make these routines
+// faster than the standard avr-libc routines.
+// First, the loops are unrolled 2X, meaning that
+// the average loop overhead is cut in half.
+// And second, the compare-and-branch at the bottom
+// of each loop decrements the low byte of the
+// counter, and if the carry is clear, it branches
+// back up immediately. Only if the low byte math
+// causes carry do we bother to decrement the high
+// byte and check that result for carry as well.
+// Results for a 100-byte buffer are 20-40% faster
+// than standard avr-libc, at a cost of a few extra
+// bytes of code.
+
+#if defined(__AVR__)
+//__attribute__ ((noinline))
+void * memset8 ( void * ptr, uint8_t val, uint16_t num )
+{
+ asm volatile(
+ " movw r26, %[ptr] \n\t"
+ " sbrs %A[num], 0 \n\t"
+ " rjmp Lseteven_%= \n\t"
+ " rjmp Lsetodd_%= \n\t"
+ "Lsetloop_%=: \n\t"
+ " st X+, %[val] \n\t"
+ "Lsetodd_%=: \n\t"
+ " st X+, %[val] \n\t"
+ "Lseteven_%=: \n\t"
+ " subi %A[num], 2 \n\t"
+ " brcc Lsetloop_%= \n\t"
+ " sbci %B[num], 0 \n\t"
+ " brcc Lsetloop_%= \n\t"
+ : [num] "+r" (num)
+ : [ptr] "r" (ptr),
+ [val] "r" (val)
+ : "memory"
+ );
+ return ptr;
+}
+
+
+
+//__attribute__ ((noinline))
+void * memcpy8 ( void * dst, const void* src, uint16_t num )
+{
+ asm volatile(
+ " movw r30, %[src] \n\t"
+ " movw r26, %[dst] \n\t"
+ " sbrs %A[num], 0 \n\t"
+ " rjmp Lcpyeven_%= \n\t"
+ " rjmp Lcpyodd_%= \n\t"
+ "Lcpyloop_%=: \n\t"
+ " ld __tmp_reg__, Z+ \n\t"
+ " st X+, __tmp_reg__ \n\t"
+ "Lcpyodd_%=: \n\t"
+ " ld __tmp_reg__, Z+ \n\t"
+ " st X+, __tmp_reg__ \n\t"
+ "Lcpyeven_%=: \n\t"
+ " subi %A[num], 2 \n\t"
+ " brcc Lcpyloop_%= \n\t"
+ " sbci %B[num], 0 \n\t"
+ " brcc Lcpyloop_%= \n\t"
+ : [num] "+r" (num)
+ : [src] "r" (src),
+ [dst] "r" (dst)
+ : "memory"
+ );
+ return dst;
+}
+
+//__attribute__ ((noinline))
+void * memmove8 ( void * dst, const void* src, uint16_t num )
+{
+ if( src > dst) {
+ // if src > dst then we can use the forward-stepping memcpy8
+ return memcpy8( dst, src, num);
+ } else {
+ // if src < dst then we have to step backward:
+ dst = (char*)dst + num;
+ src = (char*)src + num;
+ asm volatile(
+ " movw r30, %[src] \n\t"
+ " movw r26, %[dst] \n\t"
+ " sbrs %A[num], 0 \n\t"
+ " rjmp Lmoveven_%= \n\t"
+ " rjmp Lmovodd_%= \n\t"
+ "Lmovloop_%=: \n\t"
+ " ld __tmp_reg__, -Z \n\t"
+ " st -X, __tmp_reg__ \n\t"
+ "Lmovodd_%=: \n\t"
+ " ld __tmp_reg__, -Z \n\t"
+ " st -X, __tmp_reg__ \n\t"
+ "Lmoveven_%=: \n\t"
+ " subi %A[num], 2 \n\t"
+ " brcc Lmovloop_%= \n\t"
+ " sbci %B[num], 0 \n\t"
+ " brcc Lmovloop_%= \n\t"
+ : [num] "+r" (num)
+ : [src] "r" (src),
+ [dst] "r" (dst)
+ : "memory"
+ );
+ return dst;
+ }
+}
+
+#endif /* AVR */
+
+
+
+
+#if 0
+// TEST / VERIFICATION CODE ONLY BELOW THIS POINT
+#include <Arduino.h>
+#include "lib8tion.h"
+
+void test1abs( int8_t i)
+{
+ Serial.print("abs("); Serial.print(i); Serial.print(") = ");
+ int8_t j = abs8(i);
+ Serial.print(j); Serial.println(" ");
+}
+
+void testabs()
+{
+ delay(5000);
+ for( int8_t q = -128; q != 127; q++) {
+ test1abs(q);
+ }
+ for(;;){};
+}
+
+
+void testmul8()
+{
+ delay(5000);
+ byte r, c;
+
+ Serial.println("mul8:");
+ for( r = 0; r <= 20; r += 1) {
+ Serial.print(r); Serial.print(" : ");
+ for( c = 0; c <= 20; c += 1) {
+ byte t;
+ t = mul8( r, c);
+ Serial.print(t); Serial.print(' ');
+ }
+ Serial.println(' ');
+ }
+ Serial.println("done.");
+ for(;;){};
+}
+
+
+void testscale8()
+{
+ delay(5000);
+ byte r, c;
+
+ Serial.println("scale8:");
+ for( r = 0; r <= 240; r += 10) {
+ Serial.print(r); Serial.print(" : ");
+ for( c = 0; c <= 240; c += 10) {
+ byte t;
+ t = scale8( r, c);
+ Serial.print(t); Serial.print(' ');
+ }
+ Serial.println(' ');
+ }
+
+ Serial.println(' ');
+ Serial.println("scale8_video:");
+
+ for( r = 0; r <= 100; r += 4) {
+ Serial.print(r); Serial.print(" : ");
+ for( c = 0; c <= 100; c += 4) {
+ byte t;
+ t = scale8_video( r, c);
+ Serial.print(t); Serial.print(' ');
+ }
+ Serial.println(' ');
+ }
+
+ Serial.println("done.");
+ for(;;){};
+}
+
+
+
+void testqadd8()
+{
+ delay(5000);
+ byte r, c;
+ for( r = 0; r <= 240; r += 10) {
+ Serial.print(r); Serial.print(" : ");
+ for( c = 0; c <= 240; c += 10) {
+ byte t;
+ t = qadd8( r, c);
+ Serial.print(t); Serial.print(' ');
+ }
+ Serial.println(' ');
+ }
+ Serial.println("done.");
+ for(;;){};
+}
+
+void testnscale8x3()
+{
+ delay(5000);
+ byte r, g, b, sc;
+ for( byte z = 0; z < 10; z++) {
+ r = random8(); g = random8(); b = random8(); sc = random8();
+
+ Serial.print("nscale8x3_video( ");
+ Serial.print(r); Serial.print(", ");
+ Serial.print(g); Serial.print(", ");
+ Serial.print(b); Serial.print(", ");
+ Serial.print(sc); Serial.print(") = [ ");
+
+ nscale8x3_video( r, g, b, sc);
+
+ Serial.print(r); Serial.print(", ");
+ Serial.print(g); Serial.print(", ");
+ Serial.print(b); Serial.print("]");
+
+ Serial.println(' ');
+ }
+ Serial.println("done.");
+ for(;;){};
+}
+
+#endif
diff --git a/lib/lib8tion/lib8tion.h b/lib/lib8tion/lib8tion.h
new file mode 100644
index 000000000..d93c748e6
--- /dev/null
+++ b/lib/lib8tion/lib8tion.h
@@ -0,0 +1,934 @@
+#ifndef __INC_LIB8TION_H
+#define __INC_LIB8TION_H
+
+/*
+
+ Fast, efficient 8-bit math functions specifically
+ designed for high-performance LED programming.
+
+ Because of the AVR(Arduino) and ARM assembly language
+ implementations provided, using these functions often
+ results in smaller and faster code than the equivalent
+ program using plain "C" arithmetic and logic.
+
+
+ Included are:
+
+
+ - Saturating unsigned 8-bit add and subtract.
+ Instead of wrapping around if an overflow occurs,
+ these routines just 'clamp' the output at a maxumum
+ of 255, or a minimum of 0. Useful for adding pixel
+ values. E.g., qadd8( 200, 100) = 255.
+
+ qadd8( i, j) == MIN( (i + j), 0xFF )
+ qsub8( i, j) == MAX( (i - j), 0 )
+
+ - Saturating signed 8-bit ("7-bit") add.
+ qadd7( i, j) == MIN( (i + j), 0x7F)
+
+
+ - Scaling (down) of unsigned 8- and 16- bit values.
+ Scaledown value is specified in 1/256ths.
+ scale8( i, sc) == (i * sc) / 256
+ scale16by8( i, sc) == (i * sc) / 256
+
+ Example: scaling a 0-255 value down into a
+ range from 0-99:
+ downscaled = scale8( originalnumber, 100);
+
+ A special version of scale8 is provided for scaling
+ LED brightness values, to make sure that they don't
+ accidentally scale down to total black at low
+ dimming levels, since that would look wrong:
+ scale8_video( i, sc) = ((i * sc) / 256) +? 1
+
+ Example: reducing an LED brightness by a
+ dimming factor:
+ new_bright = scale8_video( orig_bright, dimming);
+
+
+ - Fast 8- and 16- bit unsigned random numbers.
+ Significantly faster than Arduino random(), but
+ also somewhat less random. You can add entropy.
+ random8() == random from 0..255
+ random8( n) == random from 0..(N-1)
+ random8( n, m) == random from N..(M-1)
+
+ random16() == random from 0..65535
+ random16( n) == random from 0..(N-1)
+ random16( n, m) == random from N..(M-1)
+
+ random16_set_seed( k) == seed = k
+ random16_add_entropy( k) == seed += k
+
+
+ - Absolute value of a signed 8-bit value.
+ abs8( i) == abs( i)
+
+
+ - 8-bit math operations which return 8-bit values.
+ These are provided mostly for completeness,
+ not particularly for performance.
+ mul8( i, j) == (i * j) & 0xFF
+ add8( i, j) == (i + j) & 0xFF
+ sub8( i, j) == (i - j) & 0xFF
+
+
+ - Fast 16-bit approximations of sin and cos.
+ Input angle is a uint16_t from 0-65535.
+ Output is a signed int16_t from -32767 to 32767.
+ sin16( x) == sin( (x/32768.0) * pi) * 32767
+ cos16( x) == cos( (x/32768.0) * pi) * 32767
+ Accurate to more than 99% in all cases.
+
+ - Fast 8-bit approximations of sin and cos.
+ Input angle is a uint8_t from 0-255.
+ Output is an UNsigned uint8_t from 0 to 255.
+ sin8( x) == (sin( (x/128.0) * pi) * 128) + 128
+ cos8( x) == (cos( (x/128.0) * pi) * 128) + 128
+ Accurate to within about 2%.
+
+
+ - Fast 8-bit "easing in/out" function.
+ ease8InOutCubic(x) == 3(x^i) - 2(x^3)
+ ease8InOutApprox(x) ==
+ faster, rougher, approximation of cubic easing
+ ease8InOutQuad(x) == quadratic (vs cubic) easing
+
+ - Cubic, Quadratic, and Triangle wave functions.
+ Input is a uint8_t representing phase withing the wave,
+ similar to how sin8 takes an angle 'theta'.
+ Output is a uint8_t representing the amplitude of
+ the wave at that point.
+ cubicwave8( x)
+ quadwave8( x)
+ triwave8( x)
+
+ - Square root for 16-bit integers. About three times
+ faster and five times smaller than Arduino's built-in
+ generic 32-bit sqrt routine.
+ sqrt16( uint16_t x ) == sqrt( x)
+
+ - Dimming and brightening functions for 8-bit
+ light values.
+ dim8_video( x) == scale8_video( x, x)
+ dim8_raw( x) == scale8( x, x)
+ dim8_lin( x) == (x<128) ? ((x+1)/2) : scale8(x,x)
+ brighten8_video( x) == 255 - dim8_video( 255 - x)
+ brighten8_raw( x) == 255 - dim8_raw( 255 - x)
+ brighten8_lin( x) == 255 - dim8_lin( 255 - x)
+ The dimming functions in particular are suitable
+ for making LED light output appear more 'linear'.
+
+
+ - Linear interpolation between two values, with the
+ fraction between them expressed as an 8- or 16-bit
+ fixed point fraction (fract8 or fract16).
+ lerp8by8( fromU8, toU8, fract8 )
+ lerp16by8( fromU16, toU16, fract8 )
+ lerp15by8( fromS16, toS16, fract8 )
+ == from + (( to - from ) * fract8) / 256)
+ lerp16by16( fromU16, toU16, fract16 )
+ == from + (( to - from ) * fract16) / 65536)
+ map8( in, rangeStart, rangeEnd)
+ == map( in, 0, 255, rangeStart, rangeEnd);
+
+ - Optimized memmove, memcpy, and memset, that are
+ faster than standard avr-libc 1.8.
+ memmove8( dest, src, bytecount)
+ memcpy8( dest, src, bytecount)
+ memset8( buf, value, bytecount)
+
+ - Beat generators which return sine or sawtooth
+ waves in a specified number of Beats Per Minute.
+ Sine wave beat generators can specify a low and
+ high range for the output. Sawtooth wave beat
+ generators always range 0-255 or 0-65535.
+ beatsin8( BPM, low8, high8)
+ = (sine(beatphase) * (high8-low8)) + low8
+ beatsin16( BPM, low16, high16)
+ = (sine(beatphase) * (high16-low16)) + low16
+ beatsin88( BPM88, low16, high16)
+ = (sine(beatphase) * (high16-low16)) + low16
+ beat8( BPM) = 8-bit repeating sawtooth wave
+ beat16( BPM) = 16-bit repeating sawtooth wave
+ beat88( BPM88) = 16-bit repeating sawtooth wave
+ BPM is beats per minute in either simple form
+ e.g. 120, or Q8.8 fixed-point form.
+ BPM88 is beats per minute in ONLY Q8.8 fixed-point
+ form.
+
+Lib8tion is pronounced like 'libation': lie-BAY-shun
+
+*/
+
+
+
+#include <stdint.h>
+
+#define LIB8STATIC __attribute__ ((unused)) static inline
+#define LIB8STATIC_ALWAYS_INLINE __attribute__ ((always_inline)) static inline
+
+#if !defined(__AVR__)
+#include <string.h>
+// for memmove, memcpy, and memset if not defined here
+#endif
+
+#if defined(__arm__)
+
+#if defined(FASTLED_TEENSY3)
+// Can use Cortex M4 DSP instructions
+#define QADD8_C 0
+#define QADD7_C 0
+#define QADD8_ARM_DSP_ASM 1
+#define QADD7_ARM_DSP_ASM 1
+#else
+// Generic ARM
+#define QADD8_C 1
+#define QADD7_C 1
+#endif
+
+#define QSUB8_C 1
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define ABS8_C 1
+#define MUL8_C 1
+#define QMUL8_C 1
+#define ADD8_C 1
+#define SUB8_C 1
+#define EASE8_C 1
+#define AVG8_C 1
+#define AVG7_C 1
+#define AVG16_C 1
+#define AVG15_C 1
+#define BLEND8_C 1
+
+
+#elif defined(__AVR__)
+
+// AVR ATmega and friends Arduino
+
+#define QADD8_C 0
+#define QADD7_C 0
+#define QSUB8_C 0
+#define ABS8_C 0
+#define ADD8_C 0
+#define SUB8_C 0
+#define AVG8_C 0
+#define AVG7_C 0
+#define AVG16_C 0
+#define AVG15_C 0
+
+#define QADD8_AVRASM 1
+#define QADD7_AVRASM 1
+#define QSUB8_AVRASM 1
+#define ABS8_AVRASM 1
+#define ADD8_AVRASM 1
+#define SUB8_AVRASM 1
+#define AVG8_AVRASM 1
+#define AVG7_AVRASM 1
+#define AVG16_AVRASM 1
+#define AVG15_AVRASM 1
+
+// Note: these require hardware MUL instruction
+// -- sorry, ATtiny!
+#if !defined(LIB8_ATTINY)
+#define SCALE8_C 0
+#define SCALE16BY8_C 0
+#define SCALE16_C 0
+#define MUL8_C 0
+#define QMUL8_C 0
+#define EASE8_C 0
+#define BLEND8_C 0
+#define SCALE8_AVRASM 1
+#define SCALE16BY8_AVRASM 1
+#define SCALE16_AVRASM 1
+#define MUL8_AVRASM 1
+#define QMUL8_AVRASM 1
+#define EASE8_AVRASM 1
+#define CLEANUP_R1_AVRASM 1
+#define BLEND8_AVRASM 1
+#else
+// On ATtiny, we just use C implementations
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define MUL8_C 1
+#define QMUL8_C 1
+#define EASE8_C 1
+#define BLEND8_C 1
+#define SCALE8_AVRASM 0
+#define SCALE16BY8_AVRASM 0
+#define SCALE16_AVRASM 0
+#define MUL8_AVRASM 0
+#define QMUL8_AVRASM 0
+#define EASE8_AVRASM 0
+#define BLEND8_AVRASM 0
+#endif
+
+#else
+
+// unspecified architecture, so
+// no ASM, everything in C
+#define QADD8_C 1
+#define QADD7_C 1
+#define QSUB8_C 1
+#define SCALE8_C 1
+#define SCALE16BY8_C 1
+#define SCALE16_C 1
+#define ABS8_C 1
+#define MUL8_C 1
+#define QMUL8_C 1
+#define ADD8_C 1
+#define SUB8_C 1
+#define EASE8_C 1
+#define AVG8_C 1
+#define AVG7_C 1
+#define AVG16_C 1
+#define AVG15_C 1
+#define BLEND8_C 1
+
+#endif
+
+///@defgroup lib8tion Fast math functions
+///A variety of functions for working with numbers.
+///@{
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// typdefs for fixed-point fractional types.
+//
+// sfract7 should be interpreted as signed 128ths.
+// fract8 should be interpreted as unsigned 256ths.
+// sfract15 should be interpreted as signed 32768ths.
+// fract16 should be interpreted as unsigned 65536ths.
+//
+// Example: if a fract8 has the value "64", that should be interpreted
+// as 64/256ths, or one-quarter.
+//
+//
+// fract8 range is 0 to 0.99609375
+// in steps of 0.00390625
+//
+// sfract7 range is -0.9921875 to 0.9921875
+// in steps of 0.0078125
+//
+// fract16 range is 0 to 0.99998474121
+// in steps of 0.00001525878
+//
+// sfract15 range is -0.99996948242 to 0.99996948242
+// in steps of 0.00003051757
+//
+
+/// ANSI unsigned short _Fract. range is 0 to 0.99609375
+/// in steps of 0.00390625
+typedef uint8_t fract8; ///< ANSI: unsigned short _Fract
+
+/// ANSI: signed short _Fract. range is -0.9921875 to 0.9921875
+/// in steps of 0.0078125
+typedef int8_t sfract7; ///< ANSI: signed short _Fract
+
+/// ANSI: unsigned _Fract. range is 0 to 0.99998474121
+/// in steps of 0.00001525878
+typedef uint16_t fract16; ///< ANSI: unsigned _Fract
+
+/// ANSI: signed _Fract. range is -0.99996948242 to 0.99996948242
+/// in steps of 0.00003051757
+typedef int16_t sfract15; ///< ANSI: signed _Fract
+
+
+// accumXY types should be interpreted as X bits of integer,
+// and Y bits of fraction.
+// E.g., accum88 has 8 bits of int, 8 bits of fraction
+
+typedef uint16_t accum88; ///< ANSI: unsigned short _Accum. 8 bits int, 8 bits fraction
+typedef int16_t saccum78; ///< ANSI: signed short _Accum. 7 bits int, 8 bits fraction
+typedef uint32_t accum1616;///< ANSI: signed _Accum. 16 bits int, 16 bits fraction
+typedef int32_t saccum1516;///< ANSI: signed _Accum. 15 bits int, 16 bits fraction
+typedef uint16_t accum124; ///< no direct ANSI counterpart. 12 bits int, 4 bits fraction
+typedef int32_t saccum114;///< no direct ANSI counterpart. 1 bit int, 14 bits fraction
+
+
+
+#include "math8.h"
+#include "scale8.h"
+#include "random8.h"
+#include "trig8.h"
+
+///////////////////////////////////////////////////////////////////////
+
+
+
+
+
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// float-to-fixed and fixed-to-float conversions
+//
+// Note that anything involving a 'float' on AVR will be slower.
+
+/// sfract15ToFloat: conversion from sfract15 fixed point to
+/// IEEE754 32-bit float.
+LIB8STATIC float sfract15ToFloat( sfract15 y)
+{
+ return y / 32768.0;
+}
+
+/// conversion from IEEE754 float in the range (-1,1)
+/// to 16-bit fixed point. Note that the extremes of
+/// one and negative one are NOT representable. The
+/// representable range is basically
+LIB8STATIC sfract15 floatToSfract15( float f)
+{
+ return f * 32768.0;
+}
+
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// memmove8, memcpy8, and memset8:
+// alternatives to memmove, memcpy, and memset that are
+// faster on AVR than standard avr-libc 1.8
+
+#if defined(__AVR__)
+void * memmove8( void * dst, const void * src, uint16_t num );
+void * memcpy8 ( void * dst, const void * src, uint16_t num ) __attribute__ ((noinline));
+void * memset8 ( void * ptr, uint8_t value, uint16_t num ) __attribute__ ((noinline)) ;
+#else
+// on non-AVR platforms, these names just call standard libc.
+#define memmove8 memmove
+#define memcpy8 memcpy
+#define memset8 memset
+#endif
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// linear interpolation, such as could be used for Perlin noise, etc.
+//
+
+// A note on the structure of the lerp functions:
+// The cases for b>a and b<=a are handled separately for
+// speed: without knowing the relative order of a and b,
+// the value (a-b) might be overflow the width of a or b,
+// and have to be promoted to a wider, slower type.
+// To avoid that, we separate the two cases, and are able
+// to do all the math in the same width as the arguments,
+// which is much faster and smaller on AVR.
+
+/// linear interpolation between two unsigned 8-bit values,
+/// with 8-bit fraction
+LIB8STATIC uint8_t lerp8by8( uint8_t a, uint8_t b, fract8 frac)
+{
+ uint8_t result;
+ if( b > a) {
+ uint8_t delta = b - a;
+ uint8_t scaled = scale8( delta, frac);
+ result = a + scaled;
+ } else {
+ uint8_t delta = a - b;
+ uint8_t scaled = scale8( delta, frac);
+ result = a - scaled;
+ }
+ return result;
+}
+
+/// linear interpolation between two unsigned 16-bit values,
+/// with 16-bit fraction
+LIB8STATIC uint16_t lerp16by16( uint16_t a, uint16_t b, fract16 frac)
+{
+ uint16_t result;
+ if( b > a ) {
+ uint16_t delta = b - a;
+ uint16_t scaled = scale16(delta, frac);
+ result = a + scaled;
+ } else {
+ uint16_t delta = a - b;
+ uint16_t scaled = scale16( delta, frac);
+ result = a - scaled;
+ }
+ return result;
+}
+
+/// linear interpolation between two unsigned 16-bit values,
+/// with 8-bit fraction
+LIB8STATIC uint16_t lerp16by8( uint16_t a, uint16_t b, fract8 frac)
+{
+ uint16_t result;
+ if( b > a) {
+ uint16_t delta = b - a;
+ uint16_t scaled = scale16by8( delta, frac);
+ result = a + scaled;
+ } else {
+ uint16_t delta = a - b;
+ uint16_t scaled = scale16by8( delta, frac);
+ result = a - scaled;
+ }
+ return result;
+}
+
+/// linear interpolation between two signed 15-bit values,
+/// with 8-bit fraction
+LIB8STATIC int16_t lerp15by8( int16_t a, int16_t b, fract8 frac)
+{
+ int16_t result;
+ if( b > a) {
+ uint16_t delta = b - a;
+ uint16_t scaled = scale16by8( delta, frac);
+ result = a + scaled;
+ } else {
+ uint16_t delta = a - b;
+ uint16_t scaled = scale16by8( delta, frac);
+ result = a - scaled;
+ }
+ return result;
+}
+
+/// linear interpolation between two signed 15-bit values,
+/// with 8-bit fraction
+LIB8STATIC int16_t lerp15by16( int16_t a, int16_t b, fract16 frac)
+{
+ int16_t result;
+ if( b > a) {
+ uint16_t delta = b - a;
+ uint16_t scaled = scale16( delta, frac);
+ result = a + scaled;
+ } else {
+ uint16_t delta = a - b;
+ uint16_t scaled = scale16( delta, frac);
+ result = a - scaled;
+ }
+ return result;
+}
+
+/// map8: map from one full-range 8-bit value into a narrower
+/// range of 8-bit values, possibly a range of hues.
+///
+/// E.g. map myValue into a hue in the range blue..purple..pink..red
+/// hue = map8( myValue, HUE_BLUE, HUE_RED);
+///
+/// Combines nicely with the waveform functions (like sin8, etc)
+/// to produce continuous hue gradients back and forth:
+///
+/// hue = map8( sin8( myValue), HUE_BLUE, HUE_RED);
+///
+/// Mathematically simiar to lerp8by8, but arguments are more
+/// like Arduino's "map"; this function is similar to
+///
+/// map( in, 0, 255, rangeStart, rangeEnd)
+///
+/// but faster and specifically designed for 8-bit values.
+LIB8STATIC uint8_t map8( uint8_t in, uint8_t rangeStart, uint8_t rangeEnd)
+{
+ uint8_t rangeWidth = rangeEnd - rangeStart;
+ uint8_t out = scale8( in, rangeWidth);
+ out += rangeStart;
+ return out;
+}
+
+
+///////////////////////////////////////////////////////////////////////
+//
+// easing functions; see http://easings.net
+//
+
+/// ease8InOutQuad: 8-bit quadratic ease-in / ease-out function
+/// Takes around 13 cycles on AVR
+#if EASE8_C == 1
+LIB8STATIC uint8_t ease8InOutQuad( uint8_t i)
+{
+ uint8_t j = i;
+ if( j & 0x80 ) {
+ j = 255 - j;
+ }
+ uint8_t jj = scale8( j, j);
+ uint8_t jj2 = jj << 1;
+ if( i & 0x80 ) {
+ jj2 = 255 - jj2;
+ }
+ return jj2;
+}
+
+#elif EASE8_AVRASM == 1
+// This AVR asm version of ease8InOutQuad preserves one more
+// low-bit of precision than the C version, and is also slightly
+// smaller and faster.
+LIB8STATIC uint8_t ease8InOutQuad(uint8_t val) {
+ uint8_t j=val;
+ asm volatile (
+ "sbrc %[val], 7 \n"
+ "com %[j] \n"
+ "mul %[j], %[j] \n"
+ "add r0, %[j] \n"
+ "ldi %[j], 0 \n"
+ "adc %[j], r1 \n"
+ "lsl r0 \n" // carry = high bit of low byte of mul product
+ "rol %[j] \n" // j = (j * 2) + carry // preserve add'l bit of precision
+ "sbrc %[val], 7 \n"
+ "com %[j] \n"
+ "clr __zero_reg__ \n"
+ : [j] "+&a" (j)
+ : [val] "a" (val)
+ : "r0", "r1"
+ );
+ return j;
+}
+
+#else
+#error "No implementation for ease8InOutQuad available."
+#endif
+
+/// ease16InOutQuad: 16-bit quadratic ease-in / ease-out function
+// C implementation at this point
+LIB8STATIC uint16_t ease16InOutQuad( uint16_t i)
+{
+ uint16_t j = i;
+ if( j & 0x8000 ) {
+ j = 65535 - j;
+ }
+ uint16_t jj = scale16( j, j);
+ uint16_t jj2 = jj << 1;
+ if( i & 0x8000 ) {
+ jj2 = 65535 - jj2;
+ }
+ return jj2;
+}
+
+
+/// ease8InOutCubic: 8-bit cubic ease-in / ease-out function
+/// Takes around 18 cycles on AVR
+LIB8STATIC fract8 ease8InOutCubic( fract8 i)
+{
+ uint8_t ii = scale8_LEAVING_R1_DIRTY( i, i);
+ uint8_t iii = scale8_LEAVING_R1_DIRTY( ii, i);
+
+ uint16_t r1 = (3 * (uint16_t)(ii)) - ( 2 * (uint16_t)(iii));
+
+ /* the code generated for the above *'s automatically
+ cleans up R1, so there's no need to explicitily call
+ cleanup_R1(); */
+
+ uint8_t result = r1;
+
+ // if we got "256", return 255:
+ if( r1 & 0x100 ) {
+ result = 255;
+ }
+ return result;
+}
+
+/// ease8InOutApprox: fast, rough 8-bit ease-in/ease-out function
+/// shaped approximately like 'ease8InOutCubic',
+/// it's never off by more than a couple of percent
+/// from the actual cubic S-curve, and it executes
+/// more than twice as fast. Use when the cycles
+/// are more important than visual smoothness.
+/// Asm version takes around 7 cycles on AVR.
+
+#if EASE8_C == 1
+LIB8STATIC fract8 ease8InOutApprox( fract8 i)
+{
+ if( i < 64) {
+ // start with slope 0.5
+ i /= 2;
+ } else if( i > (255 - 64)) {
+ // end with slope 0.5
+ i = 255 - i;
+ i /= 2;
+ i = 255 - i;
+ } else {
+ // in the middle, use slope 192/128 = 1.5
+ i -= 64;
+ i += (i / 2);
+ i += 32;
+ }
+
+ return i;
+}
+
+#elif EASE8_AVRASM == 1
+LIB8STATIC uint8_t ease8InOutApprox( fract8 i)
+{
+ // takes around 7 cycles on AVR
+ asm volatile (
+ " subi %[i], 64 \n\t"
+ " cpi %[i], 128 \n\t"
+ " brcc Lshift_%= \n\t"
+
+ // middle case
+ " mov __tmp_reg__, %[i] \n\t"
+ " lsr __tmp_reg__ \n\t"
+ " add %[i], __tmp_reg__ \n\t"
+ " subi %[i], 224 \n\t"
+ " rjmp Ldone_%= \n\t"
+
+ // start or end case
+ "Lshift_%=: \n\t"
+ " lsr %[i] \n\t"
+ " subi %[i], 96 \n\t"
+
+ "Ldone_%=: \n\t"
+
+ : [i] "+&a" (i)
+ :
+ : "r0", "r1"
+ );
+ return i;
+}
+#else
+#error "No implementation for ease8 available."
+#endif
+
+
+
+/// triwave8: triangle (sawtooth) wave generator. Useful for
+/// turning a one-byte ever-increasing value into a
+/// one-byte value that oscillates up and down.
+///
+/// input output
+/// 0..127 0..254 (positive slope)
+/// 128..255 254..0 (negative slope)
+///
+/// On AVR this function takes just three cycles.
+///
+LIB8STATIC uint8_t triwave8(uint8_t in)
+{
+ if( in & 0x80) {
+ in = 255 - in;
+ }
+ uint8_t out = in << 1;
+ return out;
+}
+
+
+// quadwave8 and cubicwave8: S-shaped wave generators (like 'sine').
+// Useful for turning a one-byte 'counter' value into a
+// one-byte oscillating value that moves smoothly up and down,
+// with an 'acceleration' and 'deceleration' curve.
+//
+// These are even faster than 'sin8', and have
+// slightly different curve shapes.
+//
+
+/// quadwave8: quadratic waveform generator. Spends just a little more
+/// time at the limits than 'sine' does.
+LIB8STATIC uint8_t quadwave8(uint8_t in)
+{
+ return ease8InOutQuad( triwave8( in));
+}
+
+/// cubicwave8: cubic waveform generator. Spends visibly more time
+/// at the limits than 'sine' does.
+LIB8STATIC uint8_t cubicwave8(uint8_t in)
+{
+ return ease8InOutCubic( triwave8( in));
+}
+
+/// squarewave8: square wave generator. Useful for
+/// turning a one-byte ever-increasing value
+/// into a one-byte value that is either 0 or 255.
+/// The width of the output 'pulse' is
+/// determined by the pulsewidth argument:
+///
+///~~~
+/// If pulsewidth is 255, output is always 255.
+/// If pulsewidth < 255, then
+/// if input < pulsewidth then output is 255
+/// if input >= pulsewidth then output is 0
+///~~~
+///
+/// the output looking like:
+///
+///~~~
+/// 255 +--pulsewidth--+
+/// . | |
+/// 0 0 +--------(256-pulsewidth)--------
+///~~~
+///
+/// @param in
+/// @param pulsewidth
+/// @returns square wave output
+LIB8STATIC uint8_t squarewave8( uint8_t in, uint8_t pulsewidth)
+{
+ if( in < pulsewidth || (pulsewidth == 255)) {
+ return 255;
+ } else {
+ return 0;
+ }
+}
+
+
+// Beat generators - These functions produce waves at a given
+// number of 'beats per minute'. Internally, they use
+// the Arduino function 'millis' to track elapsed time.
+// Accuracy is a bit better than one part in a thousand.
+//
+// beat8( BPM ) returns an 8-bit value that cycles 'BPM' times
+// per minute, rising from 0 to 255, resetting to zero,
+// rising up again, etc.. The output of this function
+// is suitable for feeding directly into sin8, and cos8,
+// triwave8, quadwave8, and cubicwave8.
+// beat16( BPM ) returns a 16-bit value that cycles 'BPM' times
+// per minute, rising from 0 to 65535, resetting to zero,
+// rising up again, etc. The output of this function is
+// suitable for feeding directly into sin16 and cos16.
+// beat88( BPM88) is the same as beat16, except that the BPM88 argument
+// MUST be in Q8.8 fixed point format, e.g. 120BPM must
+// be specified as 120*256 = 30720.
+// beatsin8( BPM, uint8_t low, uint8_t high) returns an 8-bit value that
+// rises and falls in a sine wave, 'BPM' times per minute,
+// between the values of 'low' and 'high'.
+// beatsin16( BPM, uint16_t low, uint16_t high) returns a 16-bit value
+// that rises and falls in a sine wave, 'BPM' times per
+// minute, between the values of 'low' and 'high'.
+// beatsin88( BPM88, ...) is the same as beatsin16, except that the
+// BPM88 argument MUST be in Q8.8 fixed point format,
+// e.g. 120BPM must be specified as 120*256 = 30720.
+//
+// BPM can be supplied two ways. The simpler way of specifying BPM is as
+// a simple 8-bit integer from 1-255, (e.g., "120").
+// The more sophisticated way of specifying BPM allows for fractional
+// "Q8.8" fixed point number (an 'accum88') with an 8-bit integer part and
+// an 8-bit fractional part. The easiest way to construct this is to multiply
+// a floating point BPM value (e.g. 120.3) by 256, (e.g. resulting in 30796
+// in this case), and pass that as the 16-bit BPM argument.
+// "BPM88" MUST always be specified in Q8.8 format.
+//
+// Originally designed to make an entire animation project pulse with brightness.
+// For that effect, add this line just above your existing call to "FastLED.show()":
+//
+// uint8_t bright = beatsin8( 60 /*BPM*/, 192 /*dimmest*/, 255 /*brightest*/ ));
+// FastLED.setBrightness( bright );
+// FastLED.show();
+//
+// The entire animation will now pulse between brightness 192 and 255 once per second.
+
+
+// The beat generators need access to a millisecond counter.
+// On Arduino, this is "millis()". On other platforms, you'll
+// need to provide a function with this signature:
+// uint32_t get_millisecond_timer();
+// that provides similar functionality.
+// You can also force use of the get_millisecond_timer function
+// by #defining USE_GET_MILLISECOND_TIMER.
+#if (defined(ARDUINO) || defined(SPARK) || defined(FASTLED_HAS_MILLIS)) && !defined(USE_GET_MILLISECOND_TIMER)
+// Forward declaration of Arduino function 'millis'.
+//uint32_t millis();
+#define GET_MILLIS millis
+#else
+uint32_t get_millisecond_timer(void);
+#define GET_MILLIS get_millisecond_timer
+#endif
+
+// beat16 generates a 16-bit 'sawtooth' wave at a given BPM,
+/// with BPM specified in Q8.8 fixed-point format; e.g.
+/// for this function, 120 BPM MUST BE specified as
+/// 120*256 = 30720.
+/// If you just want to specify "120", use beat16 or beat8.
+LIB8STATIC uint16_t beat88( accum88 beats_per_minute_88, uint32_t timebase)
+{
+ // BPM is 'beats per minute', or 'beats per 60000ms'.
+ // To avoid using the (slower) division operator, we
+ // want to convert 'beats per 60000ms' to 'beats per 65536ms',
+ // and then use a simple, fast bit-shift to divide by 65536.
+ //
+ // The ratio 65536:60000 is 279.620266667:256; we'll call it 280:256.
+ // The conversion is accurate to about 0.05%, more or less,
+ // e.g. if you ask for "120 BPM", you'll get about "119.93".
+ return (((GET_MILLIS()) - timebase) * beats_per_minute_88 * 280) >> 16;
+}
+
+/// beat16 generates a 16-bit 'sawtooth' wave at a given BPM
+LIB8STATIC uint16_t beat16( accum88 beats_per_minute, uint32_t timebase)
+{
+ // Convert simple 8-bit BPM's to full Q8.8 accum88's if needed
+ if( beats_per_minute < 256) beats_per_minute <<= 8;
+ return beat88(beats_per_minute, timebase);
+}
+
+/// beat8 generates an 8-bit 'sawtooth' wave at a given BPM
+LIB8STATIC uint8_t beat8( accum88 beats_per_minute, uint32_t timebase)
+{
+ return beat16( beats_per_minute, timebase) >> 8;
+}
+
+/// beatsin88 generates a 16-bit sine wave at a given BPM,
+/// that oscillates within a given range.
+/// For this function, BPM MUST BE SPECIFIED as
+/// a Q8.8 fixed-point value; e.g. 120BPM must be
+/// specified as 120*256 = 30720.
+/// If you just want to specify "120", use beatsin16 or beatsin8.
+LIB8STATIC uint16_t beatsin88( accum88 beats_per_minute_88, uint16_t lowest, uint16_t highest, uint32_t timebase, uint16_t phase_offset)
+{
+ uint16_t beat = beat88( beats_per_minute_88, timebase);
+ uint16_t beatsin = (sin16( beat + phase_offset) + 32768);
+ uint16_t rangewidth = highest - lowest;
+ uint16_t scaledbeat = scale16( beatsin, rangewidth);
+ uint16_t result = lowest + scaledbeat;
+ return result;
+}
+
+/// beatsin16 generates a 16-bit sine wave at a given BPM,
+/// that oscillates within a given range.
+LIB8STATIC uint16_t beatsin16(accum88 beats_per_minute, uint16_t lowest, uint16_t highest, uint32_t timebase, uint16_t phase_offset)
+{
+ uint16_t beat = beat16( beats_per_minute, timebase);
+ uint16_t beatsin = (sin16( beat + phase_offset) + 32768);
+ uint16_t rangewidth = highest - lowest;
+ uint16_t scaledbeat = scale16( beatsin, rangewidth);
+ uint16_t result = lowest + scaledbeat;
+ return result;
+}
+
+/// beatsin8 generates an 8-bit sine wave at a given BPM,
+/// that oscillates within a given range.
+LIB8STATIC uint8_t beatsin8( accum88 beats_per_minute, uint8_t lowest, uint8_t highest, uint32_t timebase, uint8_t phase_offset)
+{
+ uint8_t beat = beat8( beats_per_minute, timebase);
+ uint8_t beatsin = sin8( beat + phase_offset);
+ uint8_t rangewidth = highest - lowest;
+ uint8_t scaledbeat = scale8( beatsin, rangewidth);
+ uint8_t result = lowest + scaledbeat;
+ return result;
+}
+
+
+/// Return the current seconds since boot in a 16-bit value. Used as part of the
+/// "every N time-periods" mechanism
+LIB8STATIC uint16_t seconds16(void)
+{
+ uint32_t ms = GET_MILLIS();
+ uint16_t s16;
+ s16 = ms / 1000;
+ return s16;
+}
+
+/// Return the current minutes since boot in a 16-bit value. Used as part of the
+/// "every N time-periods" mechanism
+LIB8STATIC uint16_t minutes16(void)
+{
+ uint32_t ms = GET_MILLIS();
+ uint16_t m16;
+ m16 = (ms / (60000L)) & 0xFFFF;
+ return m16;
+}
+
+/// Return the current hours since boot in an 8-bit value. Used as part of the
+/// "every N time-periods" mechanism
+LIB8STATIC uint8_t hours8(void)
+{
+ uint32_t ms = GET_MILLIS();
+ uint8_t h8;
+ h8 = (ms / (3600000L)) & 0xFF;
+ return h8;
+}
+
+///@}
+
+#endif
diff --git a/lib/lib8tion/math8.h b/lib/lib8tion/math8.h
new file mode 100644
index 000000000..8c6b6c227
--- /dev/null
+++ b/lib/lib8tion/math8.h
@@ -0,0 +1,552 @@
+#ifndef __INC_LIB8TION_MATH_H
+#define __INC_LIB8TION_MATH_H
+
+#include "scale8.h"
+
+///@ingroup lib8tion
+
+///@defgroup Math Basic math operations
+/// Fast, efficient 8-bit math functions specifically
+/// designed for high-performance LED programming.
+///
+/// Because of the AVR(Arduino) and ARM assembly language
+/// implementations provided, using these functions often
+/// results in smaller and faster code than the equivalent
+/// program using plain "C" arithmetic and logic.
+///@{
+
+
+/// add one byte to another, saturating at 0xFF
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC_ALWAYS_INLINE uint8_t qadd8( uint8_t i, uint8_t j)
+{
+#if QADD8_C == 1
+ uint16_t t = i + j;
+ if (t > 255) t = 255;
+ return t;
+#elif QADD8_AVRASM == 1
+ asm volatile(
+ /* First, add j to i, conditioning the C flag */
+ "add %0, %1 \n\t"
+
+ /* Now test the C flag.
+ If C is clear, we branch around a load of 0xFF into i.
+ If C is set, we go ahead and load 0xFF into i.
+ */
+ "brcc L_%= \n\t"
+ "ldi %0, 0xFF \n\t"
+ "L_%=: "
+ : "+a" (i)
+ : "a" (j) );
+ return i;
+#elif QADD8_ARM_DSP_ASM == 1
+ asm volatile( "uqadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+ return i;
+#else
+#error "No implementation for qadd8 available."
+#endif
+}
+
+/// Add one byte to another, saturating at 0x7F
+/// @param i - first byte to add
+/// @param j - second byte to add
+/// @returns the sum of i & j, capped at 0xFF
+LIB8STATIC_ALWAYS_INLINE int8_t qadd7( int8_t i, int8_t j)
+{
+#if QADD7_C == 1
+ int16_t t = i + j;
+ if (t > 127) t = 127;
+ return t;
+#elif QADD7_AVRASM == 1
+ asm volatile(
+ /* First, add j to i, conditioning the V flag */
+ "add %0, %1 \n\t"
+
+ /* Now test the V flag.
+ If V is clear, we branch around a load of 0x7F into i.
+ If V is set, we go ahead and load 0x7F into i.
+ */
+ "brvc L_%= \n\t"
+ "ldi %0, 0x7F \n\t"
+ "L_%=: "
+ : "+a" (i)
+ : "a" (j) );
+
+ return i;
+#elif QADD7_ARM_DSP_ASM == 1
+ asm volatile( "qadd8 %0, %0, %1" : "+r" (i) : "r" (j));
+ return i;
+#else
+#error "No implementation for qadd7 available."
+#endif
+}
+
+/// subtract one byte from another, saturating at 0x00
+/// @returns i - j with a floor of 0
+LIB8STATIC_ALWAYS_INLINE uint8_t qsub8( uint8_t i, uint8_t j)
+{
+#if QSUB8_C == 1
+ int16_t t = i - j;
+ if (t < 0) t = 0;
+ return t;
+#elif QSUB8_AVRASM == 1
+
+ asm volatile(
+ /* First, subtract j from i, conditioning the C flag */
+ "sub %0, %1 \n\t"
+
+ /* Now test the C flag.
+ If C is clear, we branch around a load of 0x00 into i.
+ If C is set, we go ahead and load 0x00 into i.
+ */
+ "brcc L_%= \n\t"
+ "ldi %0, 0x00 \n\t"
+ "L_%=: "
+ : "+a" (i)
+ : "a" (j) );
+
+ return i;
+#else
+#error "No implementation for qsub8 available."
+#endif
+}
+
+/// add one byte to another, with one byte result
+LIB8STATIC_ALWAYS_INLINE uint8_t add8( uint8_t i, uint8_t j)
+{
+#if ADD8_C == 1
+ uint16_t t = i + j;
+ return t;
+#elif ADD8_AVRASM == 1
+ // Add j to i, period.
+ asm volatile( "add %0, %1" : "+a" (i) : "a" (j));
+ return i;
+#else
+#error "No implementation for add8 available."
+#endif
+}
+
+/// add one byte to another, with one byte result
+LIB8STATIC_ALWAYS_INLINE uint16_t add8to16( uint8_t i, uint16_t j)
+{
+#if ADD8_C == 1
+ uint16_t t = i + j;
+ return t;
+#elif ADD8_AVRASM == 1
+ // Add i(one byte) to j(two bytes)
+ asm volatile( "add %A[j], %[i] \n\t"
+ "adc %B[j], __zero_reg__ \n\t"
+ : [j] "+a" (j)
+ : [i] "a" (i)
+ );
+ return i;
+#else
+#error "No implementation for add8to16 available."
+#endif
+}
+
+
+/// subtract one byte from another, 8-bit result
+LIB8STATIC_ALWAYS_INLINE uint8_t sub8( uint8_t i, uint8_t j)
+{
+#if SUB8_C == 1
+ int16_t t = i - j;
+ return t;
+#elif SUB8_AVRASM == 1
+ // Subtract j from i, period.
+ asm volatile( "sub %0, %1" : "+a" (i) : "a" (j));
+ return i;
+#else
+#error "No implementation for sub8 available."
+#endif
+}
+
+/// Calculate an integer average of two unsigned
+/// 8-bit integer values (uint8_t).
+/// Fractional results are rounded down, e.g. avg8(20,41) = 30
+LIB8STATIC_ALWAYS_INLINE uint8_t avg8( uint8_t i, uint8_t j)
+{
+#if AVG8_C == 1
+ return (i + j) >> 1;
+#elif AVG8_AVRASM == 1
+ asm volatile(
+ /* First, add j to i, 9th bit overflows into C flag */
+ "add %0, %1 \n\t"
+ /* Divide by two, moving C flag into high 8th bit */
+ "ror %0 \n\t"
+ : "+a" (i)
+ : "a" (j) );
+ return i;
+#else
+#error "No implementation for avg8 available."
+#endif
+}
+
+/// Calculate an integer average of two unsigned
+/// 16-bit integer values (uint16_t).
+/// Fractional results are rounded down, e.g. avg16(20,41) = 30
+LIB8STATIC_ALWAYS_INLINE uint16_t avg16( uint16_t i, uint16_t j)
+{
+#if AVG16_C == 1
+ return (uint32_t)((uint32_t)(i) + (uint32_t)(j)) >> 1;
+#elif AVG16_AVRASM == 1
+ asm volatile(
+ /* First, add jLo (heh) to iLo, 9th bit overflows into C flag */
+ "add %A[i], %A[j] \n\t"
+ /* Now, add C + jHi to iHi, 17th bit overflows into C flag */
+ "adc %B[i], %B[j] \n\t"
+ /* Divide iHi by two, moving C flag into high 16th bit, old 9th bit now in C */
+ "ror %B[i] \n\t"
+ /* Divide iLo by two, moving C flag into high 8th bit */
+ "ror %A[i] \n\t"
+ : [i] "+a" (i)
+ : [j] "a" (j) );
+ return i;
+#else
+#error "No implementation for avg16 available."
+#endif
+}
+
+
+/// Calculate an integer average of two signed 7-bit
+/// integers (int8_t)
+/// If the first argument is even, result is rounded down.
+/// If the first argument is odd, result is result up.
+LIB8STATIC_ALWAYS_INLINE int8_t avg7( int8_t i, int8_t j)
+{
+#if AVG7_C == 1
+ return ((i + j) >> 1) + (i & 0x1);
+#elif AVG7_AVRASM == 1
+ asm volatile(
+ "asr %1 \n\t"
+ "asr %0 \n\t"
+ "adc %0, %1 \n\t"
+ : "+a" (i)
+ : "a" (j) );
+ return i;
+#else
+#error "No implementation for avg7 available."
+#endif
+}
+
+/// Calculate an integer average of two signed 15-bit
+/// integers (int16_t)
+/// If the first argument is even, result is rounded down.
+/// If the first argument is odd, result is result up.
+LIB8STATIC_ALWAYS_INLINE int16_t avg15( int16_t i, int16_t j)
+{
+#if AVG15_C == 1
+ return ((int32_t)((int32_t)(i) + (int32_t)(j)) >> 1) + (i & 0x1);
+#elif AVG15_AVRASM == 1
+ asm volatile(
+ /* first divide j by 2, throwing away lowest bit */
+ "asr %B[j] \n\t"
+ "ror %A[j] \n\t"
+ /* now divide i by 2, with lowest bit going into C */
+ "asr %B[i] \n\t"
+ "ror %A[i] \n\t"
+ /* add j + C to i */
+ "adc %A[i], %A[j] \n\t"
+ "adc %B[i], %B[j] \n\t"
+ : [i] "+a" (i)
+ : [j] "a" (j) );
+ return i;
+#else
+#error "No implementation for avg15 available."
+#endif
+}
+
+
+/// Calculate the remainder of one unsigned 8-bit
+/// value divided by anoter, aka A % M.
+/// Implemented by repeated subtraction, which is
+/// very compact, and very fast if A is 'probably'
+/// less than M. If A is a large multiple of M,
+/// the loop has to execute multiple times. However,
+/// even in that case, the loop is only two
+/// instructions long on AVR, i.e., quick.
+LIB8STATIC_ALWAYS_INLINE uint8_t mod8( uint8_t a, uint8_t m)
+{
+#if defined(__AVR__)
+ asm volatile (
+ "L_%=: sub %[a],%[m] \n\t"
+ " brcc L_%= \n\t"
+ " add %[a],%[m] \n\t"
+ : [a] "+r" (a)
+ : [m] "r" (m)
+ );
+#else
+ while( a >= m) a -= m;
+#endif
+ return a;
+}
+
+/// Add two numbers, and calculate the modulo
+/// of the sum and a third number, M.
+/// In other words, it returns (A+B) % M.
+/// It is designed as a compact mechanism for
+/// incrementing a 'mode' switch and wrapping
+/// around back to 'mode 0' when the switch
+/// goes past the end of the available range.
+/// e.g. if you have seven modes, this switches
+/// to the next one and wraps around if needed:
+/// mode = addmod8( mode, 1, 7);
+///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
+LIB8STATIC uint8_t addmod8( uint8_t a, uint8_t b, uint8_t m)
+{
+#if defined(__AVR__)
+ asm volatile (
+ " add %[a],%[b] \n\t"
+ "L_%=: sub %[a],%[m] \n\t"
+ " brcc L_%= \n\t"
+ " add %[a],%[m] \n\t"
+ : [a] "+r" (a)
+ : [b] "r" (b), [m] "r" (m)
+ );
+#else
+ a += b;
+ while( a >= m) a -= m;
+#endif
+ return a;
+}
+
+/// Subtract two numbers, and calculate the modulo
+/// of the difference and a third number, M.
+/// In other words, it returns (A-B) % M.
+/// It is designed as a compact mechanism for
+/// incrementing a 'mode' switch and wrapping
+/// around back to 'mode 0' when the switch
+/// goes past the end of the available range.
+/// e.g. if you have seven modes, this switches
+/// to the next one and wraps around if needed:
+/// mode = addmod8( mode, 1, 7);
+///LIB8STATIC_ALWAYS_INLINESee 'mod8' for notes on performance.
+LIB8STATIC uint8_t submod8( uint8_t a, uint8_t b, uint8_t m)
+{
+#if defined(__AVR__)
+ asm volatile (
+ " sub %[a],%[b] \n\t"
+ "L_%=: sub %[a],%[m] \n\t"
+ " brcc L_%= \n\t"
+ " add %[a],%[m] \n\t"
+ : [a] "+r" (a)
+ : [b] "r" (b), [m] "r" (m)
+ );
+#else
+ a -= b;
+ while( a >= m) a -= m;
+#endif
+ return a;
+}
+
+/// 8x8 bit multiplication, with 8 bit result
+LIB8STATIC_ALWAYS_INLINE uint8_t mul8( uint8_t i, uint8_t j)
+{
+#if MUL8_C == 1
+ return ((uint16_t)i * (uint16_t)(j) ) & 0xFF;
+#elif MUL8_AVRASM == 1
+ asm volatile(
+ /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+ "mul %0, %1 \n\t"
+ /* Extract the LOW 8-bits (r0) */
+ "mov %0, r0 \n\t"
+ /* Restore r1 to "0"; it's expected to always be that */
+ "clr __zero_reg__ \n\t"
+ : "+a" (i)
+ : "a" (j)
+ : "r0", "r1");
+
+ return i;
+#else
+#error "No implementation for mul8 available."
+#endif
+}
+
+
+/// saturating 8x8 bit multiplication, with 8 bit result
+/// @returns the product of i * j, capping at 0xFF
+LIB8STATIC_ALWAYS_INLINE uint8_t qmul8( uint8_t i, uint8_t j)
+{
+#if QMUL8_C == 1
+ int p = ((uint16_t)i * (uint16_t)(j) );
+ if( p > 255) p = 255;
+ return p;
+#elif QMUL8_AVRASM == 1
+ asm volatile(
+ /* Multiply 8-bit i * 8-bit j, giving 16-bit r1,r0 */
+ " mul %0, %1 \n\t"
+ /* If high byte of result is zero, all is well. */
+ " tst r1 \n\t"
+ " breq Lnospill_%= \n\t"
+ /* If high byte of result > 0, saturate low byte to 0xFF */
+ " ldi %0,0xFF \n\t"
+ " rjmp Ldone_%= \n\t"
+ "Lnospill_%=: \n\t"
+ /* Extract the LOW 8-bits (r0) */
+ " mov %0, r0 \n\t"
+ "Ldone_%=: \n\t"
+ /* Restore r1 to "0"; it's expected to always be that */
+ " clr __zero_reg__ \n\t"
+ : "+a" (i)
+ : "a" (j)
+ : "r0", "r1");
+
+ return i;
+#else
+#error "No implementation for qmul8 available."
+#endif
+}
+
+
+/// take abs() of a signed 8-bit uint8_t
+LIB8STATIC_ALWAYS_INLINE int8_t abs8( int8_t i)
+{
+#if ABS8_C == 1
+ if( i < 0) i = -i;
+ return i;
+#elif ABS8_AVRASM == 1
+
+
+ asm volatile(
+ /* First, check the high bit, and prepare to skip if it's clear */
+ "sbrc %0, 7 \n"
+
+ /* Negate the value */
+ "neg %0 \n"
+
+ : "+r" (i) : "r" (i) );
+ return i;
+#else
+#error "No implementation for abs8 available."
+#endif
+}
+
+/// square root for 16-bit integers
+/// About three times faster and five times smaller
+/// than Arduino's general sqrt on AVR.
+LIB8STATIC uint8_t sqrt16(uint16_t x)
+{
+ if( x <= 1) {
+ return x;
+ }
+
+ uint8_t low = 1; // lower bound
+ uint8_t hi, mid;
+
+ if( x > 7904) {
+ hi = 255;
+ } else {
+ hi = (x >> 5) + 8; // initial estimate for upper bound
+ }
+
+ do {
+ mid = (low + hi) >> 1;
+ if ((uint16_t)(mid * mid) > x) {
+ hi = mid - 1;
+ } else {
+ if( mid == 255) {
+ return 255;
+ }
+ low = mid + 1;
+ }
+ } while (hi >= low);
+
+ return low - 1;
+}
+
+/// blend a variable proproportion(0-255) of one byte to another
+/// @param a - the starting byte value
+/// @param b - the byte value to blend toward
+/// @param amountOfB - the proportion (0-255) of b to blend
+/// @returns a byte value between a and b, inclusive
+#if (FASTLED_BLEND_FIXED == 1)
+LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
+{
+#if BLEND8_C == 1
+ uint16_t partial;
+ uint8_t result;
+
+ uint8_t amountOfA = 255 - amountOfB;
+
+ partial = (a * amountOfA);
+#if (FASTLED_SCALE8_FIXED == 1)
+ partial += a;
+ //partial = add8to16( a, partial);
+#endif
+
+ partial += (b * amountOfB);
+#if (FASTLED_SCALE8_FIXED == 1)
+ partial += b;
+ //partial = add8to16( b, partial);
+#endif
+
+ result = partial >> 8;
+
+ return result;
+
+#elif BLEND8_AVRASM == 1
+ uint16_t partial;
+ uint8_t result;
+
+ asm volatile (
+ /* partial = b * amountOfB */
+ " mul %[b], %[amountOfB] \n\t"
+ " movw %A[partial], r0 \n\t"
+
+ /* amountOfB (aka amountOfA) = 255 - amountOfB */
+ " com %[amountOfB] \n\t"
+
+ /* partial += a * amountOfB (aka amountOfA) */
+ " mul %[a], %[amountOfB] \n\t"
+
+ " add %A[partial], r0 \n\t"
+ " adc %B[partial], r1 \n\t"
+
+ " clr __zero_reg__ \n\t"
+
+#if (FASTLED_SCALE8_FIXED == 1)
+ /* partial += a */
+ " add %A[partial], %[a] \n\t"
+ " adc %B[partial], __zero_reg__ \n\t"
+
+ // partial += b
+ " add %A[partial], %[b] \n\t"
+ " adc %B[partial], __zero_reg__ \n\t"
+#endif
+
+ : [partial] "=r" (partial),
+ [amountOfB] "+a" (amountOfB)
+ : [a] "a" (a),
+ [b] "a" (b)
+ : "r0", "r1"
+ );
+
+ result = partial >> 8;
+
+ return result;
+
+#else
+#error "No implementation for blend8 available."
+#endif
+}
+
+#else
+LIB8STATIC uint8_t blend8( uint8_t a, uint8_t b, uint8_t amountOfB)
+{
+ // This version loses precision in the integer math
+ // and can actually return results outside of the range
+ // from a to b. Its use is not recommended.
+ uint8_t result;
+ uint8_t amountOfA = 255 - amountOfB;
+ result = scale8_LEAVING_R1_DIRTY( a, amountOfA)
+ + scale8_LEAVING_R1_DIRTY( b, amountOfB);
+ cleanup_R1();
+ return result;
+}
+#endif
+
+
+///@}
+#endif
diff --git a/lib/lib8tion/random8.h b/lib/lib8tion/random8.h
new file mode 100644
index 000000000..7ee67cbb3
--- /dev/null
+++ b/lib/lib8tion/random8.h
@@ -0,0 +1,94 @@
+#ifndef __INC_LIB8TION_RANDOM_H
+#define __INC_LIB8TION_RANDOM_H
+///@ingroup lib8tion
+
+///@defgroup Random Fast random number generators
+/// Fast 8- and 16- bit unsigned random numbers.
+/// Significantly faster than Arduino random(), but
+/// also somewhat less random. You can add entropy.
+///@{
+
+// X(n+1) = (2053 * X(n)) + 13849)
+#define FASTLED_RAND16_2053 ((uint16_t)(2053))
+#define FASTLED_RAND16_13849 ((uint16_t)(13849))
+
+/// random number seed
+extern uint16_t rand16seed;// = RAND16_SEED;
+
+/// Generate an 8-bit random number
+LIB8STATIC uint8_t random8(void)
+{
+ rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
+ // return the sum of the high and low bytes, for better
+ // mixing and non-sequential correlation
+ return (uint8_t)(((uint8_t)(rand16seed & 0xFF)) +
+ ((uint8_t)(rand16seed >> 8)));
+}
+
+/// Generate a 16 bit random number
+LIB8STATIC uint16_t random16(void)
+{
+ rand16seed = (rand16seed * FASTLED_RAND16_2053) + FASTLED_RAND16_13849;
+ return rand16seed;
+}
+
+/// Generate an 8-bit random number between 0 and lim
+/// @param lim the upper bound for the result
+LIB8STATIC uint8_t random8_max(uint8_t lim)
+{
+ uint8_t r = random8();
+ r = (r*lim) >> 8;
+ return r;
+}
+
+/// Generate an 8-bit random number in the given range
+/// @param min the lower bound for the random number
+/// @param lim the upper bound for the random number
+LIB8STATIC uint8_t random8_min_max(uint8_t min, uint8_t lim)
+{
+ uint8_t delta = lim - min;
+ uint8_t r = random8_max(delta) + min;
+ return r;
+}
+
+/// Generate an 16-bit random number between 0 and lim
+/// @param lim the upper bound for the result
+LIB8STATIC uint16_t random16_max(uint16_t lim)
+{
+ uint16_t r = random16();
+ uint32_t p = (uint32_t)lim * (uint32_t)r;
+ r = p >> 16;
+ return r;
+}
+
+/// Generate an 16-bit random number in the given range
+/// @param min the lower bound for the random number
+/// @param lim the upper bound for the random number
+LIB8STATIC uint16_t random16_min_max( uint16_t min, uint16_t lim)
+{
+ uint16_t delta = lim - min;
+ uint16_t r = random16_max(delta) + min;
+ return r;
+}
+
+/// Set the 16-bit seed used for the random number generator
+LIB8STATIC void random16_set_seed(uint16_t seed)
+{
+ rand16seed = seed;
+}
+
+/// Get the current seed value for the random number generator
+LIB8STATIC uint16_t random16_get_seed(void)
+{
+ return rand16seed;
+}
+
+/// Add entropy into the random number generator
+LIB8STATIC void random16_add_entropy(uint16_t entropy)
+{
+ rand16seed += entropy;
+}
+
+///@}
+
+#endif
diff --git a/lib/lib8tion/scale8.h b/lib/lib8tion/scale8.h
new file mode 100644
index 000000000..9895fd4d7
--- /dev/null
+++ b/lib/lib8tion/scale8.h
@@ -0,0 +1,542 @@
+#ifndef __INC_LIB8TION_SCALE_H
+#define __INC_LIB8TION_SCALE_H
+
+///@ingroup lib8tion
+
+///@defgroup Scaling Scaling functions
+/// Fast, efficient 8-bit scaling functions specifically
+/// designed for high-performance LED programming.
+///
+/// Because of the AVR(Arduino) and ARM assembly language
+/// implementations provided, using these functions often
+/// results in smaller and faster code than the equivalent
+/// program using plain "C" arithmetic and logic.
+///@{
+
+/// scale one byte by a second one, which is treated as
+/// the numerator of a fraction whose denominator is 256
+/// In other words, it computes i * (scale / 256)
+/// 4 clocks AVR with MUL, 2 clocks ARM
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+#if (FASTLED_SCALE8_FIXED == 1)
+ return (((uint16_t)i) * (1+(uint16_t)(scale))) >> 8;
+#else
+ return ((uint16_t)i * (uint16_t)(scale) ) >> 8;
+#endif
+#elif SCALE8_AVRASM == 1
+#if defined(LIB8_ATTINY)
+#if (FASTLED_SCALE8_FIXED == 1)
+ uint8_t work=i;
+#else
+ uint8_t work=0;
+#endif
+ uint8_t cnt=0x80;
+ asm volatile(
+#if (FASTLED_SCALE8_FIXED == 1)
+ " inc %[scale] \n\t"
+ " breq DONE_%= \n\t"
+ " clr %[work] \n\t"
+#endif
+ "LOOP_%=: \n\t"
+ /*" sbrc %[scale], 0 \n\t"
+ " add %[work], %[i] \n\t"
+ " ror %[work] \n\t"
+ " lsr %[scale] \n\t"
+ " clc \n\t"*/
+ " sbrc %[scale], 0 \n\t"
+ " add %[work], %[i] \n\t"
+ " ror %[work] \n\t"
+ " lsr %[scale] \n\t"
+ " lsr %[cnt] \n\t"
+ "brcc LOOP_%= \n\t"
+ "DONE_%=: \n\t"
+ : [work] "+r" (work), [cnt] "+r" (cnt)
+ : [scale] "r" (scale), [i] "r" (i)
+ :
+ );
+ return work;
+#else
+ asm volatile(
+#if (FASTLED_SCALE8_FIXED==1)
+ // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
+ "mul %0, %1 \n\t"
+ // Add i to r0, possibly setting the carry flag
+ "add r0, %0 \n\t"
+ // load the immediate 0 into i (note, this does _not_ touch any flags)
+ "ldi %0, 0x00 \n\t"
+ // walk and chew gum at the same time
+ "adc %0, r1 \n\t"
+#else
+ /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+ "mul %0, %1 \n\t"
+ /* Move the high 8-bits of the product (r1) back to i */
+ "mov %0, r1 \n\t"
+ /* Restore r1 to "0"; it's expected to always be that */
+#endif
+ "clr __zero_reg__ \n\t"
+
+ : "+a" (i) /* writes to i */
+ : "a" (scale) /* uses scale */
+ : "r0", "r1" /* clobbers r0, r1 */ );
+
+ /* Return the result */
+ return i;
+#endif
+#else
+#error "No implementation for scale8 available."
+#endif
+}
+
+
+/// The "video" version of scale8 guarantees that the output will
+/// be only be zero if one or both of the inputs are zero. If both
+/// inputs are non-zero, the output is guaranteed to be non-zero.
+/// This makes for better 'video'/LED dimming, at the cost of
+/// several additional cycles.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+ uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+ // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+ // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+ return j;
+#elif SCALE8_AVRASM == 1
+ uint8_t j=0;
+ asm volatile(
+ " tst %[i]\n\t"
+ " breq L_%=\n\t"
+ " mul %[i], %[scale]\n\t"
+ " mov %[j], r1\n\t"
+ " clr __zero_reg__\n\t"
+ " cpse %[scale], r1\n\t"
+ " subi %[j], 0xFF\n\t"
+ "L_%=: \n\t"
+ : [j] "+a" (j)
+ : [i] "a" (i), [scale] "a" (scale)
+ : "r0", "r1");
+
+ return j;
+ // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+ // asm volatile(
+ // " tst %0 \n"
+ // " breq L_%= \n"
+ // " mul %0, %1 \n"
+ // " mov %0, r1 \n"
+ // " add %0, %2 \n"
+ // " clr __zero_reg__ \n"
+ // "L_%=: \n"
+
+ // : "+a" (i)
+ // : "a" (scale), "a" (nonzeroscale)
+ // : "r0", "r1");
+
+ // // Return the result
+ // return i;
+#else
+#error "No implementation for scale8_video available."
+#endif
+}
+
+
+/// This version of scale8 does not clean up the R1 register on AVR
+/// If you are doing several 'scale8's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1
+#if (FASTLED_SCALE8_FIXED == 1)
+ return (((uint16_t)i) * ((uint16_t)(scale)+1)) >> 8;
+#else
+ return ((int)i * (int)(scale) ) >> 8;
+#endif
+#elif SCALE8_AVRASM == 1
+ asm volatile(
+ #if (FASTLED_SCALE8_FIXED==1)
+ // Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0
+ "mul %0, %1 \n\t"
+ // Add i to r0, possibly setting the carry flag
+ "add r0, %0 \n\t"
+ // load the immediate 0 into i (note, this does _not_ touch any flags)
+ "ldi %0, 0x00 \n\t"
+ // walk and chew gum at the same time
+ "adc %0, r1 \n\t"
+ #else
+ /* Multiply 8-bit i * 8-bit scale, giving 16-bit r1,r0 */
+ "mul %0, %1 \n\t"
+ /* Move the high 8-bits of the product (r1) back to i */
+ "mov %0, r1 \n\t"
+ #endif
+ /* R1 IS LEFT DIRTY HERE; YOU MUST ZERO IT OUT YOURSELF */
+ /* "clr __zero_reg__ \n\t" */
+
+ : "+a" (i) /* writes to i */
+ : "a" (scale) /* uses scale */
+ : "r0", "r1" /* clobbers r0, r1 */ );
+
+ // Return the result
+ return i;
+#else
+#error "No implementation for scale8_LEAVING_R1_DIRTY available."
+#endif
+}
+
+
+/// This version of scale8_video does not clean up the R1 register on AVR
+/// If you are doing several 'scale8_video's in a row, use this, and
+/// then explicitly call cleanup_R1.
+LIB8STATIC_ALWAYS_INLINE uint8_t scale8_video_LEAVING_R1_DIRTY( uint8_t i, fract8 scale)
+{
+#if SCALE8_C == 1 || defined(LIB8_ATTINY)
+ uint8_t j = (((int)i * (int)scale) >> 8) + ((i&&scale)?1:0);
+ // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+ // uint8_t j = (i == 0) ? 0 : (((int)i * (int)(scale) ) >> 8) + nonzeroscale;
+ return j;
+#elif SCALE8_AVRASM == 1
+ uint8_t j=0;
+ asm volatile(
+ " tst %[i]\n\t"
+ " breq L_%=\n\t"
+ " mul %[i], %[scale]\n\t"
+ " mov %[j], r1\n\t"
+ " breq L_%=\n\t"
+ " subi %[j], 0xFF\n\t"
+ "L_%=: \n\t"
+ : [j] "+a" (j)
+ : [i] "a" (i), [scale] "a" (scale)
+ : "r0", "r1");
+
+ return j;
+ // uint8_t nonzeroscale = (scale != 0) ? 1 : 0;
+ // asm volatile(
+ // " tst %0 \n"
+ // " breq L_%= \n"
+ // " mul %0, %1 \n"
+ // " mov %0, r1 \n"
+ // " add %0, %2 \n"
+ // " clr __zero_reg__ \n"
+ // "L_%=: \n"
+
+ // : "+a" (i)
+ // : "a" (scale), "a" (nonzeroscale)
+ // : "r0", "r1");
+
+ // // Return the result
+ // return i;
+#else
+#error "No implementation for scale8_video_LEAVING_R1_DIRTY available."
+#endif
+}
+
+/// Clean up the r1 register after a series of *LEAVING_R1_DIRTY calls
+LIB8STATIC_ALWAYS_INLINE void cleanup_R1(void)
+{
+#if CLEANUP_R1_AVRASM == 1
+ // Restore r1 to "0"; it's expected to always be that
+ asm volatile( "clr __zero_reg__ \n\t" : : : "r1" );
+#endif
+}
+
+
+/// scale a 16-bit unsigned value by an 8-bit value,
+/// considered as numerator of a fraction whose denominator
+/// is 256. In other words, it computes i * (scale / 256)
+
+LIB8STATIC_ALWAYS_INLINE uint16_t scale16by8( uint16_t i, fract8 scale )
+{
+#if SCALE16BY8_C == 1
+ uint16_t result;
+#if FASTLED_SCALE8_FIXED == 1
+ result = (i * (1+((uint16_t)scale))) >> 8;
+#else
+ result = (i * scale) / 256;
+#endif
+ return result;
+#elif SCALE16BY8_AVRASM == 1
+#if FASTLED_SCALE8_FIXED == 1
+ uint16_t result = 0;
+ asm volatile(
+ // result.A = HighByte( (i.A x scale) + i.A )
+ " mul %A[i], %[scale] \n\t"
+ " add r0, %A[i] \n\t"
+ // " adc r1, [zero] \n\t"
+ // " mov %A[result], r1 \n\t"
+ " adc %A[result], r1 \n\t"
+
+ // result.A-B += i.B x scale
+ " mul %B[i], %[scale] \n\t"
+ " add %A[result], r0 \n\t"
+ " adc %B[result], r1 \n\t"
+
+ // cleanup r1
+ " clr __zero_reg__ \n\t"
+
+ // result.A-B += i.B
+ " add %A[result], %B[i] \n\t"
+ " adc %B[result], __zero_reg__ \n\t"
+
+ : [result] "+r" (result)
+ : [i] "r" (i), [scale] "r" (scale)
+ : "r0", "r1"
+ );
+ return result;
+#else
+ uint16_t result = 0;
+ asm volatile(
+ // result.A = HighByte(i.A x j )
+ " mul %A[i], %[scale] \n\t"
+ " mov %A[result], r1 \n\t"
+ //" clr %B[result] \n\t"
+
+ // result.A-B += i.B x j
+ " mul %B[i], %[scale] \n\t"
+ " add %A[result], r0 \n\t"
+ " adc %B[result], r1 \n\t"
+
+ // cleanup r1
+ " clr __zero_reg__ \n\t"
+
+ : [result] "+r" (result)
+ : [i] "r" (i), [scale] "r" (scale)
+ : "r0", "r1"
+ );
+ return result;
+#endif
+#else
+ #error "No implementation for scale16by8 available."
+#endif
+}
+
+/// scale a 16-bit unsigned value by a 16-bit value,
+/// considered as numerator of a fraction whose denominator
+/// is 65536. In other words, it computes i * (scale / 65536)
+
+LIB8STATIC uint16_t scale16( uint16_t i, fract16 scale )
+{
+ #if SCALE16_C == 1
+ uint16_t result;
+#if FASTLED_SCALE8_FIXED == 1
+ result = ((uint32_t)(i) * (1+(uint32_t)(scale))) / 65536;
+#else
+ result = ((uint32_t)(i) * (uint32_t)(scale)) / 65536;
+#endif
+ return result;
+#elif SCALE16_AVRASM == 1
+#if FASTLED_SCALE8_FIXED == 1
+ // implemented sort of like
+ // result = ((i * scale) + i ) / 65536
+ //
+ // why not like this, you may ask?
+ // result = (i * (scale+1)) / 65536
+ // the answer is that if scale is 65535, then scale+1
+ // will be zero, which is not what we want.
+ uint32_t result;
+ asm volatile(
+ // result.A-B = i.A x scale.A
+ " mul %A[i], %A[scale] \n\t"
+ // save results...
+ // basic idea:
+ //" mov %A[result], r0 \n\t"
+ //" mov %B[result], r1 \n\t"
+ // which can be written as...
+ " movw %A[result], r0 \n\t"
+ // Because we're going to add i.A-B to
+ // result.A-D, we DO need to keep both
+ // the r0 and r1 portions of the product
+ // UNlike in the 'unfixed scale8' version.
+ // So the movw here is needed.
+ : [result] "=r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale)
+ : "r0", "r1"
+ );
+
+ asm volatile(
+ // result.C-D = i.B x scale.B
+ " mul %B[i], %B[scale] \n\t"
+ //" mov %C[result], r0 \n\t"
+ //" mov %D[result], r1 \n\t"
+ " movw %C[result], r0 \n\t"
+ : [result] "+r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale)
+ : "r0", "r1"
+ );
+
+ const uint8_t zero = 0;
+ asm volatile(
+ // result.B-D += i.B x scale.A
+ " mul %B[i], %A[scale] \n\t"
+
+ " add %B[result], r0 \n\t"
+ " adc %C[result], r1 \n\t"
+ " adc %D[result], %[zero] \n\t"
+
+ // result.B-D += i.A x scale.B
+ " mul %A[i], %B[scale] \n\t"
+
+ " add %B[result], r0 \n\t"
+ " adc %C[result], r1 \n\t"
+ " adc %D[result], %[zero] \n\t"
+
+ // cleanup r1
+ " clr r1 \n\t"
+
+ : [result] "+r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale),
+ [zero] "r" (zero)
+ : "r0", "r1"
+ );
+
+ asm volatile(
+ // result.A-D += i.A-B
+ " add %A[result], %A[i] \n\t"
+ " adc %B[result], %B[i] \n\t"
+ " adc %C[result], %[zero] \n\t"
+ " adc %D[result], %[zero] \n\t"
+ : [result] "+r" (result)
+ : [i] "r" (i),
+ [zero] "r" (zero)
+ );
+
+ result = result >> 16;
+ return result;
+#else
+ uint32_t result;
+ asm volatile(
+ // result.A-B = i.A x scale.A
+ " mul %A[i], %A[scale] \n\t"
+ // save results...
+ // basic idea:
+ //" mov %A[result], r0 \n\t"
+ //" mov %B[result], r1 \n\t"
+ // which can be written as...
+ " movw %A[result], r0 \n\t"
+ // We actually don't need to do anything with r0,
+ // as result.A is never used again here, so we
+ // could just move the high byte, but movw is
+ // one clock cycle, just like mov, so might as
+ // well, in case we want to use this code for
+ // a generic 16x16 multiply somewhere.
+
+ : [result] "=r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale)
+ : "r0", "r1"
+ );
+
+ asm volatile(
+ // result.C-D = i.B x scale.B
+ " mul %B[i], %B[scale] \n\t"
+ //" mov %C[result], r0 \n\t"
+ //" mov %D[result], r1 \n\t"
+ " movw %C[result], r0 \n\t"
+ : [result] "+r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale)
+ : "r0", "r1"
+ );
+
+ const uint8_t zero = 0;
+ asm volatile(
+ // result.B-D += i.B x scale.A
+ " mul %B[i], %A[scale] \n\t"
+
+ " add %B[result], r0 \n\t"
+ " adc %C[result], r1 \n\t"
+ " adc %D[result], %[zero] \n\t"
+
+ // result.B-D += i.A x scale.B
+ " mul %A[i], %B[scale] \n\t"
+
+ " add %B[result], r0 \n\t"
+ " adc %C[result], r1 \n\t"
+ " adc %D[result], %[zero] \n\t"
+
+ // cleanup r1
+ " clr r1 \n\t"
+
+ : [result] "+r" (result)
+ : [i] "r" (i),
+ [scale] "r" (scale),
+ [zero] "r" (zero)
+ : "r0", "r1"
+ );
+
+ result = result >> 16;
+ return result;
+#endif
+#else
+ #error "No implementation for scale16 available."
+#endif
+}
+///@}
+
+///@defgroup Dimming Dimming and brightening functions
+///
+/// Dimming and brightening functions
+///
+/// The eye does not respond in a linear way to light.
+/// High speed PWM'd LEDs at 50% duty cycle appear far
+/// brighter then the 'half as bright' you might expect.
+///
+/// If you want your midpoint brightness leve (128) to
+/// appear half as bright as 'full' brightness (255), you
+/// have to apply a 'dimming function'.
+///@{
+
+/// Adjust a scaling value for dimming
+LIB8STATIC uint8_t dim8_raw( uint8_t x)
+{
+ return scale8( x, x);
+}
+
+/// Adjust a scaling value for dimming for video (value will never go below 1)
+LIB8STATIC uint8_t dim8_video( uint8_t x)
+{
+ return scale8_video( x, x);
+}
+
+/// Linear version of the dimming function that halves for values < 128
+LIB8STATIC uint8_t dim8_lin( uint8_t x )
+{
+ if( x & 0x80 ) {
+ x = scale8( x, x);
+ } else {
+ x += 1;
+ x /= 2;
+ }
+ return x;
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_raw( uint8_t x)
+{
+ uint8_t ix = 255 - x;
+ return 255 - scale8( ix, ix);
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_video( uint8_t x)
+{
+ uint8_t ix = 255 - x;
+ return 255 - scale8_video( ix, ix);
+}
+
+/// inverse of the dimming function, brighten a value
+LIB8STATIC uint8_t brighten8_lin( uint8_t x )
+{
+ uint8_t ix = 255 - x;
+ if( ix & 0x80 ) {
+ ix = scale8( ix, ix);
+ } else {
+ ix += 1;
+ ix /= 2;
+ }
+ return 255 - ix;
+}
+
+///@}
+#endif
diff --git a/lib/lib8tion/trig8.h b/lib/lib8tion/trig8.h
new file mode 100644
index 000000000..4907c6ff3
--- /dev/null
+++ b/lib/lib8tion/trig8.h
@@ -0,0 +1,259 @@
+#ifndef __INC_LIB8TION_TRIG_H
+#define __INC_LIB8TION_TRIG_H
+
+///@ingroup lib8tion
+
+///@defgroup Trig Fast trig functions
+/// Fast 8 and 16-bit approximations of sin(x) and cos(x).
+/// Don't use these approximations for calculating the
+/// trajectory of a rocket to Mars, but they're great
+/// for art projects and LED displays.
+///
+/// On Arduino/AVR, the 16-bit approximation is more than
+/// 10X faster than floating point sin(x) and cos(x), while
+/// the 8-bit approximation is more than 20X faster.
+///@{
+
+#if defined(__AVR__)
+#define sin16 sin16_avr
+#else
+#define sin16 sin16_C
+#endif
+
+/// Fast 16-bit approximation of sin(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+/// float s = sin(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t sin16_avr( uint16_t theta )
+{
+ static const uint8_t data[] =
+ { 0, 0, 49, 0, 6393%256, 6393/256, 48, 0,
+ 12539%256, 12539/256, 44, 0, 18204%256, 18204/256, 38, 0,
+ 23170%256, 23170/256, 31, 0, 27245%256, 27245/256, 23, 0,
+ 30273%256, 30273/256, 14, 0, 32137%256, 32137/256, 4 /*,0*/ };
+
+ uint16_t offset = (theta & 0x3FFF);
+
+ // AVR doesn't have a multi-bit shift instruction,
+ // so if we say "offset >>= 3", gcc makes a tiny loop.
+ // Inserting empty volatile statements between each
+ // bit shift forces gcc to unroll the loop.
+ offset >>= 1; // 0..8191
+ asm volatile("");
+ offset >>= 1; // 0..4095
+ asm volatile("");
+ offset >>= 1; // 0..2047
+
+ if( theta & 0x4000 ) offset = 2047 - offset;
+
+ uint8_t sectionX4;
+ sectionX4 = offset / 256;
+ sectionX4 *= 4;
+
+ uint8_t m;
+
+ union {
+ uint16_t b;
+ struct {
+ uint8_t blo;
+ uint8_t bhi;
+ };
+ } u;
+
+ //in effect u.b = blo + (256 * bhi);
+ u.blo = data[ sectionX4 ];
+ u.bhi = data[ sectionX4 + 1];
+ m = data[ sectionX4 + 2];
+
+ uint8_t secoffset8 = (uint8_t)(offset) / 2;
+
+ uint16_t mx = m * secoffset8;
+
+ int16_t y = mx + u.b;
+ if( theta & 0x8000 ) y = -y;
+
+ return y;
+}
+
+/// Fast 16-bit approximation of sin(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+/// float s = sin(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t sin16_C( uint16_t theta )
+{
+ static const uint16_t base[] =
+ { 0, 6393, 12539, 18204, 23170, 27245, 30273, 32137 };
+ static const uint8_t slope[] =
+ { 49, 48, 44, 38, 31, 23, 14, 4 };
+
+ uint16_t offset = (theta & 0x3FFF) >> 3; // 0..2047
+ if( theta & 0x4000 ) offset = 2047 - offset;
+
+ uint8_t section = offset / 256; // 0..7
+ uint16_t b = base[section];
+ uint8_t m = slope[section];
+
+ uint8_t secoffset8 = (uint8_t)(offset) / 2;
+
+ uint16_t mx = m * secoffset8;
+ int16_t y = mx + b;
+
+ if( theta & 0x8000 ) y = -y;
+
+ return y;
+}
+
+
+/// Fast 16-bit approximation of cos(x). This approximation never varies more than
+/// 0.69% from the floating point value you'd get by doing
+///
+/// float s = cos(x) * 32767.0;
+///
+/// @param theta input angle from 0-65535
+/// @returns sin of theta, value between -32767 to 32767.
+LIB8STATIC int16_t cos16( uint16_t theta)
+{
+ return sin16( theta + 16384);
+}
+
+///////////////////////////////////////////////////////////////////////
+
+// sin8 & cos8
+// Fast 8-bit approximations of sin(x) & cos(x).
+// Input angle is an unsigned int from 0-255.
+// Output is an unsigned int from 0 to 255.
+//
+// This approximation can vary to to 2%
+// from the floating point value you'd get by doing
+// float s = (sin( x ) * 128.0) + 128;
+//
+// Don't use this approximation for calculating the
+// "real" trigonometric calculations, but it's great
+// for art projects and LED displays.
+//
+// On Arduino/AVR, this approximation is more than
+// 20X faster than floating point sin(x) and cos(x)
+
+#if defined(__AVR__) && !defined(LIB8_ATTINY)
+#define sin8 sin8_avr
+#else
+#define sin8 sin8_C
+#endif
+
+
+const uint8_t b_m16_interleave[] = { 0, 49, 49, 41, 90, 27, 117, 10 };
+
+/// Fast 8-bit approximation of sin(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+/// float s = (sin(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t sin8_avr( uint8_t theta)
+{
+ uint8_t offset = theta;
+
+ asm volatile(
+ "sbrc %[theta],6 \n\t"
+ "com %[offset] \n\t"
+ : [theta] "+r" (theta), [offset] "+r" (offset)
+ );
+
+ offset &= 0x3F; // 0..63
+
+ uint8_t secoffset = offset & 0x0F; // 0..15
+ if( theta & 0x40) secoffset++;
+
+ uint8_t m16; uint8_t b;
+
+ uint8_t section = offset >> 4; // 0..3
+ uint8_t s2 = section * 2;
+
+ const uint8_t* p = b_m16_interleave;
+ p += s2;
+ b = *p;
+ p++;
+ m16 = *p;
+
+ uint8_t mx;
+ uint8_t xr1;
+ asm volatile(
+ "mul %[m16],%[secoffset] \n\t"
+ "mov %[mx],r0 \n\t"
+ "mov %[xr1],r1 \n\t"
+ "eor r1, r1 \n\t"
+ "swap %[mx] \n\t"
+ "andi %[mx],0x0F \n\t"
+ "swap %[xr1] \n\t"
+ "andi %[xr1], 0xF0 \n\t"
+ "or %[mx], %[xr1] \n\t"
+ : [mx] "=d" (mx), [xr1] "=d" (xr1)
+ : [m16] "d" (m16), [secoffset] "d" (secoffset)
+ );
+
+ int8_t y = mx + b;
+ if( theta & 0x80 ) y = -y;
+
+ y += 128;
+
+ return y;
+}
+
+
+/// Fast 8-bit approximation of sin(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+/// float s = (sin(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t sin8_C( uint8_t theta)
+{
+ uint8_t offset = theta;
+ if( theta & 0x40 ) {
+ offset = (uint8_t)255 - offset;
+ }
+ offset &= 0x3F; // 0..63
+
+ uint8_t secoffset = offset & 0x0F; // 0..15
+ if( theta & 0x40) secoffset++;
+
+ uint8_t section = offset >> 4; // 0..3
+ uint8_t s2 = section * 2;
+ const uint8_t* p = b_m16_interleave;
+ p += s2;
+ uint8_t b = *p;
+ p++;
+ uint8_t m16 = *p;
+
+ uint8_t mx = (m16 * secoffset) >> 4;
+
+ int8_t y = mx + b;
+ if( theta & 0x80 ) y = -y;
+
+ y += 128;
+
+ return y;
+}
+
+/// Fast 8-bit approximation of cos(x). This approximation never varies more than
+/// 2% from the floating point value you'd get by doing
+///
+/// float s = (cos(x) * 128.0) + 128;
+///
+/// @param theta input angle from 0-255
+/// @returns sin of theta, value between 0 and 255
+LIB8STATIC uint8_t cos8( uint8_t theta)
+{
+ return sin8( theta + 64);
+}
+
+///@}
+#endif