diff --git a/colorchord2/Makefile b/colorchord2/Makefile
index f393fc3..1299852 100644
--- a/colorchord2/Makefile
+++ b/colorchord2/Makefile
@@ -17,7 +17,7 @@ LDLIBS:=-lpthread -lasound -lm -lpulse-simple -lpulse -ludev -lrt
 CFLAGS:=-g -O0 -flto -Wall -ffast-math -I../embeddedcommon -I. -DICACHE_FLASH_ATTR=
 EXTRALIBS:=-lusb-1.0
 
-colorchord : os_generic.o main.o  dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o
+colorchord : os_generic.o main.o  dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o ../embeddedcommon/DFT8Turbo.o ../embeddedcommon/DFT8Padauk.o
 	gcc -o $@ $^ $(CFLAGS) $(LDLIBS) $(EXTRALIBS) $(RAWDRAWLIBS)
 
 
@@ -26,4 +26,4 @@ colorchord.exe : os_generic.c main.c  dft.c decompose.c filter.c color.c notefin
 
 
 clean :
-	rm -rf *.o *~ colorchord colorchord.exe embeddedcc
+	rm -rf *.o *~ ../embeddedcommon/*.o colorchord colorchord.exe embeddedcc
diff --git a/colorchord2/colorchord.exe b/colorchord2/colorchord.exe
deleted file mode 100644
index 7afcc43..0000000
Binary files a/colorchord2/colorchord.exe and /dev/null differ
diff --git a/colorchord2/default.conf b/colorchord2/default.conf
index c8d2a9d..297c676 100644
--- a/colorchord2/default.conf
+++ b/colorchord2/default.conf
@@ -58,8 +58,9 @@ octaves = 5
 # 1 = DFT Progressive
 # 2 = DFT Progressive Integer	
 # 3 = DFT Progressive Integer Skippy
-# 4 = Integer, 32-Bit, Progressive, Skippy.
-do_progressive_dft = 4
+# 4 = Integer, 32-Bit, Progressive, Skippy. (wow, this actually works)
+# 5 = 8-bit turbo test.
+do_progressive_dft = 5
 
 
 filter_iter = 2
diff --git a/colorchord2/main.c b/colorchord2/main.c
index adc00c6..02f3a5c 100644
--- a/colorchord2/main.c
+++ b/colorchord2/main.c
@@ -54,6 +54,7 @@ float cpu_autolimit_interval = 0.016; 	REGISTER_PARAM( cpu_autolimit_interval, P
 int sample_channel = -1;REGISTER_PARAM( sample_channel, PAINT );
 int showfps = 0;        REGISTER_PARAM( showfps, PAINT );
 float in_amplitude = 1; REGISTER_PARAM( in_amplitude, PAFLOAT );
+int shim_sinewave = 0;  REGISTER_PARAM( shim_sinewave, PAINT );
 
 struct NoteFinder * nf;
 
@@ -96,6 +97,9 @@ void HandleMotion( int x, int y, int mask )
 
 void SoundCB( float * out, float * in, int samplesr, int * samplesp, struct SoundDriver * sd )
 {
+	static og_sema_t tss;
+	if( !tss ) tss = OGCreateSema();
+	else OGLockSema( tss );
 	int channelin = sd->channelsRec;
 //	int channelout = sd->channelsPlay;
 	//*samplesp = 0;
@@ -106,53 +110,90 @@ void SoundCB( float * out, float * in, int samplesr, int * samplesp, struct Soun
 	int i;
 	int j;
 
-	for( i = 0; i < samplesr; i++ )
+	if( out )
 	{
-		if( out )
+		for( i = 0; i < samplesr; i++ )
 		{
 			for( j = 0; j < channelin; j++ )
 			{
 				out[i*channelin+j] = 0;
 			}
 		}
+	}
 
-		if( sample_channel < 0 )
-		{
-			float fo = 0;
-			for( j = 0; j < channelin; j++ )
-			{
-				float f = in[i*channelin+j];
-				if( f >= -1 && f <= 1 )
-				{
-					fo += f;
-				}
-				else
-				{
-					fo += (f>0)?1:-1;
-//					printf( "Sound fault A %d/%d %d/%d %f\n", j, channelin, i, samplesr, f );
-				}
-			}
+	if( shim_sinewave )
+	{
+		static double sinplace;
+		static double sinfreq = 0;
+		static int msp;
 
-			fo /= channelin;
-			sound[soundhead] = fo*in_amplitude;
-			soundhead = (soundhead+1)%SOUNDCBSIZE;
-		}
-		else
+		for( i = 0; i < samplesr; i++ )
 		{
-			float f = in[i*channelin+sample_channel];
+			sinfreq = 3.14159 * 2 * 110 * pow( 2, 5.0/12 ) / 16000;
+//			sinfreq += .000001;
+//			if( sinfreq > .2 ) sinfreq = 0;
+			sinplace += sinfreq;
+			if( sinplace > (3.14159*2) ) sinplace -= 3.14159 * 2;
+
+			msp++;
+			float f = sin( sinplace );
+			//if( msp % 20000 > 10000 ) f = 0;
 
 			if( f > 1 || f < -1 )
 			{ 	
 				f = (f>0)?1:-1;
 			}
 
-
 			//printf( "Sound fault B %d/%d\n", i, samplesr );
 			sound[soundhead] = f*in_amplitude;
 			soundhead = (soundhead+1)%SOUNDCBSIZE;
-
 		}
 	}
+	else
+	{
+		if( sample_channel < 0 )
+		{
+			for( i = 0; i < samplesr; i++ )
+			{
+				float fo = 0;
+				for( j = 0; j < channelin; j++ )
+				{
+					float f = in[i*channelin+j];
+					if( f >= -1 && f <= 1 )
+					{
+						fo += f;
+					}
+					else
+					{
+						fo += (f>0)?1:-1;
+	//					printf( "Sound fault A %d/%d %d/%d %f\n", j, channelin, i, samplesr, f );
+					}
+				}
+
+				fo /= channelin;
+				sound[soundhead] = fo*in_amplitude;
+				soundhead = (soundhead+1)%SOUNDCBSIZE;
+			}
+		}
+		else
+		{
+			for( i = 0; i < samplesr; i++ )
+			{
+				float f = in[i*channelin+sample_channel];
+
+				if( f > 1 || f < -1 )
+				{
+					f = (f>0)?1:-1;
+				}
+
+
+				//printf( "Sound fault B %d/%d\n", i, samplesr );
+				sound[soundhead] = f*in_amplitude;
+				soundhead = (soundhead+1)%SOUNDCBSIZE;
+			}
+		}
+	}
+
 
 	SoundEventHappened( samplesr, in, 0, channelin );
 	if( out )
@@ -160,6 +201,8 @@ void SoundCB( float * out, float * in, int samplesr, int * samplesp, struct Soun
 		SoundEventHappened( samplesr, out, 1, sd->channelsPlay );
 	}
 	*samplesp = samplesr;
+	OGUnlockSema( tss );
+
 }
 
 int main(int argc, char ** argv)
diff --git a/colorchord2/notefinder.c b/colorchord2/notefinder.c
index f0e1178..02bf0b7 100644
--- a/colorchord2/notefinder.c
+++ b/colorchord2/notefinder.c
@@ -11,6 +11,8 @@
 #include "filter.h"
 #include "decompose.h"
 #include "DFT32.h"
+#include "DFT8Turbo.h"
+#include "DFT8Padauk.h"
 
 struct NoteFinder * CreateNoteFinder( int spsRec )
 {
@@ -199,6 +201,12 @@ void RunNoteFinder( struct NoteFinder * nf, const float * audio_stream, int head
 	case 4:
 		DoDFTProgressive32( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup );
 		break;
+	case 5:
+		DoDFT8BitTurbo( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup );
+		break;
+	case 6:
+		DoDFT8BitPadauk( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup );
+		break;
 	default:
 		fprintf( stderr, "Error: No DFT Seleced\n" );
 	}
diff --git a/colorchord2/turbo8bit.conf b/colorchord2/turbo8bit.conf
new file mode 100644
index 0000000..039af99
--- /dev/null
+++ b/colorchord2/turbo8bit.conf
@@ -0,0 +1,95 @@
+# This is the configuration file for colorchord.
+# Most values are already defaulted in the software.
+# This file is constantly checked for new versions.
+# \r, and ; are used as terminators, so you can put
+# multiple entries on the same line.
+
+#Whether to limit the control loop to ~60ish FPS.
+cpu_autolimit = 1
+
+#General GUI properties.
+title = PA Test
+set_screenx = 720
+set_screeny = 480
+
+#Sound properties.
+buffer = 384
+play = 0
+rec = 1
+channels = 2
+
+
+
+# THis matters for CC Turbo8
+# What is the base note?  I.e. the lowest note. 
+# Note that it won't have very much impact until an octave up though!
+
+#These two are carefully selected.  You should pick a base note such that it fully saturates the sample frequency.
+#10000 / 2^4{octaves} / 8
+base_hz = 82.41
+samplerate = 10000
+freqbins = 12
+octaves = 4
+do_progressive_dft=6
+
+
+slope = 0
+wininput = -1
+
+#Compiled version will default this.
+#sound_source = ALSA
+#-1 indicates left and right, 0 left, 1 right.
+
+sample_channel = -1
+sourcename = default
+#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor
+#default
+# alsa_output.pci-0000_00_1b.0.analog-stereo.monitor
+#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor << New laptop
+#use pactl list | grep pci- | grep monitor
+
+##################################
+# General ColorChord properties. #
+##################################
+
+# How much to amplify the incoming signal.
+amplify = 2.0
+
+
+# This is only used when dealing with the slow decompose (now defunct)
+# decompose_iterations = 1000
+# default_sigma = 1.4000
+
+
+# For the final note information... How much to slack everything?
+note_attach_amp_iir = 0.3500
+note_attach_amp_iir2 = 0.250
+note_attach_freq_iir = 0.3000
+
+#How many bins a note can jump from frame to frame to be considered a slide.
+#this is used to prevent notes from popping in and out a lot.
+note_combine_distance = 0.5000
+note_jumpability = 1.8000
+note_minimum_new_distribution_value = 0.0200
+note_out_chop = 0.05000
+
+#compress_coefficient = 4.0
+#compress_exponent = .5
+
+
+#=======================================================================
+#Outputs
+
+
+shim_sinewave = 0
+
+This is a vornoi thing: 
+outdrivers = OutputVoronoi, DisplayArray
+lightx = 64
+lighty = 32
+fromsides = 1
+shape_cutoff = 0.03
+satamp = 5.000
+amppow = 2.510
+distpow = 1.500
+
diff --git a/embedded8266/esp82xx b/embedded8266/esp82xx
index a08b471..113e0d1 160000
--- a/embedded8266/esp82xx
+++ b/embedded8266/esp82xx
@@ -1 +1 @@
-Subproject commit a08b47184b3fcf04172ecc0b6a1aee9c90e5d92d
+Subproject commit 113e0d1a182cd138510f748abf2854c0e84cfa23
diff --git a/embeddedcommon/DFT12Small.c b/embeddedcommon/DFT12Small.c
new file mode 100644
index 0000000..6d04241
--- /dev/null
+++ b/embeddedcommon/DFT12Small.c
@@ -0,0 +1,346 @@
+//NOTE DO NOT EDIT THIS FILE WITHOUT ALSO EDITING DFT8TURBO!!!
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "DFT12Small.h"
+#include <math.h>
+
+#include <stdio.h>
+
+
+#define MAX_FREQS (12)
+#define OCTAVES   (4)
+
+/*
+	General procedure - use this code, with uint16_t or uint32_t buffers, and make sure none of the alarms go off.
+		All of the paths still require no more than an 8-bit multiply.
+		You should test with extreme cases, like square wave sweeps in, etc.
+*/
+
+//#define TWELVEBIT
+#define EIGHTBIT
+
+#ifdef TWELVEBIT
+//No larger than 12-bit signed values for integration or sincos 
+#define FRONTEND_AMPLITUDE (0)
+#define INITIAL_DECIMATE (2)
+#define INTEGRATOR_DECIMATE (8)
+#define FINAL_DECIMATE (4)
+#elif defined( EIGHTBIT )
+//No larger than 8-bit signed values for integration or sincos
+#define FRONTEND_AMPLITUDE (2)
+#define INITIAL_DECIMATE (5) //Yurgh... only 3 bits of ADC data.  That's 8 unique levels :(
+#define INTEGRATOR_DECIMATE (8)
+#define FINAL_DECIMATE (1)
+#endif
+
+
+//4x the hits (sin/cos and we need to do it once for each edge)
+//8x for selecting a higher octave.
+#define FREQREBASE 8.0 
+#define TARGFREQ 10000.0
+
+/* Tradeoff guide:
+
+	* We will optimize for RAM size here.
+
+	* INITIAL_DECIMATE; A larger decimation: {NOTE 1}
+		+) Reduces the bit depth needed for the integral map.
+			If you use "1" and a fully saturted map (highest note is every sample), it will not overflow a signed 12-bit number.
+		-) Increases noise.  
+			With full-scale: 0->1 minimal 1->2 minimal 2->3 significantly noticable, 3->4 major.
+			If sound is quieter, it matters more.  Not sure with other changes in system. (2) seems ok.
+		-) If you make it (1) or (0) You can't do an 8-bit multiply and keep the output in a signed range.
+	Also, other things, like frequency of hits can manipulate the maximum bit depth needed for integral map.
+
+	* If you weight the bins in advance see "mulmux", you can:	{NOTE 2}
+		+) potentially use shallower bit depth but
+		-) have to compute the multiply every time you update the bin.
+
+	* You can use a modified-square-wave which only integrates for 1/2 of the duty cycle. {NOTE 3}
+		+) uses 1/2 the integral memory.
+		-) Not as pretty of an output.  See "integral_at"
+
+	*TODO: Investigate using all unsigned (to make multiply and/or 12-bit storage easier)
+	*TODO: Consider a mode which has 16-bit integrals, but still 8-bit cossin data.
+
+	So, the idea here is we would keep a running total of the current ADC value, kept away in a int16_t.
+	It is constantly summing, so we can take an integral of it.  Or rather an integral range.
+
+	Over time, we perform operations like adding or subtracting from a current place.  It basically is
+	a DFT where the kernel is computed using square waves (or modified square waves)
+*/
+
+//These live in RAM.
+int16_t running_integral; //Realistically treat as 12-bits on ramjet8
+int16_t integral_at[MAX_FREQS*OCTAVES];	//For ramjet8, make 12-bits
+int32_t cossindata[MAX_FREQS*OCTAVES*2]; //Contains COS and SIN data.  (32-bit for now, will be 16-bit, potentially even 8.)
+uint8_t which_octave_for_op[MAX_FREQS]; //counts up, tells you which ocative you are operating on.  PUT IN RAM.
+uint8_t actiontableplace;
+
+#define NR_OF_OPS (4<<OCTAVES)
+//Format is:
+//  255 = DO NOT OPERATE
+// bits 0..3 unfolded octave, i.e. sin/cos are offset by one.
+// bit 4 = add or subtract.
+uint8_t  optable[NR_OF_OPS]; //PUT IN FLASH
+
+#define ACTIONTABLESIZE 256
+uint16_t actiontable[ACTIONTABLESIZE]; //PUT IN FLASH // If there are more than 8 freqbins, this must be a uint16_t, otherwise if more than 16, 32.
+//Format is
+
+uint8_t mulmux[MAX_FREQS];	//PUT IN FLASH
+
+static int Setup( float * frequencies, int bins )
+{
+	int i;
+	printf( "BINS: %d\n", bins );
+
+	float highestf = frequencies[MAX_FREQS-1];
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		mulmux[i] = (uint8_t)( highestf / frequencies[i] * 255 + 0.5 );
+		printf( "MM: %d  %f / %f\n", mulmux[i], frequencies[i], highestf );
+	}
+
+	for( i = bins-MAX_FREQS; i < bins; i++ )
+	{
+		int topbin = i - (bins-MAX_FREQS);
+		float f = frequencies[i]/FREQREBASE; 
+		float hits_per_table = (float)ACTIONTABLESIZE/f;
+		int dhrpertable = (int)(hits_per_table+.5);//TRICKY: You might think you need to have even number of hits (sin/cos), but you don't!  It can flip sin/cos each time through the table!
+		float err = (TARGFREQ/((float)ACTIONTABLESIZE/dhrpertable) - (float)TARGFREQ/f)/((float)TARGFREQ/f);
+		//Perform an op every X samples.  How well does this map into units of 1024?
+		printf( "%d %f -> hits per %d: %f %d (%.2f%% error)\n", topbin, f, ACTIONTABLESIZE, (float)ACTIONTABLESIZE/f, dhrpertable, err * 100.0 );
+		if( dhrpertable >= ACTIONTABLESIZE )
+		{
+			fprintf( stderr, "Error: Too many hits.\n" );
+			exit(0);
+		}
+
+		float advance_per_step = dhrpertable/(float)ACTIONTABLESIZE;
+		float fvadv = 0.5;
+		int j;
+		int countset = 0;
+
+		//Tricky: We need to start fadv off at such a place that there won't be a hicchup when going back around to 0.
+		//	I believe this is done by setting fvadv to 0.5 initially.  Unsure.
+
+		for( j = 0; j < ACTIONTABLESIZE; j++ )
+		{
+			if( fvadv >= 0.5 )
+			{
+				actiontable[j] |= 1<<topbin;
+				fvadv -= 1.0;
+				countset++;
+			}
+			fvadv += advance_per_step;
+		}
+		printf( "   countset: %d\n", countset );
+	}
+	//exit(1);
+
+
+	int phaseinop[OCTAVES] = { 0 };
+	int already_hit_octaveplace[OCTAVES*2] = { 0 };
+	for( i = 0; i < NR_OF_OPS; i++ )
+	{
+		int longestzeroes = 0;
+		int val = i & ((1<<OCTAVES)-1);
+		for( longestzeroes = 0; longestzeroes < 255 && ( ((val >> longestzeroes) & 1) == 0 ); longestzeroes++ );
+		//longestzeroes goes: 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, ...
+		//This isn't great, because we need to also know whether we are attacking the SIN side or the COS side, and if it's + or -.
+		//We can actually decide that out.
+
+		if( longestzeroes == 255 )
+		{
+			//This is a nop.  Emit a nop.
+			optable[i] = 255;
+		}
+		else
+		{
+			longestzeroes = OCTAVES-1-longestzeroes;	//Actually do octave 0 least often.
+			int iop = phaseinop[longestzeroes]++;
+			int toop = longestzeroes;
+			int toopmon = (longestzeroes<<1) | (iop & 1);
+
+			//if it's the first time an octave happened this set, flag it. This may be used later in the process.
+			if( !already_hit_octaveplace[toopmon] )
+			{
+				already_hit_octaveplace[toopmon] = 1;
+				toop |= 1<<5;
+			}
+			if( iop & 1 )
+			{
+				toop |= 1<<6;
+			}
+
+			//Handle add/subtract bit.
+			if( iop & 2 ) toop |= 1<<4;
+
+			optable[i] = toop;
+
+			//printf( "  %d %d %d\n", iop, val, longestzeroes );
+		}
+		//printf( "HBT: %d = %d\n", i, optable[i] );
+	}
+	//exit(1);
+
+	return 0;
+}
+
+
+void Small12BitRun( int8_t adcval )
+{
+	int16_t adcv = adcval;
+	adcv *= FRONTEND_AMPLITUDE;
+	if( adcv > 127 ) adcv = 127;
+	if( adcv < -128 ) adcv = -128;
+	running_integral += adcv>>INITIAL_DECIMATE;
+
+	uint32_t action = actiontable[actiontableplace++];
+	int n;
+	for( n = 0; n < MAX_FREQS; n++, action>>=1 )
+	{
+		if( !( action & 1 ) ) continue;
+
+		int ao = which_octave_for_op[n];
+		ao++;
+		if( ao >= NR_OF_OPS ) ao = 0;
+		which_octave_for_op[n] = ao;
+
+		int op = optable[ao];
+
+		if( op == 255 )
+			continue;
+
+		//int octaveplace = op & 0xf;
+
+		//Tricky: We share the integral with SIN and COS.
+		//We don't need to. It would produce a slightly cleaner signal. See: NOTE 3
+		uint8_t octave = op & 0xf;
+		uint8_t intindex = octave * MAX_FREQS + n;
+
+		//int invoct = OCTAVES-1-octaveplace;
+		int16_t diff;
+
+		if( op & 0x10 )	//ADD
+		{
+			diff = integral_at[intindex] - running_integral;
+		}
+		else	//SUBTRACT
+		{
+			diff = running_integral - integral_at[intindex];
+		}
+
+		integral_at[intindex] = running_integral;
+
+#ifdef TWELVEBIT
+		if( diff > 2000 || diff < -2000 ) printf( "!!!!!!!!!!!! %d !!!!!!!!!!!\n", diff );
+#elif defined( EIGHTBIT )
+		if( diff > 124 || diff < -124 ) printf( "!!!!!!!!!!!! %d !!!!!!!!!!!\n", diff );
+#endif
+
+		//uint8_t idx = ( intindex << 1 );
+		intindex<<=1;
+
+		if( op&(1<<6) )
+		{
+			intindex |= 1;
+		}
+
+		//printf( "%d: %d + %d * %d >> 8 - %d\n", intindex, cossindata[intindex], diff, mulmux[intindex/2], cossindata[intindex]>>4 );
+
+		uint8_t mulmuxval = mulmux[n];
+
+
+		//Do you live on a super lame processor? {NOTE 4}
+		//If you do, you might not have good signed multiply operations. So, an alternative mechanism is found here.
+		//	+) Able to more cleanly crush to an 8-bit multiply.
+		//	+) Gets extra bit of precision back, i.e. the sign bit is now used as a data bit.
+		//	-) More than 1 line of C code.  Requires possible double invert.
+#if 1
+		//Terrible processor, i.e. PMS133
+		if( 0 && diff < 0 )
+		{
+			diff *= -1;
+			diff >>= (OCTAVES-1-octave);
+
+			if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+
+			diff = (uint16_t)diff * (uint16_t)mulmuxval;
+			diff >>= INTEGRATOR_DECIMATE;
+
+			diff *= -1;
+		}
+		else
+		{
+			diff >>= (OCTAVES-1-octave);
+
+			if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+
+			diff = (uint16_t)diff * (uint16_t)mulmuxval;
+			diff >>= INTEGRATOR_DECIMATE;
+		}	
+#else
+		//Decent processor, i.e. ATTiny85.
+		diff = ((diff>>(OCTAVES-1-octave)) * mulmuxval ) >> 6;
+#endif
+		cossindata[intindex] = cossindata[intindex] 
+			+ diff
+			- (cossindata[intindex]>>4)
+			;
+
+#ifdef EIGHTBIT
+		if( cossindata[intindex] > 0 ) cossindata[intindex]--;
+		if( cossindata[intindex] < 0 ) cossindata[intindex]++;
+#endif
+	}
+
+}
+
+
+void DoDFT12BitSmall( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup )
+{
+	static int is_setup;
+	if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); }
+	static int last_place;
+	int i;
+
+	for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer )
+	{
+		int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 );
+		Small12BitRun( ifr1>>5 ); //5 = Actually only feed algorithm numbers from -128 to 127.
+	}
+	last_place = place_in_data_buffer;
+
+	static int idiv;
+	idiv++;
+#if 1
+	for( i = 0; i < bins; i++ )
+	{
+		int iss = cossindata[i*2+0]>>FINAL_DECIMATE;
+		int isc = cossindata[i*2+1]>>FINAL_DECIMATE;
+		int mux = iss * iss + isc * isc;
+
+		if( mux <= 0 ) 
+		{
+			outbins[i] = 0;
+		}
+		else
+		{
+			outbins[i] = sqrt((float)mux)/50.0;
+
+#ifdef TWELVEBIT
+		if( abs( cossindata[i*2+0] ) > 1000 || abs( cossindata[i*2+1] ) > 1000 )
+			printf( "CS OVF %d/%d/%d/%f\n", i, cossindata[i*2+0], cossindata[i*2+1],outbins[i] );
+#elif defined( EIGHTBIT )
+		if( abs( cossindata[i*2+0] ) > 120 || abs( cossindata[i*2+1] ) > 120 )
+			printf( "CS OVF %d/%d/%d/%f\n", i, cossindata[i*2+0], cossindata[i*2+1],outbins[i] );
+#endif
+		}
+	} 
+#endif
+}
+
+
diff --git a/embeddedcommon/DFT12Small.h b/embeddedcommon/DFT12Small.h
new file mode 100644
index 0000000..13506e6
--- /dev/null
+++ b/embeddedcommon/DFT12Small.h
@@ -0,0 +1,9 @@
+#ifndef _DFT8TURBO_H
+#define _DFT8TURBO_H
+
+/* Note: Frequencies must be precompiled. */
+
+void DoDFT12BitSmall( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup );
+
+#endif
+
diff --git a/embeddedcommon/DFT32.c b/embeddedcommon/DFT32.c
index 21df4dd..b587c6c 100644
--- a/embeddedcommon/DFT32.c
+++ b/embeddedcommon/DFT32.c
@@ -353,6 +353,3 @@ void DoDFTProgressive32( float * outbins, float * frequencies, int bins, const f
 
 #endif
 
-
-
-
diff --git a/embeddedcommon/DFT8Padauk.c b/embeddedcommon/DFT8Padauk.c
new file mode 100644
index 0000000..0194799
--- /dev/null
+++ b/embeddedcommon/DFT8Padauk.c
@@ -0,0 +1,360 @@
+//NOTE DO NOT EDIT THIS FILE WITHOUT ALSO EDITING DFT12SMALL!!!
+//WARNING: DFT8Turbo, DFT12Small is currently the only one that's actually working.
+//THIS FILE DOES NOT CURRENTLY WORK.
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "DFT8Turbo.h"
+#include <math.h>
+
+#include <stdio.h>
+
+#define MAX_FREQS (12)
+#define OCTAVES   (4)
+/* Backporting notes:
+	* Change loop to only check if the output table says it's complete.
+	* Pre-multiply octaves in optable.
+*/
+
+/*
+	General procedure - use this code, with uint16_t or uint32_t buffers, and make sure none of the alarms go off.
+		All of the paths still require no more than an 8-bit multiply.
+		You should test with extreme cases, like square wave sweeps in, etc.
+*/
+
+//No larger than 8-bit signed values for integration or sincos
+#define FRONTEND_AMPLITUDE (2)
+#define INITIAL_DECIMATE (5) //Yurgh... only 3 bits of ADC data.  That's 8 unique levels :(
+#define INTEGRATOR_DECIMATE (8)
+#define FINAL_DECIMATE (1)
+
+
+#define OPTABLETYPE uint16_t	//Make uint8_t if on attiny.
+
+//4x the hits (sin/cos and we need to do it once for each edge)
+//8x for selecting a higher octave.
+#define FREQREBASE 8.0 
+#define TARGFREQ 10000.0
+
+//These live in RAM.
+int8_t running_integral; //Realistically treat as 12-bits on ramjet8
+int8_t integral_at[MAX_FREQS*OCTAVES];	//For ramjet8, make 12-bits
+int8_t cossindata[MAX_FREQS*OCTAVES*2]; //Contains COS and SIN data.  (32-bit for now, will be 16-bit, potentially even 8.)
+uint8_t which_octave_for_op[MAX_FREQS]; //counts up, tells you which ocative you are operating on.  PUT IN RAM.
+uint8_t actiontableplace;
+
+#define NR_OF_OPS (4<<OCTAVES) /*64*/
+//Format is:
+//  255 = DO NOT OPERATE
+// bits 0..4 = which octave
+// bit 5 = even or odd (sin or cos) [UNUSED]
+// bit 6 = reset
+// bit 7 = add or subtract.
+// bits 8..15 = octave base offset.
+OPTABLETYPE  optable[NR_OF_OPS]; //PUT IN FLASH
+
+#define ACTIONTABLESIZE 256
+uint16_t actiontable[ACTIONTABLESIZE]; //PUT IN FLASH // If there are more than 8 freqbins, this must be a uint16_t, otherwise if more than 16, 32.
+//Format is
+
+OPTABLETYPE mulmux[MAX_FREQS];	//PUT IN FLASH
+
+static int Setup( float * frequencies, int bins )
+{
+	int i;
+	printf( "BINS: %d\n", bins );
+
+	float highestf = frequencies[MAX_FREQS-1];
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		mulmux[i] = (uint8_t)( highestf / frequencies[i] * 255 + 0.5 );
+		printf( "MM: %d  %f / %f\n", mulmux[i], frequencies[i], highestf );
+	}
+
+	for( i = bins-MAX_FREQS; i < bins; i++ )
+	{
+		int topbin = i - (bins-MAX_FREQS);
+		float f = frequencies[i]/FREQREBASE; 
+		float hits_per_table = (float)ACTIONTABLESIZE/f;
+		int dhrpertable = (int)(hits_per_table+.5);//TRICKY: You might think you need to have even number of hits (sin/cos), but you don't!  It can flip sin/cos each time through the table!
+		float err = (TARGFREQ/((float)ACTIONTABLESIZE/dhrpertable) - (float)TARGFREQ/f)/((float)TARGFREQ/f);
+		//Perform an op every X samples.  How well does this map into units of 1024?
+		printf( "%d %f -> hits per %d: %f %d (%.2f%% error)\n", topbin, f, ACTIONTABLESIZE, (float)ACTIONTABLESIZE/f, dhrpertable, err * 100.0 );
+		if( dhrpertable >= ACTIONTABLESIZE )
+		{
+			fprintf( stderr, "Error: Too many hits.\n" );
+			exit(0);
+		}
+
+		float advance_per_step = dhrpertable/(float)ACTIONTABLESIZE;
+		float fvadv = 0.5;
+		int j;
+		int countset = 0;
+
+		//Tricky: We need to start fadv off at such a place that there won't be a hicchup when going back around to 0.
+		//	I believe this is done by setting fvadv to 0.5 initially.  Unsure.
+
+		for( j = 0; j < ACTIONTABLESIZE; j++ )
+		{
+			if( fvadv >= 0.5 )
+			{
+				actiontable[j] |= 1<<(MAX_FREQS-1-topbin);	//XXX-DEPARTURE (reversing the table symbols)
+				fvadv -= 1.0;
+				countset++;
+			}
+			fvadv += advance_per_step;
+		}
+		printf( "   countset: %d\n", countset );
+	}
+	//exit(1);
+
+
+	int phaseinop[OCTAVES] = { 0 };
+	int already_hit_octaveplace[OCTAVES*2] = { 0 };
+	for( i = 0; i < NR_OF_OPS; i++ )
+	{
+		int longestzeroes = 0;
+		int val = i & ((1<<OCTAVES)-1);
+		for( longestzeroes = 0; longestzeroes < 255 && ( ((val >> longestzeroes) & 1) == 0 ); longestzeroes++ );
+		//longestzeroes goes: 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, ...
+		//This isn't great, because we need to also know whether we are attacking the SIN side or the COS side, and if it's + or -.
+		//We can actually decide that out.
+
+		if( longestzeroes == 255 )
+		{
+			//This is a nop.  Emit a nop.
+			optable[i] = 65535;
+		}
+		else
+		{
+			longestzeroes = OCTAVES-1-longestzeroes;	//Actually do octave 0 least often.
+			int iop = phaseinop[longestzeroes]++;
+			int toop = longestzeroes;
+			int toopmon = (longestzeroes<<1) | (iop & 1);
+
+			//if it's the first time an octave happened this set, flag it. This may be used later in the process.
+			if( !already_hit_octaveplace[toopmon] )
+			{
+				already_hit_octaveplace[toopmon] = 1;
+				toop |= 1<<5;
+			}
+
+			if( iop & 1 )
+			{
+				toop |= 1<<6;
+			}
+
+			//Handle add/subtract bit.
+			if( iop & 2 ) toop |= 1<<4;
+
+			optable[i] = toop | ((longestzeroes*MAX_FREQS*2+(iop & 1))<<8);
+
+			//printf( "  %d %d %d\n", iop, val, longestzeroes );
+		}
+		//printf( "HBT: %d = %d\n", i, optable[i] );
+	}
+	//exit(1);
+
+	return 0;
+}
+
+
+static uint16_t action;
+static uint8_t note;
+static uint8_t * memptr;
+static uint16_t * romptr;
+static uint8_t op;
+static uint8_t note_offset; //Offset of current note.
+static uint8_t octave;
+static uint8_t intindex;
+static int8_t diff;
+static uint8_t tmp;
+
+void Padauk8BitRun( int8_t adcval )
+{
+	int16_t adcv = adcval;
+	adcv *= FRONTEND_AMPLITUDE;
+	if( adcv > 127 ) adcv = 127;
+	if( adcv < -128 ) adcv = -128;
+	running_integral += adcv>>INITIAL_DECIMATE;
+
+	uint8_t acc;
+	uint8_t * accM;
+	uint8_t mul2;
+
+	action = actiontable[actiontableplace++];
+
+	//Counts are approximate counts for PMS133
+
+	for( note = MAX_FREQS; 	
+		 note; 				//1CYC/PAIRED
+		 note--, 			//1CYC/PAIRED (dzsn)
+			action>>=1 		//2CYC (slc x2)
+		)
+	{
+		//Everything inside this loop is executed ~3/4 * MAX_FREQS per audio sample. so.. ~9x.
+		//If op @ 4MHz, we get 44 cycles in here.  I don't think we can do it.
+
+		//If no operation is scheduled, continue.
+		if( !( action & 1 ) ) continue;		//1CYC
+
+		accM = which_octave_for_op - 1;			//1CYC
+		accM = accM + note;						//1CYC
+		//accM now points to the memory address containing which step we're on.
+		//We can use that to figure out which octave we should operate with.
+		memptr = accM;							//1CYC
+		acc = *memptr;							//2CYC (idxm)
+		acc++;									//1CYC
+		//acc now contains the actual place we are indexing off of.
+		//If it overflows, be sure to reset it.
+		if( acc == NR_OF_OPS+1 )
+		{
+			acc = 1;
+			continue;
+		}
+		//We then update the memory with the new data.
+		*memptr = acc;							//2CYC (idxm)
+
+		//Now, we look up in optable what we're supposed to do.
+		accM = ((uint8_t*)optable) + acc*2;		//1CYC -> ROM dad is stored in word pairs.
+		romptr = (uint16_t*)accM;				//1CYC
+		acc = *romptr;							//2CYC (ldtabl) 
+
+		//If we are on the one operation we aren't supposed to operate within, we should cancel and loop around.
+		//XXX XXX XXX XXX XXX This is wrong.  We should probably handle this logic above.
+		//XXX XXX XXX XXX XXX Logic handled above. XXX PICK UP HERE!!!
+		printf( "+ %d %d %d\n", note, acc, *memptr );
+		//if( acc == 255 )						//2CYC
+		//{
+		//	//This way, when we loop back around, it will be at index 0, and everything should flow gracefully.
+		//	*memptr = 255;
+		//	continue;			
+		//}
+		if( acc == 255 )
+		{
+			//We dun goofed.
+			fprintf( stderr, "Goofed.\n" );
+			exit( 0 );
+		}
+
+		//This actually reads the current octave specifier into "op"
+		//BIT7: add or subtract
+		//BIT6: reset
+		//BIT5: Even or odd?
+		//BITS 0..4 = Which octave.
+		op = acc;								//1CYC	
+
+		acc = (*romptr)>>8;						//2CYC (ldtabh)  -> Contains memory offset of which note to use.
+		note_offset = acc;
+		acc = acc + note;						//1CYC
+		accM = (uint8_t*)integral_at-1 + acc;	//1CYC
+		memptr = accM;							//1CYC
+		acc = *memptr;							//2CYC idxm
+
+		//acc now contains the running integral of the last time we were on this cell.
+		if( op & (1<<7) )	//ADD				//2CYC
+		{
+			acc = acc - running_integral;		//1CYC
+		}
+		else	//SUBTRACT
+		{
+			tmp = acc;							//1CYC
+			acc = running_integral;				//1CYC
+			acc = acc - tmp;					//1CYC
+		}
+
+		diff = acc;								//1CYC
+
+		//Assume 2 extra cycles of overhead for if/else.	//2 CYC
+
+		acc = running_integral;					//1CYC
+		//Store the current running integral back into this note's running integral for next time.
+		*memptr = acc;							//2CYC
+
+		// op = info about what op we're on. WARNING: Bitfield.
+		// diff = how much to add to current value.
+		// note_offset = index of current operative note position.
+		octave = op & 0x1f; //XXX TODO
+
+		printf( "%d %d %d %d\n", op, diff, note_offset, octave );
+		accM = (uint8_t*)(mulmux - 1);			//1CYC
+		accM = accM + note*2;					//1CYC
+		romptr = accM;							//1CYC
+		acc = *romptr;							//2CYC
+		mul2 = acc;								//1CYC
+
+		if( diff < 0 )							//[2CYC] (t0sn on MSB)
+		{
+			diff *= -1;							//[1CYC] (neg M)
+			diff >>= (OCTAVES-1-octave); // ???TRICKY???  Should this be a multiply?
+
+			//if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+
+			diff = ((uint16_t)diff * (uint16_t)mul2)>>INTEGRATOR_DECIMATE; //[3CYC]
+			diff *= -1; //[1CYC]
+		}
+		else
+		{
+			diff >>= (OCTAVES-1-octave);
+			//if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+			diff = ((uint16_t)diff * (uint16_t)mul2)>>INTEGRATOR_DECIMATE;
+		}	
+
+		//@48 cycles :( :( :(
+
+		//printf( "%d\n", diff );
+
+		int8_t tmp = 
+			cossindata[intindex] 	//[3CYC]
+			+ diff					//[1CYC]
+			- (cossindata[intindex]>>4)	//[2CYC]
+			;
+
+		if( tmp > 0 ) tmp--;	//2CYC
+		if( tmp < 0 ) tmp++;	//2CYC
+		cossindata[intindex] = tmp;	//2CYC
+		//60ish cycles :( :( :(
+	}
+}
+
+
+void DoDFT8BitPadauk( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup )
+{
+	static int is_setup;
+	if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); }
+	static int last_place;
+	int i;
+
+	for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer )
+	{
+		int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 );
+		Padauk8BitRun( ifr1>>5 ); //5 = Actually only feed algorithm numbers from -128 to 127.
+	}
+	last_place = place_in_data_buffer;
+
+	static int idiv;
+	idiv++;
+#if 1
+	for( i = 0; i < bins; i++ )
+	{
+		int iss = cossindata[i*2+0]>>FINAL_DECIMATE;
+		int isc = cossindata[i*2+1]>>FINAL_DECIMATE;
+		int mux = iss * iss + isc * isc;
+
+		if( mux <= 0 ) 
+		{
+			outbins[i] = 0;
+		}
+		else
+		{
+			outbins[i] = sqrt((float)mux)/50.0;
+
+			if( abs( cossindata[i*2+0] ) > 120 || abs( cossindata[i*2+1] ) > 120 )
+				printf( "CS OVF %d/%d/%d/%f\n", i, cossindata[i*2+0], cossindata[i*2+1],outbins[i] );
+
+		}
+	} 
+#endif
+}
+
+
diff --git a/embeddedcommon/DFT8Padauk.h b/embeddedcommon/DFT8Padauk.h
new file mode 100644
index 0000000..cb6387a
--- /dev/null
+++ b/embeddedcommon/DFT8Padauk.h
@@ -0,0 +1,9 @@
+#ifndef _DFT8PADAUK_H
+#define _DFT8PADAUK_H
+
+/* Note: Frequencies must be precompiled. */
+
+void DoDFT8BitPadauk( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup );
+
+#endif
+
diff --git a/embeddedcommon/DFT8Turbo.c b/embeddedcommon/DFT8Turbo.c
new file mode 100644
index 0000000..6645a41
--- /dev/null
+++ b/embeddedcommon/DFT8Turbo.c
@@ -0,0 +1,312 @@
+//NOTE DO NOT EDIT THIS FILE WITHOUT ALSO EDITING DFT12SMALL!!!
+
+#include <stdint.h>
+#include <stdlib.h>
+#include "DFT8Turbo.h"
+#include <math.h>
+
+#include <stdio.h>
+
+#define MAX_FREQS (12)
+#define OCTAVES   (4)
+
+/*
+	General procedure - use this code, with uint16_t or uint32_t buffers, and make sure none of the alarms go off.
+		All of the paths still require no more than an 8-bit multiply.
+		You should test with extreme cases, like square wave sweeps in, etc.
+*/
+
+//No larger than 8-bit signed values for integration or sincos
+#define FRONTEND_AMPLITUDE (2)
+#define INITIAL_DECIMATE (5) //Yurgh... only 3 bits of ADC data.  That's 8 unique levels :(
+#define INTEGRATOR_DECIMATE (8)
+#define FINAL_DECIMATE (1)
+
+
+#define OPTABLETYPE uint16_t	//Make uint8_t if on attiny.
+
+//4x the hits (sin/cos and we need to do it once for each edge)
+//8x for selecting a higher octave.
+#define FREQREBASE 8.0 
+#define TARGFREQ 10000.0
+
+//These live in RAM.
+int8_t running_integral; //Realistically treat as 12-bits on ramjet8
+int8_t integral_at[MAX_FREQS*OCTAVES];	//For ramjet8, make 12-bits
+int8_t cossindata[MAX_FREQS*OCTAVES*2]; //Contains COS and SIN data.  (32-bit for now, will be 16-bit, potentially even 8.)
+uint8_t which_octave_for_op[MAX_FREQS]; //counts up, tells you which ocative you are operating on.  PUT IN RAM.
+uint8_t actiontableplace;
+
+#define NR_OF_OPS (4<<OCTAVES) /*64*/
+//Format is:
+//  255 = DO NOT OPERATE
+// bits 0..3 unfolded octave, i.e. sin/cos are offset by one.
+// bit 4 = add or subtract.
+OPTABLETYPE  optable[NR_OF_OPS]; //PUT IN FLASH
+
+#define ACTIONTABLESIZE 256
+uint16_t actiontable[ACTIONTABLESIZE]; //PUT IN FLASH // If there are more than 8 freqbins, this must be a uint16_t, otherwise if more than 16, 32.
+//Format is
+
+OPTABLETYPE mulmux[MAX_FREQS];	//PUT IN FLASH
+
+static int Setup( float * frequencies, int bins )
+{
+	int i;
+	printf( "BINS: %d\n", bins );
+
+	float highestf = frequencies[MAX_FREQS-1];
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		mulmux[i] = (uint8_t)( highestf / frequencies[i] * 255 + 0.5 );
+		printf( "MM: %d  %f / %f\n", mulmux[i], frequencies[i], highestf );
+	}
+
+	for( i = bins-MAX_FREQS; i < bins; i++ )
+	{
+		int topbin = i - (bins-MAX_FREQS);
+		float f = frequencies[i]/FREQREBASE; 
+		float hits_per_table = (float)ACTIONTABLESIZE/f;
+		int dhrpertable = (int)(hits_per_table+.5);//TRICKY: You might think you need to have even number of hits (sin/cos), but you don't!  It can flip sin/cos each time through the table!
+		float err = (TARGFREQ/((float)ACTIONTABLESIZE/dhrpertable) - (float)TARGFREQ/f)/((float)TARGFREQ/f);
+		//Perform an op every X samples.  How well does this map into units of 1024?
+		printf( "%d %f -> hits per %d: %f %d (%.2f%% error)\n", topbin, f, ACTIONTABLESIZE, (float)ACTIONTABLESIZE/f, dhrpertable, err * 100.0 );
+		if( dhrpertable >= ACTIONTABLESIZE )
+		{
+			fprintf( stderr, "Error: Too many hits.\n" );
+			exit(0);
+		}
+
+		float advance_per_step = dhrpertable/(float)ACTIONTABLESIZE;
+		float fvadv = 0.5;
+		int j;
+		int countset = 0;
+
+		//Tricky: We need to start fadv off at such a place that there won't be a hicchup when going back around to 0.
+		//	I believe this is done by setting fvadv to 0.5 initially.  Unsure.
+
+		for( j = 0; j < ACTIONTABLESIZE; j++ )
+		{
+			if( fvadv >= 0.5 )
+			{
+				actiontable[j] |= 1<<topbin;
+				fvadv -= 1.0;
+				countset++;
+			}
+			fvadv += advance_per_step;
+		}
+		printf( "   countset: %d\n", countset );
+	}
+	//exit(1);
+
+
+	int phaseinop[OCTAVES] = { 0 };
+	int already_hit_octaveplace[OCTAVES*2] = { 0 };
+	for( i = 0; i < NR_OF_OPS; i++ )
+	{
+		int longestzeroes = 0;
+		int val = i & ((1<<OCTAVES)-1);
+		for( longestzeroes = 0; longestzeroes < 255 && ( ((val >> longestzeroes) & 1) == 0 ); longestzeroes++ );
+		//longestzeroes goes: 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, ...
+		//This isn't great, because we need to also know whether we are attacking the SIN side or the COS side, and if it's + or -.
+		//We can actually decide that out.
+
+		if( longestzeroes == 255 )
+		{
+			//This is a nop.  Emit a nop.
+			optable[i] = 255;
+		}
+		else
+		{
+			longestzeroes = OCTAVES-1-longestzeroes;	//Actually do octave 0 least often.
+			int iop = phaseinop[longestzeroes]++;
+			int toop = longestzeroes;
+			int toopmon = (longestzeroes<<1) | (iop & 1);
+
+			//if it's the first time an octave happened this set, flag it. This may be used later in the process.
+			if( !already_hit_octaveplace[toopmon] )
+			{
+				already_hit_octaveplace[toopmon] = 1;
+				toop |= 1<<5;
+			}
+			if( iop & 1 )
+			{
+				toop |= 1<<6;
+			}
+
+			//Handle add/subtract bit.
+			if( iop & 2 ) toop |= 1<<4;
+
+			optable[i] = toop;
+
+			//printf( "  %d %d %d\n", iop, val, longestzeroes );
+		}
+		//printf( "HBT: %d = %d\n", i, optable[i] );
+	}
+	//exit(1);
+
+	return 0;
+}
+
+
+void Turbo8BitRun( int8_t adcval )
+{
+	int16_t adcv = adcval;
+	adcv *= FRONTEND_AMPLITUDE;
+	if( adcv > 127 ) adcv = 127;
+	if( adcv < -128 ) adcv = -128;
+	running_integral += adcv>>INITIAL_DECIMATE;
+
+	uint16_t action = actiontable[actiontableplace++];
+	uint8_t n;
+
+	//Counts are approximate counts for PMS133
+
+	for( n = 0; 			//1CYC
+		 n < MAX_FREQS; 	//2CYC
+		 n++, 				//1CYC
+			action>>=1 		//2CYC
+		)
+	{
+		//Everything inside this loop is executed ~3/4 * MAX_FREQS per audio sample. so.. ~9x.
+		//If op @ 4MHz, we get 44 cycles in here.
+
+		//If no operation is scheduled, continue.
+		if( !( action & 1 ) ) continue;		//1CYC
+
+		uint8_t ao = which_octave_for_op[n];	//4CYC
+		ao++;									//1CYC
+		if( ao >= NR_OF_OPS ) ao = 0;			//2CYC
+		which_octave_for_op[n] = ao;			//2CYC (idxm)
+
+		uint8_t op = optable[ao];				//"theoretically" 3CYC (if you align things right)
+												//1CYC (Put A into specific RAM location)
+
+		//If we are on the one thing we aren't supposed to operate within, cancel.
+		if( op == 255 )	continue;				//2CYC (if op is in A)
+
+		//Tricky: We share the integral with SIN and COS.
+		//We don't need to. It would produce a slightly cleaner signal. See: NOTE 3
+		uint8_t octave = op & 0xf;				//1CYC (if op is in A)
+
+
+		uint8_t intindex = octave * MAX_FREQS //Load mulop with 12 [2CYC]; mul [1CYC]
+			 + n;								//Add [1CYC]
+												//[1CYC] more cycle to write A into RAM[(intindex)
+		//int invoct = OCTAVES-1-octaveplace;
+		int8_t diff;
+
+		if( op & 0x10 )	//ADD		//2CYC
+		{
+			diff = integral_at[intindex]		//Assume "IntIndex" is in A, add integral_at to A [1], move A to an index [1]. [2] to read into acc. [4CYC]
+				 - running_integral;			//1CYC to subtract.
+												//1CYC to write diff into a memory location.
+		}
+		else	//SUBTRACT
+		{
+			diff = running_integral - integral_at[intindex];
+		}
+
+		//30 cycles so far.
+
+		integral_at[intindex] = running_integral;	//[3CYC]
+
+		//if( diff > 124 || diff < -124 ) printf( "!!!!!!!!!!!! %d !!!!!!!!!!!\n", diff );
+		
+		//uint8_t idx = ( intindex << 1 );	//Overwrite intindex.
+		intindex <<= 1; //1CYC
+
+		if( op&(1<<6) )	//2CYC
+		{
+			intindex |= 1; //1CYC
+		}
+
+		uint8_t mulmuxval = mulmux[n];	//[4CYC]
+
+
+		//Do you live on a super lame processor? {NOTE 4}
+		//If you do, you might not have good signed multiply operations. So, an alternative mechanism is found here.
+		//	+) Able to more cleanly crush to an 8-bit multiply.
+		//	+) Gets extra bit of precision back, i.e. the sign bit is now used as a data bit.
+		//	-) More than 1 line of C code.  Requires possible double invert.
+#if 1
+		//rough processor, i.e. PMS133
+		if( diff < 0 )		//[2CYC]
+		{
+			diff *= -1;		//[1CYC]
+			diff >>= (OCTAVES-1-octave); // ???TRICKY???  Should this be a multiply?
+
+			//if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+
+			diff = ((uint16_t)diff * (uint16_t)mulmuxval)>>INTEGRATOR_DECIMATE; //[3CYC]
+			diff *= -1; //[1CYC]
+		}
+		else
+		{
+			diff >>= (OCTAVES-1-octave);
+			//if( diff > 250 ) printf( "!!!!!!!**** %d ****!!!!!!!\n", diff );
+			diff = ((uint16_t)diff * (uint16_t)mulmuxval)>>INTEGRATOR_DECIMATE;
+		}	
+
+		//@48 cycles :( :( :(
+
+#else
+		//Decent processor, i.e. ATTiny85.
+		diff = ((diff>>(OCTAVES-1-octave)) * mulmuxval ) >> 6;
+#endif
+		//printf( "%d\n", diff );
+
+		int8_t tmp = 
+			cossindata[intindex] 	//[3CYC]
+			+ diff					//[1CYC]
+			- (cossindata[intindex]>>4)	//[2CYC]
+			;
+
+		if( tmp > 0 ) tmp--;	//2CYC
+		if( tmp < 0 ) tmp++;	//2CYC
+		cossindata[intindex] = tmp;	//2CYC
+		//60ish cycles :( :( :(
+	}
+}
+
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup )
+{
+	static int is_setup;
+	if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); }
+	static int last_place;
+	int i;
+
+	for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer )
+	{
+		int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 );
+		Turbo8BitRun( ifr1>>5 ); //5 = Actually only feed algorithm numbers from -128 to 127.
+	}
+	last_place = place_in_data_buffer;
+
+	static int idiv;
+	idiv++;
+#if 1
+	for( i = 0; i < bins; i++ )
+	{
+		int iss = cossindata[i*2+0]>>FINAL_DECIMATE;
+		int isc = cossindata[i*2+1]>>FINAL_DECIMATE;
+		int mux = iss * iss + isc * isc;
+
+		if( mux <= 0 ) 
+		{
+			outbins[i] = 0;
+		}
+		else
+		{
+			outbins[i] = sqrt((float)mux)/50.0;
+
+			if( abs( cossindata[i*2+0] ) > 120 || abs( cossindata[i*2+1] ) > 120 )
+				printf( "CS OVF %d/%d/%d/%f\n", i, cossindata[i*2+0], cossindata[i*2+1],outbins[i] );
+
+		}
+	} 
+#endif
+}
+
+
diff --git a/embeddedcommon/DFT8Turbo.c.attic b/embeddedcommon/DFT8Turbo.c.attic
new file mode 100644
index 0000000..9c7b7b4
--- /dev/null
+++ b/embeddedcommon/DFT8Turbo.c.attic
@@ -0,0 +1,295 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include "DFT8Turbo.h"
+#include <math.h>
+
+#include <stdio.h>
+
+#define MAX_FREQS (24)
+#define OCTAVES   (5)
+
+
+/*
+	* The first thought was using an integration map and only operating when we need to, to pull the data out.
+	* Now we're doing the thing below this block comment
+		int16_t accumulated_total;							//2 bytes
+		int16_t last_accumulated_total_at_bin[MAX_FREQS*2];  //24 * 2 * sizeof(int16_t) = 96 bytes.
+		uint8_t current_time;								//1 byte
+		uint8_t placecode[MAX_FREQS];
+*/
+//OK... We don't have enough ram to sum everything... can we do something wacky with multiple ocatives to sum everything better?
+//i.e.
+//
+// 4332322132212210
+//
+// ++++++++++++++++-----------------
+// ++++++++--------
+// ++++----++++----
+// ++--++--++--++--
+// +-+-+-+-+-+-+-+-
+//
+// Don't forget we need to do this for sin and cos.
+// Can we instead of making this plusses, make it a multiplier?
+// How can we handle sin+cos?
+//
+// Is it possible to do this for every frame?  I.e. for each of the 24 notes, multiply with their current place in table?
+//  That's interesting.  It's not like a sin table.
+// There is no "multiply" in the attiny instruction set for attiny85.
+// There is, however for attiny402
+
+//Question:  Can we do five octaves, or does this need to be balanced?
+//Question2: Should we weight higher octaves?
+
+
+//ATTiny402: 256x8 RAM, 4096x8 FLASH  LPM: 3 cycles + FMUL: 2 cycles  << Do stacked sin waves?
+//ATtiny85:  512x8 RAM, 8192x8 FLASH  LPM: 3 cycles + NO MULTIPLY     << Do square waves?
+
+
+/* Approaches:
+
+  on ATtiny402:  Stacked sin approach.
+   Say 16 MHz, though 12 MHz is interesting...
+   16k SPS: 1k cycles per; say 24 bins per; 41 cycles per bin = hard.  But is it too hard?
+   20 cycles per s/c.
+		read place in stacked table (8? bits) 3 cycles
+
+		//Inner loop = 17 cycles.
+		read stacked table (8 bits), 3 cycles
+		fractional multiply table with current value. 2 cycles
+		read current running for note 2 cycles  (LDS = 3 cycles)
+		subtract a shifted version, to make it into an IIR. (4 cycles)
+		add in current values. (2 cycles)
+		store data back to ram (2 cycles)
+		advance place in stacked table (8?bits) 1 cycle
+
+		store place in stacked table (8? bits) 3 cycles?
+
+	//What if we chunk ADC updates into groups of 4 or 8?
+	//This is looking barely possible.
+
+	on attiny85: scheduled adds/subtracts (like a stacked-square-wave-table)
+		//XXX TODO!
+
+*/
+
+/* Ok... Let's think about the ATTiny402.  256x8 RAM + 4096x8 FLASH.
+
+	* We can create a table which has all octaves overlaid.
+	* We would need to keep track of:
+		* 12 x 2 x 2 = 48 bytes = Current sin/cos values.
+		* 12 x 2 = 24 bytes = Current place in table.  = 72 bytes
+	* We would need to store:
+		* The layered lookup table.  If possible, keep @ 256 bytes to simplify math ops.
+		* The speed by which each note needs to advance.
+	* We would need to:
+		* Read current running place. X                8 cycles
+		* Use that place to look up into sin table.    3 cycles
+		* Read running val  4 cycles best case
+		* Multiply out the sin + IIR                   5 cycles
+		* Store running val 4 cycles best case
+		* Cos-advance that place to look up into sin table.    4 cycles
+		* Read running val 4 cycles best case
+		* Multiply out the sin + IIR                   5 cycles
+		* Store running val 4 cycles best case.
+		* Read how much to advance X by.               4 cycles
+        * (Cos^2+Sin^2)                                8?
+		* Store it.                                    4 cycles best case.
+        *                                                  = 48 x 12 = 576 cycles.  Assume 10 MHz @ 16k SPS.  We're OK (625 samples)
+*/
+
+// Observation: The two tables are actually mirror images of each other, well diagonally mirrored.  That's odd.  But, would take CPU to exploit.
+
+#define SSTABLESIZE 256
+int8_t  spikysin_interleved_cos[SSTABLESIZE][2];
+uint32_t advancespeed[MAX_FREQS];
+
+static int CompTableWithPhase( int nelements, float phase, int scaling )
+{
+	int highest = 0;
+	int i;
+	for( i = 0; i < nelements; i++ )
+	{
+		float taued = i * 3.141592 * 2.0 / nelements;
+		int o;
+		float combsin = 0;
+		for( o = 0; o < OCTAVES; o++ )
+		{
+			combsin += sin( taued * (1<<o) + phase);
+		}
+		combsin /= OCTAVES;
+		int csadapt =  combsin * scaling - 0.5;	//No value is higher with five octaves.  XXX TODO Lookout.  If you change # of octaves, need to change this, too.
+
+		if( csadapt > highest ) highest = csadapt;
+		if( -csadapt > highest ) highest = -csadapt;
+
+		if( csadapt > 127 ) csadapt = 127;
+		if( csadapt < -128 ) csadapt = -128;  //tricky: Keep balanced.
+		spikysin_interleved_cos[i][0] = csadapt;
+
+		float combcos = 0;
+		for( o = 0; o < OCTAVES; o++ )
+		{
+			combcos += cos( taued * (1<<o) + phase );
+		}
+		combcos /= OCTAVES;
+		csadapt = combcos * scaling - 0.5;	//No value is higher with five octaves.  XXX TODO Lookout.  If you change # of octaves, need to change this, too.
+
+		if( csadapt > highest ) highest = csadapt;
+		if( -csadapt > highest ) highest = -csadapt;
+
+		if( csadapt > 127 ) csadapt = 127;
+		if( csadapt < -128 ) csadapt = -128;  //tricky: Keep balanced.
+		spikysin_interleved_cos[i][1] = csadapt;
+	}
+	return highest;
+}
+
+
+static int Setup( float * frequencies, int bins )
+{
+	int i;
+
+	//Since start position/phase is arbitrary, we should try several to see which gives us the best dynamic range.
+	float tryphase = 0;
+
+	float bestphase = 0;
+	int highest_val_at_best_phase = 1000000;
+
+	for( tryphase = 0; tryphase < 3.14159; tryphase += 0.001 )
+	{
+		int highest = CompTableWithPhase( SSTABLESIZE, tryphase, 65536 );
+		if( highest < highest_val_at_best_phase )
+		{
+			highest_val_at_best_phase = highest;
+			bestphase = tryphase;
+		}
+	}
+	printf( "Best comp: %f : %d\n", bestphase, highest_val_at_best_phase );
+
+	//Set this because we would overflow the sinm and cosm regs if we don't.  This is sort of like a master volume.
+	//use this as that input volume knob thing.
+	float further_reduce = 1.0;
+
+	CompTableWithPhase( SSTABLESIZE, bestphase, (65536*128*further_reduce)/highest_val_at_best_phase );
+
+//	for( i = 0; i < SSTABLESIZE; i++ )
+//	{
+//		printf( "%d %d\n", spikysin_interleved_cos[i*2+0], spikysin_interleved_cos[i*2+1] );
+//	}
+
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		//frequencies[i] = SPS / Freq
+		// Need to decide how quickly we sweep through the table.
+		advancespeed[i] = 65536 * 256.0 /* fixed point */ * 256.0 /* size of table */ / frequencies[i];
+		//printf( "%f\n", frequencies[i] );
+	}
+	return 0;
+}
+
+
+/*
+uint8_t  spikysin_interleved_cos[256*2];
+uint16_t advancespeed[MAX_FREQS];
+*/
+
+float toutbins[MAX_FREQS];
+
+struct notedat
+{
+	uint32_t time;
+	int32_t sinm;
+	int32_t cosm;
+};
+
+static struct notedat nd[MAX_FREQS];
+
+void Turbo8BitRun( int8_t adcval )
+{
+	int i;
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		uint32_t ct = nd[i].time;
+		int32_t muxres;
+		int32_t running;
+		int32_t rdesc, rdess;
+		uint8_t * spikysintable = &spikysin_interleved_cos[(ct>>24)][0];
+
+		int8_t  ss = *(spikysintable++);
+
+		#define DECIR 8
+
+		muxres = ((int16_t)adcval * ss + (1<<(DECIR-1)) ) >> (DECIR);
+		running = nd[i].cosm;
+		running += muxres;
+		rdesc = running >> 8;
+		running -= rdesc >> 3;
+
+		nd[i].cosm = running;
+if( i == 0) printf( "MRX %5d  %9d %9d  %9d %9d\n", muxres, adcval, ss, running, nd[i].sinm );
+		int8_t  sc = *(spikysintable++);
+		muxres = ((int16_t)adcval * sc + (1<<(DECIR-1)) ) >> (DECIR);
+		running = nd[i].sinm;
+		running += muxres;
+
+		rdess = running>>8;
+		running -= rdess >> 3;
+
+		nd[i].sinm = running;
+
+		nd[i].time = ct + advancespeed[i];
+
+		toutbins[i] = rdess * rdess + rdesc * rdesc;
+		//printf( "%d %d = %f %p\n", rdess, rdesc, toutbins[i], &toutbins[i] );
+	}
+
+	static uint8_t stater;
+/*	stater++;
+	if( stater == 16 )
+	{
+		stater = 0;
+		for( i = 0; i < MAX_FREQS; i++ )
+		{
+			nd[i].sinm -= nd[i].sinm >> 12;
+			nd[i].cosm -= nd[i].cosm >> 12;
+			nd[i].sinm += 8;
+			nd[i].cosm += 8;
+		}
+	}*/
+}
+
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup )
+{
+	static int is_setup;
+	if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); }
+	static int last_place;
+	int i;
+
+	for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer )
+	{
+		int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 );
+		//ifr1 += 4095;
+		//ifr1 += 512;
+		Turbo8BitRun( ifr1>>5 ); //6 = Actually only feed algorithm numbers from -64 to 63.
+	}
+	last_place = place_in_data_buffer;
+
+	for( i = 0; i < bins; i++ )
+	{
+		outbins[i] = 0;
+	}
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		int iss = nd[i].sinm>>8;
+		int isc = nd[i].cosm>>8;
+		int mux = iss * iss + isc * isc;
+		if( mux == 0 ) mux = 1;
+		if( i == 0 )
+		printf( "MUX: %d %d\n", isc, iss );
+		outbins[i+MAX_FREQS] = sqrt(mux)/200.0;
+	} 
+
+}
+
+
diff --git a/embeddedcommon/DFT8Turbo.h b/embeddedcommon/DFT8Turbo.h
new file mode 100644
index 0000000..257cf89
--- /dev/null
+++ b/embeddedcommon/DFT8Turbo.h
@@ -0,0 +1,9 @@
+#ifndef _DFT8TURBO_H
+#define _DFT8TURBO_H
+
+/* Note: Frequencies must be precompiled. */
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup );
+
+#endif
+
diff --git a/embeddedcommon/DFT8Turbo.h.attic b/embeddedcommon/DFT8Turbo.h.attic
new file mode 100644
index 0000000..257cf89
--- /dev/null
+++ b/embeddedcommon/DFT8Turbo.h.attic
@@ -0,0 +1,9 @@
+#ifndef _DFT8TURBO_H
+#define _DFT8TURBO_H
+
+/* Note: Frequencies must be precompiled. */
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup );
+
+#endif
+
diff --git a/embeddedcommon/embeddednf.h b/embeddedcommon/embeddednf.h
index d360020..51d3672 100644
--- a/embeddedcommon/embeddednf.h
+++ b/embeddedcommon/embeddednf.h
@@ -32,7 +32,7 @@
 
 //We take the raw signal off of the 
 #ifndef FILTER_BLUR_PASSES
-#define FILTER_BLUR_PASSES 2
+#define FILTER_BLUR_PASSES 1
 #endif
 
 //Determines bit shifts for where notes lie.  We represent notes with an