diff --git a/embeddedcommon/DFT8Turbo.c b/embeddedcommon/DFT8Turbo.c index 9c7b7b4..02791c4 100644 --- a/embeddedcommon/DFT8Turbo.c +++ b/embeddedcommon/DFT8Turbo.c @@ -17,245 +17,120 @@ uint8_t current_time; //1 byte uint8_t placecode[MAX_FREQS]; */ -//OK... We don't have enough ram to sum everything... can we do something wacky with multiple ocatives to sum everything better? -//i.e. -// -// 4332322132212210 -// -// ++++++++++++++++----------------- -// ++++++++-------- -// ++++----++++---- -// ++--++--++--++-- -// +-+-+-+-+-+-+-+- -// -// Don't forget we need to do this for sin and cos. -// Can we instead of making this plusses, make it a multiplier? -// How can we handle sin+cos? -// -// Is it possible to do this for every frame? I.e. for each of the 24 notes, multiply with their current place in table? -// That's interesting. It's not like a sin table. -// There is no "multiply" in the attiny instruction set for attiny85. -// There is, however for attiny402 +/* + So, the idea here is we would keep a running total of the current ADC value, kept away in a int16_t. + It is constantly summing, so we can take an integral of it. Or rather an integral range. -//Question: Can we do five octaves, or does this need to be balanced? -//Question2: Should we weight higher octaves? + Over time, we perform operations like adding or subtracting from a current place. -//ATTiny402: 256x8 RAM, 4096x8 FLASH LPM: 3 cycles + FMUL: 2 cycles << Do stacked sin waves? -//ATtiny85: 512x8 RAM, 8192x8 FLASH LPM: 3 cycles + NO MULTIPLY << Do square waves? - - -/* Approaches: - - on ATtiny402: Stacked sin approach. - Say 16 MHz, though 12 MHz is interesting... - 16k SPS: 1k cycles per; say 24 bins per; 41 cycles per bin = hard. But is it too hard? - 20 cycles per s/c. - read place in stacked table (8? bits) 3 cycles - - //Inner loop = 17 cycles. - read stacked table (8 bits), 3 cycles - fractional multiply table with current value. 2 cycles - read current running for note 2 cycles (LDS = 3 cycles) - subtract a shifted version, to make it into an IIR. (4 cycles) - add in current values. (2 cycles) - store data back to ram (2 cycles) - advance place in stacked table (8?bits) 1 cycle - - store place in stacked table (8? bits) 3 cycles? - - //What if we chunk ADC updates into groups of 4 or 8? - //This is looking barely possible. - - on attiny85: scheduled adds/subtracts (like a stacked-square-wave-table) - //XXX TODO! - +NOTE: + Optimizations: + Only use 16 bins, lets action table be 16-bits wide. */ -/* Ok... Let's think about the ATTiny402. 256x8 RAM + 4096x8 FLASH. +int16_t running_integral; +int16_t cossindata[MAX_FREQS*OCTAVES*2]; //Contains COS and SIN data. - * We can create a table which has all octaves overlaid. - * We would need to keep track of: - * 12 x 2 x 2 = 48 bytes = Current sin/cos values. - * 12 x 2 = 24 bytes = Current place in table. = 72 bytes - * We would need to store: - * The layered lookup table. If possible, keep @ 256 bytes to simplify math ops. - * The speed by which each note needs to advance. - * We would need to: - * Read current running place. X 8 cycles - * Use that place to look up into sin table. 3 cycles - * Read running val 4 cycles best case - * Multiply out the sin + IIR 5 cycles - * Store running val 4 cycles best case - * Cos-advance that place to look up into sin table. 4 cycles - * Read running val 4 cycles best case - * Multiply out the sin + IIR 5 cycles - * Store running val 4 cycles best case. - * Read how much to advance X by. 4 cycles - * (Cos^2+Sin^2) 8? - * Store it. 4 cycles best case. - * = 48 x 12 = 576 cycles. Assume 10 MHz @ 16k SPS. We're OK (625 samples) -*/ -// Observation: The two tables are actually mirror images of each other, well diagonally mirrored. That's odd. But, would take CPU to exploit. -#define SSTABLESIZE 256 -int8_t spikysin_interleved_cos[SSTABLESIZE][2]; -uint32_t advancespeed[MAX_FREQS]; +uint8_t which_octave_for_op[MAX_FREQS]; //counts up, tells you which ocative you are operating on. +uint8_t highbit_table[2< highest ) highest = csadapt; - if( -csadapt > highest ) highest = -csadapt; +#define ACTIONTABLESIZE 512 - if( csadapt > 127 ) csadapt = 127; - if( csadapt < -128 ) csadapt = -128; //tricky: Keep balanced. - spikysin_interleved_cos[i][0] = csadapt; - - float combcos = 0; - for( o = 0; o < OCTAVES; o++ ) - { - combcos += cos( taued * (1< highest ) highest = csadapt; - if( -csadapt > highest ) highest = -csadapt; - - if( csadapt > 127 ) csadapt = 127; - if( csadapt < -128 ) csadapt = -128; //tricky: Keep balanced. - spikysin_interleved_cos[i][1] = csadapt; - } - return highest; -} +uint16_t * placeintable; +//Put this in flash. +uint32_t actiontable[ACTIONTABLESIZE]; static int Setup( float * frequencies, int bins ) { int i; - - //Since start position/phase is arbitrary, we should try several to see which gives us the best dynamic range. - float tryphase = 0; - - float bestphase = 0; - int highest_val_at_best_phase = 1000000; - - for( tryphase = 0; tryphase < 3.14159; tryphase += 0.001 ) + printf( "BINS: %d\n", bins ); + for( i = bins-MAX_FREQS; i < bins; i++ ) { - int highest = CompTableWithPhase( SSTABLESIZE, tryphase, 65536 ); - if( highest < highest_val_at_best_phase ) + int topbin = i - (bins-MAX_FREQS); + float f = frequencies[i]/2.0; //2x the hits (sin/cos) + float hits_per_table = (float)ACTIONTABLESIZE/f; + int dhrpertable = (int)(hits_per_table+.5);//TRICKY: You might think you need to have even number of hits (sin/cos), but you don't! It can flip sin/cos each time through the table! + float err = (8000./((float)ACTIONTABLESIZE/dhrpertable) - 8000./f)/(8000./f); + //Perform an op every X samples. How well does this map into units of 1024? + printf( "%d %f -> hits per 1024: %f %d (%f error)\n", topbin, f, (float)ACTIONTABLESIZE/f, dhrpertable, err * 100.0 ); + + float advance_per_step = dhrpertable/(float)ACTIONTABLESIZE; + float fvadv = 0.0; + int j; + int actions = 0; + int countset = 0; + + //XXX TODO Tricky: We need to start fadv off at such a place that there won't be a hicchup when going back around to 0. + + for( j = 0; j < ACTIONTABLESIZE; j++ ) { - highest_val_at_best_phase = highest; - bestphase = tryphase; + if( fvadv >= 0.5 ) + { + actiontable[j] |= 1<> longestzeroes) & 1) == 0 ); longestzeroes++ ); + //longestzeroes goes: 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, ... + //This isn't great, because we need to also know whether we are attacking the SIN side or the COS side. + highbit_table[i] = longestzeroes; } - return 0; + //Repeat the highbit table in the second half. + //XXX PICK UP HERE + //Encode into highbit_table which cell is being operated on + //Also, do the * MAX_FREQS here. That will + + + + placeintable = actiontable; + // for( i = 0; i < ACTIONTABLESIZE; i++ ) printf( "%08x\n", actiontable[i] ); } -/* -uint8_t spikysin_interleved_cos[256*2]; -uint16_t advancespeed[MAX_FREQS]; -*/ -float toutbins[MAX_FREQS]; +int16_t running_integral; +int16_t cossindata[MAX_FREQS*OCTAVES*2]; +uint8_t which_octave_for_op[MAX_FREQS]; //counts up, tells you which ocative you are operating on. +uint16_t * placeintable; -struct notedat -{ - uint32_t time; - int32_t sinm; - int32_t cosm; -}; +//Put this in flash. +uint32_t actiontable[ACTIONTABLESIZE]; -static struct notedat nd[MAX_FREQS]; void Turbo8BitRun( int8_t adcval ) { - int i; - for( i = 0; i < MAX_FREQS; i++ ) + uint32_t actions = *(placeintable++); + if( placeintable == &actiontable[ACTIONTABLESIZE] ) placeintable = actiontable; + int b; + for( b = 0; b < MAX_FREQS; b++ ) { - uint32_t ct = nd[i].time; - int32_t muxres; - int32_t running; - int32_t rdesc, rdess; - uint8_t * spikysintable = &spikysin_interleved_cos[(ct>>24)][0]; + if( ! ((1<>= 1; + int octavebit = op & ((1<> (DECIR); - running = nd[i].cosm; - running += muxres; - rdesc = running >> 8; - running -= rdesc >> 3; - - nd[i].cosm = running; -if( i == 0) printf( "MRX %5d %9d %9d %9d %9d\n", muxres, adcval, ss, running, nd[i].sinm ); - int8_t sc = *(spikysintable++); - muxres = ((int16_t)adcval * sc + (1<<(DECIR-1)) ) >> (DECIR); - running = nd[i].sinm; - running += muxres; - - rdess = running>>8; - running -= rdess >> 3; - - nd[i].sinm = running; - - nd[i].time = ct + advancespeed[i]; - - toutbins[i] = rdess * rdess + rdesc * rdesc; - //printf( "%d %d = %f %p\n", rdess, rdesc, toutbins[i], &toutbins[i] ); + //if( b == 0 ) printf( "%d\n", whichoctave ); + //XXX TODO Optimization: Use a table, since octavebit can only be 0...31. } - - static uint8_t stater; -/* stater++; - if( stater == 16 ) - { - stater = 0; - for( i = 0; i < MAX_FREQS; i++ ) - { - nd[i].sinm -= nd[i].sinm >> 12; - nd[i].cosm -= nd[i].cosm >> 12; - nd[i].sinm += 8; - nd[i].cosm += 8; - } - }*/ } @@ -269,12 +144,11 @@ void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer ) { int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 ); - //ifr1 += 4095; - //ifr1 += 512; Turbo8BitRun( ifr1>>5 ); //6 = Actually only feed algorithm numbers from -64 to 63. } last_place = place_in_data_buffer; +#if 0 for( i = 0; i < bins; i++ ) { outbins[i] = 0; @@ -289,7 +163,7 @@ void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float printf( "MUX: %d %d\n", isc, iss ); outbins[i+MAX_FREQS] = sqrt(mux)/200.0; } - +#endif } diff --git a/embeddedcommon/DFT8Turbo.c.attic b/embeddedcommon/DFT8Turbo.c.attic new file mode 100644 index 0000000..9c7b7b4 --- /dev/null +++ b/embeddedcommon/DFT8Turbo.c.attic @@ -0,0 +1,295 @@ +#include +#include +#include "DFT8Turbo.h" +#include + +#include + +#define MAX_FREQS (24) +#define OCTAVES (5) + + +/* + * The first thought was using an integration map and only operating when we need to, to pull the data out. + * Now we're doing the thing below this block comment + int16_t accumulated_total; //2 bytes + int16_t last_accumulated_total_at_bin[MAX_FREQS*2]; //24 * 2 * sizeof(int16_t) = 96 bytes. + uint8_t current_time; //1 byte + uint8_t placecode[MAX_FREQS]; +*/ +//OK... We don't have enough ram to sum everything... can we do something wacky with multiple ocatives to sum everything better? +//i.e. +// +// 4332322132212210 +// +// ++++++++++++++++----------------- +// ++++++++-------- +// ++++----++++---- +// ++--++--++--++-- +// +-+-+-+-+-+-+-+- +// +// Don't forget we need to do this for sin and cos. +// Can we instead of making this plusses, make it a multiplier? +// How can we handle sin+cos? +// +// Is it possible to do this for every frame? I.e. for each of the 24 notes, multiply with their current place in table? +// That's interesting. It's not like a sin table. +// There is no "multiply" in the attiny instruction set for attiny85. +// There is, however for attiny402 + +//Question: Can we do five octaves, or does this need to be balanced? +//Question2: Should we weight higher octaves? + + +//ATTiny402: 256x8 RAM, 4096x8 FLASH LPM: 3 cycles + FMUL: 2 cycles << Do stacked sin waves? +//ATtiny85: 512x8 RAM, 8192x8 FLASH LPM: 3 cycles + NO MULTIPLY << Do square waves? + + +/* Approaches: + + on ATtiny402: Stacked sin approach. + Say 16 MHz, though 12 MHz is interesting... + 16k SPS: 1k cycles per; say 24 bins per; 41 cycles per bin = hard. But is it too hard? + 20 cycles per s/c. + read place in stacked table (8? bits) 3 cycles + + //Inner loop = 17 cycles. + read stacked table (8 bits), 3 cycles + fractional multiply table with current value. 2 cycles + read current running for note 2 cycles (LDS = 3 cycles) + subtract a shifted version, to make it into an IIR. (4 cycles) + add in current values. (2 cycles) + store data back to ram (2 cycles) + advance place in stacked table (8?bits) 1 cycle + + store place in stacked table (8? bits) 3 cycles? + + //What if we chunk ADC updates into groups of 4 or 8? + //This is looking barely possible. + + on attiny85: scheduled adds/subtracts (like a stacked-square-wave-table) + //XXX TODO! + +*/ + +/* Ok... Let's think about the ATTiny402. 256x8 RAM + 4096x8 FLASH. + + * We can create a table which has all octaves overlaid. + * We would need to keep track of: + * 12 x 2 x 2 = 48 bytes = Current sin/cos values. + * 12 x 2 = 24 bytes = Current place in table. = 72 bytes + * We would need to store: + * The layered lookup table. If possible, keep @ 256 bytes to simplify math ops. + * The speed by which each note needs to advance. + * We would need to: + * Read current running place. X 8 cycles + * Use that place to look up into sin table. 3 cycles + * Read running val 4 cycles best case + * Multiply out the sin + IIR 5 cycles + * Store running val 4 cycles best case + * Cos-advance that place to look up into sin table. 4 cycles + * Read running val 4 cycles best case + * Multiply out the sin + IIR 5 cycles + * Store running val 4 cycles best case. + * Read how much to advance X by. 4 cycles + * (Cos^2+Sin^2) 8? + * Store it. 4 cycles best case. + * = 48 x 12 = 576 cycles. Assume 10 MHz @ 16k SPS. We're OK (625 samples) +*/ + +// Observation: The two tables are actually mirror images of each other, well diagonally mirrored. That's odd. But, would take CPU to exploit. + +#define SSTABLESIZE 256 +int8_t spikysin_interleved_cos[SSTABLESIZE][2]; +uint32_t advancespeed[MAX_FREQS]; + +static int CompTableWithPhase( int nelements, float phase, int scaling ) +{ + int highest = 0; + int i; + for( i = 0; i < nelements; i++ ) + { + float taued = i * 3.141592 * 2.0 / nelements; + int o; + float combsin = 0; + for( o = 0; o < OCTAVES; o++ ) + { + combsin += sin( taued * (1< highest ) highest = csadapt; + if( -csadapt > highest ) highest = -csadapt; + + if( csadapt > 127 ) csadapt = 127; + if( csadapt < -128 ) csadapt = -128; //tricky: Keep balanced. + spikysin_interleved_cos[i][0] = csadapt; + + float combcos = 0; + for( o = 0; o < OCTAVES; o++ ) + { + combcos += cos( taued * (1< highest ) highest = csadapt; + if( -csadapt > highest ) highest = -csadapt; + + if( csadapt > 127 ) csadapt = 127; + if( csadapt < -128 ) csadapt = -128; //tricky: Keep balanced. + spikysin_interleved_cos[i][1] = csadapt; + } + return highest; +} + + +static int Setup( float * frequencies, int bins ) +{ + int i; + + //Since start position/phase is arbitrary, we should try several to see which gives us the best dynamic range. + float tryphase = 0; + + float bestphase = 0; + int highest_val_at_best_phase = 1000000; + + for( tryphase = 0; tryphase < 3.14159; tryphase += 0.001 ) + { + int highest = CompTableWithPhase( SSTABLESIZE, tryphase, 65536 ); + if( highest < highest_val_at_best_phase ) + { + highest_val_at_best_phase = highest; + bestphase = tryphase; + } + } + printf( "Best comp: %f : %d\n", bestphase, highest_val_at_best_phase ); + + //Set this because we would overflow the sinm and cosm regs if we don't. This is sort of like a master volume. + //use this as that input volume knob thing. + float further_reduce = 1.0; + + CompTableWithPhase( SSTABLESIZE, bestphase, (65536*128*further_reduce)/highest_val_at_best_phase ); + +// for( i = 0; i < SSTABLESIZE; i++ ) +// { +// printf( "%d %d\n", spikysin_interleved_cos[i*2+0], spikysin_interleved_cos[i*2+1] ); +// } + + for( i = 0; i < MAX_FREQS; i++ ) + { + //frequencies[i] = SPS / Freq + // Need to decide how quickly we sweep through the table. + advancespeed[i] = 65536 * 256.0 /* fixed point */ * 256.0 /* size of table */ / frequencies[i]; + //printf( "%f\n", frequencies[i] ); + } + return 0; +} + + +/* +uint8_t spikysin_interleved_cos[256*2]; +uint16_t advancespeed[MAX_FREQS]; +*/ + +float toutbins[MAX_FREQS]; + +struct notedat +{ + uint32_t time; + int32_t sinm; + int32_t cosm; +}; + +static struct notedat nd[MAX_FREQS]; + +void Turbo8BitRun( int8_t adcval ) +{ + int i; + for( i = 0; i < MAX_FREQS; i++ ) + { + uint32_t ct = nd[i].time; + int32_t muxres; + int32_t running; + int32_t rdesc, rdess; + uint8_t * spikysintable = &spikysin_interleved_cos[(ct>>24)][0]; + + int8_t ss = *(spikysintable++); + + #define DECIR 8 + + muxres = ((int16_t)adcval * ss + (1<<(DECIR-1)) ) >> (DECIR); + running = nd[i].cosm; + running += muxres; + rdesc = running >> 8; + running -= rdesc >> 3; + + nd[i].cosm = running; +if( i == 0) printf( "MRX %5d %9d %9d %9d %9d\n", muxres, adcval, ss, running, nd[i].sinm ); + int8_t sc = *(spikysintable++); + muxres = ((int16_t)adcval * sc + (1<<(DECIR-1)) ) >> (DECIR); + running = nd[i].sinm; + running += muxres; + + rdess = running>>8; + running -= rdess >> 3; + + nd[i].sinm = running; + + nd[i].time = ct + advancespeed[i]; + + toutbins[i] = rdess * rdess + rdesc * rdesc; + //printf( "%d %d = %f %p\n", rdess, rdesc, toutbins[i], &toutbins[i] ); + } + + static uint8_t stater; +/* stater++; + if( stater == 16 ) + { + stater = 0; + for( i = 0; i < MAX_FREQS; i++ ) + { + nd[i].sinm -= nd[i].sinm >> 12; + nd[i].cosm -= nd[i].cosm >> 12; + nd[i].sinm += 8; + nd[i].cosm += 8; + } + }*/ +} + + +void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup ) +{ + static int is_setup; + if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); } + static int last_place; + int i; + + for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer ) + { + int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 ); + //ifr1 += 4095; + //ifr1 += 512; + Turbo8BitRun( ifr1>>5 ); //6 = Actually only feed algorithm numbers from -64 to 63. + } + last_place = place_in_data_buffer; + + for( i = 0; i < bins; i++ ) + { + outbins[i] = 0; + } + for( i = 0; i < MAX_FREQS; i++ ) + { + int iss = nd[i].sinm>>8; + int isc = nd[i].cosm>>8; + int mux = iss * iss + isc * isc; + if( mux == 0 ) mux = 1; + if( i == 0 ) + printf( "MUX: %d %d\n", isc, iss ); + outbins[i+MAX_FREQS] = sqrt(mux)/200.0; + } + +} + + diff --git a/embeddedcommon/DFT8Turbo.h.attic b/embeddedcommon/DFT8Turbo.h.attic new file mode 100644 index 0000000..257cf89 --- /dev/null +++ b/embeddedcommon/DFT8Turbo.h.attic @@ -0,0 +1,9 @@ +#ifndef _DFT8TURBO_H +#define _DFT8TURBO_H + +/* Note: Frequencies must be precompiled. */ + +void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup ); + +#endif +