diff --git a/colorchord2/Makefile b/colorchord2/Makefile index f393fc3..8620946 100644 --- a/colorchord2/Makefile +++ b/colorchord2/Makefile @@ -17,7 +17,7 @@ LDLIBS:=-lpthread -lasound -lm -lpulse-simple -lpulse -ludev -lrt CFLAGS:=-g -O0 -flto -Wall -ffast-math -I../embeddedcommon -I. -DICACHE_FLASH_ATTR= EXTRALIBS:=-lusb-1.0 -colorchord : os_generic.o main.o dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o +colorchord : os_generic.o main.o dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o ../embeddedcommon/DFT8Turbo.o gcc -o $@ $^ $(CFLAGS) $(LDLIBS) $(EXTRALIBS) $(RAWDRAWLIBS) diff --git a/colorchord2/default.conf b/colorchord2/default.conf index c8d2a9d..297c676 100644 --- a/colorchord2/default.conf +++ b/colorchord2/default.conf @@ -58,8 +58,9 @@ octaves = 5 # 1 = DFT Progressive # 2 = DFT Progressive Integer # 3 = DFT Progressive Integer Skippy -# 4 = Integer, 32-Bit, Progressive, Skippy. -do_progressive_dft = 4 +# 4 = Integer, 32-Bit, Progressive, Skippy. (wow, this actually works) +# 5 = 8-bit turbo test. +do_progressive_dft = 5 filter_iter = 2 diff --git a/colorchord2/notefinder.c b/colorchord2/notefinder.c index f0e1178..1f0a01c 100644 --- a/colorchord2/notefinder.c +++ b/colorchord2/notefinder.c @@ -11,6 +11,7 @@ #include "filter.h" #include "decompose.h" #include "DFT32.h" +#include "DFT8Turbo.h" struct NoteFinder * CreateNoteFinder( int spsRec ) { @@ -199,6 +200,9 @@ void RunNoteFinder( struct NoteFinder * nf, const float * audio_stream, int head case 4: DoDFTProgressive32( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup ); break; + case 5: + DoDFT8BitTurbo( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup ); + break; default: fprintf( stderr, "Error: No DFT Seleced\n" ); } diff --git a/colorchord2/turbo8bit.conf b/colorchord2/turbo8bit.conf new file mode 100644 index 0000000..9dd6b04 --- /dev/null +++ b/colorchord2/turbo8bit.conf @@ -0,0 +1,103 @@ +# This is the configuration file for colorchord. +# Most values are already defaulted in the software. +# This file is constantly checked for new versions. +# \r, and ; are used as terminators, so you can put +# multiple entries on the same line. + +#Whether to limit the control loop to ~60ish FPS. +cpu_autolimit = 1 + +#General GUI properties. +title = PA Test +set_screenx = 720 +set_screeny = 480 + +#Sound properties. +buffer = 384 +play = 0 +rec = 1 +channels = 2 +samplerate = 16000 +wininput = -1 + +#Compiled version will default this. +#sound_source = ALSA +#-1 indicates left and right, 0 left, 1 right. + +sample_channel = -1 +sourcename = default +#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor +#default +# alsa_output.pci-0000_00_1b.0.analog-stereo.monitor +#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor << New laptop +#use pactl list | grep pci- | grep monitor + +################################## +# General ColorChord properties. # +################################## + +# How much to amplify the incoming signal. +amplify = 2.0 + +# What is the base note? I.e. the lowest note. +# Note that it won't have very much impact until an octave up though! +base_hz = 110 + +# This is only used when dealing with the slow decompose (now defunct) +# decompose_iterations = 1000 +# default_sigma = 1.4000 + +# DFT properties for the DFT up top. +dft_iir = 0.6 +dft_q = 20.0000 +dft_speedup = 1000.0000 +octaves = 5 + +# Should we use a progressive DFT? +# 0 = DFT Quick +# 1 = DFT Progressive +# 2 = DFT Progressive Integer +# 3 = DFT Progressive Integer Skippy +# 4 = Integer, 32-Bit, Progressive, Skippy. (wow, this actually works) +# 5 = 8-bit turbo test. +do_progressive_dft = 5 + + +filter_iter = 2 +filter_strength = .5 + +# How many bins per octave to use? +freqbins = 24 + +# For the final note information... How much to slack everything? +note_attach_amp_iir = 0.3500 +note_attach_amp_iir2 = 0.250 +note_attach_freq_iir = 0.3000 + +#How many bins a note can jump from frame to frame to be considered a slide. +#this is used to prevent notes from popping in and out a lot. +note_combine_distance = 0.5000 +note_jumpability = 1.8000 +note_minimum_new_distribution_value = 0.0200 +note_out_chop = 0.05000 + +#compress_coefficient = 4.0 +#compress_exponent = .5 + + +#======================================================================= +#Outputs + + +This is a vornoi thing: +outdrivers = OutputVoronoi, DisplayArray +lightx = 64 +lighty = 32 +fromsides = 1 +shape_cutoff = 0.03 +satamp = 5.000 +amppow = 2.510 +distpow = 1.500 + + + diff --git a/embeddedcommon/DFT32.c b/embeddedcommon/DFT32.c index 21df4dd..b587c6c 100644 --- a/embeddedcommon/DFT32.c +++ b/embeddedcommon/DFT32.c @@ -353,6 +353,3 @@ void DoDFTProgressive32( float * outbins, float * frequencies, int bins, const f #endif - - - diff --git a/embeddedcommon/DFT8Turbo.c b/embeddedcommon/DFT8Turbo.c new file mode 100644 index 0000000..5c91e34 --- /dev/null +++ b/embeddedcommon/DFT8Turbo.c @@ -0,0 +1,264 @@ +#include +#include +#include "DFT8Turbo.h" +#include + +#include + +#define MAX_FREQS (24) +#define OCTAVES (5) + + +/* + * The first thought was using an integration map and only operating when we need to, to pull the data out. + * Now we're doing the thing below this block comment + int16_t accumulated_total; //2 bytes + int16_t last_accumulated_total_at_bin[MAX_FREQS*2]; //24 * 2 * sizeof(int16_t) = 96 bytes. + uint8_t current_time; //1 byte + uint8_t placecode[MAX_FREQS]; +*/ +//OK... We don't have enough ram to sum everything... can we do something wacky with multiple ocatives to sum everything better? +//i.e. +// +// 4332322132212210 +// +// ++++++++++++++++----------------- +// ++++++++-------- +// ++++----++++---- +// ++--++--++--++-- +// +-+-+-+-+-+-+-+- +// +// Don't forget we need to do this for sin and cos. +// Can we instead of making this plusses, make it a multiplier? +// How can we handle sin+cos? +// +// Is it possible to do this for every frame? I.e. for each of the 24 notes, multiply with their current place in table? +// That's interesting. It's not like a sin table. +// There is no "multiply" in the attiny instruction set for attiny85. +// There is, however for attiny402 + +//Question: Can we do five octaves, or does this need to be balanced? +//Question2: Should we weight higher octaves? + + +//ATTiny402: 256x8 RAM, 4096x8 FLASH LPM: 3 cycles + FMUL: 2 cycles << Do stacked sin waves? +//ATtiny85: 512x8 RAM, 8192x8 FLASH LPM: 3 cycles + NO MULTIPLY << Do square waves? + + +/* Approaches: + + on ATtiny402: Stacked sin approach. + Say 16 MHz, though 12 MHz is interesting... + 16k SPS: 1k cycles per; say 24 bins per; 41 cycles per bin = hard. But is it too hard? + 20 cycles per s/c. + read place in stacked table (8? bits) 3 cycles + + //Inner loop = 17 cycles. + read stacked table (8 bits), 3 cycles + fractional multiply table with current value. 2 cycles + read current running for note 2 cycles (LDS = 3 cycles) + subtract a shifted version, to make it into an IIR. (4 cycles) + add in current values. (2 cycles) + store data back to ram (2 cycles) + advance place in stacked table (8?bits) 1 cycle + + store place in stacked table (8? bits) 3 cycles? + + //What if we chunk ADC updates into groups of 4 or 8? + //This is looking barely possible. + + on attiny85: scheduled adds/subtracts (like a stacked-square-wave-table) + //XXX TODO! + +*/ + +/* Ok... Let's think about the ATTiny402. 256x8 RAM + 4096x8 FLASH. + + * We can create a table which has all octaves overlaid. + * We would need to keep track of: + * 12 x 2 x 2 = 48 bytes = Current sin/cos values. + * 12 x 2 = 24 bytes = Current place in table. = 72 bytes + * We would need to store: + * The layered lookup table. If possible, keep @ 256 bytes to simplify math ops. + * The speed by which each note needs to advance. + * We would need to: + * Read current running place. X 8 cycles + * Use that place to look up into sin table. 3 cycles + * Read running val 4 cycles best case + * Multiply out the sin + IIR 5 cycles + * Store running val 4 cycles best case + * Cos-advance that place to look up into sin table. 4 cycles + * Read running val 4 cycles best case + * Multiply out the sin + IIR 5 cycles + * Store running val 4 cycles best case. + * Read how much to advance X by. 4 cycles + * (Cos^2+Sin^2) 8? + * Store it. 4 cycles best case. + * = 48 x 12 = 576 cycles. Assume 10 MHz @ 16k SPS. We're OK (625 samples) +*/ + +// Observation: The two tables are actually mirror images of each other, well diagonally mirrored. That's odd. But, would take CPU to exploit. + +#define SSTABLESIZE 256 +int8_t spikysin_interleved_cos[SSTABLESIZE*2]; +uint16_t advancespeed[MAX_FREQS]; + +static int CompTableWithPhase( int nelements, float phase, int scaling ) +{ + int highest = 0; + int i; + for( i = 0; i < nelements; i++ ) + { + float taued = i * 3.141592 * 2.0 / nelements; + int o; + float combsin = 0; + for( o = 0; o < OCTAVES; o++ ) + { + combsin += sin( taued * (1< highest ) highest = csadapt; + if( -csadapt > highest ) highest = -csadapt; + + if( csadapt > 127 ) csadapt = 127; + if( csadapt < -127 ) csadapt = -127; //tricky: Keep balanced. + spikysin_interleved_cos[i*2+0] = csadapt; + + float combcos = 0; + for( o = 0; o < OCTAVES; o++ ) + { + combcos += cos( taued * (1< highest ) highest = csadapt; + if( -csadapt > highest ) highest = -csadapt; + + if( csadapt > 127 ) csadapt = 127; + if( csadapt < -127 ) csadapt = -127; //tricky: Keep balanced. + spikysin_interleved_cos[i*2+1] = csadapt; + } + return highest; +} + + +static int Setup( float * frequencies, int bins ) +{ + int i; + + //Since start position/phase is arbitrary, we should try several to see which gives us the best dynamic range. + float tryphase = 0; + + float bestphase = 0; + int highest_val_at_best_phase = 1000000; + + for( tryphase = 0; tryphase < 3.14159; tryphase += 0.001 ) + { + int highest = CompTableWithPhase( SSTABLESIZE, tryphase, 65536 ); + if( highest < highest_val_at_best_phase ) + { + highest_val_at_best_phase = highest; + bestphase = tryphase; + } + } + printf( "Best comp: %f : %d\n", bestphase, highest_val_at_best_phase ); + + CompTableWithPhase( SSTABLESIZE, bestphase, (65536*128)/highest_val_at_best_phase ); + +// for( i = 0; i < SSTABLESIZE; i++ ) +// { +// printf( "%d %d\n", spikysin_interleved_cos[i*2+0], spikysin_interleved_cos[i*2+1] ); +// } + + for( i = 0; i < MAX_FREQS; i++ ) + { + //frequencies[i] = SPS / Freq + // Need to decide how quickly we sweep through the table. + advancespeed[i] = 256.0 /* fixed point */ * 256.0 /* size of table */ / frequencies[i]; + //printf( "%f\n", frequencies[i] ); + } + return 0; +} + + +/* +uint8_t spikysin_interleved_cos[256*2]; +uint16_t advancespeed[MAX_FREQS]; +*/ + +float toutbins[MAX_FREQS]; + +struct notedat +{ + uint16_t time; + int32_t sinm; + int32_t cosm; +}; + +static struct notedat nd[MAX_FREQS]; + +void Turbo8BitRun( int8_t adcval ) +{ + int i; + for( i = 0; i < MAX_FREQS; i++ ) + { + uint16_t ct = nd[i].time; + int32_t muxres; + int32_t running; + int32_t rdesc, rdess; + int8_t ss = spikysin_interleved_cos[(ct>>8) + 0]; + muxres = ((int16_t)adcval * ss) >> 8; + running = nd[i].cosm; + running += muxres; + rdesc = running >> 8; + running -= rdesc>>6; + nd[i].cosm = running; + + int8_t sc = spikysin_interleved_cos[(ct>>8) + 1]; + muxres = ((int16_t)adcval * sc) >> 8; + running = nd[i].sinm; + running += muxres; + rdess = running>>8; + running -= rdess>>6; + nd[i].sinm = running; + + nd[i].time = ct + advancespeed[i]; + toutbins[i] = rdess * rdess + rdesc * rdesc; + //printf( "%d %d = %f %p\n", rdess, rdesc, toutbins[i], &toutbins[i] ); + } +} + + +void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup ) +{ + static int is_setup; + if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); } + static int last_place; + int i; + + for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer ) + { + int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 ); + //ifr1 += 4095; + Turbo8BitRun( ifr1>>5 ); + } + + for( i = 0; i < bins; i++ ) + { + outbins[i] = 0; + } + for( i = 0; i < MAX_FREQS; i++ ) + { + int iss = nd[i].sinm; + int isc = nd[i].cosm; + int mux = iss * iss + isc * isc; + if( mux == 0 ) mux = 1; + outbins[i+MAX_FREQS] = sqrt(mux)/1000.0; + } + +} + + diff --git a/embeddedcommon/DFT8Turbo.h b/embeddedcommon/DFT8Turbo.h new file mode 100644 index 0000000..257cf89 --- /dev/null +++ b/embeddedcommon/DFT8Turbo.h @@ -0,0 +1,9 @@ +#ifndef _DFT8TURBO_H +#define _DFT8TURBO_H + +/* Note: Frequencies must be precompiled. */ + +void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup ); + +#endif +