First inroads to turbo8

2019-03-28 06:29:48 -04:00 · 2019-03-28 06:29:48 -04:00 · b9dc46c701
commit b9dc46c701
parent 8e628ab602
7 changed files with 384 additions and 6 deletions
--- a/colorchord2/Makefile
+++ b/colorchord2/Makefile
@ -17,7 +17,7 @@ LDLIBS:=-lpthread -lasound -lm -lpulse-simple -lpulse -ludev -lrt
 CFLAGS:=-g -O0 -flto -Wall -ffast-math -I../embeddedcommon -I. -DICACHE_FLASH_ATTR=
 EXTRALIBS:=-lusb-1.0

-colorchord : os_generic.o main.o  dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o
+colorchord : os_generic.o main.o  dft.o decompose.o filter.o color.o notefinder.o util.o outdrivers.o $(RAWDRAW) $(SOUND) $(OUTS) parameters.o chash.o hook.o ../embeddedcommon/DFT32.o configs.o ../embeddedcommon/DFT8Turbo.o
 	gcc -o $@ $^ $(CFLAGS) $(LDLIBS) $(EXTRALIBS) $(RAWDRAWLIBS)


--- a/colorchord2/default.conf
+++ b/colorchord2/default.conf
@ -58,8 +58,9 @@ octaves = 5
 # 1 = DFT Progressive
 # 2 = DFT Progressive Integer	
 # 3 = DFT Progressive Integer Skippy
-# 4 = Integer, 32-Bit, Progressive, Skippy.
-do_progressive_dft = 4
+# 4 = Integer, 32-Bit, Progressive, Skippy. (wow, this actually works)
+# 5 = 8-bit turbo test.
+do_progressive_dft = 5


 filter_iter = 2
--- a/colorchord2/notefinder.c
+++ b/colorchord2/notefinder.c
@ -11,6 +11,7 @@
 #include "filter.h"
 #include "decompose.h"
 #include "DFT32.h"
+#include "DFT8Turbo.h"

 struct NoteFinder * CreateNoteFinder( int spsRec )
 {
@ -199,6 +200,9 @@ void RunNoteFinder( struct NoteFinder * nf, const float * audio_stream, int head
 	case 4:
 		DoDFTProgressive32( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup );
 		break;
+	case 5:
+		DoDFT8BitTurbo( dftbins, nf->frequencies, freqs, audio_stream, head, buffersize, nf->dft_q, nf->dft_speedup );
+		break;
 	default:
 		fprintf( stderr, "Error: No DFT Seleced\n" );
 	}
--- a/colorchord2/turbo8bit.conf
+++ b/colorchord2/turbo8bit.conf
@ -0,0 +1,103 @@
+# This is the configuration file for colorchord.
+# Most values are already defaulted in the software.
+# This file is constantly checked for new versions.
+# \r, and ; are used as terminators, so you can put
+# multiple entries on the same line.
+
+#Whether to limit the control loop to ~60ish FPS.
+cpu_autolimit = 1
+
+#General GUI properties.
+title = PA Test
+set_screenx = 720
+set_screeny = 480
+
+#Sound properties.
+buffer = 384
+play = 0
+rec = 1
+channels = 2
+samplerate = 16000
+wininput = -1
+
+#Compiled version will default this.
+#sound_source = ALSA
+#-1 indicates left and right, 0 left, 1 right.
+
+sample_channel = -1
+sourcename = default
+#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor
+#default
+# alsa_output.pci-0000_00_1b.0.analog-stereo.monitor
+#alsa_output.pci-0000_00_1f.3.analog-stereo.monitor << New laptop
+#use pactl list | grep pci- | grep monitor
+
+##################################
+# General ColorChord properties. #
+##################################
+
+# How much to amplify the incoming signal.
+amplify = 2.0
+
+# What is the base note?  I.e. the lowest note. 
+# Note that it won't have very much impact until an octave up though!
+base_hz = 110
+
+# This is only used when dealing with the slow decompose (now defunct)
+# decompose_iterations = 1000
+# default_sigma = 1.4000
+
+# DFT properties for the DFT up top.
+dft_iir = 0.6
+dft_q = 20.0000
+dft_speedup = 1000.0000
+octaves = 5
+
+# Should we use a progressive DFT?
+# 0 = DFT Quick
+# 1 = DFT Progressive
+# 2 = DFT Progressive Integer	
+# 3 = DFT Progressive Integer Skippy
+# 4 = Integer, 32-Bit, Progressive, Skippy. (wow, this actually works)
+# 5 = 8-bit turbo test.
+do_progressive_dft = 5
+
+
+filter_iter = 2
+filter_strength = .5
+
+# How many bins per octave to use?
+freqbins = 24
+
+# For the final note information... How much to slack everything?
+note_attach_amp_iir = 0.3500
+note_attach_amp_iir2 = 0.250
+note_attach_freq_iir = 0.3000
+
+#How many bins a note can jump from frame to frame to be considered a slide.
+#this is used to prevent notes from popping in and out a lot.
+note_combine_distance = 0.5000
+note_jumpability = 1.8000
+note_minimum_new_distribution_value = 0.0200
+note_out_chop = 0.05000
+
+#compress_coefficient = 4.0
+#compress_exponent = .5
+
+
+#=======================================================================
+#Outputs
+
+
+This is a vornoi thing: 
+outdrivers = OutputVoronoi, DisplayArray
+lightx = 64
+lighty = 32
+fromsides = 1
+shape_cutoff = 0.03
+satamp = 5.000
+amppow = 2.510
+distpow = 1.500
+
+
+
--- a/embeddedcommon/DFT32.c
+++ b/embeddedcommon/DFT32.c
@ -353,6 +353,3 @@ void DoDFTProgressive32( float * outbins, float * frequencies, int bins, const f

 #endif

-
-
-
--- a/embeddedcommon/DFT8Turbo.c
+++ b/embeddedcommon/DFT8Turbo.c
@ -0,0 +1,264 @@
+#include <stdint.h>
+#include <stdlib.h>
+#include "DFT8Turbo.h"
+#include <math.h>
+
+#include <stdio.h>
+
+#define MAX_FREQS (24)
+#define OCTAVES   (5)
+
+
+/*
+	* The first thought was using an integration map and only operating when we need to, to pull the data out.
+	* Now we're doing the thing below this block comment
+		int16_t accumulated_total;							//2 bytes
+		int16_t last_accumulated_total_at_bin[MAX_FREQS*2];  //24 * 2 * sizeof(int16_t) = 96 bytes.
+		uint8_t current_time;								//1 byte
+		uint8_t placecode[MAX_FREQS];
+*/
+//OK... We don't have enough ram to sum everything... can we do something wacky with multiple ocatives to sum everything better?
+//i.e.
+//
+// 4332322132212210
+//
+// ++++++++++++++++-----------------
+// ++++++++--------
+// ++++----++++----
+// ++--++--++--++--
+// +-+-+-+-+-+-+-+-
+//
+// Don't forget we need to do this for sin and cos.
+// Can we instead of making this plusses, make it a multiplier?
+// How can we handle sin+cos?
+//
+// Is it possible to do this for every frame?  I.e. for each of the 24 notes, multiply with their current place in table?
+//  That's interesting.  It's not like a sin table.
+// There is no "multiply" in the attiny instruction set for attiny85.
+// There is, however for attiny402
+
+//Question:  Can we do five octaves, or does this need to be balanced?
+//Question2: Should we weight higher octaves?
+
+
+//ATTiny402: 256x8 RAM, 4096x8 FLASH  LPM: 3 cycles + FMUL: 2 cycles  << Do stacked sin waves?
+//ATtiny85:  512x8 RAM, 8192x8 FLASH  LPM: 3 cycles + NO MULTIPLY     << Do square waves?
+
+
+/* Approaches:
+
+  on ATtiny402:  Stacked sin approach.
+   Say 16 MHz, though 12 MHz is interesting...
+   16k SPS: 1k cycles per; say 24 bins per; 41 cycles per bin = hard.  But is it too hard?
+   20 cycles per s/c.
+		read place in stacked table (8? bits) 3 cycles
+
+		//Inner loop = 17 cycles.
+		read stacked table (8 bits), 3 cycles
+		fractional multiply table with current value. 2 cycles
+		read current running for note 2 cycles  (LDS = 3 cycles)
+		subtract a shifted version, to make it into an IIR. (4 cycles)
+		add in current values. (2 cycles)
+		store data back to ram (2 cycles)
+		advance place in stacked table (8?bits) 1 cycle
+
+		store place in stacked table (8? bits) 3 cycles?
+
+	//What if we chunk ADC updates into groups of 4 or 8?
+	//This is looking barely possible.
+
+	on attiny85: scheduled adds/subtracts (like a stacked-square-wave-table)
+		//XXX TODO!
+
+*/
+
+/* Ok... Let's think about the ATTiny402.  256x8 RAM + 4096x8 FLASH.
+
+	* We can create a table which has all octaves overlaid.
+	* We would need to keep track of:
+		* 12 x 2 x 2 = 48 bytes = Current sin/cos values.
+		* 12 x 2 = 24 bytes = Current place in table.  = 72 bytes
+	* We would need to store:
+		* The layered lookup table.  If possible, keep @ 256 bytes to simplify math ops.
+		* The speed by which each note needs to advance.
+	* We would need to:
+		* Read current running place. X                8 cycles
+		* Use that place to look up into sin table.    3 cycles
+		* Read running val  4 cycles best case
+		* Multiply out the sin + IIR                   5 cycles
+		* Store running val 4 cycles best case
+		* Cos-advance that place to look up into sin table.    4 cycles
+		* Read running val 4 cycles best case
+		* Multiply out the sin + IIR                   5 cycles
+		* Store running val 4 cycles best case.
+		* Read how much to advance X by.               4 cycles
+        * (Cos^2+Sin^2)                                8?
+		* Store it.                                    4 cycles best case.
+        *                                                  = 48 x 12 = 576 cycles.  Assume 10 MHz @ 16k SPS.  We're OK (625 samples)
+*/
+
+// Observation: The two tables are actually mirror images of each other, well diagonally mirrored.  That's odd.  But, would take CPU to exploit.
+
+#define SSTABLESIZE 256
+int8_t  spikysin_interleved_cos[SSTABLESIZE*2];
+uint16_t advancespeed[MAX_FREQS];
+
+static int CompTableWithPhase( int nelements, float phase, int scaling )
+{
+	int highest = 0;
+	int i;
+	for( i = 0; i < nelements; i++ )
+	{
+		float taued = i * 3.141592 * 2.0 / nelements;
+		int o;
+		float combsin = 0;
+		for( o = 0; o < OCTAVES; o++ )
+		{
+			combsin += sin( taued * (1<<o) + phase);
+		}
+		combsin /= OCTAVES;
+		int csadapt =  combsin * scaling;	//No value is higher with five octaves.  XXX TODO Lookout.  If you change # of octaves, need to change this, too.
+
+		if( csadapt > highest ) highest = csadapt;
+		if( -csadapt > highest ) highest = -csadapt;
+
+		if( csadapt > 127 ) csadapt = 127;
+		if( csadapt < -127 ) csadapt = -127;  //tricky: Keep balanced.
+		spikysin_interleved_cos[i*2+0] = csadapt;
+
+		float combcos = 0;
+		for( o = 0; o < OCTAVES; o++ )
+		{
+			combcos += cos( taued * (1<<o) + phase );
+		}
+		combcos /= OCTAVES;
+		csadapt = combcos * scaling;	//No value is higher with five octaves.  XXX TODO Lookout.  If you change # of octaves, need to change this, too.
+
+		if( csadapt > highest ) highest = csadapt;
+		if( -csadapt > highest ) highest = -csadapt;
+
+		if( csadapt > 127 ) csadapt = 127;
+		if( csadapt < -127 ) csadapt = -127;  //tricky: Keep balanced.
+		spikysin_interleved_cos[i*2+1] = csadapt;
+	}
+	return highest;
+}
+
+
+static int Setup( float * frequencies, int bins )
+{
+	int i;
+
+	//Since start position/phase is arbitrary, we should try several to see which gives us the best dynamic range.
+	float tryphase = 0;
+
+	float bestphase = 0;
+	int highest_val_at_best_phase = 1000000;
+
+	for( tryphase = 0; tryphase < 3.14159; tryphase += 0.001 )
+	{
+		int highest = CompTableWithPhase( SSTABLESIZE, tryphase, 65536 );
+		if( highest < highest_val_at_best_phase )
+		{
+			highest_val_at_best_phase = highest;
+			bestphase = tryphase;
+		}
+	}
+	printf( "Best comp: %f : %d\n", bestphase, highest_val_at_best_phase );
+
+	CompTableWithPhase( SSTABLESIZE, bestphase, (65536*128)/highest_val_at_best_phase );
+
+//	for( i = 0; i < SSTABLESIZE; i++ )
+//	{
+//		printf( "%d %d\n", spikysin_interleved_cos[i*2+0], spikysin_interleved_cos[i*2+1] );
+//	}
+
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		//frequencies[i] = SPS / Freq
+		// Need to decide how quickly we sweep through the table.
+		advancespeed[i] = 256.0 /* fixed point */ * 256.0 /* size of table */ / frequencies[i];
+		//printf( "%f\n", frequencies[i] );
+	}
+	return 0;
+}
+
+
+/*
+uint8_t  spikysin_interleved_cos[256*2];
+uint16_t advancespeed[MAX_FREQS];
+*/
+
+float toutbins[MAX_FREQS];
+
+struct notedat
+{
+	uint16_t time;
+	int32_t sinm;
+	int32_t cosm;
+};
+
+static struct notedat nd[MAX_FREQS];
+
+void Turbo8BitRun( int8_t adcval )
+{
+	int i;
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		uint16_t ct = nd[i].time;
+		int32_t muxres;
+		int32_t running;
+		int32_t rdesc, rdess;
+		int8_t  ss = spikysin_interleved_cos[(ct>>8) + 0];
+		muxres = ((int16_t)adcval * ss) >> 8;
+		running = nd[i].cosm;
+		running += muxres;
+		rdesc = running >> 8;
+		running -= rdesc>>6;
+		nd[i].cosm = running;
+
+		int8_t  sc = spikysin_interleved_cos[(ct>>8) + 1];
+		muxres = ((int16_t)adcval * sc) >> 8;
+		running = nd[i].sinm;
+		running += muxres;
+		rdess = running>>8;
+		running -= rdess>>6;
+		nd[i].sinm = running;
+
+		nd[i].time = ct + advancespeed[i];
+		toutbins[i] = rdess * rdess + rdesc * rdesc;
+		//printf( "%d %d = %f %p\n", rdess, rdesc, toutbins[i], &toutbins[i] );
+	}
+}
+
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup )
+{
+	static int is_setup;
+	if( !is_setup ) { is_setup = 1; Setup( frequencies, bins ); }
+	static int last_place;
+	int i;
+
+	for( i = last_place; i != place_in_data_buffer; i = (i+1)%size_of_data_buffer )
+	{
+		int16_t ifr1 = (int16_t)( ((databuffer[i]) ) * 4095 );
+		//ifr1 += 4095;
+		Turbo8BitRun( ifr1>>5 );
+	}
+
+	for( i = 0; i < bins; i++ )
+	{
+		outbins[i] = 0;
+	}
+	for( i = 0; i < MAX_FREQS; i++ )
+	{
+		int iss = nd[i].sinm;
+		int isc = nd[i].cosm;
+		int mux = iss * iss + isc * isc;
+		if( mux == 0 ) mux = 1;
+		outbins[i+MAX_FREQS] = sqrt(mux)/1000.0;
+	} 
+
+}
+
+
--- a/embeddedcommon/DFT8Turbo.h
+++ b/embeddedcommon/DFT8Turbo.h
@ -0,0 +1,9 @@
+#ifndef _DFT8TURBO_H
+#define _DFT8TURBO_H
+
+/* Note: Frequencies must be precompiled. */
+
+void DoDFT8BitTurbo( float * outbins, float * frequencies, int bins, const float * databuffer, int place_in_data_buffer, int size_of_data_buffer, float q, float speedup );
+
+#endif
+
				`@ -353,6 +353,3 @@ void DoDFTProgressive32( float * outbins, float * frequencies, int bins, const f`

				`#endif`