Added NEON optimization for DDC.

Buffer size can now automatically adjust to sampling rate changes between csdr processes.
2015-09-30 13:52:43 +00:00 · 2015-09-30 13:52:43 +00:00 · 9cec3782e2
commit 9cec3782e2
parent 6ee80ea7ce
5 changed files with 582 additions and 109 deletions
--- a/10
+++ b/10
@ -32,7 +32,7 @@ LIBSOURCES =  fft_fftw.c libcsdr_wrapper.c
 #SOURCES = csdr.c $(LIBSOURCES)
 cpufeature = $(if $(findstring $(1),$(shell cat /proc/cpuinfo)),$(2))
 PARAMS_SSE = $(call cpufeature,sse,-msse) $(call cpufeature,sse2,-msse2) $(call cpufeature,sse3,-msse3) $(call cpufeature,sse4,-msse4) $(call cpufeature,sse4_1,-msse4.1) $(call cpufeature,sse4_2,-msse4.2) -mfpmath=sse 
-PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0
+PARAMS_NEON = -mfloat-abi=hard -march=armv7-a -mtune=cortex-a8 -mfpu=neon -mvectorize-with-neon-quad -funsafe-math-optimizations -Wformat=0 -DNEON_OPTS
 #tnx Jan Szumiec for the Raspberry Pi support
 PARAMS_RASPI = -mfloat-abi=hard -mcpu=arm1176jzf-s -mfpu=vfp -funsafe-math-optimizations -Wformat=0
 PARAMS_ARM = $(if $(call cpufeature,BCM2708,dummy-text),$(PARAMS_RASPI),$(PARAMS_NEON))
@ -47,12 +47,12 @@ all: clean-vect
 	@echo NOTE: you may have to manually edit Makefile to optimize for your CPU \(especially if you compile on ARM, please edit PARAMS_NEON\).
 	@echo Auto-detected optimization parameters: $(PARAMS_SIMD)
 	@echo
-	c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so
+	gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) $(LIBSOURCES) $(PARAMS_LIBS) $(PARAMS_MISC) -fpic -shared -o libcsdr.so
 	-./parsevect dumpvect*.vect
-	c99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr
+	gcc -std=gnu99 $(PARAMS_LOOPVECT) $(PARAMS_SIMD) csdr.c $(PARAMS_LIBS) -L. -lcsdr $(PARAMS_MISC) -o csdr
 arm-cross: clean-vect
 	#note: this doesn't work since having added FFTW
-	arm-linux-gnueabihf-gcc -std=c99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr
+	arm-linux-gnueabihf-gcc -std=gnu99 -O3 -fshort-double -ffast-math -dumpbase dumpvect-arm -fdump-tree-vect-details -mfloat-abi=softfp -march=armv7-a -mtune=cortex-a9 -mfpu=neon -mvectorize-with-neon-quad -Wno-unused-result -Wformat=0 $(SOURCES) -lm -o ./csdr
 clean-vect:
 	rm -f dumpvect*.vect
 clean: clean-vect
@ -65,6 +65,8 @@ install:
 uninstall:
 	rm /usr/lib/libcsdr.so /usr/bin/csdr /usr/bin/csdr-fm
 	ldconfig
+disasm:
+	objdump -S libcsdr.so > libcsdr.disasm
 emcc-clean:
 	-rm sdr.js/sdr.js
 	-rm sdr.js/sdrjs-compiled.js
--- a/csdr.c
+++ b/csdr.c
--- a/libcsdr.c
+++ b/libcsdr.c
@ -263,6 +263,75 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
 	return phase;
 }

+#ifdef NEON_OPTS
+#pragma message "We have a faster fir_decimate_cc now."
+
+//max help: http://community.arm.com/groups/android-community/blog/2015/03/27/arm-neon-programming-quick-reference
+
+int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
+{
+	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
+	//It uses real taps. It returns the number of output samples actually written.
+	//It needs overlapping input based on its returned value:
+	//number of processed input samples = returned value * decimation factor
+	//The output buffer should be at least input_length / 3.
+	// i: input index | ti: tap index | oi: output index
+	int oi=0;
+	for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
+	{
+		if(i+taps_length>input_size) break;
+		register float acci=0; 
+		register float accq=0;		
+
+		register int ti=0;
+		register float* pinput=(float*)&(input[i+ti]);
+		register float* ptaps=taps;
+		register float* ptaps_end=taps+taps_length;
+		float quad_acciq [8];
+
+	
+/*
+q0, q1:	input signal I sample and Q sample
+q2:		taps 
+q4, q5: accumulator for I branch and Q branch (will be the output)
+*/
+
+		//fprintf(stderr, "macska\n");
+
+		asm volatile(
+			//"		vorr.f32 q4, #0\n\t" //null the accumulators
+			//"		vorr.f32 q5, #0\n\t"
+			"		vmov.f32 q4, #0.0\n\t" //another way to null the accumulators
+			"		vmov.f32 q5, #0.0\n\t"
+			"for_fdccasm: vld2.32	{q0-q1}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in q0 and the Q samples in q1), also increment the memory address in pinput (hence the "!" mark) //http://community.arm.com/groups/processors/blog/2010/03/17/coding-for-neon--part-1-load-and-stores
+			"		vld1.32	{q2}, [%[ptaps]]!\n\t" 
+			"		vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
+			"		vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
+			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
+			"		bcc for_fdccasm\n\t"			//	then goto for_fdcasm
+			"		vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
+			"		vst1.32 {q5}, [%[quad_accq]]\n\t"
+		:
+			[pinput]"+r"(pinput), [ptaps]"+r"(ptaps) //output operand list
+		:
+			[ptaps_end]"r"(ptaps_end), [quad_acci]"r"(quad_acciq), [quad_accq]"r"(quad_acciq+4) //input operand list
+		: 
+			"memory", "q0", "q1", "q2", "q4", "q5", "cc" //clobber list
+		);
+		//original for loops for reference:
+		//for(int ti=0; ti<taps_length; ti++) acci += (iof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: i loop		
+		//for(int ti=0; ti<taps_length; ti++) accq += (qof(input,i+ti)) * taps[ti]; //@fir_decimate_cc: q loop
+		
+		//for(int n=0;n<8;n++) fprintf(stderr, "\n>> [%d] %g \n", n, quad_acciq[n]);
+		iof(output,oi)=quad_acciq[0]+quad_acciq[1]+quad_acciq[2]+quad_acciq[3]; //we're still not ready, as we have to add up the contents of a quad accumulator register to get a single accumulated value
+		qof(output,oi)=quad_acciq[4]+quad_acciq[5]+quad_acciq[6]+quad_acciq[7];
+		oi++;
+	}
+	return oi;
+}
+
+#else
+
 int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
 {
 	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
@ -286,6 +355,34 @@ int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decim
 	return oi;
 }

+#endif
+
+/*
+int fir_decimate_cc(complexf *input, complexf *output, int input_size, int decimation, float *taps, int taps_length)
+{
+	//Theory: http://www.dspguru.com/dsp/faqs/multirate/decimation
+	//It uses real taps. It returns the number of output samples actually written.
+	//It needs overlapping input based on its returned value:
+	//number of processed input samples = returned value * decimation factor
+	//The output buffer should be at least input_length / 3.
+	// i: input index | ti: tap index | oi: output index
+	int oi=0;
+	for(int i=0; i<input_size; i+=decimation) //@fir_decimate_cc: outer loop
+	{
+		if(i+taps_length>input_size) break;
+		float acci=0;
+		int taps_halflength = taps_length/2;
+		for(int ti=0; ti<taps_halflength; ti++) acci += (iof(input,i+ti)+iof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: i loop
+		float accq=0;
+		for(int ti=0; ti<taps_halflength; ti++) accq += (qof(input,i+ti)+qof(input,i+taps_length-ti)) * taps[ti]; //@fir_decimate_cc: q loop
+		iof(output,oi)=acci+iof(input,i+taps_halflength)*taps[taps_halflength];
+		qof(output,oi)=accq+qof(input,i+taps_halflength)*taps[taps_halflength];
+		oi++;
+	}
+	return oi;
+}
+*/
+
 rational_resampler_ff_t rational_resampler_ff(float *input, float *output, int input_size, int interpolation, int decimation, float *taps, int taps_length, int last_taps_delay)
 {
 	
@ -524,7 +621,8 @@ float fastdcblock_ff(float* input, float* output, int input_size, float last_dc_
 	return avg;
 }

-#define FASTAGC_MAX_GAIN (65e3)
+//#define FASTAGC_MAX_GAIN (65e3)
+#define FASTAGC_MAX_GAIN 50

 void fastagc_ff(fastagc_ff_t* input, float* output)
 {
@ -553,6 +651,7 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
 	//we change the gain linearly on the apply_block from the last_gain to target_gain.
 	float target_gain=input->reference/target_peak;
 	if(target_gain>FASTAGC_MAX_GAIN) target_gain=FASTAGC_MAX_GAIN;
+	//fprintf(stderr, "target_gain: %g\n",target_gain);

 	for(int i=0;i<input->input_size;i++) //@fastagc_ff: apply gain
 	{
@ -572,7 +671,6 @@ void fastagc_ff(fastagc_ff_t* input, float* output)
 	//fprintf(stderr,"target_gain=%g\n", target_gain);
 }

-
 /*
  ______ __  __        _                          _       _       _                 
 |  ____|  \/  |      | |                        | |     | |     | |                
--- a/libcsdr_gpl.c
+++ b/libcsdr_gpl.c
@ -189,12 +189,14 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
 			{
 				if(last_peak<input_abs) 
 				{
+					
 					attack_wait_counter=attack_wait_time;
 					last_peak=input_abs;
 				}
 				if(attack_wait_counter>0)
 				{
 					attack_wait_counter--;
+					//fprintf(stderr,"A");
 					dgain=0;
 				}
 				else
@ -203,6 +205,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
 					dgain=error*attack_rate; 
 					//Before starting to increase the gain next time, we will be waiting until hang_time for sure.
 					hang_counter=hang_time;
+					
 				}
 			}
 			else //DECREASE IN SIGNAL LEVEL
@ -222,7 +225,7 @@ float agc_ff(float* input, float* output, int input_size, float reference, float
 		}
 		//output[i]=gain*input[i]; //Here we do the actual scaling of the samples.
 		//Here we do the actual scaling of the samples, but we run an IIR filter on the gain values:
-		output[i]=(gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1]
+		output[i]=(gain=gain+last_gain-gain_filter_alpha*last_gain)*input[i]; //dc-pass-filter: freqz([1 -1],[1 -0.99]) y[i]=x[i]+y[i-1]-alpha*x[i-1]
 		//output[i]=input[i]*(last_gain+gain_filter_alpha*(gain-last_gain)); //LPF

 		last_gain=gain;
--- a/sdr.js/sdrjs-test.html
+++ b/sdr.js/sdrjs-test.html
@ -0,0 +1,38 @@
+<!doctype html>
+<!--
+This software is part of libcsdr, a set of simple DSP routines for 
+Software Defined Radio.
+
+Copyright (c) 2014, Andras Retzler <randras@sdr.hu>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holder nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANDRAS RETZLER BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+<html lang="en-us">
+  <head>
+	<script type="text/javascript" src="sdr.js"></script>
+  </head>
+  <body style="font-family: monospace">
+    <script type="text/javascript">test_firdes_lowpass_f_new();</script>
+  </body>
+</html>