diff --git a/csdr.c b/csdr.c
index a1330e3..e74fadc 100644
--- a/csdr.c
+++ b/csdr.c
@@ -535,6 +535,56 @@ int main(int argc, char *argv[])
return 0;
}
+
+ if(!strcmp(argv[1],"shift_unroll_cc"))
+ {
+ bigbufs=1;
+
+ float starting_phase=0;
+ float rate;
+
+ int fd;
+ if(fd=init_fifo(argc,argv))
+ {
+ while(!read_fifo_ctl(fd,"%g\n",&rate)) usleep(10000);
+ }
+ else
+ {
+ if(argc<=2) return badsyntax("need required parameter (rate)");
+ sscanf(argv[2],"%g",&rate);
+ }
+
+ if(!sendbufsize(initialize_buffers())) return -2;
+ for(;;)
+ {
+ shift_unroll_data_t data=shift_unroll_init(rate, 1024);
+ fprintf(stderr,"shift_unroll_cc: reinitialized to %g\n",rate);
+ int remain, current_size;
+ float* ibufptr;
+ float* obufptr;
+ for(;;)
+ {
+ FEOF_CHECK;
+ if(!FREAD_C) break;
+ remain=the_bufsize;
+ ibufptr=input_buffer;
+ obufptr=output_buffer;
+ while(remain)
+ {
+ current_size=(remain>1024)?1024:remain;
+ starting_phase=shift_unroll_cc((complexf*)ibufptr, (complexf*)obufptr, current_size, &data, starting_phase);
+ ibufptr+=current_size*2;
+ obufptr+=current_size*2;
+ remain-=current_size;
+ }
+ FWRITE_C;
+ if(read_fifo_ctl(fd,"%g\n",&rate)) break;
+ TRY_YIELD;
+ }
+ }
+ return 0;
+ }
+
#ifdef LIBCSDR_GPL
if(!strcmp(argv[1],"decimating_shift_addition_cc"))
{
diff --git a/grc_tests/test_shift_remote.grc b/grc_tests/test_shift_remote.grc
index 9b4f589..59620bd 100644
--- a/grc_tests/test_shift_remote.grc
+++ b/grc_tests/test_shift_remote.grc
@@ -355,7 +355,7 @@
commandline
- ncat -vv raspberrypi.local 5321
+ ncat -v raspberrypi.local 5321
comment
diff --git a/grc_tests/test_shift_remote.sh b/grc_tests/test_shift_remote.sh
index 14f061d..65c7192 100755
--- a/grc_tests/test_shift_remote.sh
+++ b/grc_tests/test_shift_remote.sh
@@ -2,7 +2,7 @@
# Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC.
# It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
-echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT
+echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
cat $TEMPSCRIPT
chmod +x $TEMPSCRIPT
ncat -vvl 5321 -e $TEMPSCRIPT
diff --git a/libcsdr.c b/libcsdr.c
index d0c3b2d..df87266 100644
--- a/libcsdr.c
+++ b/libcsdr.c
@@ -264,6 +264,44 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
}
+shift_unroll_data_t shift_unroll_init(float rate, int size)
+{
+ shift_unroll_data_t output;
+ output.phase_increment=2*rate*PI;
+ output.size = size;
+ output.dsin=(float*)malloc(sizeof(float)*size);
+ output.dcos=(float*)malloc(sizeof(float)*size);
+ float myphase = 0;
+ for(int i=0;iPI) myphase-=2*PI;
+ while(myphase<-PI) myphase+=2*PI;
+ output.dsin[i]=sin(myphase);
+ output.dcos[i]=cos(myphase);
+ }
+ return output;
+}
+
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
+{
+ //input_size should be multiple of 4
+ //fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+ float cos_start=cos(starting_phase);
+ float sin_start=sin(starting_phase);
+ register float cos_val, sin_val;
+ for(int i=0;idcos[i] - sin_start * d->dsin[i];
+ sin_val = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+ iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
+ qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
+ }
+ starting_phase+=input_size*d->phase_increment;
+ while(starting_phase>PI) starting_phase-=2*PI;
+ while(starting_phase<-PI) starting_phase+=2*PI;
+ return starting_phase;
+}
shift_addfast_data_t shift_addfast_init(float rate)
{
@@ -283,7 +321,6 @@ shift_addfast_data_t shift_addfast_init(float rate)
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
{
//input_size should be multiple of 4
- float phase=starting_phase;
float cos_start[4], sin_start[4];
float cos_vals[4], sin_vals[4];
for(int i=0;i<4;i++)
@@ -316,7 +353,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
" vld1.32 {" RDSIN "}, [%[pdsin]]\n\t"
" vld1.32 {" RCOSST "}, [%[cos_start]]\n\t"
" vld1.32 {" RSINST "}, [%[sin_start]]\n\t"
- "for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark)
+ "for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark)
//C version:
//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
@@ -330,18 +367,18 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
//C version:
//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);
- " vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input
- " vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input
- " vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
- " vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
+ " vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i = cos_vals * input_i
+ " vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
+ " vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q = sin_vals * input_i
+ " vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q
- " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory
- " add %[poutput],%[poutput],#32\n\t"
- " vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
- " vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
+ " vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
+ //" add %[poutput],%[poutput],#32\n\t"
+ " vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
+ " vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]
- " cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end)
- " bcc for_addfast\n\t" // then goto for_fdcasm
+ " cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
+ " bcc for_addfast\n\t" // then goto for_addfast
:
[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
:
@@ -349,7 +386,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
:
"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
);
- return phase+input_size*d->phase_increment;
+ starting_phase+=input_size*d->phase_increment;
+ while(starting_phase>PI) starting_phase-=2*PI;
+ while(starting_phase<-PI) starting_phase+=2*PI;
+ return starting_phase;
}
#else
@@ -358,7 +398,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
{
//input_size should be multiple of 4
//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
- float phase=starting_phase;
float cos_start=cos(starting_phase);
float sin_start=sin(starting_phase);
float cos_vals[4], sin_vals[4];
@@ -377,7 +416,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
cos_start = cos_vals[3];
sin_start = sin_vals[3];
}
- return phase+input_size*d->phase_increment;
+ starting_phase+=input_size*d->phase_increment;
+ while(starting_phase>PI) starting_phase-=2*PI;
+ while(starting_phase<-PI) starting_phase+=2*PI;
+ return starting_phase;
}
#endif
@@ -422,7 +464,7 @@ q4, q5: accumulator for I branch and Q branch (will be the output)
" vld1.32 {q2}, [%[ptaps]]!\n\t"
" vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
" vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
- " cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
+ " cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
" bcc for_fdccasm\n\t" // then goto for_fdcasm
" vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
" vst1.32 {q5}, [%[quad_accq]]\n\t"
diff --git a/libcsdr.h b/libcsdr.h
index 5ccb370..334ba6f 100644
--- a/libcsdr.h
+++ b/libcsdr.h
@@ -165,8 +165,18 @@ typedef struct shift_addfast_data_s
float phase_increment;
} shift_addfast_data_t;
shift_addfast_data_t shift_addfast_init(float rate);
+shift_addfast_data_t shift_addfast_init(float rate);
float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
+typedef struct shift_unroll_data_s
+{
+ float* dsin;
+ float* dcos;
+ float phase_increment;
+ int size;
+} shift_unroll_data_t;
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase);
+shift_unroll_data_t shift_unroll_init(float rate, int size);
int log2n(int x);
int next_pow2(int x);
diff --git a/test200.c b/test200.c
index f233d1a..9feb457 100644
--- a/test200.c
+++ b/test200.c
@@ -62,9 +62,18 @@ int main()
fprintf(stderr,"Starting tests of processing %d samples...\n", T_BUFSIZE*T_N);
+ //shift_math_cc
+ float starting_phase = 0;
+
+ clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
+ for(int i=0;i