diff --git a/csdr.c b/csdr.c
index a1330e3..e74fadc 100644
--- a/csdr.c
+++ b/csdr.c
@@ -535,6 +535,56 @@ int main(int argc, char *argv[])
 		return 0;
 	}
 
+
+	if(!strcmp(argv[1],"shift_unroll_cc"))
+	{
+		bigbufs=1;
+
+		float starting_phase=0;
+		float rate;
+
+		int fd;
+		if(fd=init_fifo(argc,argv))
+		{
+			while(!read_fifo_ctl(fd,"%g\n",&rate)) usleep(10000);
+		}
+		else
+		{
+			if(argc<=2) return badsyntax("need required parameter (rate)"); 
+			sscanf(argv[2],"%g",&rate);
+		}
+
+		if(!sendbufsize(initialize_buffers())) return -2;
+		for(;;)
+		{
+			shift_unroll_data_t data=shift_unroll_init(rate, 1024);
+			fprintf(stderr,"shift_unroll_cc: reinitialized to %g\n",rate);
+			int remain, current_size;
+			float* ibufptr;
+			float* obufptr;
+			for(;;)
+			{
+				FEOF_CHECK;
+				if(!FREAD_C) break;
+				remain=the_bufsize;
+				ibufptr=input_buffer;
+				obufptr=output_buffer;
+				while(remain)
+				{
+					current_size=(remain>1024)?1024:remain;
+					starting_phase=shift_unroll_cc((complexf*)ibufptr, (complexf*)obufptr, current_size, &data, starting_phase);
+					ibufptr+=current_size*2;
+					obufptr+=current_size*2;
+					remain-=current_size;
+				}
+				FWRITE_C;
+				if(read_fifo_ctl(fd,"%g\n",&rate)) break;
+				TRY_YIELD;
+			}
+		}
+		return 0;
+	}
+
 #ifdef LIBCSDR_GPL
 	if(!strcmp(argv[1],"decimating_shift_addition_cc"))
 	{
diff --git a/grc_tests/test_shift_remote.grc b/grc_tests/test_shift_remote.grc
index 9b4f589..59620bd 100644
--- a/grc_tests/test_shift_remote.grc
+++ b/grc_tests/test_shift_remote.grc
@@ -355,7 +355,7 @@
     </param>
     <param>
       <key>commandline</key>
-      <value>ncat -vv raspberrypi.local 5321</value>
+      <value>ncat -v raspberrypi.local 5321</value>
     </param>
     <param>
       <key>comment</key>
diff --git a/grc_tests/test_shift_remote.sh b/grc_tests/test_shift_remote.sh
index 14f061d..65c7192 100755
--- a/grc_tests/test_shift_remote.sh
+++ b/grc_tests/test_shift_remote.sh
@@ -2,7 +2,7 @@
 # Run this script on a Raspberry Pi 2, while running test_shift_remote.grc on your PC. 
 # It allows you to debug the NEON-accelerated version of specific DSP algorithms on the target hardware.
 TEMPSCRIPT="/tmp/test_shift_remote_exec.sh"
-echo '#!/bin/sh\ncsdr shift_addfast_cc -0' > $TEMPSCRIPT
+echo '#!/bin/sh\ncsdr shift_addfast_cc -0.1' > $TEMPSCRIPT
 cat $TEMPSCRIPT
 chmod +x $TEMPSCRIPT
 ncat -vvl 5321 -e $TEMPSCRIPT
diff --git a/libcsdr.c b/libcsdr.c
index d0c3b2d..df87266 100644
--- a/libcsdr.c
+++ b/libcsdr.c
@@ -264,6 +264,44 @@ float shift_table_cc(complexf* input, complexf* output, int input_size, float ra
 }
 
 
+shift_unroll_data_t shift_unroll_init(float rate, int size)
+{
+	shift_unroll_data_t output;
+	output.phase_increment=2*rate*PI;
+	output.size = size;
+	output.dsin=(float*)malloc(sizeof(float)*size);
+	output.dcos=(float*)malloc(sizeof(float)*size);
+	float myphase = 0;
+	for(int i=0;i<size;i++)
+	{
+		myphase += output.phase_increment;
+		while(myphase>PI) myphase-=2*PI;
+		while(myphase<-PI) myphase+=2*PI;		
+		output.dsin[i]=sin(myphase);
+		output.dcos[i]=cos(myphase);
+	}
+	return output;	
+}
+
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase)
+{
+	//input_size should be multiple of 4
+	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
+	float cos_start=cos(starting_phase);
+	float sin_start=sin(starting_phase);
+	register float cos_val, sin_val;
+	for(int i=0;i<input_size; i++) //@shift_unroll_cc
+	{
+		cos_val = cos_start * d->dcos[i] - sin_start * d->dsin[i];
+		sin_val  = sin_start * d->dcos[i] + cos_start * d->dsin[i];
+		iof(output,i)=cos_val*iof(input,i)-sin_val*qof(input,i);
+		qof(output,i)=sin_val*iof(input,i)+cos_val*qof(input,i);
+	}
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
+}
 
 shift_addfast_data_t shift_addfast_init(float rate)
 {
@@ -283,7 +321,6 @@ shift_addfast_data_t shift_addfast_init(float rate)
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase)
 {
 	//input_size should be multiple of 4
-	float phase=starting_phase;
 	float cos_start[4], sin_start[4];
 	float cos_vals[4], sin_vals[4];
 	for(int i=0;i<4;i++) 
@@ -316,7 +353,7 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		"		vld1.32	{" RDSIN "}, [%[pdsin]]\n\t"
 		"		vld1.32	{" RCOSST "}, [%[cos_start]]\n\t"
 		"		vld1.32	{" RSINST "}, [%[sin_start]]\n\t"
-		"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in rinpi and the Q samples in rinpq), also increment the memory address in pinput (hence the "!" mark) 
+		"for_addfast: vld2.32 {" RINPI "-" RINPQ "}, [%[pinput]]!\n\t" //load q0 and q1 directly from the memory address stored in pinput, with interleaving (so that we get the I samples in RINPI and the Q samples in RINPQ), also increment the memory address in pinput (hence the "!" mark) 
 
 		//C version:
 		//cos_vals[j] = cos_start * d->dcos[j] - sin_start * d->dsin[j];
@@ -330,18 +367,18 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		//C version:
 		//iof(output,4*i+j)=cos_vals[j]*iof(input,4*i+j)-sin_vals[j]*qof(input,4*i+j);
 		//qof(output,4*i+j)=sin_vals[j]*iof(input,4*i+j)+cos_vals[j]*qof(input,4*i+j);	
-		"		vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output = cos_vals * input
-		"		vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output -= sin_vals * input
-		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //sin_vals[i] = sin_start * d->dcos[i]
-		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //sin_vals[i] += cos_start * d->dsin[i]
+		"		vmul.f32 " R3(ROUTI, RCOSV, RINPI) //output_i =  cos_vals * input_i
+		"		vmls.f32 " R3(ROUTI, RSINV, RINPQ) //output_i -= sin_vals * input_q
+		"		vmul.f32 " R3(ROUTQ, RSINV, RINPI) //output_q =  sin_vals * input_i
+		"		vmla.f32 " R3(ROUTQ, RCOSV, RINPQ) //output_i += cos_vals * input_q
 
-		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]\n\t" //store the outputs in memory
-		"		add %[poutput],%[poutput],#32\n\t"
-		"		vdup.32 " RCOSST ", d5[1]\n\t" // cos_start[0-3] = cos_vals[3]
-		"		vdup.32 " RSINST ", d7[1]\n\t" // sin_start[0-3] = sin_vals[3]
+		"		vst2.32 {" ROUTI "-" ROUTQ "}, [%[poutput]]!\n\t" //store the outputs in memory
+		//"		add %[poutput],%[poutput],#32\n\t"
+		"		vdup.32 " RCOSST ", d9[1]\n\t" // cos_start[0-3] = cos_vals[3]
+		"		vdup.32 " RSINST ", d11[1]\n\t" // sin_start[0-3] = sin_vals[3]
 
-		"		cmp %[pinput], %[pinput_end]\n\t" //if(pinput == pinput_end)
-		"		bcc for_addfast\n\t"			  //	then goto for_fdcasm
+		"		cmp %[pinput], %[pinput_end]\n\t" //if(pinput != pinput_end)
+		"		bcc for_addfast\n\t"			  //	then goto for_addfast
 	:
 		[pinput]"+r"(pinput), [poutput]"+r"(poutput) //output operand list -> C variables that we will change from ASM
 	:
@@ -349,7 +386,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 	: 
 		"memory", "q0", "q1", "q2", "q4", "q5", "q6", "q7", "q8", "q9", "cc" //clobber list
 	);
-	return phase+input_size*d->phase_increment;
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
 }
 
 #else
@@ -358,7 +398,6 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 {
 	//input_size should be multiple of 4
 	//fprintf(stderr, "shift_addfast_cc: input_size = %d\n", input_size);
-	float phase=starting_phase;
 	float cos_start=cos(starting_phase);
 	float sin_start=sin(starting_phase);
 	float cos_vals[4], sin_vals[4];
@@ -377,7 +416,10 @@ float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_
 		cos_start = cos_vals[3];
 		sin_start = sin_vals[3];
 	}
-	return phase+input_size*d->phase_increment;
+	starting_phase+=input_size*d->phase_increment;
+	while(starting_phase>PI) starting_phase-=2*PI;
+	while(starting_phase<-PI) starting_phase+=2*PI;
+	return starting_phase;
 }
 
 #endif
@@ -422,7 +464,7 @@ q4, q5: accumulator for I branch and Q branch (will be the output)
 			"		vld1.32	{q2}, [%[ptaps]]!\n\t" 
 			"		vmla.f32 q4, q0, q2\n\t" //quad_acc_i += quad_input_i * quad_taps_1 //http://stackoverflow.com/questions/3240440/how-to-use-the-multiply-and-accumulate-intrinsics-in-arm-cortex-a8 //http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0489e/CIHEJBIE.html
 			"		vmla.f32 q5, q1, q2\n\t" //quad_acc_q += quad_input_q * quad_taps_1
-			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps == ptaps_end)
+			"		cmp %[ptaps], %[ptaps_end]\n\t" //if(ptaps != ptaps_end)
 			"		bcc for_fdccasm\n\t"			//	then goto for_fdcasm
 			"		vst1.32 {q4}, [%[quad_acci]]\n\t" //if the loop is finished, store the two accumulators in memory
 			"		vst1.32 {q5}, [%[quad_accq]]\n\t"
diff --git a/libcsdr.h b/libcsdr.h
index 5ccb370..334ba6f 100644
--- a/libcsdr.h
+++ b/libcsdr.h
@@ -165,8 +165,18 @@ typedef struct shift_addfast_data_s
 	float phase_increment;
 } shift_addfast_data_t;
 shift_addfast_data_t shift_addfast_init(float rate);
+shift_addfast_data_t shift_addfast_init(float rate);
 float shift_addfast_cc(complexf *input, complexf* output, int input_size, shift_addfast_data_t* d, float starting_phase);
 
+typedef struct shift_unroll_data_s
+{
+	float* dsin;
+	float* dcos;
+	float phase_increment;
+	int size;
+} shift_unroll_data_t;
+float shift_unroll_cc(complexf *input, complexf* output, int input_size, shift_unroll_data_t* d, float starting_phase);
+shift_unroll_data_t shift_unroll_init(float rate, int size);
 
 int log2n(int x);
 int next_pow2(int x);
diff --git a/test200.c b/test200.c
index f233d1a..9feb457 100644
--- a/test200.c
+++ b/test200.c
@@ -62,9 +62,18 @@ int main()
 
 	fprintf(stderr,"Starting tests of processing %d samples...\n", T_BUFSIZE*T_N);
 
+	//shift_math_cc
+	float starting_phase = 0;
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
+	for(int i=0;i<T_N;i++) starting_phase = shift_math_cc(buf_c, outbuf_c, T_BUFSIZE, 0.1, starting_phase);
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
+	fprintf(stderr,"shift_math_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
+
+
 	//shift_addition_cc	
 	shift_addition_data_t data_addition = shift_addition_init(0.1);
-	float starting_phase = 0;
+	starting_phase = 0;
 
 	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
 	for(int i=0;i<T_N;i++) starting_phase = shift_addition_cc(buf_c, outbuf_c, T_BUFSIZE, data_addition, starting_phase);
@@ -80,5 +89,14 @@ int main()
 	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
 	fprintf(stderr,"shift_addfast_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
 
+	//shift_unroll_cc
+	shift_unroll_data_t data_unroll = shift_unroll_init(0.1, T_BUFSIZE);
+	starting_phase = 0;
+
+	clock_gettime(CLOCK_MONOTONIC_RAW, &start_time);
+	for(int i=0;i<T_N;i++) starting_phase = shift_unroll_cc(buf_c, outbuf_c, T_BUFSIZE, &data_unroll, starting_phase);
+	clock_gettime(CLOCK_MONOTONIC_RAW, &end_time);
+	fprintf(stderr,"shift_unroll_cc done in %g seconds.\n",TIME_TAKEN(start_time,end_time));
+
 
 }