ft8_lib/fft/kiss_fft.c

/*
 *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */


#include "_kiss_fft_guts.h"
/* The guts header contains all the multiplication and addition macros that are defined for
 fixed or floating point complex numbers.  It also delares the kf_ internal functions.
 */

static void kf_bfly2(
        kiss_fft_cpx * Fout,
        const size_t fstride,
        const kiss_fft_cfg st,
        int m
        )
{
    kiss_fft_cpx * Fout2;
    kiss_fft_cpx * tw1 = st->twiddles;
    kiss_fft_cpx t;
    Fout2 = Fout + m;
    do{
        C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);

        C_MUL (t,  *Fout2 , *tw1);
        tw1 += fstride;
        C_SUB( *Fout2 ,  *Fout , t );
        C_ADDTO( *Fout ,  t );
        ++Fout2;
        ++Fout;
    }while (--m);
}

static void kf_bfly4(
        kiss_fft_cpx * Fout,
        const size_t fstride,
        const kiss_fft_cfg st,
        const size_t m
        )
{
    kiss_fft_cpx *tw1,*tw2,*tw3;
    kiss_fft_cpx scratch[6];
    size_t k=m;
    const size_t m2=2*m;
    const size_t m3=3*m;


    tw3 = tw2 = tw1 = st->twiddles;

    do {
        C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);

        C_MUL(scratch[0],Fout[m] , *tw1 );
        C_MUL(scratch[1],Fout[m2] , *tw2 );
        C_MUL(scratch[2],Fout[m3] , *tw3 );

        C_SUB( scratch[5] , *Fout, scratch[1] );
        C_ADDTO(*Fout, scratch[1]);
        C_ADD( scratch[3] , scratch[0] , scratch[2] );
        C_SUB( scratch[4] , scratch[0] , scratch[2] );
        C_SUB( Fout[m2], *Fout, scratch[3] );
        tw1 += fstride;
        tw2 += fstride*2;
        tw3 += fstride*3;
        C_ADDTO( *Fout , scratch[3] );

        if(st->inverse) {
            Fout[m].r = scratch[5].r - scratch[4].i;
            Fout[m].i = scratch[5].i + scratch[4].r;
            Fout[m3].r = scratch[5].r + scratch[4].i;
            Fout[m3].i = scratch[5].i - scratch[4].r;
        }else{
            Fout[m].r = scratch[5].r + scratch[4].i;
            Fout[m].i = scratch[5].i - scratch[4].r;
            Fout[m3].r = scratch[5].r - scratch[4].i;
            Fout[m3].i = scratch[5].i + scratch[4].r;
        }
        ++Fout;
    }while(--k);
}

static void kf_bfly3(
         kiss_fft_cpx * Fout,
         const size_t fstride,
         const kiss_fft_cfg st,
         size_t m
         )
{
     size_t k=m;
     const size_t m2 = 2*m;
     kiss_fft_cpx *tw1,*tw2;
     kiss_fft_cpx scratch[5];
     kiss_fft_cpx epi3;
     epi3 = st->twiddles[fstride*m];

     tw1=tw2=st->twiddles;

     do{
         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);

         C_MUL(scratch[1],Fout[m] , *tw1);
         C_MUL(scratch[2],Fout[m2] , *tw2);

         C_ADD(scratch[3],scratch[1],scratch[2]);
         C_SUB(scratch[0],scratch[1],scratch[2]);
         tw1 += fstride;
         tw2 += fstride*2;

         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);

         C_MULBYSCALAR( scratch[0] , epi3.i );

         C_ADDTO(*Fout,scratch[3]);

         Fout[m2].r = Fout[m].r + scratch[0].i;
         Fout[m2].i = Fout[m].i - scratch[0].r;

         Fout[m].r -= scratch[0].i;
         Fout[m].i += scratch[0].r;

         ++Fout;
     }while(--k);
}

static void kf_bfly5(
        kiss_fft_cpx * Fout,
        const size_t fstride,
        const kiss_fft_cfg st,
        int m
        )
{
    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
    int u;
    kiss_fft_cpx scratch[13];
    kiss_fft_cpx * twiddles = st->twiddles;
    kiss_fft_cpx *tw;
    kiss_fft_cpx ya,yb;
    ya = twiddles[fstride*m];
    yb = twiddles[fstride*2*m];

    Fout0=Fout;
    Fout1=Fout0+m;
    Fout2=Fout0+2*m;
    Fout3=Fout0+3*m;
    Fout4=Fout0+4*m;

    tw=st->twiddles;
    for ( u=0; u<m; ++u ) {
        C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
        scratch[0] = *Fout0;

        C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
        C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
        C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
        C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);

        C_ADD( scratch[7],scratch[1],scratch[4]);
        C_SUB( scratch[10],scratch[1],scratch[4]);
        C_ADD( scratch[8],scratch[2],scratch[3]);
        C_SUB( scratch[9],scratch[2],scratch[3]);

        Fout0->r += scratch[7].r + scratch[8].r;
        Fout0->i += scratch[7].i + scratch[8].i;

        scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
        scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);

        scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
        scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);

        C_SUB(*Fout1,scratch[5],scratch[6]);
        C_ADD(*Fout4,scratch[5],scratch[6]);

        scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
        scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
        scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
        scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);

        C_ADD(*Fout2,scratch[11],scratch[12]);
        C_SUB(*Fout3,scratch[11],scratch[12]);

        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
    }
}

/* perform the butterfly for one stage of a mixed radix FFT */
static void kf_bfly_generic(
        kiss_fft_cpx * Fout,
        const size_t fstride,
        const kiss_fft_cfg st,
        int m,
        int p
        )
{
    int u,k,q1,q;
    kiss_fft_cpx * twiddles = st->twiddles;
    kiss_fft_cpx t;
    int Norig = st->nfft;

    kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);

    for ( u=0; u<m; ++u ) {
        k=u;
        for ( q1=0 ; q1<p ; ++q1 ) {
            scratch[q1] = Fout[ k  ];
            C_FIXDIV(scratch[q1],p);
            k += m;
        }

        k=u;
        for ( q1=0 ; q1<p ; ++q1 ) {
            int twidx=0;
            Fout[ k ] = scratch[0];
            for (q=1;q<p;++q ) {
                twidx += fstride * k;
                if (twidx>=Norig) twidx-=Norig;
                C_MUL(t,scratch[q] , twiddles[twidx] );
                C_ADDTO( Fout[ k ] ,t);
            }
            k += m;
        }
    }
    KISS_FFT_TMP_FREE(scratch);
}

static
void kf_work(
        kiss_fft_cpx * Fout,
        const kiss_fft_cpx * f,
        const size_t fstride,
        int in_stride,
        int * factors,
        const kiss_fft_cfg st
        )
{
    kiss_fft_cpx * Fout_beg=Fout;
    const int p=*factors++; /* the radix  */
    const int m=*factors++; /* stage's fft length/p */
    const kiss_fft_cpx * Fout_end = Fout + p*m;

#ifdef _OPENMP
    // use openmp extensions at the 
    // top-level (not recursive)
    if (fstride==1 && p<=5)
    {
        int k;

        // execute the p different work units in different threads
#       pragma omp parallel for
        for (k=0;k<p;++k) 
            kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
        // all threads have joined by this point

        switch (p) {
            case 2: kf_bfly2(Fout,fstride,st,m); break;
            case 3: kf_bfly3(Fout,fstride,st,m); break; 
            case 4: kf_bfly4(Fout,fstride,st,m); break;
            case 5: kf_bfly5(Fout,fstride,st,m); break; 
            default: kf_bfly_generic(Fout,fstride,st,m,p); break;
        }
        return;
    }
#endif

    if (m==1) {
        do{
            *Fout = *f;
            f += fstride*in_stride;
        }while(++Fout != Fout_end );
    }else{
        do{
            // recursive call:
            // DFT of size m*p performed by doing
            // p instances of smaller DFTs of size m, 
            // each one takes a decimated version of the input
            kf_work( Fout , f, fstride*p, in_stride, factors,st);
            f += fstride*in_stride;
        }while( (Fout += m) != Fout_end );
    }

    Fout=Fout_beg;

    // recombine the p smaller DFTs 
    switch (p) {
        case 2: kf_bfly2(Fout,fstride,st,m); break;
        case 3: kf_bfly3(Fout,fstride,st,m); break; 
        case 4: kf_bfly4(Fout,fstride,st,m); break;
        case 5: kf_bfly5(Fout,fstride,st,m); break; 
        default: kf_bfly_generic(Fout,fstride,st,m,p); break;
    }
}

/*  facbuf is populated by p1,m1,p2,m2, ...
    where 
    p[i] * m[i] = m[i-1]
    m0 = n                  */
static 
void kf_factor(int n,int * facbuf)
{
    int p=4;
    double floor_sqrt;
    floor_sqrt = floor( sqrt((double)n) );

    /*factor out powers of 4, powers of 2, then any remaining primes */
    do {
        while (n % p) {
            switch (p) {
                case 4: p = 2; break;
                case 2: p = 3; break;
                default: p += 2; break;
            }
            if (p > floor_sqrt)
                p = n;          /* no more factors, skip to end */
        }
        n /= p;
        *facbuf++ = p;
        *facbuf++ = n;
    } while (n > 1);
}

/*
 *
 * User-callable function to allocate all necessary storage space for the fft.
 *
 * The return value is a contiguous block of memory, allocated with malloc.  As such,
 * It can be freed with free(), rather than a kiss_fft-specific function.
 * */
kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
{
    kiss_fft_cfg st=NULL;
    size_t memneeded = sizeof(struct kiss_fft_state)
        + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/

    if ( lenmem==NULL ) {
        st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
    }else{
        if (mem != NULL && *lenmem >= memneeded)
            st = (kiss_fft_cfg)mem;
        *lenmem = memneeded;
    }
    if (st) {
        int i;
        st->nfft=nfft;
        st->inverse = inverse_fft;

        for (i=0;i<nfft;++i) {
            const double pi=3.141592653589793238462643383279502884197169399375105820974944;
            double phase = -2*pi*i / nfft;
            if (st->inverse)
                phase *= -1;
            kf_cexp(st->twiddles+i, phase );
        }

        kf_factor(nfft,st->factors);
    }
    return st;
}


void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
{
    if (fin == fout) {
        //NOTE: this is not really an in-place FFT algorithm.
        //It just performs an out-of-place FFT into a temp buffer
        kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
        kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
        memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
        KISS_FFT_TMP_FREE(tmpbuf);
    }else{
        kf_work( fout, fin, 1,in_stride, st->factors,st );
    }
}

void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
{
    kiss_fft_stride(cfg,fin,fout,1);
}


void kiss_fft_cleanup(void)
{
    // nothing needed any more
}

int kiss_fft_next_fast_size(int n)
{
    while(1) {
        int m=n;
        while ( (m%2) == 0 ) m/=2;
        while ( (m%3) == 0 ) m/=3;
        while ( (m%5) == 0 ) m/=5;
        if (m<=1)
            break; /* n is completely factorable by twos, threes, and fives */
        n++;
    }
    return n;
}
Working towards decoding, added FFT routines and a LDPC decoder 2018-11-07 21:50:55 +08:00			`/*`
			`* Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.`
			`* This file is part of KISS FFT - https://github.com/mborgerding/kissfft`
			`*`
			`* SPDX-License-Identifier: BSD-3-Clause`
			`* See COPYING file for more information.`
			`*/`


			`#include "_kiss_fft_guts.h"`
			`/* The guts header contains all the multiplication and addition macros that are defined for`
			`fixed or floating point complex numbers. It also delares the kf_ internal functions.`
			`*/`

			`static void kf_bfly2(`
			`kiss_fft_cpx * Fout,`
			`const size_t fstride,`
			`const kiss_fft_cfg st,`
			`int m`
			`)`
			`{`
			`kiss_fft_cpx * Fout2;`
			`kiss_fft_cpx * tw1 = st->twiddles;`
			`kiss_fft_cpx t;`
			`Fout2 = Fout + m;`
			`do{`
			`C_FIXDIV(Fout,2); C_FIXDIV(Fout2,2);`

			`C_MUL (t, Fout2 , tw1);`
			`tw1 += fstride;`
			`C_SUB( Fout2 , Fout , t );`
			`C_ADDTO( *Fout , t );`
			`++Fout2;`
			`++Fout;`
			`}while (--m);`
			`}`

			`static void kf_bfly4(`
			`kiss_fft_cpx * Fout,`
			`const size_t fstride,`
			`const kiss_fft_cfg st,`
			`const size_t m`
			`)`
			`{`
			`kiss_fft_cpx tw1,tw2,*tw3;`
			`kiss_fft_cpx scratch[6];`
			`size_t k=m;`
			`const size_t m2=2*m;`
			`const size_t m3=3*m;`


			`tw3 = tw2 = tw1 = st->twiddles;`

			`do {`
			`C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);`

			`C_MUL(scratch[0],Fout[m] , *tw1 );`
			`C_MUL(scratch[1],Fout[m2] , *tw2 );`
			`C_MUL(scratch[2],Fout[m3] , *tw3 );`

			`C_SUB( scratch[5] , *Fout, scratch[1] );`
			`C_ADDTO(*Fout, scratch[1]);`
			`C_ADD( scratch[3] , scratch[0] , scratch[2] );`
			`C_SUB( scratch[4] , scratch[0] , scratch[2] );`
			`C_SUB( Fout[m2], *Fout, scratch[3] );`
			`tw1 += fstride;`
			`tw2 += fstride*2;`
			`tw3 += fstride*3;`
			`C_ADDTO( *Fout , scratch[3] );`

			`if(st->inverse) {`
			`Fout[m].r = scratch[5].r - scratch[4].i;`
			`Fout[m].i = scratch[5].i + scratch[4].r;`
			`Fout[m3].r = scratch[5].r + scratch[4].i;`
			`Fout[m3].i = scratch[5].i - scratch[4].r;`
			`}else{`
			`Fout[m].r = scratch[5].r + scratch[4].i;`
			`Fout[m].i = scratch[5].i - scratch[4].r;`
			`Fout[m3].r = scratch[5].r - scratch[4].i;`
			`Fout[m3].i = scratch[5].i + scratch[4].r;`
			`}`
			`++Fout;`
			`}while(--k);`
			`}`

			`static void kf_bfly3(`
			`kiss_fft_cpx * Fout,`
			`const size_t fstride,`
			`const kiss_fft_cfg st,`
			`size_t m`
			`)`
			`{`
			`size_t k=m;`
			`const size_t m2 = 2*m;`
			`kiss_fft_cpx tw1,tw2;`
			`kiss_fft_cpx scratch[5];`
			`kiss_fft_cpx epi3;`
			`epi3 = st->twiddles[fstride*m];`

			`tw1=tw2=st->twiddles;`

			`do{`
			`C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);`

			`C_MUL(scratch[1],Fout[m] , *tw1);`
			`C_MUL(scratch[2],Fout[m2] , *tw2);`

			`C_ADD(scratch[3],scratch[1],scratch[2]);`
			`C_SUB(scratch[0],scratch[1],scratch[2]);`
			`tw1 += fstride;`
			`tw2 += fstride*2;`

			`Fout[m].r = Fout->r - HALF_OF(scratch[3].r);`
			`Fout[m].i = Fout->i - HALF_OF(scratch[3].i);`

			`C_MULBYSCALAR( scratch[0] , epi3.i );`

			`C_ADDTO(*Fout,scratch[3]);`

			`Fout[m2].r = Fout[m].r + scratch[0].i;`
			`Fout[m2].i = Fout[m].i - scratch[0].r;`

			`Fout[m].r -= scratch[0].i;`
			`Fout[m].i += scratch[0].r;`

			`++Fout;`
			`}while(--k);`
			`}`

			`static void kf_bfly5(`
			`kiss_fft_cpx * Fout,`
			`const size_t fstride,`
			`const kiss_fft_cfg st,`
			`int m`
			`)`
			`{`
			`kiss_fft_cpx Fout0,Fout1,Fout2,Fout3,*Fout4;`
			`int u;`
			`kiss_fft_cpx scratch[13];`
			`kiss_fft_cpx * twiddles = st->twiddles;`
			`kiss_fft_cpx *tw;`
			`kiss_fft_cpx ya,yb;`
			`ya = twiddles[fstride*m];`
			`yb = twiddles[fstride2m];`

			`Fout0=Fout;`
			`Fout1=Fout0+m;`
			`Fout2=Fout0+2*m;`
			`Fout3=Fout0+3*m;`
			`Fout4=Fout0+4*m;`

			`tw=st->twiddles;`
			`for ( u=0; u<m; ++u ) {`
			`C_FIXDIV( Fout0,5); C_FIXDIV( Fout1,5); C_FIXDIV( Fout2,5); C_FIXDIV( Fout3,5); C_FIXDIV( *Fout4,5);`
			`scratch[0] = *Fout0;`

			`C_MUL(scratch[1] ,Fout1, tw[ufstride]);`
			`C_MUL(scratch[2] ,Fout2, tw[2u*fstride]);`
			`C_MUL(scratch[3] ,Fout3, tw[3u*fstride]);`
			`C_MUL(scratch[4] ,Fout4, tw[4u*fstride]);`

			`C_ADD( scratch[7],scratch[1],scratch[4]);`
			`C_SUB( scratch[10],scratch[1],scratch[4]);`
			`C_ADD( scratch[8],scratch[2],scratch[3]);`
			`C_SUB( scratch[9],scratch[2],scratch[3]);`

			`Fout0->r += scratch[7].r + scratch[8].r;`
			`Fout0->i += scratch[7].i + scratch[8].i;`

			`scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);`
			`scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);`

			`scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);`
			`scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);`

			`C_SUB(*Fout1,scratch[5],scratch[6]);`
			`C_ADD(*Fout4,scratch[5],scratch[6]);`

			`scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);`
			`scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);`
			`scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);`
			`scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);`

			`C_ADD(*Fout2,scratch[11],scratch[12]);`
			`C_SUB(*Fout3,scratch[11],scratch[12]);`

			`++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;`
			`}`
			`}`

			`/* perform the butterfly for one stage of a mixed radix FFT */`
			`static void kf_bfly_generic(`
			`kiss_fft_cpx * Fout,`
			`const size_t fstride,`
			`const kiss_fft_cfg st,`
			`int m,`
			`int p`
			`)`
			`{`
			`int u,k,q1,q;`
			`kiss_fft_cpx * twiddles = st->twiddles;`
			`kiss_fft_cpx t;`
			`int Norig = st->nfft;`

			`kiss_fft_cpx * scratch = (kiss_fft_cpx)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)p);`

			`for ( u=0; u<m; ++u ) {`
			`k=u;`
			`for ( q1=0 ; q1<p ; ++q1 ) {`
			`scratch[q1] = Fout[ k ];`
			`C_FIXDIV(scratch[q1],p);`
			`k += m;`
			`}`

			`k=u;`
			`for ( q1=0 ; q1<p ; ++q1 ) {`
			`int twidx=0;`
			`Fout[ k ] = scratch[0];`
			`for (q=1;q<p;++q ) {`
			`twidx += fstride * k;`
			`if (twidx>=Norig) twidx-=Norig;`
			`C_MUL(t,scratch[q] , twiddles[twidx] );`
			`C_ADDTO( Fout[ k ] ,t);`
			`}`
			`k += m;`
			`}`
			`}`
			`KISS_FFT_TMP_FREE(scratch);`
			`}`

			`static`
			`void kf_work(`
			`kiss_fft_cpx * Fout,`
			`const kiss_fft_cpx * f,`
			`const size_t fstride,`
			`int in_stride,`
			`int * factors,`
			`const kiss_fft_cfg st`
			`)`
			`{`
			`kiss_fft_cpx * Fout_beg=Fout;`
			`const int p=factors++; / the radix */`
			`const int m=factors++; / stage's fft length/p */`
			`const kiss_fft_cpx * Fout_end = Fout + p*m;`

			`#ifdef _OPENMP`
			`// use openmp extensions at the`
			`// top-level (not recursive)`
			`if (fstride==1 && p<=5)`
			`{`
			`int k;`

			`// execute the p different work units in different threads`
			`# pragma omp parallel for`
			`for (k=0;k<p;++k)`
			`kf_work( Fout +km, f+ fstridein_stridek,fstridep,in_stride,factors,st);`
			`// all threads have joined by this point`

			`switch (p) {`
			`case 2: kf_bfly2(Fout,fstride,st,m); break;`
			`case 3: kf_bfly3(Fout,fstride,st,m); break;`
			`case 4: kf_bfly4(Fout,fstride,st,m); break;`
			`case 5: kf_bfly5(Fout,fstride,st,m); break;`
			`default: kf_bfly_generic(Fout,fstride,st,m,p); break;`
			`}`
			`return;`
			`}`
			`#endif`

			`if (m==1) {`
			`do{`
			`Fout = f;`
			`f += fstride*in_stride;`
			`}while(++Fout != Fout_end );`
			`}else{`
			`do{`
			`// recursive call:`
			`// DFT of size m*p performed by doing`
			`// p instances of smaller DFTs of size m,`
			`// each one takes a decimated version of the input`
			`kf_work( Fout , f, fstride*p, in_stride, factors,st);`
			`f += fstride*in_stride;`
			`}while( (Fout += m) != Fout_end );`
			`}`

			`Fout=Fout_beg;`

			`// recombine the p smaller DFTs`
			`switch (p) {`
			`case 2: kf_bfly2(Fout,fstride,st,m); break;`
			`case 3: kf_bfly3(Fout,fstride,st,m); break;`
			`case 4: kf_bfly4(Fout,fstride,st,m); break;`
			`case 5: kf_bfly5(Fout,fstride,st,m); break;`
			`default: kf_bfly_generic(Fout,fstride,st,m,p); break;`
			`}`
			`}`

			`/* facbuf is populated by p1,m1,p2,m2, ...`
			`where`
			`p[i] * m[i] = m[i-1]`
			`m0 = n */`
			`static`
			`void kf_factor(int n,int * facbuf)`
			`{`
			`int p=4;`
			`double floor_sqrt;`
			`floor_sqrt = floor( sqrt((double)n) );`

			`/factor out powers of 4, powers of 2, then any remaining primes /`
			`do {`
			`while (n % p) {`
			`switch (p) {`
			`case 4: p = 2; break;`
			`case 2: p = 3; break;`
			`default: p += 2; break;`
			`}`
			`if (p > floor_sqrt)`
			`p = n; /* no more factors, skip to end */`
			`}`
			`n /= p;`
			`*facbuf++ = p;`
			`*facbuf++ = n;`
			`} while (n > 1);`
			`}`

			`/*`
			`*`
			`* User-callable function to allocate all necessary storage space for the fft.`
			`*`
			`* The return value is a contiguous block of memory, allocated with malloc. As such,`
			`* It can be freed with free(), rather than a kiss_fft-specific function.`
			`* */`
			`kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )`
			`{`
			`kiss_fft_cfg st=NULL;`
			`size_t memneeded = sizeof(struct kiss_fft_state)`
			`+ sizeof(kiss_fft_cpx)(nfft-1); / twiddle factors*/`

			`if ( lenmem==NULL ) {`
			`st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );`
			`}else{`
			`if (mem != NULL && *lenmem >= memneeded)`
			`st = (kiss_fft_cfg)mem;`
			`*lenmem = memneeded;`
			`}`
			`if (st) {`
			`int i;`
			`st->nfft=nfft;`
			`st->inverse = inverse_fft;`

			`for (i=0;i<nfft;++i) {`
			`const double pi=3.141592653589793238462643383279502884197169399375105820974944;`
			`double phase = -2pii / nfft;`
			`if (st->inverse)`
			`phase *= -1;`
			`kf_cexp(st->twiddles+i, phase );`
			`}`

			`kf_factor(nfft,st->factors);`
			`}`
			`return st;`
			`}`


			`void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx fin,kiss_fft_cpx fout,int in_stride)`
			`{`
			`if (fin == fout) {`
			`//NOTE: this is not really an in-place FFT algorithm.`
			`//It just performs an out-of-place FFT into a temp buffer`
			`kiss_fft_cpx * tmpbuf = (kiss_fft_cpx)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)st->nfft);`
			`kf_work(tmpbuf,fin,1,in_stride, st->factors,st);`
			`memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);`
			`KISS_FFT_TMP_FREE(tmpbuf);`
			`}else{`
			`kf_work( fout, fin, 1,in_stride, st->factors,st );`
			`}`
			`}`

			`void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx fin,kiss_fft_cpx fout)`
			`{`
			`kiss_fft_stride(cfg,fin,fout,1);`
			`}`


			`void kiss_fft_cleanup(void)`
			`{`
			`// nothing needed any more`
			`}`

			`int kiss_fft_next_fast_size(int n)`
			`{`
			`while(1) {`
			`int m=n;`
			`while ( (m%2) == 0 ) m/=2;`
			`while ( (m%3) == 0 ) m/=3;`
			`while ( (m%5) == 0 ) m/=5;`
			`if (m<=1)`
			`break; /* n is completely factorable by twos, threes, and fives */`
			`n++;`
			`}`
			`return n;`
			`}`