diff options
| author | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2016-04-22 17:46:23 +0200 | 
|---|---|---|
| committer | Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl> | 2016-04-22 17:46:23 +0200 | 
| commit | e38a4dc774d3f7ca78cec7f16710afd583709b10 (patch) | |
| tree | c41b7ad10765f98602b09f09f3901c6ca3f4afab | |
| parent | c366f2b07ce16c4ccdafc7cc4199fdac2d3ffef2 (diff) | |
| parent | 8b67986464daae799d0171aed70a0d2cd96fd8d1 (diff) | |
| download | astra-e38a4dc774d3f7ca78cec7f16710afd583709b10.tar.gz astra-e38a4dc774d3f7ca78cec7f16710afd583709b10.tar.bz2 astra-e38a4dc774d3f7ca78cec7f16710afd583709b10.tar.xz astra-e38a4dc774d3f7ca78cec7f16710afd583709b10.zip | |
Merge branch 'master' into aniso
| -rw-r--r-- | README.md | 2 | ||||
| -rw-r--r-- | include/astra/AstraObjectManager.h | 2 | ||||
| -rw-r--r-- | include/astra/Fourier.h | 132 | ||||
| -rw-r--r-- | src/FilteredBackProjectionAlgorithm.cpp | 45 | ||||
| -rw-r--r-- | src/Fourier.cpp | 3460 | 
5 files changed, 3330 insertions, 311 deletions
| @@ -30,7 +30,7 @@ cd build/linux  ./autogen.sh   # when building a git version  ./configure --with-cuda=/usr/local/cuda \              --with-matlab=/usr/local/MATLAB/R2012a \ -            --with-python +            --with-python \              --prefix=/usr/local/astra  make  make install diff --git a/include/astra/AstraObjectManager.h b/include/astra/AstraObjectManager.h index ad89c2a..9faecbe 100644 --- a/include/astra/AstraObjectManager.h +++ b/include/astra/AstraObjectManager.h @@ -60,7 +60,7 @@ public:  }; -class CAstraIndexManager : public Singleton<CAstraIndexManager> { +class _AstraExport CAstraIndexManager : public Singleton<CAstraIndexManager> {  public:  	CAstraIndexManager() : m_iLastIndex(0) { } diff --git a/include/astra/Fourier.h b/include/astra/Fourier.h index b515dc6..68f9f38 100644 --- a/include/astra/Fourier.h +++ b/include/astra/Fourier.h @@ -33,94 +33,50 @@ $Id$  namespace astra { - -/** - * Perform a 1D DFT or inverse DFT. - * - * @param iLength number of elements - * @param pfRealIn real part of input - * @param pfImaginaryIn imaginary part of input - * @param pfRealOut real part of output - * @param pfImaginaryOut imaginary part of output - * @param iStrideIn distance between elements in pf*In - * @param iStrideOut distance between elements in pf*Out - * @param bInverse if true, perform an inverse DFT - */ - -void _AstraExport discreteFourierTransform1D(unsigned int iLength, -                                const float32* pfRealIn, -                                const float32* pfImaginaryIn, -                                float32* pfRealOut, -                                float32* pfImaginaryOut, -                                unsigned int iStrideIn, -                                unsigned int iStrideOut, -                                bool bInverse); - -/** - * Perform a 2D DFT or inverse DFT. - * - * @param iHeight number of rows - * @param iWidth number of columns - * @param pfRealIn real part of input - * @param pfImaginaryIn imaginary part of input - * @param pfRealOut real part of output - * @param pfImaginaryOut imaginary part of output - * @param bInverse if true, perform an inverse DFT - */ - -void _AstraExport discreteFourierTransform2D(unsigned int iHeight, unsigned int iWidth, -                                const float32* pfRealIn, -                                const float32* pfImaginaryIn, -                                float32* pfRealOut, -                                float32* pfImaginaryOut, -                                bool bInverse); - -/** - * Perform a 1D FFT or inverse FFT. The size must be a power of two. - * This transform can be done in-place, so the input and output pointers - * may point to the same data. - * - * @param iLength number of elements, must be a power of two - * @param pfRealIn real part of input - * @param pfImaginaryIn imaginary part of input - * @param pfRealOut real part of output - * @param pfImaginaryOut imaginary part of output - * @param iStrideIn distance between elements in pf*In - * @param iStrideOut distance between elements in pf*Out - * @param bInverse if true, perform an inverse DFT - */ - -void _AstraExport fastTwoPowerFourierTransform1D(unsigned int iLength, -                                    const float32* pfRealIn, -                                    const float32* pfImaginaryIn, -                                    float32* pfRealOut, -                                    float32* pfImaginaryOut, -                                    unsigned int iStrideIn, -                                    unsigned int iStrideOut, -                                    bool bInverse); - -/** - * Perform a 2D FFT or inverse FFT. The size must be a power of two. - * This transform can be done in-place, so the input and output pointers - * may point to the same data. - * - * @param iHeight number of rows, must be a power of two - * @param iWidth number of columns, must be a power of two - * @param pfRealIn real part of input - * @param pfImaginaryIn imaginary part of input - * @param pfRealOut real part of output - * @param pfImaginaryOut imaginary part of output - * @param bInverse if true, perform an inverse DFT - */ - -void _AstraExport fastTwoPowerFourierTransform2D(unsigned int iHeight, -                                    unsigned int iWidth, -                                    const float32* pfRealIn, -                                    const float32* pfImaginaryIn, -                                    float32* pfRealOut, -                                    float32* pfImaginaryOut, -                                    bool bInverse); - +/* +-------- Complex DFT (Discrete Fourier Transform) -------- +    [definition] +        <case1> +            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n +        <case2> +            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n +        (notes: sum_j=0^n-1 is a summation from j=0 to n-1) +    [usage] +        <case1> +            ip[0] = 0; // first time only +            cdft(2*n, 1, a, ip, w); +        <case2> +            ip[0] = 0; // first time only +            cdft(2*n, -1, a, ip, w); +    [parameters] +        2*n            :data length (int) +                        n >= 1, n = power of 2 +        a[0...2*n-1]   :input/output data (float32 *) +                        input data +                            a[2*j] = Re(x[j]),  +                            a[2*j+1] = Im(x[j]), 0<=j<n +                        output data +                            a[2*k] = Re(X[k]),  +                            a[2*k+1] = Im(X[k]), 0<=k<n +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n/2-1]   :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            cdft(2*n, -1, a, ip, w); +        is  +            cdft(2*n, 1, a, ip, w); +            for (j = 0; j <= 2 * n - 1; j++) { +                a[j] *= 1.0 / n; +            } +        . +*/ +_AstraExport void cdft(int n, int isgn, float32 *a, int *ip, float32 *w);  } diff --git a/src/FilteredBackProjectionAlgorithm.cpp b/src/FilteredBackProjectionAlgorithm.cpp index ccbfec6..70462f7 100644 --- a/src/FilteredBackProjectionAlgorithm.cpp +++ b/src/FilteredBackProjectionAlgorithm.cpp @@ -274,60 +274,57 @@ void CFilteredBackProjectionAlgorithm::performFiltering(CFloat32ProjectionData2D  		filter[iDetector] = (2.0f * (zpDetector - iDetector)) / zpDetector; -	float32* pfRe = new float32[iAngleCount * zpDetector]; -	float32* pfIm = new float32[iAngleCount * zpDetector]; +	float32* pf = new float32[2 * iAngleCount * zpDetector]; +	int *ip = new int[int(2+sqrt((float)zpDetector)+1)]; +	ip[0]=0; +	float32 *w = new float32[zpDetector/2];  	// Copy and zero-pad data  	for (int iAngle = 0; iAngle < iAngleCount; ++iAngle) { -		float32* pfReRow = pfRe + iAngle * zpDetector; -		float32* pfImRow = pfIm + iAngle * zpDetector; +		float32* pfRow = pf + iAngle * 2 * zpDetector;  		float32* pfDataRow = _pFilteredSinogram->getData() + iAngle * iDetectorCount;  		for (int iDetector = 0; iDetector < iDetectorCount; ++iDetector) { -			pfReRow[iDetector] = pfDataRow[iDetector]; -			pfImRow[iDetector] = 0.0f; +			pfRow[2*iDetector] = pfDataRow[iDetector]; +			pfRow[2*iDetector+1] = 0.0f;  		}  		for (int iDetector = iDetectorCount; iDetector < zpDetector; ++iDetector) { -			pfReRow[iDetector] = 0.0f; -			pfImRow[iDetector] = 0.0f; +			pfRow[2*iDetector] = 0.0f; +			pfRow[2*iDetector+1] = 0.0f;  		}  	}  	// in-place FFT  	for (int iAngle = 0; iAngle < iAngleCount; ++iAngle) { -		float32* pfReRow = pfRe + iAngle * zpDetector; -		float32* pfImRow = pfIm + iAngle * zpDetector; - -		fastTwoPowerFourierTransform1D(zpDetector, pfReRow, pfImRow, pfReRow, pfImRow, 1, 1, false); +		float32* pfRow = pf + iAngle * 2 * zpDetector; +		cdft(2*zpDetector, -1, pfRow, ip, w);  	}  	// Filter  	for (int iAngle = 0; iAngle < iAngleCount; ++iAngle) { -		float32* pfReRow = pfRe + iAngle * zpDetector; -		float32* pfImRow = pfIm + iAngle * zpDetector; +		float32* pfRow = pf + iAngle * 2 * zpDetector;  		for (int iDetector = 0; iDetector < zpDetector; ++iDetector) { -			pfReRow[iDetector] *= filter[iDetector]; -			pfImRow[iDetector] *= filter[iDetector]; +			pfRow[2*iDetector] *= filter[iDetector]; +			pfRow[2*iDetector+1] *= filter[iDetector];  		}  	}  	// in-place inverse FFT  	for (int iAngle = 0; iAngle < iAngleCount; ++iAngle) { -		float32* pfReRow = pfRe + iAngle * zpDetector; -		float32* pfImRow = pfIm + iAngle * zpDetector; - -		fastTwoPowerFourierTransform1D(zpDetector, pfReRow, pfImRow, pfReRow, pfImRow, 1, 1, true); +		float32* pfRow = pf + iAngle * 2 * zpDetector; +		cdft(2*zpDetector, 1, pfRow, ip, w);  	}  	// Copy data back  	for (int iAngle = 0; iAngle < iAngleCount; ++iAngle) { -		float32* pfReRow = pfRe + iAngle * zpDetector; +		float32* pfRow = pf + iAngle * 2 * zpDetector;  		float32* pfDataRow = _pFilteredSinogram->getData() + iAngle * iDetectorCount;  		for (int iDetector = 0; iDetector < iDetectorCount; ++iDetector) -			pfDataRow[iDetector] = pfReRow[iDetector]; +			pfDataRow[iDetector] = pfRow[2*iDetector] / zpDetector;  	} -	delete[] pfRe; -	delete[] pfIm; +	delete[] pf; +	delete[] w; +	delete[] ip;  	delete[] filter;  } diff --git a/src/Fourier.cpp b/src/Fourier.cpp index 584b633..c33f7bd 100644 --- a/src/Fourier.cpp +++ b/src/Fourier.cpp @@ -27,207 +27,3273 @@ $Id$  */  #include "astra/Fourier.h" +#include <cmath>  namespace astra { +    /* +Copyright Takuya OOURA, 1996-2001 -void discreteFourierTransform1D(unsigned int iLength, -                                const float32* pfRealIn, -                                const float32* pfImaginaryIn, -                                float32* pfRealOut, -                                float32* pfImaginaryOut, -                                unsigned int iStrideIn, -                                unsigned int iStrideOut, -                                bool inverse) -{ -	for (unsigned int w = 0; w < iLength; w++) -	{ -		pfRealOut[iStrideOut*w] = pfImaginaryOut[iStrideOut*w] = 0; -		for (unsigned int y = 0; y < iLength; y++) -		{ -			float32 a = 2 * PI * w * y / float32(iLength); -			if (!inverse) -				a = -a; -			float32 ca = cos(a); -			float32 sa = sin(a); -			pfRealOut[iStrideOut*w] += pfRealIn[iStrideIn*y] * ca - pfImaginaryIn[iStrideIn*y] * sa; -			pfImaginaryOut[iStrideOut*w] += pfRealIn[iStrideIn*y] * sa + pfImaginaryIn[iStrideIn*y] * ca;    -		} -	} - -	if (inverse) { -		for (unsigned int x = 0; x < iLength; ++x) { -			pfRealOut[iStrideOut*x] /= iLength; -			pfImaginaryOut[iStrideOut*x] /= iLength; -		} -	} -} - -void discreteFourierTransform2D(unsigned int iHeight, unsigned int iWidth, -                                const float32* pfRealIn, -                                const float32* pfImaginaryIn, -                                float32* pfRealOut, -                                float32* pfImaginaryOut, -                                bool inverse) -{ -	float32* reTemp = new float32[iWidth * iHeight]; -	float32* imTemp = new float32[iWidth * iHeight]; - -	//calculate the fourier transform of the columns -	for (unsigned int x = 0; x < iWidth; x++) -	{ -		discreteFourierTransform1D(iHeight, pfRealIn+x, pfImaginaryIn+x, -		                           reTemp+x, imTemp+x, -		                           iWidth, iWidth, inverse); -	} - -	//calculate the fourier transform of the rows -	for(unsigned int y = 0; y < iHeight; y++) -	{ -		discreteFourierTransform1D(iWidth, -		                           reTemp+y*iWidth, -		                           imTemp+y*iWidth, -		                           pfRealOut+y*iWidth, -		                           pfImaginaryOut+y*iWidth, -		                           1, 1, inverse); -	} - -	delete[] reTemp; -	delete[] imTemp; -} - -/** permute the entries from pfDataIn into pfDataOut to prepare for an - *  in-place FFT. pfDataIn may be equal to pfDataOut. - */ -static void bitReverse(unsigned int iLength, -                       const float32* pfDataIn, float32* pfDataOut, -                       unsigned int iStrideShiftIn, -                       unsigned int iStrideShiftOut) -{ -	if (pfDataIn == pfDataOut) { -		assert(iStrideShiftIn == iStrideShiftOut); -		float32 t; -		unsigned int j = 0; -		for(unsigned int i = 0; i < iLength - 1; i++) { -			if (i < j) { -				t = pfDataOut[i<<iStrideShiftOut]; -				pfDataOut[i<<iStrideShiftOut] = pfDataOut[j<<iStrideShiftOut]; -				pfDataOut[j<<iStrideShiftOut] = t; -			} -			unsigned int k = iLength / 2; -			while (k <= j) { -				j -= k; -				k /= 2; -			} -			j += k; -		} -	} else { -		unsigned int j = 0; -		for(unsigned int i = 0; i < iLength - 1; i++) { -			pfDataOut[i<<iStrideShiftOut] = pfDataIn[j<<iStrideShiftIn]; -			unsigned int k = iLength / 2; -			while (k <= j) { -				j -= k; -				k /= 2; -			} -			j += k; -		} -		pfDataOut[(iLength-1)<<iStrideShiftOut] = pfDataIn[(iLength-1)<<iStrideShiftOut]; -	} -} - -static unsigned int log2(unsigned int n) -{ -	unsigned int l = 0; -	while (n > 1) { -		n /= 2; -		++l; -	} -	return l; -} - -/** perform 1D FFT. iLength, iStrideIn, iStrideOut must be powers of two. */ -void fastTwoPowerFourierTransform1D(unsigned int iLength, -                                    const float32* pfRealIn, -                                    const float32* pfImaginaryIn, -                                    float32* pfRealOut, -                                    float32* pfImaginaryOut, -                                    unsigned int iStrideIn, -                                    unsigned int iStrideOut, -                                    bool inverse) -{ -	unsigned int iStrideShiftIn = log2(iStrideIn); -	unsigned int iStrideShiftOut = log2(iStrideOut); -	unsigned int iLogLength = log2(iLength); - -	bitReverse(iLength, pfRealIn, pfRealOut, iStrideShiftIn, iStrideShiftOut); -	bitReverse(iLength, pfImaginaryIn, pfImaginaryOut, iStrideShiftIn, iStrideShiftOut); - -	float32 ca = -1.0; -	float32 sa = 0.0; -	unsigned int l1 = 1, l2 = 1; -	for(unsigned int l=0; l < iLogLength; ++l) -	{ -		l1 = l2; -		l2 *= 2; -		float32 u1 = 1.0; -		float32 u2 = 0.0; -		for(unsigned int j = 0; j < l1; j++) -		{ -			for(unsigned int i = j; i < iLength; i += l2) -			{ -				unsigned int i1 = i + l1; -				float32 t1 = u1 * pfRealOut[i1<<iStrideShiftOut] - u2 * pfImaginaryOut[i1<<iStrideShiftOut]; -				float32 t2 = u1 * pfImaginaryOut[i1<<iStrideShiftOut] + u2 * pfRealOut[i1<<iStrideShiftOut]; -				pfRealOut[i1<<iStrideShiftOut] = pfRealOut[i<<iStrideShiftOut] - t1; -				pfImaginaryOut[i1<<iStrideShiftOut] = pfImaginaryOut[i<<iStrideShiftOut] - t2; -				pfRealOut[i<<iStrideShiftOut] += t1; -				pfImaginaryOut[i<<iStrideShiftOut] += t2; -			} -			float32 z =  u1 * ca - u2 * sa; -			u2 = u1 * sa + u2 * ca; -			u1 = z; -		} -		sa = sqrt((1.0 - ca) / 2.0); -		if (!inverse)  -			sa = -sa; -		ca = sqrt((1.0 + ca) / 2.0); -	} - -	if (inverse) { -		for (unsigned int i = 0; i < iLength; ++i) { -			pfRealOut[i<<iStrideShiftOut] /= iLength; -			pfImaginaryOut[i<<iStrideShiftOut] /= iLength; -		} -	} -} - -void fastTwoPowerFourierTransform2D(unsigned int iHeight, -                                    unsigned int iWidth, -                                    const float32* pfRealIn, -                                    const float32* pfImaginaryIn, -                                    float32* pfRealOut, -                                    float32* pfImaginaryOut, -                                    bool inverse) -{ -	//calculate the fourier transform of the columns -	for (unsigned int x = 0; x < iWidth; x++) -	{ -		fastTwoPowerFourierTransform1D(iHeight, pfRealIn+x, pfImaginaryIn+x, -		                               pfRealOut+x, pfImaginaryOut+x, -		                               iWidth, iWidth, inverse); -	} - -	//calculate the fourier transform of the rows -	for (unsigned int y = 0; y < iHeight; y++) -	{ -		fastTwoPowerFourierTransform1D(iWidth, -		                               pfRealOut+y*iWidth, -		                               pfImaginaryOut+y*iWidth, -		                               pfRealOut+y*iWidth, -		                               pfImaginaryOut+y*iWidth, -		                               1, 1, inverse); -	} +You may use, copy, modify and distribute this code for any purpose (include commercial use) and without fee. + +Source: http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html + +Fast Fourier/Cosine/Sine Transform +    dimension   :one +    data length :power of 2 +    decimation  :frequency +    radix       :split-radix +    data        :inplace +    table       :use +functions +    cdft: Complex Discrete Fourier Transform +    rdft: Real Discrete Fourier Transform +    ddct: Discrete Cosine Transform +    ddst: Discrete Sine Transform +    dfct: Cosine Transform of RDFT (Real Symmetric DFT) +    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes +    void cdft(int, int, float32 *, int *, float32 *); +    void rdft(int, int, float32 *, int *, float32 *); +    void ddct(int, int, float32 *, int *, float32 *); +    void ddst(int, int, float32 *, int *, float32 *); +    void dfct(int, float32 *, float32 *, int *, float32 *); +    void dfst(int, float32 *, float32 *, int *, float32 *); +macro definitions +    USE_CDFT_PTHREADS : default=not defined +        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192 +        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 +    USE_CDFT_WINTHREADS : default=not defined +        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768 +        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- +    [definition] +        <case1> +            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n +        <case2> +            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n +        (notes: sum_j=0^n-1 is a summation from j=0 to n-1) +    [usage] +        <case1> +            ip[0] = 0; // first time only +            cdft(2*n, 1, a, ip, w); +        <case2> +            ip[0] = 0; // first time only +            cdft(2*n, -1, a, ip, w); +    [parameters] +        2*n            :data length (int) +                        n >= 1, n = power of 2 +        a[0...2*n-1]   :input/output data (float32 *) +                        input data +                            a[2*j] = Re(x[j]),  +                            a[2*j+1] = Im(x[j]), 0<=j<n +                        output data +                            a[2*k] = Re(X[k]),  +                            a[2*k+1] = Im(X[k]), 0<=k<n +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n/2-1]   :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            cdft(2*n, -1, a, ip, w); +        is  +            cdft(2*n, 1, a, ip, w); +            for (j = 0; j <= 2 * n - 1; j++) { +                a[j] *= 1.0 / n; +            } +        . + + +-------- Real DFT / Inverse of Real DFT -------- +    [definition] +        <case1> RDFT +            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 +            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2 +        <case2> IRDFT (excluding scale) +            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +  +                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +  +                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n +    [usage] +        <case1> +            ip[0] = 0; // first time only +            rdft(n, 1, a, ip, w); +        <case2> +            ip[0] = 0; // first time only +            rdft(n, -1, a, ip, w); +    [parameters] +        n              :data length (int) +                        n >= 2, n = power of 2 +        a[0...n-1]     :input/output data (float32 *) +                        <case1> +                            output data +                                a[2*k] = R[k], 0<=k<n/2 +                                a[2*k+1] = I[k], 0<k<n/2 +                                a[1] = R[n/2] +                        <case2> +                            input data +                                a[2*j] = R[j], 0<=j<n/2 +                                a[2*j+1] = I[j], 0<j<n/2 +                                a[1] = R[n/2] +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n/2) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n/2+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n/2-1]   :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            rdft(n, 1, a, ip, w); +        is  +            rdft(n, -1, a, ip, w); +            for (j = 0; j <= n - 1; j++) { +                a[j] *= 2.0 / n; +            } +        . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- +    [definition] +        <case1> IDCT (excluding scale) +            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n +        <case2> DCT +            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n +    [usage] +        <case1> +            ip[0] = 0; // first time only +            ddct(n, 1, a, ip, w); +        <case2> +            ip[0] = 0; // first time only +            ddct(n, -1, a, ip, w); +    [parameters] +        n              :data length (int) +                        n >= 2, n = power of 2 +        a[0...n-1]     :input/output data (float32 *) +                        output data +                            a[k] = C[k], 0<=k<n +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n/2) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n/2+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n*5/4-1] :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            ddct(n, -1, a, ip, w); +        is  +            a[0] *= 0.5; +            ddct(n, 1, a, ip, w); +            for (j = 0; j <= n - 1; j++) { +                a[j] *= 2.0 / n; +            } +        . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- +    [definition] +        <case1> IDST (excluding scale) +            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n +        <case2> DST +            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n +    [usage] +        <case1> +            ip[0] = 0; // first time only +            ddst(n, 1, a, ip, w); +        <case2> +            ip[0] = 0; // first time only +            ddst(n, -1, a, ip, w); +    [parameters] +        n              :data length (int) +                        n >= 2, n = power of 2 +        a[0...n-1]     :input/output data (float32 *) +                        <case1> +                            input data +                                a[j] = A[j], 0<j<n +                                a[0] = A[n] +                            output data +                                a[k] = S[k], 0<=k<n +                        <case2> +                            output data +                                a[k] = S[k], 0<k<n +                                a[0] = S[n] +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n/2) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n/2+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n*5/4-1] :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            ddst(n, -1, a, ip, w); +        is  +            a[0] *= 0.5; +            ddst(n, 1, a, ip, w); +            for (j = 0; j <= n - 1; j++) { +                a[j] *= 2.0 / n; +            } +        . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- +    [definition] +        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n +    [usage] +        ip[0] = 0; // first time only +        dfct(n, a, t, ip, w); +    [parameters] +        n              :data length - 1 (int) +                        n >= 2, n = power of 2 +        a[0...n]       :input/output data (float32 *) +                        output data +                            a[k] = C[k], 0<=k<=n +        t[0...n/2]     :work area (float32 *) +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n/4) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n/4+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n*5/8-1] :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            a[0] *= 0.5; +            a[n] *= 0.5; +            dfct(n, a, t, ip, w); +        is  +            a[0] *= 0.5; +            a[n] *= 0.5; +            dfct(n, a, t, ip, w); +            for (j = 0; j <= n; j++) { +                a[j] *= 2.0 / n; +            } +        . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- +    [definition] +        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n +    [usage] +        ip[0] = 0; // first time only +        dfst(n, a, t, ip, w); +    [parameters] +        n              :data length + 1 (int) +                        n >= 2, n = power of 2 +        a[0...n-1]     :input/output data (float32 *) +                        output data +                            a[k] = S[k], 0<k<n +                        (a[0] is used for work area) +        t[0...n/2-1]   :work area (float32 *) +        ip[0...*]      :work area for bit reversal (int *) +                        length of ip >= 2+sqrt(n/4) +                        strictly,  +                        length of ip >=  +                            2+(1<<(int)(log(n/4+0.5)/log(2))/2). +                        ip[0],ip[1] are pointers of the cos/sin table. +        w[0...n*5/8-1] :cos/sin table (float32 *) +                        w[],ip[] are initialized if ip[0] == 0. +    [remark] +        Inverse of  +            dfst(n, a, t, ip, w); +        is  +            dfst(n, a, t, ip, w); +            for (j = 1; j <= n - 1; j++) { +                a[j] *= 2.0 / n; +            } +        . + + +Appendix : +    The cos/sin table is recalculated when the larger table required. +    w[] and ip[] are compatible with all routines. +*/ + + +static int cfttree(int n, int j, int k, float32 *a, int nw, float32 *w); +static void bitrv208(float32 *a); +static void bitrv208neg(float32 *a); +static void bitrv216(float32 *a); +static void bitrv216neg(float32 *a); +static void bitrv2conj(int n, int *ip, float32 *a); +static void bitrv2(int n, int *ip, float32 *a); +static void cftb040(float32 *a); +static void cftb1st(int n, float32 *a, float32 *w); +static void cftbsub(int n, float32 *a, int *ip, int nw, float32 *w); +static void cftf040(float32 *a); +static void cftf081(float32 *a, float32 *w); +static void cftf082(float32 *a, float32 *w); +static void cftf161(float32 *a, float32 *w); +static void cftf162(float32 *a, float32 *w); +static void cftf1st(int n, float32 *a, float32 *w); +static void cftfsub(int n, float32 *a, int *ip, int nw, float32 *w); +static void cftfx41(int n, float32 *a, int nw, float32 *w); +static void cftleaf(int n, int isplt, float32 *a, int nw, float32 *w); +static void cftmdl1(int n, float32 *a, float32 *w); +static void cftmdl2(int n, float32 *a, float32 *w); +static void *cftrec1_th(void *p); +static void *cftrec2_th(void *p); +static void cftrec4(int n, float32 *a, int nw, float32 *w); +static void cftx020(float32 *a); +static void dctsub(int n, float32 *a, int nc, float32 *c); +static void dstsub(int n, float32 *a, int nc, float32 *c); +static void makect(int nc, int *ip, float32 *c); +static void makeipt(int nw, int *ip); +static void makewt(int nw, int *ip, float32 *w); +static void rftbsub(int n, float32 *a, int nc, float32 *c); +static void rftfsub(int n, float32 *a, int nc, float32 *c); +#ifdef USE_CDFT_THREADS +static void cftrec4_th(int n, float32 *a, int nw, float32 *w); +#endif /* USE_CDFT_THREADS */ +     +   +_AstraExport void cdft(int n, int isgn, float32 *a, int *ip, float32 *w) +{ +    int nw; +     +    nw = ip[0]; +    if (n > (nw << 2)) { +        nw = n >> 2; +        makewt(nw, ip, w); +    } +    if (isgn >= 0) { +        cftfsub(n, a, ip, nw, w); +    } else { +        cftbsub(n, a, ip, nw, w); +    } +} + + +_AstraExport void rdft(int n, int isgn, float32 *a, int *ip, float32 *w) +{ +    int nw, nc; +    float32 xi; +     +    nw = ip[0]; +    if (n > (nw << 2)) { +        nw = n >> 2; +        makewt(nw, ip, w); +    } +    nc = ip[1]; +    if (n > (nc << 2)) { +        nc = n >> 2; +        makect(nc, ip, w + nw); +    } +    if (isgn >= 0) { +        if (n > 4) { +            cftfsub(n, a, ip, nw, w); +            rftfsub(n, a, nc, w + nw); +        } else if (n == 4) { +            cftfsub(n, a, ip, nw, w); +        } +        xi = a[0] - a[1]; +        a[0] += a[1]; +        a[1] = xi; +    } else { +        a[1] = 0.5 * (a[0] - a[1]); +        a[0] -= a[1]; +        if (n > 4) { +            rftbsub(n, a, nc, w + nw); +            cftbsub(n, a, ip, nw, w); +        } else if (n == 4) { +            cftbsub(n, a, ip, nw, w); +        } +    } +} + + +_AstraExport void ddct(int n, int isgn, float32 *a, int *ip, float32 *w) +{ +    int j, nw, nc; +    float32 xr; +     +    nw = ip[0]; +    if (n > (nw << 2)) { +        nw = n >> 2; +        makewt(nw, ip, w); +    } +    nc = ip[1]; +    if (n > nc) { +        nc = n; +        makect(nc, ip, w + nw); +    } +    if (isgn < 0) { +        xr = a[n - 1]; +        for (j = n - 2; j >= 2; j -= 2) { +            a[j + 1] = a[j] - a[j - 1]; +            a[j] += a[j - 1]; +        } +        a[1] = a[0] - xr; +        a[0] += xr; +        if (n > 4) { +            rftbsub(n, a, nc, w + nw); +            cftbsub(n, a, ip, nw, w); +        } else if (n == 4) { +            cftbsub(n, a, ip, nw, w); +        } +    } +    dctsub(n, a, nc, w + nw); +    if (isgn >= 0) { +        if (n > 4) { +            cftfsub(n, a, ip, nw, w); +            rftfsub(n, a, nc, w + nw); +        } else if (n == 4) { +            cftfsub(n, a, ip, nw, w); +        } +        xr = a[0] - a[1]; +        a[0] += a[1]; +        for (j = 2; j < n; j += 2) { +            a[j - 1] = a[j] - a[j + 1]; +            a[j] += a[j + 1]; +        } +        a[n - 1] = xr; +    } +} + + +_AstraExport void ddst(int n, int isgn, float32 *a, int *ip, float32 *w) +{ +    int j, nw, nc; +    float32 xr; +     +    nw = ip[0]; +    if (n > (nw << 2)) { +        nw = n >> 2; +        makewt(nw, ip, w); +    } +    nc = ip[1]; +    if (n > nc) { +        nc = n; +        makect(nc, ip, w + nw); +    } +    if (isgn < 0) { +        xr = a[n - 1]; +        for (j = n - 2; j >= 2; j -= 2) { +            a[j + 1] = -a[j] - a[j - 1]; +            a[j] -= a[j - 1]; +        } +        a[1] = a[0] + xr; +        a[0] -= xr; +        if (n > 4) { +            rftbsub(n, a, nc, w + nw); +            cftbsub(n, a, ip, nw, w); +        } else if (n == 4) { +            cftbsub(n, a, ip, nw, w); +        } +    } +    dstsub(n, a, nc, w + nw); +    if (isgn >= 0) { +        if (n > 4) { +            cftfsub(n, a, ip, nw, w); +            rftfsub(n, a, nc, w + nw); +        } else if (n == 4) { +            cftfsub(n, a, ip, nw, w); +        } +        xr = a[0] - a[1]; +        a[0] += a[1]; +        for (j = 2; j < n; j += 2) { +            a[j - 1] = -a[j] - a[j + 1]; +            a[j] -= a[j + 1]; +        } +        a[n - 1] = -xr; +    } +} + + +_AstraExport void dfct(int n, float32 *a, float32 *t, int *ip, float32 *w) +{ +    int j, k, l, m, mh, nw, nc; +    float32 xr, xi, yr, yi; +     +    nw = ip[0]; +    if (n > (nw << 3)) { +        nw = n >> 3; +        makewt(nw, ip, w); +    } +    nc = ip[1]; +    if (n > (nc << 1)) { +        nc = n >> 1; +        makect(nc, ip, w + nw); +    } +    m = n >> 1; +    yi = a[m]; +    xi = a[0] + a[n]; +    a[0] -= a[n]; +    t[0] = xi - yi; +    t[m] = xi + yi; +    if (n > 2) { +        mh = m >> 1; +        for (j = 1; j < mh; j++) { +            k = m - j; +            xr = a[j] - a[n - j]; +            xi = a[j] + a[n - j]; +            yr = a[k] - a[n - k]; +            yi = a[k] + a[n - k]; +            a[j] = xr; +            a[k] = yr; +            t[j] = xi - yi; +            t[k] = xi + yi; +        } +        t[mh] = a[mh] + a[n - mh]; +        a[mh] -= a[n - mh]; +        dctsub(m, a, nc, w + nw); +        if (m > 4) { +            cftfsub(m, a, ip, nw, w); +            rftfsub(m, a, nc, w + nw); +        } else if (m == 4) { +            cftfsub(m, a, ip, nw, w); +        } +        a[n - 1] = a[0] - a[1]; +        a[1] = a[0] + a[1]; +        for (j = m - 2; j >= 2; j -= 2) { +            a[2 * j + 1] = a[j] + a[j + 1]; +            a[2 * j - 1] = a[j] - a[j + 1]; +        } +        l = 2; +        m = mh; +        while (m >= 2) { +            dctsub(m, t, nc, w + nw); +            if (m > 4) { +                cftfsub(m, t, ip, nw, w); +                rftfsub(m, t, nc, w + nw); +            } else if (m == 4) { +                cftfsub(m, t, ip, nw, w); +            } +            a[n - l] = t[0] - t[1]; +            a[l] = t[0] + t[1]; +            k = 0; +            for (j = 2; j < m; j += 2) { +                k += l << 2; +                a[k - l] = t[j] - t[j + 1]; +                a[k + l] = t[j] + t[j + 1]; +            } +            l <<= 1; +            mh = m >> 1; +            for (j = 0; j < mh; j++) { +                k = m - j; +                t[j] = t[m + k] - t[m + j]; +                t[k] = t[m + k] + t[m + j]; +            } +            t[mh] = t[m + mh]; +            m = mh; +        } +        a[l] = t[0]; +        a[n] = t[2] - t[1]; +        a[0] = t[2] + t[1]; +    } else { +        a[1] = a[0]; +        a[2] = t[0]; +        a[0] = t[1]; +    } +} + + +_AstraExport void dfst(int n, float32 *a, float32 *t, int *ip, float32 *w) +{ +    int j, k, l, m, mh, nw, nc; +    float32 xr, xi, yr, yi; +     +    nw = ip[0]; +    if (n > (nw << 3)) { +        nw = n >> 3; +        makewt(nw, ip, w); +    } +    nc = ip[1]; +    if (n > (nc << 1)) { +        nc = n >> 1; +        makect(nc, ip, w + nw); +    } +    if (n > 2) { +        m = n >> 1; +        mh = m >> 1; +        for (j = 1; j < mh; j++) { +            k = m - j; +            xr = a[j] + a[n - j]; +            xi = a[j] - a[n - j]; +            yr = a[k] + a[n - k]; +            yi = a[k] - a[n - k]; +            a[j] = xr; +            a[k] = yr; +            t[j] = xi + yi; +            t[k] = xi - yi; +        } +        t[0] = a[mh] - a[n - mh]; +        a[mh] += a[n - mh]; +        a[0] = a[m]; +        dstsub(m, a, nc, w + nw); +        if (m > 4) { +            cftfsub(m, a, ip, nw, w); +            rftfsub(m, a, nc, w + nw); +        } else if (m == 4) { +            cftfsub(m, a, ip, nw, w); +        } +        a[n - 1] = a[1] - a[0]; +        a[1] = a[0] + a[1]; +        for (j = m - 2; j >= 2; j -= 2) { +            a[2 * j + 1] = a[j] - a[j + 1]; +            a[2 * j - 1] = -a[j] - a[j + 1]; +        } +        l = 2; +        m = mh; +        while (m >= 2) { +            dstsub(m, t, nc, w + nw); +            if (m > 4) { +                cftfsub(m, t, ip, nw, w); +                rftfsub(m, t, nc, w + nw); +            } else if (m == 4) { +                cftfsub(m, t, ip, nw, w); +            } +            a[n - l] = t[1] - t[0]; +            a[l] = t[0] + t[1]; +            k = 0; +            for (j = 2; j < m; j += 2) { +                k += l << 2; +                a[k - l] = -t[j] - t[j + 1]; +                a[k + l] = t[j] - t[j + 1]; +            } +            l <<= 1; +            mh = m >> 1; +            for (j = 1; j < mh; j++) { +                k = m - j; +                t[j] = t[m + k] + t[m + j]; +                t[k] = t[m + k] - t[m + j]; +            } +            t[0] = t[m + mh]; +            m = mh; +        } +        a[l] = t[0]; +    } +    a[0] = 0; +} + + +/* -------- initializing routines -------- */ + +static void makewt(int nw, int *ip, float32 *w) +{ +    int j, nwh, nw0, nw1; +    float32 delta, wn4r, wk1r, wk1i, wk3r, wk3i; +     +    ip[0] = nw; +    ip[1] = 1; +    if (nw > 2) { +        nwh = nw >> 1; +        delta = atan(1.0) / nwh; +        wn4r = cos(delta * nwh); +        w[0] = 1; +        w[1] = wn4r; +        if (nwh == 4) { +            w[2] = cos(delta * 2); +            w[3] = sin(delta * 2); +        } else if (nwh > 4) { +            makeipt(nw, ip); +            w[2] = 0.5 / cos(delta * 2); +            w[3] = 0.5 / cos(delta * 6); +            for (j = 4; j < nwh; j += 4) { +                w[j] = cos(delta * j); +                w[j + 1] = sin(delta * j); +                w[j + 2] = cos(3 * delta * j); +                w[j + 3] = -sin(3 * delta * j); +            } +        } +        nw0 = 0; +        while (nwh > 2) { +            nw1 = nw0 + nwh; +            nwh >>= 1; +            w[nw1] = 1; +            w[nw1 + 1] = wn4r; +            if (nwh == 4) { +                wk1r = w[nw0 + 4]; +                wk1i = w[nw0 + 5]; +                w[nw1 + 2] = wk1r; +                w[nw1 + 3] = wk1i; +            } else if (nwh > 4) { +                wk1r = w[nw0 + 4]; +                wk3r = w[nw0 + 6]; +                w[nw1 + 2] = 0.5 / wk1r; +                w[nw1 + 3] = 0.5 / wk3r; +                for (j = 4; j < nwh; j += 4) { +                    wk1r = w[nw0 + 2 * j]; +                    wk1i = w[nw0 + 2 * j + 1]; +                    wk3r = w[nw0 + 2 * j + 2]; +                    wk3i = w[nw0 + 2 * j + 3]; +                    w[nw1 + j] = wk1r; +                    w[nw1 + j + 1] = wk1i; +                    w[nw1 + j + 2] = wk3r; +                    w[nw1 + j + 3] = wk3i; +                } +            } +            nw0 = nw1; +        } +    } +} + + +static void makeipt(int nw, int *ip) +{ +    int j, l, m, m2, p, q; +     +    ip[2] = 0; +    ip[3] = 16; +    m = 2; +    for (l = nw; l > 32; l >>= 2) { +        m2 = m << 1; +        q = m2 << 3; +        for (j = m; j < m2; j++) { +            p = ip[j] << 2; +            ip[m + j] = p; +            ip[m2 + j] = p + q; +        } +        m = m2; +    } +} + + +static void makect(int nc, int *ip, float32 *c) +{ +    int j, nch; +    float32 delta; +     +    ip[1] = nc; +    if (nc > 1) { +        nch = nc >> 1; +        delta = atan(1.0) / nch; +        c[0] = cos(delta * nch); +        c[nch] = 0.5 * c[0]; +        for (j = 1; j < nch; j++) { +            c[j] = 0.5 * cos(delta * j); +            c[nc - j] = 0.5 * sin(delta * j); +        } +    } +} + + +/* -------- child routines -------- */ + + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp,func,argp) { \ +    if (pthread_create(thp, NULL, func, (void *) argp) != 0) { \ +        fprintf(stderr, "cdft thread error\n"); \ +        exit(1); \ +    } \ +} +#define cdft_thread_wait(th) { \ +    if (pthread_join(th, NULL) != 0) { \ +        fprintf(stderr, "cdft thread error\n"); \ +        exit(1); \ +    } \ +} +#endif /* USE_CDFT_PTHREADS */ + + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include <windows.h> +#include <stdio.h> +#include <stdlib.h> +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp,func,argp) { \ +    DWORD thid; \ +    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \ +    if (*(thp) == 0) { \ +        fprintf(stderr, "cdft thread error\n"); \ +        exit(1); \ +    } \ +} +#define cdft_thread_wait(th) { \ +    WaitForSingleObject(th, INFINITE); \ +    CloseHandle(th); \ +} +#endif /* USE_CDFT_WINTHREADS */ + + +static void cftfsub(int n, float32 *a, int *ip, int nw, float32 *w) +{ +    if (n > 8) { +        if (n > 32) { +            cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS +            if (n > CDFT_THREADS_BEGIN_N) { +                cftrec4_th(n, a, nw, w); +            } else  +#endif /* USE_CDFT_THREADS */ +            if (n > 512) { +                cftrec4(n, a, nw, w); +            } else if (n > 128) { +                cftleaf(n, 1, a, nw, w); +            } else { +                cftfx41(n, a, nw, w); +            } +            bitrv2(n, ip, a); +        } else if (n == 32) { +            cftf161(a, &w[nw - 8]); +            bitrv216(a); +        } else { +            cftf081(a, w); +            bitrv208(a); +        } +    } else if (n == 8) { +        cftf040(a); +    } else if (n == 4) { +        cftx020(a); +    } +} + + +static void cftbsub(int n, float32 *a, int *ip, int nw, float32 *w) +{ +    if (n > 8) { +        if (n > 32) { +            cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS +            if (n > CDFT_THREADS_BEGIN_N) { +                cftrec4_th(n, a, nw, w); +            } else  +#endif /* USE_CDFT_THREADS */ +            if (n > 512) { +                cftrec4(n, a, nw, w); +            } else if (n > 128) { +                cftleaf(n, 1, a, nw, w); +            } else { +                cftfx41(n, a, nw, w); +            } +            bitrv2conj(n, ip, a); +        } else if (n == 32) { +            cftf161(a, &w[nw - 8]); +            bitrv216neg(a); +        } else { +            cftf081(a, w); +            bitrv208neg(a); +        } +    } else if (n == 8) { +        cftb040(a); +    } else if (n == 4) { +        cftx020(a); +    } +} + + +static void bitrv2(int n, int *ip, float32 *a) +{ +    int j, j1, k, k1, l, m, nh, nm; +    float32 xr, xi, yr, yi; +     +    m = 1; +    for (l = n >> 2; l > 8; l >>= 2) { +        m <<= 1; +    } +    nh = n >> 1; +    nm = 4 * m; +    if (l == 8) { +        for (k = 0; k < m; k++) { +            for (j = 0; j < k; j++) { +                j1 = 4 * j + 2 * ip[m + k]; +                k1 = 4 * k + 2 * ip[m + j]; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 -= nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nh; +                k1 += 2; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 += nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += 2; +                k1 += nh; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 -= nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nh; +                k1 -= 2; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 += nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +            } +            k1 = 4 * k + 2 * ip[m + k]; +            j1 = k1 + 2; +            k1 += nh; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nm; +            k1 += 2 * nm; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nm; +            k1 -= nm; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 -= 2; +            k1 -= nh; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nh + 2; +            k1 += nh + 2; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 -= nh - nm; +            k1 += 2 * nm - 2; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +        } +    } else { +        for (k = 0; k < m; k++) { +            for (j = 0; j < k; j++) { +                j1 = 4 * j + ip[m + k]; +                k1 = 4 * k + ip[m + j]; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nh; +                k1 += 2; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += 2; +                k1 += nh; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nh; +                k1 -= 2; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= nm; +                xr = a[j1]; +                xi = a[j1 + 1]; +                yr = a[k1]; +                yi = a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +            } +            k1 = 4 * k + ip[m + k]; +            j1 = k1 + 2; +            k1 += nh; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nm; +            k1 += nm; +            xr = a[j1]; +            xi = a[j1 + 1]; +            yr = a[k1]; +            yi = a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +        } +    } +} + + +static void bitrv2conj(int n, int *ip, float32 *a) +{ +    int j, j1, k, k1, l, m, nh, nm; +    float32 xr, xi, yr, yi; +     +    m = 1; +    for (l = n >> 2; l > 8; l >>= 2) { +        m <<= 1; +    } +    nh = n >> 1; +    nm = 4 * m; +    if (l == 8) { +        for (k = 0; k < m; k++) { +            for (j = 0; j < k; j++) { +                j1 = 4 * j + 2 * ip[m + k]; +                k1 = 4 * k + 2 * ip[m + j]; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 -= nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nh; +                k1 += 2; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 += nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += 2; +                k1 += nh; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 -= nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nh; +                k1 -= 2; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 += nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= 2 * nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +            } +            k1 = 4 * k + 2 * ip[m + k]; +            j1 = k1 + 2; +            k1 += nh; +            a[j1 - 1] = -a[j1 - 1]; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            a[k1 + 3] = -a[k1 + 3]; +            j1 += nm; +            k1 += 2 * nm; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nm; +            k1 -= nm; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 -= 2; +            k1 -= nh; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 += nh + 2; +            k1 += nh + 2; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            j1 -= nh - nm; +            k1 += 2 * nm - 2; +            a[j1 - 1] = -a[j1 - 1]; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            a[k1 + 3] = -a[k1 + 3]; +        } +    } else { +        for (k = 0; k < m; k++) { +            for (j = 0; j < k; j++) { +                j1 = 4 * j + ip[m + k]; +                k1 = 4 * k + ip[m + j]; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nh; +                k1 += 2; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += 2; +                k1 += nh; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 += nm; +                k1 += nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nh; +                k1 -= 2; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +                j1 -= nm; +                k1 -= nm; +                xr = a[j1]; +                xi = -a[j1 + 1]; +                yr = a[k1]; +                yi = -a[k1 + 1]; +                a[j1] = yr; +                a[j1 + 1] = yi; +                a[k1] = xr; +                a[k1 + 1] = xi; +            } +            k1 = 4 * k + ip[m + k]; +            j1 = k1 + 2; +            k1 += nh; +            a[j1 - 1] = -a[j1 - 1]; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            a[k1 + 3] = -a[k1 + 3]; +            j1 += nm; +            k1 += nm; +            a[j1 - 1] = -a[j1 - 1]; +            xr = a[j1]; +            xi = -a[j1 + 1]; +            yr = a[k1]; +            yi = -a[k1 + 1]; +            a[j1] = yr; +            a[j1 + 1] = yi; +            a[k1] = xr; +            a[k1 + 1] = xi; +            a[k1 + 3] = -a[k1 + 3]; +        } +    } +} + + +static void bitrv216(float32 *a) +{ +    float32 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,  +        x5r, x5i, x7r, x7i, x8r, x8i, x10r, x10i,  +        x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; +     +    x1r = a[2]; +    x1i = a[3]; +    x2r = a[4]; +    x2i = a[5]; +    x3r = a[6]; +    x3i = a[7]; +    x4r = a[8]; +    x4i = a[9]; +    x5r = a[10]; +    x5i = a[11]; +    x7r = a[14]; +    x7i = a[15]; +    x8r = a[16]; +    x8i = a[17]; +    x10r = a[20]; +    x10i = a[21]; +    x11r = a[22]; +    x11i = a[23]; +    x12r = a[24]; +    x12i = a[25]; +    x13r = a[26]; +    x13i = a[27]; +    x14r = a[28]; +    x14i = a[29]; +    a[2] = x8r; +    a[3] = x8i; +    a[4] = x4r; +    a[5] = x4i; +    a[6] = x12r; +    a[7] = x12i; +    a[8] = x2r; +    a[9] = x2i; +    a[10] = x10r; +    a[11] = x10i; +    a[14] = x14r; +    a[15] = x14i; +    a[16] = x1r; +    a[17] = x1i; +    a[20] = x5r; +    a[21] = x5i; +    a[22] = x13r; +    a[23] = x13i; +    a[24] = x3r; +    a[25] = x3i; +    a[26] = x11r; +    a[27] = x11i; +    a[28] = x7r; +    a[29] = x7i; +} + + +static void bitrv216neg(float32 *a) +{ +    float32 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,  +        x5r, x5i, x6r, x6i, x7r, x7i, x8r, x8i,  +        x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i,  +        x13r, x13i, x14r, x14i, x15r, x15i; +     +    x1r = a[2]; +    x1i = a[3]; +    x2r = a[4]; +    x2i = a[5]; +    x3r = a[6]; +    x3i = a[7]; +    x4r = a[8]; +    x4i = a[9]; +    x5r = a[10]; +    x5i = a[11]; +    x6r = a[12]; +    x6i = a[13]; +    x7r = a[14]; +    x7i = a[15]; +    x8r = a[16]; +    x8i = a[17]; +    x9r = a[18]; +    x9i = a[19]; +    x10r = a[20]; +    x10i = a[21]; +    x11r = a[22]; +    x11i = a[23]; +    x12r = a[24]; +    x12i = a[25]; +    x13r = a[26]; +    x13i = a[27]; +    x14r = a[28]; +    x14i = a[29]; +    x15r = a[30]; +    x15i = a[31]; +    a[2] = x15r; +    a[3] = x15i; +    a[4] = x7r; +    a[5] = x7i; +    a[6] = x11r; +    a[7] = x11i; +    a[8] = x3r; +    a[9] = x3i; +    a[10] = x13r; +    a[11] = x13i; +    a[12] = x5r; +    a[13] = x5i; +    a[14] = x9r; +    a[15] = x9i; +    a[16] = x1r; +    a[17] = x1i; +    a[18] = x14r; +    a[19] = x14i; +    a[20] = x6r; +    a[21] = x6i; +    a[22] = x10r; +    a[23] = x10i; +    a[24] = x2r; +    a[25] = x2i; +    a[26] = x12r; +    a[27] = x12i; +    a[28] = x4r; +    a[29] = x4i; +    a[30] = x8r; +    a[31] = x8i; +} + + +static void bitrv208(float32 *a) +{ +    float32 x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; +     +    x1r = a[2]; +    x1i = a[3]; +    x3r = a[6]; +    x3i = a[7]; +    x4r = a[8]; +    x4i = a[9]; +    x6r = a[12]; +    x6i = a[13]; +    a[2] = x4r; +    a[3] = x4i; +    a[6] = x6r; +    a[7] = x6i; +    a[8] = x1r; +    a[9] = x1i; +    a[12] = x3r; +    a[13] = x3i; +} + + +static void bitrv208neg(float32 *a) +{ +    float32 x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i,  +        x5r, x5i, x6r, x6i, x7r, x7i; +     +    x1r = a[2]; +    x1i = a[3]; +    x2r = a[4]; +    x2i = a[5]; +    x3r = a[6]; +    x3i = a[7]; +    x4r = a[8]; +    x4i = a[9]; +    x5r = a[10]; +    x5i = a[11]; +    x6r = a[12]; +    x6i = a[13]; +    x7r = a[14]; +    x7i = a[15]; +    a[2] = x7r; +    a[3] = x7i; +    a[4] = x3r; +    a[5] = x3i; +    a[6] = x5r; +    a[7] = x5i; +    a[8] = x1r; +    a[9] = x1i; +    a[10] = x6r; +    a[11] = x6i; +    a[12] = x2r; +    a[13] = x2i; +    a[14] = x4r; +    a[15] = x4i; +} + + +static void cftf1st(int n, float32 *a, float32 *w) +{ +    int j, j0, j1, j2, j3, k, m, mh; +    float32 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,  +        wd1r, wd1i, wd3r, wd3i; +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; +     +    mh = n >> 3; +    m = 2 * mh; +    j1 = m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[0] + a[j2]; +    x0i = a[1] + a[j2 + 1]; +    x1r = a[0] - a[j2]; +    x1i = a[1] - a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[0] = x0r + x2r; +    a[1] = x0i + x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i - x2i; +    a[j2] = x1r - x3i; +    a[j2 + 1] = x1i + x3r; +    a[j3] = x1r + x3i; +    a[j3 + 1] = x1i - x3r; +    wn4r = w[1]; +    csc1 = w[2]; +    csc3 = w[3]; +    wd1r = 1; +    wd1i = 0; +    wd3r = 1; +    wd3i = 0; +    k = 0; +    for (j = 2; j < mh - 2; j += 4) { +        k += 4; +        wk1r = csc1 * (wd1r + w[k]); +        wk1i = csc1 * (wd1i + w[k + 1]); +        wk3r = csc3 * (wd3r + w[k + 2]); +        wk3i = csc3 * (wd3i + w[k + 3]); +        wd1r = w[k]; +        wd1i = w[k + 1]; +        wd3r = w[k + 2]; +        wd3i = w[k + 3]; +        j1 = j + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j] + a[j2]; +        x0i = a[j + 1] + a[j2 + 1]; +        x1r = a[j] - a[j2]; +        x1i = a[j + 1] - a[j2 + 1]; +        y0r = a[j + 2] + a[j2 + 2]; +        y0i = a[j + 3] + a[j2 + 3]; +        y1r = a[j + 2] - a[j2 + 2]; +        y1i = a[j + 3] - a[j2 + 3]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        y2r = a[j1 + 2] + a[j3 + 2]; +        y2i = a[j1 + 3] + a[j3 + 3]; +        y3r = a[j1 + 2] - a[j3 + 2]; +        y3i = a[j1 + 3] - a[j3 + 3]; +        a[j] = x0r + x2r; +        a[j + 1] = x0i + x2i; +        a[j + 2] = y0r + y2r; +        a[j + 3] = y0i + y2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i - x2i; +        a[j1 + 2] = y0r - y2r; +        a[j1 + 3] = y0i - y2i; +        x0r = x1r - x3i; +        x0i = x1i + x3r; +        a[j2] = wk1r * x0r - wk1i * x0i; +        a[j2 + 1] = wk1r * x0i + wk1i * x0r; +        x0r = y1r - y3i; +        x0i = y1i + y3r; +        a[j2 + 2] = wd1r * x0r - wd1i * x0i; +        a[j2 + 3] = wd1r * x0i + wd1i * x0r; +        x0r = x1r + x3i; +        x0i = x1i - x3r; +        a[j3] = wk3r * x0r + wk3i * x0i; +        a[j3 + 1] = wk3r * x0i - wk3i * x0r; +        x0r = y1r + y3i; +        x0i = y1i - y3r; +        a[j3 + 2] = wd3r * x0r + wd3i * x0i; +        a[j3 + 3] = wd3r * x0i - wd3i * x0r; +        j0 = m - j; +        j1 = j0 + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j0] + a[j2]; +        x0i = a[j0 + 1] + a[j2 + 1]; +        x1r = a[j0] - a[j2]; +        x1i = a[j0 + 1] - a[j2 + 1]; +        y0r = a[j0 - 2] + a[j2 - 2]; +        y0i = a[j0 - 1] + a[j2 - 1]; +        y1r = a[j0 - 2] - a[j2 - 2]; +        y1i = a[j0 - 1] - a[j2 - 1]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        y2r = a[j1 - 2] + a[j3 - 2]; +        y2i = a[j1 - 1] + a[j3 - 1]; +        y3r = a[j1 - 2] - a[j3 - 2]; +        y3i = a[j1 - 1] - a[j3 - 1]; +        a[j0] = x0r + x2r; +        a[j0 + 1] = x0i + x2i; +        a[j0 - 2] = y0r + y2r; +        a[j0 - 1] = y0i + y2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i - x2i; +        a[j1 - 2] = y0r - y2r; +        a[j1 - 1] = y0i - y2i; +        x0r = x1r - x3i; +        x0i = x1i + x3r; +        a[j2] = wk1i * x0r - wk1r * x0i; +        a[j2 + 1] = wk1i * x0i + wk1r * x0r; +        x0r = y1r - y3i; +        x0i = y1i + y3r; +        a[j2 - 2] = wd1i * x0r - wd1r * x0i; +        a[j2 - 1] = wd1i * x0i + wd1r * x0r; +        x0r = x1r + x3i; +        x0i = x1i - x3r; +        a[j3] = wk3i * x0r + wk3r * x0i; +        a[j3 + 1] = wk3i * x0i - wk3r * x0r; +        x0r = y1r + y3i; +        x0i = y1i - y3r; +        a[j3 - 2] = wd3i * x0r + wd3r * x0i; +        a[j3 - 1] = wd3i * x0i - wd3r * x0r; +    } +    wk1r = csc1 * (wd1r + wn4r); +    wk1i = csc1 * (wd1i + wn4r); +    wk3r = csc3 * (wd3r - wn4r); +    wk3i = csc3 * (wd3i - wn4r); +    j0 = mh; +    j1 = j0 + m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[j0 - 2] + a[j2 - 2]; +    x0i = a[j0 - 1] + a[j2 - 1]; +    x1r = a[j0 - 2] - a[j2 - 2]; +    x1i = a[j0 - 1] - a[j2 - 1]; +    x2r = a[j1 - 2] + a[j3 - 2]; +    x2i = a[j1 - 1] + a[j3 - 1]; +    x3r = a[j1 - 2] - a[j3 - 2]; +    x3i = a[j1 - 1] - a[j3 - 1]; +    a[j0 - 2] = x0r + x2r; +    a[j0 - 1] = x0i + x2i; +    a[j1 - 2] = x0r - x2r; +    a[j1 - 1] = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    a[j2 - 2] = wk1r * x0r - wk1i * x0i; +    a[j2 - 1] = wk1r * x0i + wk1i * x0r; +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    a[j3 - 2] = wk3r * x0r + wk3i * x0i; +    a[j3 - 1] = wk3r * x0i - wk3i * x0r; +    x0r = a[j0] + a[j2]; +    x0i = a[j0 + 1] + a[j2 + 1]; +    x1r = a[j0] - a[j2]; +    x1i = a[j0 + 1] - a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[j0] = x0r + x2r; +    a[j0 + 1] = x0i + x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    a[j2] = wn4r * (x0r - x0i); +    a[j2 + 1] = wn4r * (x0i + x0r); +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    a[j3] = -wn4r * (x0r + x0i); +    a[j3 + 1] = -wn4r * (x0i - x0r); +    x0r = a[j0 + 2] + a[j2 + 2]; +    x0i = a[j0 + 3] + a[j2 + 3]; +    x1r = a[j0 + 2] - a[j2 + 2]; +    x1i = a[j0 + 3] - a[j2 + 3]; +    x2r = a[j1 + 2] + a[j3 + 2]; +    x2i = a[j1 + 3] + a[j3 + 3]; +    x3r = a[j1 + 2] - a[j3 + 2]; +    x3i = a[j1 + 3] - a[j3 + 3]; +    a[j0 + 2] = x0r + x2r; +    a[j0 + 3] = x0i + x2i; +    a[j1 + 2] = x0r - x2r; +    a[j1 + 3] = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    a[j2 + 2] = wk1i * x0r - wk1r * x0i; +    a[j2 + 3] = wk1i * x0i + wk1r * x0r; +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    a[j3 + 2] = wk3i * x0r + wk3r * x0i; +    a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +static void cftb1st(int n, float32 *a, float32 *w) +{ +    int j, j0, j1, j2, j3, k, m, mh; +    float32 wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i,  +        wd1r, wd1i, wd3r, wd3i; +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i; +     +    mh = n >> 3; +    m = 2 * mh; +    j1 = m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[0] + a[j2]; +    x0i = -a[1] - a[j2 + 1]; +    x1r = a[0] - a[j2]; +    x1i = -a[1] + a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[0] = x0r + x2r; +    a[1] = x0i - x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i + x2i; +    a[j2] = x1r + x3i; +    a[j2 + 1] = x1i + x3r; +    a[j3] = x1r - x3i; +    a[j3 + 1] = x1i - x3r; +    wn4r = w[1]; +    csc1 = w[2]; +    csc3 = w[3]; +    wd1r = 1; +    wd1i = 0; +    wd3r = 1; +    wd3i = 0; +    k = 0; +    for (j = 2; j < mh - 2; j += 4) { +        k += 4; +        wk1r = csc1 * (wd1r + w[k]); +        wk1i = csc1 * (wd1i + w[k + 1]); +        wk3r = csc3 * (wd3r + w[k + 2]); +        wk3i = csc3 * (wd3i + w[k + 3]); +        wd1r = w[k]; +        wd1i = w[k + 1]; +        wd3r = w[k + 2]; +        wd3i = w[k + 3]; +        j1 = j + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j] + a[j2]; +        x0i = -a[j + 1] - a[j2 + 1]; +        x1r = a[j] - a[j2]; +        x1i = -a[j + 1] + a[j2 + 1]; +        y0r = a[j + 2] + a[j2 + 2]; +        y0i = -a[j + 3] - a[j2 + 3]; +        y1r = a[j + 2] - a[j2 + 2]; +        y1i = -a[j + 3] + a[j2 + 3]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        y2r = a[j1 + 2] + a[j3 + 2]; +        y2i = a[j1 + 3] + a[j3 + 3]; +        y3r = a[j1 + 2] - a[j3 + 2]; +        y3i = a[j1 + 3] - a[j3 + 3]; +        a[j] = x0r + x2r; +        a[j + 1] = x0i - x2i; +        a[j + 2] = y0r + y2r; +        a[j + 3] = y0i - y2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i + x2i; +        a[j1 + 2] = y0r - y2r; +        a[j1 + 3] = y0i + y2i; +        x0r = x1r + x3i; +        x0i = x1i + x3r; +        a[j2] = wk1r * x0r - wk1i * x0i; +        a[j2 + 1] = wk1r * x0i + wk1i * x0r; +        x0r = y1r + y3i; +        x0i = y1i + y3r; +        a[j2 + 2] = wd1r * x0r - wd1i * x0i; +        a[j2 + 3] = wd1r * x0i + wd1i * x0r; +        x0r = x1r - x3i; +        x0i = x1i - x3r; +        a[j3] = wk3r * x0r + wk3i * x0i; +        a[j3 + 1] = wk3r * x0i - wk3i * x0r; +        x0r = y1r - y3i; +        x0i = y1i - y3r; +        a[j3 + 2] = wd3r * x0r + wd3i * x0i; +        a[j3 + 3] = wd3r * x0i - wd3i * x0r; +        j0 = m - j; +        j1 = j0 + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j0] + a[j2]; +        x0i = -a[j0 + 1] - a[j2 + 1]; +        x1r = a[j0] - a[j2]; +        x1i = -a[j0 + 1] + a[j2 + 1]; +        y0r = a[j0 - 2] + a[j2 - 2]; +        y0i = -a[j0 - 1] - a[j2 - 1]; +        y1r = a[j0 - 2] - a[j2 - 2]; +        y1i = -a[j0 - 1] + a[j2 - 1]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        y2r = a[j1 - 2] + a[j3 - 2]; +        y2i = a[j1 - 1] + a[j3 - 1]; +        y3r = a[j1 - 2] - a[j3 - 2]; +        y3i = a[j1 - 1] - a[j3 - 1]; +        a[j0] = x0r + x2r; +        a[j0 + 1] = x0i - x2i; +        a[j0 - 2] = y0r + y2r; +        a[j0 - 1] = y0i - y2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i + x2i; +        a[j1 - 2] = y0r - y2r; +        a[j1 - 1] = y0i + y2i; +        x0r = x1r + x3i; +        x0i = x1i + x3r; +        a[j2] = wk1i * x0r - wk1r * x0i; +        a[j2 + 1] = wk1i * x0i + wk1r * x0r; +        x0r = y1r + y3i; +        x0i = y1i + y3r; +        a[j2 - 2] = wd1i * x0r - wd1r * x0i; +        a[j2 - 1] = wd1i * x0i + wd1r * x0r; +        x0r = x1r - x3i; +        x0i = x1i - x3r; +        a[j3] = wk3i * x0r + wk3r * x0i; +        a[j3 + 1] = wk3i * x0i - wk3r * x0r; +        x0r = y1r - y3i; +        x0i = y1i - y3r; +        a[j3 - 2] = wd3i * x0r + wd3r * x0i; +        a[j3 - 1] = wd3i * x0i - wd3r * x0r; +    } +    wk1r = csc1 * (wd1r + wn4r); +    wk1i = csc1 * (wd1i + wn4r); +    wk3r = csc3 * (wd3r - wn4r); +    wk3i = csc3 * (wd3i - wn4r); +    j0 = mh; +    j1 = j0 + m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[j0 - 2] + a[j2 - 2]; +    x0i = -a[j0 - 1] - a[j2 - 1]; +    x1r = a[j0 - 2] - a[j2 - 2]; +    x1i = -a[j0 - 1] + a[j2 - 1]; +    x2r = a[j1 - 2] + a[j3 - 2]; +    x2i = a[j1 - 1] + a[j3 - 1]; +    x3r = a[j1 - 2] - a[j3 - 2]; +    x3i = a[j1 - 1] - a[j3 - 1]; +    a[j0 - 2] = x0r + x2r; +    a[j0 - 1] = x0i - x2i; +    a[j1 - 2] = x0r - x2r; +    a[j1 - 1] = x0i + x2i; +    x0r = x1r + x3i; +    x0i = x1i + x3r; +    a[j2 - 2] = wk1r * x0r - wk1i * x0i; +    a[j2 - 1] = wk1r * x0i + wk1i * x0r; +    x0r = x1r - x3i; +    x0i = x1i - x3r; +    a[j3 - 2] = wk3r * x0r + wk3i * x0i; +    a[j3 - 1] = wk3r * x0i - wk3i * x0r; +    x0r = a[j0] + a[j2]; +    x0i = -a[j0 + 1] - a[j2 + 1]; +    x1r = a[j0] - a[j2]; +    x1i = -a[j0 + 1] + a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[j0] = x0r + x2r; +    a[j0 + 1] = x0i - x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i + x2i; +    x0r = x1r + x3i; +    x0i = x1i + x3r; +    a[j2] = wn4r * (x0r - x0i); +    a[j2 + 1] = wn4r * (x0i + x0r); +    x0r = x1r - x3i; +    x0i = x1i - x3r; +    a[j3] = -wn4r * (x0r + x0i); +    a[j3 + 1] = -wn4r * (x0i - x0r); +    x0r = a[j0 + 2] + a[j2 + 2]; +    x0i = -a[j0 + 3] - a[j2 + 3]; +    x1r = a[j0 + 2] - a[j2 + 2]; +    x1i = -a[j0 + 3] + a[j2 + 3]; +    x2r = a[j1 + 2] + a[j3 + 2]; +    x2i = a[j1 + 3] + a[j3 + 3]; +    x3r = a[j1 + 2] - a[j3 + 2]; +    x3i = a[j1 + 3] - a[j3 + 3]; +    a[j0 + 2] = x0r + x2r; +    a[j0 + 3] = x0i - x2i; +    a[j1 + 2] = x0r - x2r; +    a[j1 + 3] = x0i + x2i; +    x0r = x1r + x3i; +    x0i = x1i + x3r; +    a[j2 + 2] = wk1i * x0r - wk1r * x0i; +    a[j2 + 3] = wk1i * x0i + wk1r * x0r; +    x0r = x1r - x3i; +    x0i = x1i - x3r; +    a[j3 + 2] = wk3i * x0r + wk3r * x0i; +    a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st { +    int n0; +    int n; +    float32 *a; +    int nw; +    float32 *w; +}; +typedef struct cdft_arg_st cdft_arg_t; + + +static void cftrec4_th(int n, float32 *a, int nw, float32 *w) +{ +    int i, idiv4, m, nthread; +    cdft_thread_t th[4]; +    cdft_arg_t ag[4]; +     +    nthread = 2; +    idiv4 = 0; +    m = n >> 1; +    if (n > CDFT_4THREADS_BEGIN_N) { +        nthread = 4; +        idiv4 = 1; +        m >>= 1; +    } +    for (i = 0; i < nthread; i++) { +        ag[i].n0 = n; +        ag[i].n = m; +        ag[i].a = &a[i * m]; +        ag[i].nw = nw; +        ag[i].w = w; +        if (i != idiv4) { +            cdft_thread_create(&th[i], cftrec1_th, &ag[i]); +        } else { +            cdft_thread_create(&th[i], cftrec2_th, &ag[i]); +        } +    } +    for (i = 0; i < nthread; i++) { +        cdft_thread_wait(th[i]); +    } +} + + +static void *cftrec1_th(void *p) +{ +    int isplt, j, k, m, n, n0, nw; +    float32 *a, *w; +     +    n0 = ((cdft_arg_t *) p)->n0; +    n = ((cdft_arg_t *) p)->n; +    a = ((cdft_arg_t *) p)->a; +    nw = ((cdft_arg_t *) p)->nw; +    w = ((cdft_arg_t *) p)->w; +    m = n0; +    while (m > 512) { +        m >>= 2; +        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); +    } +    cftleaf(m, 1, &a[n - m], nw, w); +    k = 0; +    for (j = n - m; j > 0; j -= m) { +        k++; +        isplt = cfttree(m, j, k, a, nw, w); +        cftleaf(m, isplt, &a[j - m], nw, w); +    } +    return (void *) 0; +} + + +static void *cftrec2_th(void *p) +{ +    int isplt, j, k, m, n, n0, nw; +    float32 *a, *w; +     +    n0 = ((cdft_arg_t *) p)->n0; +    n = ((cdft_arg_t *) p)->n; +    a = ((cdft_arg_t *) p)->a; +    nw = ((cdft_arg_t *) p)->nw; +    w = ((cdft_arg_t *) p)->w; +    k = 1; +    m = n0; +    while (m > 512) { +        m >>= 2; +        k <<= 2; +        cftmdl2(m, &a[n - m], &w[nw - m]); +    } +    cftleaf(m, 0, &a[n - m], nw, w); +    k >>= 1; +    for (j = n - m; j > 0; j -= m) { +        k++; +        isplt = cfttree(m, j, k, a, nw, w); +        cftleaf(m, isplt, &a[j - m], nw, w); +    } +    return (void *) 0; +} +#endif /* USE_CDFT_THREADS */ + + +static void cftrec4(int n, float32 *a, int nw, float32 *w) +{ +    int isplt, j, k, m; +     +    m = n; +    while (m > 512) { +        m >>= 2; +        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); +    } +    cftleaf(m, 1, &a[n - m], nw, w); +    k = 0; +    for (j = n - m; j > 0; j -= m) { +        k++; +        isplt = cfttree(m, j, k, a, nw, w); +        cftleaf(m, isplt, &a[j - m], nw, w); +    } +} + + +int cfttree(int n, int j, int k, float32 *a, int nw, float32 *w) +{ +    int i, isplt, m; +     +    if ((k & 3) != 0) { +        isplt = k & 1; +        if (isplt != 0) { +            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); +        } else { +            cftmdl2(n, &a[j - n], &w[nw - n]); +        } +    } else { +        m = n; +        for (i = k; (i & 3) == 0; i >>= 2) { +            m <<= 2; +        } +        isplt = i & 1; +        if (isplt != 0) { +            while (m > 128) { +                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); +                m >>= 2; +            } +        } else { +            while (m > 128) { +                cftmdl2(m, &a[j - m], &w[nw - m]); +                m >>= 2; +            } +        } +    } +    return isplt; +} + + +static void cftleaf(int n, int isplt, float32 *a, int nw, float32 *w) +{ +    if (n == 512) { +        cftmdl1(128, a, &w[nw - 64]); +        cftf161(a, &w[nw - 8]); +        cftf162(&a[32], &w[nw - 32]); +        cftf161(&a[64], &w[nw - 8]); +        cftf161(&a[96], &w[nw - 8]); +        cftmdl2(128, &a[128], &w[nw - 128]); +        cftf161(&a[128], &w[nw - 8]); +        cftf162(&a[160], &w[nw - 32]); +        cftf161(&a[192], &w[nw - 8]); +        cftf162(&a[224], &w[nw - 32]); +        cftmdl1(128, &a[256], &w[nw - 64]); +        cftf161(&a[256], &w[nw - 8]); +        cftf162(&a[288], &w[nw - 32]); +        cftf161(&a[320], &w[nw - 8]); +        cftf161(&a[352], &w[nw - 8]); +        if (isplt != 0) { +            cftmdl1(128, &a[384], &w[nw - 64]); +            cftf161(&a[480], &w[nw - 8]); +        } else { +            cftmdl2(128, &a[384], &w[nw - 128]); +            cftf162(&a[480], &w[nw - 32]); +        } +        cftf161(&a[384], &w[nw - 8]); +        cftf162(&a[416], &w[nw - 32]); +        cftf161(&a[448], &w[nw - 8]); +    } else { +        cftmdl1(64, a, &w[nw - 32]); +        cftf081(a, &w[nw - 8]); +        cftf082(&a[16], &w[nw - 8]); +        cftf081(&a[32], &w[nw - 8]); +        cftf081(&a[48], &w[nw - 8]); +        cftmdl2(64, &a[64], &w[nw - 64]); +        cftf081(&a[64], &w[nw - 8]); +        cftf082(&a[80], &w[nw - 8]); +        cftf081(&a[96], &w[nw - 8]); +        cftf082(&a[112], &w[nw - 8]); +        cftmdl1(64, &a[128], &w[nw - 32]); +        cftf081(&a[128], &w[nw - 8]); +        cftf082(&a[144], &w[nw - 8]); +        cftf081(&a[160], &w[nw - 8]); +        cftf081(&a[176], &w[nw - 8]); +        if (isplt != 0) { +            cftmdl1(64, &a[192], &w[nw - 32]); +            cftf081(&a[240], &w[nw - 8]); +        } else { +            cftmdl2(64, &a[192], &w[nw - 64]); +            cftf082(&a[240], &w[nw - 8]); +        } +        cftf081(&a[192], &w[nw - 8]); +        cftf082(&a[208], &w[nw - 8]); +        cftf081(&a[224], &w[nw - 8]); +    } +} + + +static void cftmdl1(int n, float32 *a, float32 *w) +{ +    int j, j0, j1, j2, j3, k, m, mh; +    float32 wn4r, wk1r, wk1i, wk3r, wk3i; +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; +     +    mh = n >> 3; +    m = 2 * mh; +    j1 = m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[0] + a[j2]; +    x0i = a[1] + a[j2 + 1]; +    x1r = a[0] - a[j2]; +    x1i = a[1] - a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[0] = x0r + x2r; +    a[1] = x0i + x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i - x2i; +    a[j2] = x1r - x3i; +    a[j2 + 1] = x1i + x3r; +    a[j3] = x1r + x3i; +    a[j3 + 1] = x1i - x3r; +    wn4r = w[1]; +    k = 0; +    for (j = 2; j < mh; j += 2) { +        k += 4; +        wk1r = w[k]; +        wk1i = w[k + 1]; +        wk3r = w[k + 2]; +        wk3i = w[k + 3]; +        j1 = j + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j] + a[j2]; +        x0i = a[j + 1] + a[j2 + 1]; +        x1r = a[j] - a[j2]; +        x1i = a[j + 1] - a[j2 + 1]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        a[j] = x0r + x2r; +        a[j + 1] = x0i + x2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i - x2i; +        x0r = x1r - x3i; +        x0i = x1i + x3r; +        a[j2] = wk1r * x0r - wk1i * x0i; +        a[j2 + 1] = wk1r * x0i + wk1i * x0r; +        x0r = x1r + x3i; +        x0i = x1i - x3r; +        a[j3] = wk3r * x0r + wk3i * x0i; +        a[j3 + 1] = wk3r * x0i - wk3i * x0r; +        j0 = m - j; +        j1 = j0 + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j0] + a[j2]; +        x0i = a[j0 + 1] + a[j2 + 1]; +        x1r = a[j0] - a[j2]; +        x1i = a[j0 + 1] - a[j2 + 1]; +        x2r = a[j1] + a[j3]; +        x2i = a[j1 + 1] + a[j3 + 1]; +        x3r = a[j1] - a[j3]; +        x3i = a[j1 + 1] - a[j3 + 1]; +        a[j0] = x0r + x2r; +        a[j0 + 1] = x0i + x2i; +        a[j1] = x0r - x2r; +        a[j1 + 1] = x0i - x2i; +        x0r = x1r - x3i; +        x0i = x1i + x3r; +        a[j2] = wk1i * x0r - wk1r * x0i; +        a[j2 + 1] = wk1i * x0i + wk1r * x0r; +        x0r = x1r + x3i; +        x0i = x1i - x3r; +        a[j3] = wk3i * x0r + wk3r * x0i; +        a[j3 + 1] = wk3i * x0i - wk3r * x0r; +    } +    j0 = mh; +    j1 = j0 + m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[j0] + a[j2]; +    x0i = a[j0 + 1] + a[j2 + 1]; +    x1r = a[j0] - a[j2]; +    x1i = a[j0 + 1] - a[j2 + 1]; +    x2r = a[j1] + a[j3]; +    x2i = a[j1 + 1] + a[j3 + 1]; +    x3r = a[j1] - a[j3]; +    x3i = a[j1 + 1] - a[j3 + 1]; +    a[j0] = x0r + x2r; +    a[j0 + 1] = x0i + x2i; +    a[j1] = x0r - x2r; +    a[j1 + 1] = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    a[j2] = wn4r * (x0r - x0i); +    a[j2 + 1] = wn4r * (x0i + x0r); +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    a[j3] = -wn4r * (x0r + x0i); +    a[j3 + 1] = -wn4r * (x0i - x0r); +} + + +static void cftmdl2(int n, float32 *a, float32 *w) +{ +    int j, j0, j1, j2, j3, k, kr, m, mh; +    float32 wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; +     +    mh = n >> 3; +    m = 2 * mh; +    wn4r = w[1]; +    j1 = m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[0] - a[j2 + 1]; +    x0i = a[1] + a[j2]; +    x1r = a[0] + a[j2 + 1]; +    x1i = a[1] - a[j2]; +    x2r = a[j1] - a[j3 + 1]; +    x2i = a[j1 + 1] + a[j3]; +    x3r = a[j1] + a[j3 + 1]; +    x3i = a[j1 + 1] - a[j3]; +    y0r = wn4r * (x2r - x2i); +    y0i = wn4r * (x2i + x2r); +    a[0] = x0r + y0r; +    a[1] = x0i + y0i; +    a[j1] = x0r - y0r; +    a[j1 + 1] = x0i - y0i; +    y0r = wn4r * (x3r - x3i); +    y0i = wn4r * (x3i + x3r); +    a[j2] = x1r - y0i; +    a[j2 + 1] = x1i + y0r; +    a[j3] = x1r + y0i; +    a[j3 + 1] = x1i - y0r; +    k = 0; +    kr = 2 * m; +    for (j = 2; j < mh; j += 2) { +        k += 4; +        wk1r = w[k]; +        wk1i = w[k + 1]; +        wk3r = w[k + 2]; +        wk3i = w[k + 3]; +        kr -= 4; +        wd1i = w[kr]; +        wd1r = w[kr + 1]; +        wd3i = w[kr + 2]; +        wd3r = w[kr + 3]; +        j1 = j + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j] - a[j2 + 1]; +        x0i = a[j + 1] + a[j2]; +        x1r = a[j] + a[j2 + 1]; +        x1i = a[j + 1] - a[j2]; +        x2r = a[j1] - a[j3 + 1]; +        x2i = a[j1 + 1] + a[j3]; +        x3r = a[j1] + a[j3 + 1]; +        x3i = a[j1 + 1] - a[j3]; +        y0r = wk1r * x0r - wk1i * x0i; +        y0i = wk1r * x0i + wk1i * x0r; +        y2r = wd1r * x2r - wd1i * x2i; +        y2i = wd1r * x2i + wd1i * x2r; +        a[j] = y0r + y2r; +        a[j + 1] = y0i + y2i; +        a[j1] = y0r - y2r; +        a[j1 + 1] = y0i - y2i; +        y0r = wk3r * x1r + wk3i * x1i; +        y0i = wk3r * x1i - wk3i * x1r; +        y2r = wd3r * x3r + wd3i * x3i; +        y2i = wd3r * x3i - wd3i * x3r; +        a[j2] = y0r + y2r; +        a[j2 + 1] = y0i + y2i; +        a[j3] = y0r - y2r; +        a[j3 + 1] = y0i - y2i; +        j0 = m - j; +        j1 = j0 + m; +        j2 = j1 + m; +        j3 = j2 + m; +        x0r = a[j0] - a[j2 + 1]; +        x0i = a[j0 + 1] + a[j2]; +        x1r = a[j0] + a[j2 + 1]; +        x1i = a[j0 + 1] - a[j2]; +        x2r = a[j1] - a[j3 + 1]; +        x2i = a[j1 + 1] + a[j3]; +        x3r = a[j1] + a[j3 + 1]; +        x3i = a[j1 + 1] - a[j3]; +        y0r = wd1i * x0r - wd1r * x0i; +        y0i = wd1i * x0i + wd1r * x0r; +        y2r = wk1i * x2r - wk1r * x2i; +        y2i = wk1i * x2i + wk1r * x2r; +        a[j0] = y0r + y2r; +        a[j0 + 1] = y0i + y2i; +        a[j1] = y0r - y2r; +        a[j1 + 1] = y0i - y2i; +        y0r = wd3i * x1r + wd3r * x1i; +        y0i = wd3i * x1i - wd3r * x1r; +        y2r = wk3i * x3r + wk3r * x3i; +        y2i = wk3i * x3i - wk3r * x3r; +        a[j2] = y0r + y2r; +        a[j2 + 1] = y0i + y2i; +        a[j3] = y0r - y2r; +        a[j3 + 1] = y0i - y2i; +    } +    wk1r = w[m]; +    wk1i = w[m + 1]; +    j0 = mh; +    j1 = j0 + m; +    j2 = j1 + m; +    j3 = j2 + m; +    x0r = a[j0] - a[j2 + 1]; +    x0i = a[j0 + 1] + a[j2]; +    x1r = a[j0] + a[j2 + 1]; +    x1i = a[j0 + 1] - a[j2]; +    x2r = a[j1] - a[j3 + 1]; +    x2i = a[j1 + 1] + a[j3]; +    x3r = a[j1] + a[j3 + 1]; +    x3i = a[j1 + 1] - a[j3]; +    y0r = wk1r * x0r - wk1i * x0i; +    y0i = wk1r * x0i + wk1i * x0r; +    y2r = wk1i * x2r - wk1r * x2i; +    y2i = wk1i * x2i + wk1r * x2r; +    a[j0] = y0r + y2r; +    a[j0 + 1] = y0i + y2i; +    a[j1] = y0r - y2r; +    a[j1 + 1] = y0i - y2i; +    y0r = wk1i * x1r - wk1r * x1i; +    y0i = wk1i * x1i + wk1r * x1r; +    y2r = wk1r * x3r - wk1i * x3i; +    y2i = wk1r * x3i + wk1i * x3r; +    a[j2] = y0r - y2r; +    a[j2 + 1] = y0i - y2i; +    a[j3] = y0r + y2r; +    a[j3 + 1] = y0i + y2i; +} + + +static void cftfx41(int n, float32 *a, int nw, float32 *w) +{ +    if (n == 128) { +        cftf161(a, &w[nw - 8]); +        cftf162(&a[32], &w[nw - 32]); +        cftf161(&a[64], &w[nw - 8]); +        cftf161(&a[96], &w[nw - 8]); +    } else { +        cftf081(a, &w[nw - 8]); +        cftf082(&a[16], &w[nw - 8]); +        cftf081(&a[32], &w[nw - 8]); +        cftf081(&a[48], &w[nw - 8]); +    } +} + + +static void cftf161(float32 *a, float32 *w) +{ +    float32 wn4r, wk1r, wk1i,  +        x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,  +        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,  +        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,  +        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; +     +    wn4r = w[1]; +    wk1r = w[2]; +    wk1i = w[3]; +    x0r = a[0] + a[16]; +    x0i = a[1] + a[17]; +    x1r = a[0] - a[16]; +    x1i = a[1] - a[17]; +    x2r = a[8] + a[24]; +    x2i = a[9] + a[25]; +    x3r = a[8] - a[24]; +    x3i = a[9] - a[25]; +    y0r = x0r + x2r; +    y0i = x0i + x2i; +    y4r = x0r - x2r; +    y4i = x0i - x2i; +    y8r = x1r - x3i; +    y8i = x1i + x3r; +    y12r = x1r + x3i; +    y12i = x1i - x3r; +    x0r = a[2] + a[18]; +    x0i = a[3] + a[19]; +    x1r = a[2] - a[18]; +    x1i = a[3] - a[19]; +    x2r = a[10] + a[26]; +    x2i = a[11] + a[27]; +    x3r = a[10] - a[26]; +    x3i = a[11] - a[27]; +    y1r = x0r + x2r; +    y1i = x0i + x2i; +    y5r = x0r - x2r; +    y5i = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    y9r = wk1r * x0r - wk1i * x0i; +    y9i = wk1r * x0i + wk1i * x0r; +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    y13r = wk1i * x0r - wk1r * x0i; +    y13i = wk1i * x0i + wk1r * x0r; +    x0r = a[4] + a[20]; +    x0i = a[5] + a[21]; +    x1r = a[4] - a[20]; +    x1i = a[5] - a[21]; +    x2r = a[12] + a[28]; +    x2i = a[13] + a[29]; +    x3r = a[12] - a[28]; +    x3i = a[13] - a[29]; +    y2r = x0r + x2r; +    y2i = x0i + x2i; +    y6r = x0r - x2r; +    y6i = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    y10r = wn4r * (x0r - x0i); +    y10i = wn4r * (x0i + x0r); +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    y14r = wn4r * (x0r + x0i); +    y14i = wn4r * (x0i - x0r); +    x0r = a[6] + a[22]; +    x0i = a[7] + a[23]; +    x1r = a[6] - a[22]; +    x1i = a[7] - a[23]; +    x2r = a[14] + a[30]; +    x2i = a[15] + a[31]; +    x3r = a[14] - a[30]; +    x3i = a[15] - a[31]; +    y3r = x0r + x2r; +    y3i = x0i + x2i; +    y7r = x0r - x2r; +    y7i = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    y11r = wk1i * x0r - wk1r * x0i; +    y11i = wk1i * x0i + wk1r * x0r; +    x0r = x1r + x3i; +    x0i = x1i - x3r; +    y15r = wk1r * x0r - wk1i * x0i; +    y15i = wk1r * x0i + wk1i * x0r; +    x0r = y12r - y14r; +    x0i = y12i - y14i; +    x1r = y12r + y14r; +    x1i = y12i + y14i; +    x2r = y13r - y15r; +    x2i = y13i - y15i; +    x3r = y13r + y15r; +    x3i = y13i + y15i; +    a[24] = x0r + x2r; +    a[25] = x0i + x2i; +    a[26] = x0r - x2r; +    a[27] = x0i - x2i; +    a[28] = x1r - x3i; +    a[29] = x1i + x3r; +    a[30] = x1r + x3i; +    a[31] = x1i - x3r; +    x0r = y8r + y10r; +    x0i = y8i + y10i; +    x1r = y8r - y10r; +    x1i = y8i - y10i; +    x2r = y9r + y11r; +    x2i = y9i + y11i; +    x3r = y9r - y11r; +    x3i = y9i - y11i; +    a[16] = x0r + x2r; +    a[17] = x0i + x2i; +    a[18] = x0r - x2r; +    a[19] = x0i - x2i; +    a[20] = x1r - x3i; +    a[21] = x1i + x3r; +    a[22] = x1r + x3i; +    a[23] = x1i - x3r; +    x0r = y5r - y7i; +    x0i = y5i + y7r; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    x0r = y5r + y7i; +    x0i = y5i - y7r; +    x3r = wn4r * (x0r - x0i); +    x3i = wn4r * (x0i + x0r); +    x0r = y4r - y6i; +    x0i = y4i + y6r; +    x1r = y4r + y6i; +    x1i = y4i - y6r; +    a[8] = x0r + x2r; +    a[9] = x0i + x2i; +    a[10] = x0r - x2r; +    a[11] = x0i - x2i; +    a[12] = x1r - x3i; +    a[13] = x1i + x3r; +    a[14] = x1r + x3i; +    a[15] = x1i - x3r; +    x0r = y0r + y2r; +    x0i = y0i + y2i; +    x1r = y0r - y2r; +    x1i = y0i - y2i; +    x2r = y1r + y3r; +    x2i = y1i + y3i; +    x3r = y1r - y3r; +    x3i = y1i - y3i; +    a[0] = x0r + x2r; +    a[1] = x0i + x2i; +    a[2] = x0r - x2r; +    a[3] = x0i - x2i; +    a[4] = x1r - x3i; +    a[5] = x1i + x3r; +    a[6] = x1r + x3i; +    a[7] = x1i - x3r; +} + + +static void cftf162(float32 *a, float32 *w) +{ +    float32 wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i,  +        x0r, x0i, x1r, x1i, x2r, x2i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,  +        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,  +        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i,  +        y12r, y12i, y13r, y13i, y14r, y14i, y15r, y15i; +     +    wn4r = w[1]; +    wk1r = w[4]; +    wk1i = w[5]; +    wk3r = w[6]; +    wk3i = -w[7]; +    wk2r = w[8]; +    wk2i = w[9]; +    x1r = a[0] - a[17]; +    x1i = a[1] + a[16]; +    x0r = a[8] - a[25]; +    x0i = a[9] + a[24]; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    y0r = x1r + x2r; +    y0i = x1i + x2i; +    y4r = x1r - x2r; +    y4i = x1i - x2i; +    x1r = a[0] + a[17]; +    x1i = a[1] - a[16]; +    x0r = a[8] + a[25]; +    x0i = a[9] - a[24]; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    y8r = x1r - x2i; +    y8i = x1i + x2r; +    y12r = x1r + x2i; +    y12i = x1i - x2r; +    x0r = a[2] - a[19]; +    x0i = a[3] + a[18]; +    x1r = wk1r * x0r - wk1i * x0i; +    x1i = wk1r * x0i + wk1i * x0r; +    x0r = a[10] - a[27]; +    x0i = a[11] + a[26]; +    x2r = wk3i * x0r - wk3r * x0i; +    x2i = wk3i * x0i + wk3r * x0r; +    y1r = x1r + x2r; +    y1i = x1i + x2i; +    y5r = x1r - x2r; +    y5i = x1i - x2i; +    x0r = a[2] + a[19]; +    x0i = a[3] - a[18]; +    x1r = wk3r * x0r - wk3i * x0i; +    x1i = wk3r * x0i + wk3i * x0r; +    x0r = a[10] + a[27]; +    x0i = a[11] - a[26]; +    x2r = wk1r * x0r + wk1i * x0i; +    x2i = wk1r * x0i - wk1i * x0r; +    y9r = x1r - x2r; +    y9i = x1i - x2i; +    y13r = x1r + x2r; +    y13i = x1i + x2i; +    x0r = a[4] - a[21]; +    x0i = a[5] + a[20]; +    x1r = wk2r * x0r - wk2i * x0i; +    x1i = wk2r * x0i + wk2i * x0r; +    x0r = a[12] - a[29]; +    x0i = a[13] + a[28]; +    x2r = wk2i * x0r - wk2r * x0i; +    x2i = wk2i * x0i + wk2r * x0r; +    y2r = x1r + x2r; +    y2i = x1i + x2i; +    y6r = x1r - x2r; +    y6i = x1i - x2i; +    x0r = a[4] + a[21]; +    x0i = a[5] - a[20]; +    x1r = wk2i * x0r - wk2r * x0i; +    x1i = wk2i * x0i + wk2r * x0r; +    x0r = a[12] + a[29]; +    x0i = a[13] - a[28]; +    x2r = wk2r * x0r - wk2i * x0i; +    x2i = wk2r * x0i + wk2i * x0r; +    y10r = x1r - x2r; +    y10i = x1i - x2i; +    y14r = x1r + x2r; +    y14i = x1i + x2i; +    x0r = a[6] - a[23]; +    x0i = a[7] + a[22]; +    x1r = wk3r * x0r - wk3i * x0i; +    x1i = wk3r * x0i + wk3i * x0r; +    x0r = a[14] - a[31]; +    x0i = a[15] + a[30]; +    x2r = wk1i * x0r - wk1r * x0i; +    x2i = wk1i * x0i + wk1r * x0r; +    y3r = x1r + x2r; +    y3i = x1i + x2i; +    y7r = x1r - x2r; +    y7i = x1i - x2i; +    x0r = a[6] + a[23]; +    x0i = a[7] - a[22]; +    x1r = wk1i * x0r + wk1r * x0i; +    x1i = wk1i * x0i - wk1r * x0r; +    x0r = a[14] + a[31]; +    x0i = a[15] - a[30]; +    x2r = wk3i * x0r - wk3r * x0i; +    x2i = wk3i * x0i + wk3r * x0r; +    y11r = x1r + x2r; +    y11i = x1i + x2i; +    y15r = x1r - x2r; +    y15i = x1i - x2i; +    x1r = y0r + y2r; +    x1i = y0i + y2i; +    x2r = y1r + y3r; +    x2i = y1i + y3i; +    a[0] = x1r + x2r; +    a[1] = x1i + x2i; +    a[2] = x1r - x2r; +    a[3] = x1i - x2i; +    x1r = y0r - y2r; +    x1i = y0i - y2i; +    x2r = y1r - y3r; +    x2i = y1i - y3i; +    a[4] = x1r - x2i; +    a[5] = x1i + x2r; +    a[6] = x1r + x2i; +    a[7] = x1i - x2r; +    x1r = y4r - y6i; +    x1i = y4i + y6r; +    x0r = y5r - y7i; +    x0i = y5i + y7r; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    a[8] = x1r + x2r; +    a[9] = x1i + x2i; +    a[10] = x1r - x2r; +    a[11] = x1i - x2i; +    x1r = y4r + y6i; +    x1i = y4i - y6r; +    x0r = y5r + y7i; +    x0i = y5i - y7r; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    a[12] = x1r - x2i; +    a[13] = x1i + x2r; +    a[14] = x1r + x2i; +    a[15] = x1i - x2r; +    x1r = y8r + y10r; +    x1i = y8i + y10i; +    x2r = y9r - y11r; +    x2i = y9i - y11i; +    a[16] = x1r + x2r; +    a[17] = x1i + x2i; +    a[18] = x1r - x2r; +    a[19] = x1i - x2i; +    x1r = y8r - y10r; +    x1i = y8i - y10i; +    x2r = y9r + y11r; +    x2i = y9i + y11i; +    a[20] = x1r - x2i; +    a[21] = x1i + x2r; +    a[22] = x1r + x2i; +    a[23] = x1i - x2r; +    x1r = y12r - y14i; +    x1i = y12i + y14r; +    x0r = y13r + y15i; +    x0i = y13i - y15r; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    a[24] = x1r + x2r; +    a[25] = x1i + x2i; +    a[26] = x1r - x2r; +    a[27] = x1i - x2i; +    x1r = y12r + y14i; +    x1i = y12i - y14r; +    x0r = y13r - y15i; +    x0i = y13i + y15r; +    x2r = wn4r * (x0r - x0i); +    x2i = wn4r * (x0i + x0r); +    a[28] = x1r - x2i; +    a[29] = x1i + x2r; +    a[30] = x1r + x2i; +    a[31] = x1i - x2r; +} + + +static void cftf081(float32 *a, float32 *w) +{ +    float32 wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,  +        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; +     +    wn4r = w[1]; +    x0r = a[0] + a[8]; +    x0i = a[1] + a[9]; +    x1r = a[0] - a[8]; +    x1i = a[1] - a[9]; +    x2r = a[4] + a[12]; +    x2i = a[5] + a[13]; +    x3r = a[4] - a[12]; +    x3i = a[5] - a[13]; +    y0r = x0r + x2r; +    y0i = x0i + x2i; +    y2r = x0r - x2r; +    y2i = x0i - x2i; +    y1r = x1r - x3i; +    y1i = x1i + x3r; +    y3r = x1r + x3i; +    y3i = x1i - x3r; +    x0r = a[2] + a[10]; +    x0i = a[3] + a[11]; +    x1r = a[2] - a[10]; +    x1i = a[3] - a[11]; +    x2r = a[6] + a[14]; +    x2i = a[7] + a[15]; +    x3r = a[6] - a[14]; +    x3i = a[7] - a[15]; +    y4r = x0r + x2r; +    y4i = x0i + x2i; +    y6r = x0r - x2r; +    y6i = x0i - x2i; +    x0r = x1r - x3i; +    x0i = x1i + x3r; +    x2r = x1r + x3i; +    x2i = x1i - x3r; +    y5r = wn4r * (x0r - x0i); +    y5i = wn4r * (x0r + x0i); +    y7r = wn4r * (x2r - x2i); +    y7i = wn4r * (x2r + x2i); +    a[8] = y1r + y5r; +    a[9] = y1i + y5i; +    a[10] = y1r - y5r; +    a[11] = y1i - y5i; +    a[12] = y3r - y7i; +    a[13] = y3i + y7r; +    a[14] = y3r + y7i; +    a[15] = y3i - y7r; +    a[0] = y0r + y4r; +    a[1] = y0i + y4i; +    a[2] = y0r - y4r; +    a[3] = y0i - y4i; +    a[4] = y2r - y6i; +    a[5] = y2i + y6r; +    a[6] = y2r + y6i; +    a[7] = y2i - y6r; +} + + +static void cftf082(float32 *a, float32 *w) +{ +    float32 wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i,  +        y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i,  +        y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; +     +    wn4r = w[1]; +    wk1r = w[2]; +    wk1i = w[3]; +    y0r = a[0] - a[9]; +    y0i = a[1] + a[8]; +    y1r = a[0] + a[9]; +    y1i = a[1] - a[8]; +    x0r = a[4] - a[13]; +    x0i = a[5] + a[12]; +    y2r = wn4r * (x0r - x0i); +    y2i = wn4r * (x0i + x0r); +    x0r = a[4] + a[13]; +    x0i = a[5] - a[12]; +    y3r = wn4r * (x0r - x0i); +    y3i = wn4r * (x0i + x0r); +    x0r = a[2] - a[11]; +    x0i = a[3] + a[10]; +    y4r = wk1r * x0r - wk1i * x0i; +    y4i = wk1r * x0i + wk1i * x0r; +    x0r = a[2] + a[11]; +    x0i = a[3] - a[10]; +    y5r = wk1i * x0r - wk1r * x0i; +    y5i = wk1i * x0i + wk1r * x0r; +    x0r = a[6] - a[15]; +    x0i = a[7] + a[14]; +    y6r = wk1i * x0r - wk1r * x0i; +    y6i = wk1i * x0i + wk1r * x0r; +    x0r = a[6] + a[15]; +    x0i = a[7] - a[14]; +    y7r = wk1r * x0r - wk1i * x0i; +    y7i = wk1r * x0i + wk1i * x0r; +    x0r = y0r + y2r; +    x0i = y0i + y2i; +    x1r = y4r + y6r; +    x1i = y4i + y6i; +    a[0] = x0r + x1r; +    a[1] = x0i + x1i; +    a[2] = x0r - x1r; +    a[3] = x0i - x1i; +    x0r = y0r - y2r; +    x0i = y0i - y2i; +    x1r = y4r - y6r; +    x1i = y4i - y6i; +    a[4] = x0r - x1i; +    a[5] = x0i + x1r; +    a[6] = x0r + x1i; +    a[7] = x0i - x1r; +    x0r = y1r - y3i; +    x0i = y1i + y3r; +    x1r = y5r - y7r; +    x1i = y5i - y7i; +    a[8] = x0r + x1r; +    a[9] = x0i + x1i; +    a[10] = x0r - x1r; +    a[11] = x0i - x1i; +    x0r = y1r + y3i; +    x0i = y1i - y3r; +    x1r = y5r + y7r; +    x1i = y5i + y7i; +    a[12] = x0r - x1i; +    a[13] = x0i + x1r; +    a[14] = x0r + x1i; +    a[15] = x0i - x1r; +} + + +static void cftf040(float32 *a) +{ +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; +     +    x0r = a[0] + a[4]; +    x0i = a[1] + a[5]; +    x1r = a[0] - a[4]; +    x1i = a[1] - a[5]; +    x2r = a[2] + a[6]; +    x2i = a[3] + a[7]; +    x3r = a[2] - a[6]; +    x3i = a[3] - a[7]; +    a[0] = x0r + x2r; +    a[1] = x0i + x2i; +    a[2] = x1r - x3i; +    a[3] = x1i + x3r; +    a[4] = x0r - x2r; +    a[5] = x0i - x2i; +    a[6] = x1r + x3i; +    a[7] = x1i - x3r; +} + + +static void cftb040(float32 *a) +{ +    float32 x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; +     +    x0r = a[0] + a[4]; +    x0i = a[1] + a[5]; +    x1r = a[0] - a[4]; +    x1i = a[1] - a[5]; +    x2r = a[2] + a[6]; +    x2i = a[3] + a[7]; +    x3r = a[2] - a[6]; +    x3i = a[3] - a[7]; +    a[0] = x0r + x2r; +    a[1] = x0i + x2i; +    a[2] = x1r + x3i; +    a[3] = x1i - x3r; +    a[4] = x0r - x2r; +    a[5] = x0i - x2i; +    a[6] = x1r - x3i; +    a[7] = x1i + x3r; +} + + +static void cftx020(float32 *a) +{ +    float32 x0r, x0i; +     +    x0r = a[0] - a[2]; +    x0i = a[1] - a[3]; +    a[0] += a[2]; +    a[1] += a[3]; +    a[2] = x0r; +    a[3] = x0i; +} + + +static void rftfsub(int n, float32 *a, int nc, float32 *c) +{ +    int j, k, kk, ks, m; +    float32 wkr, wki, xr, xi, yr, yi; +     +    m = n >> 1; +    ks = 2 * nc / m; +    kk = 0; +    for (j = 2; j < m; j += 2) { +        k = n - j; +        kk += ks; +        wkr = 0.5 - c[nc - kk]; +        wki = c[kk]; +        xr = a[j] - a[k]; +        xi = a[j + 1] + a[k + 1]; +        yr = wkr * xr - wki * xi; +        yi = wkr * xi + wki * xr; +        a[j] -= yr; +        a[j + 1] -= yi; +        a[k] += yr; +        a[k + 1] -= yi; +    } +} + + +static void rftbsub(int n, float32 *a, int nc, float32 *c) +{ +    int j, k, kk, ks, m; +    float32 wkr, wki, xr, xi, yr, yi; +     +    m = n >> 1; +    ks = 2 * nc / m; +    kk = 0; +    for (j = 2; j < m; j += 2) { +        k = n - j; +        kk += ks; +        wkr = 0.5 - c[nc - kk]; +        wki = c[kk]; +        xr = a[j] - a[k]; +        xi = a[j + 1] + a[k + 1]; +        yr = wkr * xr + wki * xi; +        yi = wkr * xi - wki * xr; +        a[j] -= yr; +        a[j + 1] -= yi; +        a[k] += yr; +        a[k + 1] -= yi; +    } +} + + +static void dctsub(int n, float32 *a, int nc, float32 *c) +{ +    int j, k, kk, ks, m; +    float32 wkr, wki, xr; +     +    m = n >> 1; +    ks = nc / n; +    kk = 0; +    for (j = 1; j < m; j++) { +        k = n - j; +        kk += ks; +        wkr = c[kk] - c[nc - kk]; +        wki = c[kk] + c[nc - kk]; +        xr = wki * a[j] - wkr * a[k]; +        a[j] = wkr * a[j] + wki * a[k]; +        a[k] = xr; +    } +    a[m] *= c[0]; +} + + +static void dstsub(int n, float32 *a, int nc, float32 *c) +{ +    int j, k, kk, ks, m; +    float32 wkr, wki, xr; +     +    m = n >> 1; +    ks = nc / n; +    kk = 0; +    for (j = 1; j < m; j++) { +        k = n - j; +        kk += ks; +        wkr = c[kk] - c[nc - kk]; +        wki = c[kk] + c[nc - kk]; +        xr = wki * a[k] - wkr * a[j]; +        a[k] = wkr * a[k] + wki * a[j]; +        a[j] = xr; +    } +    a[m] *= c[0];  }  } | 
