/* ----------------------------------------------------------------------- Copyright: 2010-2018, imec Vision Lab, University of Antwerp 2014-2018, CWI, Amsterdam Contact: astra@astra-toolbox.com Website: http://www.astra-toolbox.com/ This file is part of the ASTRA Toolbox. The ASTRA Toolbox is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The ASTRA Toolbox is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with the ASTRA Toolbox. If not, see . ----------------------------------------------------------------------- */ #include "astra/cuda/2d/fft.h" #include "astra/cuda/2d/util.h" #include "astra/Logging.h" #include "astra/Fourier.h" #include #include #include #include using namespace astra; // TODO: evaluate what we want to do in these situations: #define CHECK_ERROR(errorMessage) do { \ cudaError_t err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ ASTRA_ERROR("Cuda error %s : %s", \ errorMessage,cudaGetErrorString( err)); \ exit(EXIT_FAILURE); \ } } while (0) #define SAFE_CALL( call) do { \ cudaError err = call; \ if( cudaSuccess != err) { \ ASTRA_ERROR("Cuda error: %s ", \ cudaGetErrorString( err)); \ exit(EXIT_FAILURE); \ } \ err = cudaThreadSynchronize(); \ if( cudaSuccess != err) { \ ASTRA_ERROR("Cuda error: %s : ", \ cudaGetErrorString( err)); \ exit(EXIT_FAILURE); \ } } while (0) namespace astraCUDA { __global__ static void applyFilter_kernel(int _iProjectionCount, int _iFreqBinCount, cufftComplex * _pSinogram, cufftComplex * _pFilter) { int iIndex = threadIdx.x + blockIdx.x * blockDim.x; int iProjectionIndex = iIndex / _iFreqBinCount; if(iProjectionIndex >= _iProjectionCount) { return; } float fA = _pSinogram[iIndex].x; float fB = _pSinogram[iIndex].y; float fC = _pFilter[iIndex].x; float fD = _pFilter[iIndex].y; _pSinogram[iIndex].x = fA * fC - fB * fD; _pSinogram[iIndex].y = fA * fD + fC * fB; } __global__ static void rescaleInverseFourier_kernel(int _iProjectionCount, int _iDetectorCount, float* _pfInFourierOutput) { int iIndex = threadIdx.x + blockIdx.x * blockDim.x; int iProjectionIndex = iIndex / _iDetectorCount; int iDetectorIndex = iIndex % _iDetectorCount; if(iProjectionIndex >= _iProjectionCount) { return; } _pfInFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex] /= (float)_iDetectorCount; } static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount, float * _pfInFourierOutput) { const int iBlockSize = 256; int iElementCount = _iProjectionCount * _iDetectorCount; int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, _iDetectorCount, _pfInFourierOutput); CHECK_ERROR("rescaleInverseFourier_kernel failed"); } void applyFilter(int _iProjectionCount, int _iFreqBinCount, cufftComplex * _pSinogram, cufftComplex * _pFilter) { const int iBlockSize = 256; int iElementCount = _iProjectionCount * _iFreqBinCount; int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, _iFreqBinCount, _pSinogram, _pFilter); CHECK_ERROR("applyFilter_kernel failed"); } static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount, const float * _pfDevSource, cufftComplex * _pDevTargetComplex) { cufftHandle plan; cufftResult result; result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to plan 1d r2c fft"); return false; } result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex); cufftDestroy(plan); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to exec 1d r2c fft"); return false; } return true; } static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount, const cufftComplex * _pDevSourceComplex, float * _pfDevTarget) { cufftHandle plan; cufftResult result; result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to plan 1d c2r fft"); return false; } // todo: why do we have to get rid of the const qualifier? result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex, (cufftReal *)_pfDevTarget); cufftDestroy(plan); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to exec 1d c2r fft"); return false; } return true; } bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex ** _ppDevComplex) { size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize)); return true; } bool freeComplexOnDevice(cufftComplex * _pDevComplex) { SAFE_CALL(cudaFree(_pDevComplex)); return true; } bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, cufftComplex * _pHostComplexSource, cufftComplex * _pDevComplexTarget) { size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice)); return true; } bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, int _iSourcePitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, cufftComplex * _pDevTargetComplex) { float * pfDevRealFFTSource = NULL; size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize)); SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize)); for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch; float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount; SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); } bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount, pfDevRealFFTSource, _pDevTargetComplex); if(!bResult) { return false; } SAFE_CALL(cudaFree(pfDevRealFFTSource)); return true; } bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, float * _pfRealTarget, int _iTargetPitch, int _iProjDets, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) { float * pfDevRealFFTTarget = NULL; size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize)); bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount, _pDevSourceComplex, pfDevRealFFTTarget); if(!bResult) { return false; } rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount, pfDevRealFFTTarget); SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch)); for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch; SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); } SAFE_CALL(cudaFree(pfDevRealFFTTarget)); return true; } // Because the input is real, the Fourier transform is symmetric. // CUFFT only outputs the first half (ignoring the redundant second half), // and expects the same as input for the IFFT. int calcFFTFourierSize(int _iFFTRealSize) { int iFFTFourierSize = _iFFTRealSize / 2 + 1; return iFFTFourierSize; } void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) { for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) { int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; _pFilter[iIndex].x = 1.0f; _pFilter[iIndex].y = 0.0f; } } } void genCuFFTFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, cufftComplex * _pFilter, int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, float _fParameter /* = -1.0f */) { float * pfFilt = astra::genFilter(_eFilter, _fD, _iProjectionCount, _iFFTRealDetectorCount, _iFFTFourierDetectorCount, _fParameter); for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) { float fFilterValue = pfFilt[iDetectorIndex]; for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) { int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; _pFilter[iIndex].x = fFilterValue; _pFilter[iIndex].y = 0.0f; } } delete[] pfFilt; } } #ifdef STANDALONE __global__ static void doubleFourierOutput_kernel(int _iProjectionCount, int _iDetectorCount, cufftComplex* _pFourierOutput) { int iIndex = threadIdx.x + blockIdx.x * blockDim.x; int iProjectionIndex = iIndex / _iDetectorCount; int iDetectorIndex = iIndex % _iDetectorCount; if(iProjectionIndex >= _iProjectionCount) { return; } if(iDetectorIndex <= (_iDetectorCount / 2)) { return; } int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex; _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x; _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y; } static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount, cufftComplex * _pFourierOutput) { const int iBlockSize = 256; int iElementCount = _iProjectionCount * _iDetectorCount; int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, _iDetectorCount, _pFourierOutput); CHECK_ERROR("doubleFourierOutput_kernel failed"); } static void writeToMatlabFile(const char * _fileName, const float * _pfData, int _iRowCount, int _iColumnCount) { std::ofstream out(_fileName); for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++) { for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++) { out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " "; } out << std::endl; } } static void convertComplexToRealImg(const cufftComplex * _pComplex, int _iElementCount, float * _pfReal, float * _pfImaginary) { for(int iIndex = 0; iIndex < _iElementCount; iIndex++) { _pfReal[iIndex] = _pComplex[iIndex].x; _pfImaginary[iIndex] = _pComplex[iIndex].y; } } void testCudaFFT() { const int iProjectionCount = 2; const int iDetectorCount = 1024; const int iTotalElementCount = iProjectionCount * iDetectorCount; float * pfHostProj = new float[iTotalElementCount]; memset(pfHostProj, 0, sizeof(float) * iTotalElementCount); for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++) { for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++) { // int // pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX; } } writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount); float * pfDevProj = NULL; SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount)); SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice)); cufftComplex * pDevFourProj = NULL; SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount)); cufftHandle plan; cufftResult result; result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to plan 1d r2c fft"); } result = cufftExecR2C(plan, pfDevProj, pDevFourProj); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to exec 1d r2c fft"); } cufftDestroy(plan); doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj); cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount]; SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost)); float * pfHostFourProjReal = new float[iTotalElementCount]; float * pfHostFourProjImaginary = new float[iTotalElementCount]; convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary); writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount); writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount); float * pfDevInFourProj = NULL; SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount)); result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to plan 1d c2r fft"); } result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj); if(result != CUFFT_SUCCESS) { ASTRA_ERROR("Failed to exec 1d c2r fft"); } cufftDestroy(plan); rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj); float * pfHostInFourProj = new float[iTotalElementCount]; SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost)); writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount); SAFE_CALL(cudaFree(pDevFourProj)); SAFE_CALL(cudaFree(pfDevProj)); delete [] pfHostInFourProj; delete [] pfHostFourProjReal; delete [] pfHostFourProjImaginary; delete [] pfHostProj; delete [] pHostFourProj; } void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount, int _iDetectorCount, cufftComplex * _pDevFilter, int _iFilterDetCount) { cufftComplex * pHostFilter = NULL; size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount; pHostFilter = (cufftComplex *)malloc(complMemSize); SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost)); for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) { for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) { cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; float fReadValue = sqrtf(source.x * source.x + source.y * source.y); _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue; } } free(pHostFilter); } void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount, int _iDetectorCount, float * _pfDevFilter, int _iFilterDetCount) { float * pfHostFilter = NULL; size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount; pfHostFilter = (float *)malloc(memSize); SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost)); for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) { for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) { float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource; } } free(pfHostFilter); } #endif