diff options
-rw-r--r-- | cuda/3d/arith3d.cu | 6 | ||||
-rw-r--r-- | cuda/3d/arith3d.h | 1 | ||||
-rw-r--r-- | cuda/3d/astra3d.cu | 142 | ||||
-rw-r--r-- | cuda/3d/astra3d.h | 22 | ||||
-rw-r--r-- | cuda/3d/par3d_bp.cu | 18 | ||||
-rw-r--r-- | include/astra/CudaBackProjectionAlgorithm3D.h | 7 | ||||
-rw-r--r-- | matlab/mex/astra_mex_data3d_c.cpp | 4 | ||||
-rw-r--r-- | src/CudaBackProjectionAlgorithm3D.cpp | 80 |
8 files changed, 245 insertions, 35 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index 7cb56f6..cc96da5 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -52,6 +52,11 @@ struct opAddMul { out += in1 * in2; } }; +struct opAdd { + __device__ void operator()(float& out, const float in) { + out += in; + } +}; struct opMul { __device__ void operator()(float& out, const float in) { out *= in; @@ -594,6 +599,7 @@ INST_DDFtoD(opAddMulScaled) INST_DDtoD(opAddMul) INST_DDtoD(opMul2) INST_DtoD(opMul) +INST_DtoD(opAdd) INST_DtoD(opDividedBy) INST_toD(opInvert) INST_FtoD(opSet) diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h index 53c9b79..5635ffd 100644 --- a/cuda/3d/arith3d.h +++ b/cuda/3d/arith3d.h @@ -37,6 +37,7 @@ struct opAddScaled; struct opScaleAndAdd; struct opAddMulScaled; struct opAddMul; +struct opAdd; struct opMul; struct opMul2; struct opDividedBy; diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu index 4447775..2efac98 100644 --- a/cuda/3d/astra3d.cu +++ b/cuda/3d/astra3d.cu @@ -1463,6 +1463,40 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, return ok; } +// This computes the column weights, divides by them, and adds the +// result to the current volume. This is both more expensive and more +// GPU memory intensive than the regular BP, but allows saving system RAM. +bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaPar3DBP_SIRTWeighted(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + + delete[] p; + + return ok; +} + bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, unsigned int iVolX, @@ -1491,9 +1525,6 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, dims.iRaysPerVoxelDim = iVoxelSuperSampling; - if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) - return false; - if (iGPUIndex != -1) { cudaSetDevice(iGPUIndex); cudaError_t err = cudaGetLastError(); @@ -1540,6 +1571,111 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, } +// This computes the column weights, divides by them, and adds the +// result to the current volume. This is both more expensive and more +// GPU memory intensive than the regular BP, but allows saving system RAM. +bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, + const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iGPUIndex != -1) { + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + } + + + cudaPitchedPtr D_pixelWeight = allocateVolumeData(dims); + bool ok = D_pixelWeight.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + ok = D_volumeData.ptr; + if (!ok) { + cudaFree(D_pixelWeight.ptr); + return false; + } + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_pixelWeight.ptr); + cudaFree(D_volumeData.ptr); + return false; + } + + // Compute weights + ok &= zeroVolumeData(D_pixelWeight, dims); + processSino3D<opSet>(D_projData, 1.0f, dims); + ok &= Par3DBP(D_pixelWeight, D_projData, dims, pfAngles); + processVol3D<opInvert>(D_pixelWeight, dims); + if (!ok) { + cudaFree(D_pixelWeight.ptr); + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, + dims, dims.iProjU); + ok &= zeroVolumeData(D_volumeData, dims); + // Do BP into D_volumeData + ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles); + // Multiply with weights + processVol3D<opMul>(D_volumeData, D_pixelWeight, dims); + + // Upload previous iterate to D_pixelWeight... + ok &= copyVolumeToDevice(pfVolume, D_pixelWeight, dims, dims.iVolX); + if (!ok) { + cudaFree(D_pixelWeight.ptr); + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + // ...and add it to the weighted BP + processVol3D<opAdd>(D_volumeData, D_pixelWeight, dims); + + // Then copy the result back + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_pixelWeight.ptr); + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + + bool astraCudaFDK(float* pfVolume, const float* pfProjections, unsigned int iVolX, diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h index 5712f89..9d87e33 100644 --- a/cuda/3d/astra3d.h +++ b/cuda/3d/astra3d.h @@ -429,6 +429,28 @@ _AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, const SPar3DProjection *pfAngles, int iGPUIndex, int iVoxelSuperSampling); +_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + _AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections, unsigned int iVolX, unsigned int iVolY, diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu index 9bf9a9f..8cb285c 100644 --- a/cuda/3d/par3d_bp.cu +++ b/cuda/3d/par3d_bp.cu @@ -136,13 +136,10 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn const float fCvz = gC_Cvz[angle]; const float fCvc = gC_Cvc[angle]; - const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; - const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; + const float fU = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; + const float fV = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; - const float fU = fUNum; - const float fV = fVNum; - - fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); } @@ -213,13 +210,10 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star float fZs = fZ; for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { - const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; - const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; - - const float fU = fUNum; - const float fV = fVNum; + const float fU = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; + const float fV = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; - fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); fZs += fSubStep; } fYs += fSubStep; diff --git a/include/astra/CudaBackProjectionAlgorithm3D.h b/include/astra/CudaBackProjectionAlgorithm3D.h index a9cc559..90a085d 100644 --- a/include/astra/CudaBackProjectionAlgorithm3D.h +++ b/include/astra/CudaBackProjectionAlgorithm3D.h @@ -140,6 +140,13 @@ protected: int m_iGPUIndex; int m_iVoxelSuperSampling; + /** Option to compute the column weights on the fly, divide by + * them, and add the result to the current volume. This is both + * more expensive and more GPU memory intensive than the regular + * BP, but allows saving system RAM. + */ + bool m_bSIRTWeighting; + }; // inline functions diff --git a/matlab/mex/astra_mex_data3d_c.cpp b/matlab/mex/astra_mex_data3d_c.cpp index dfb3003..8492695 100644 --- a/matlab/mex/astra_mex_data3d_c.cpp +++ b/matlab/mex/astra_mex_data3d_c.cpp @@ -277,6 +277,7 @@ extern "C" { mxArray *mxCreateSharedDataCopy(const mxArray *pr); bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy); mxArray *mxUnreference(const mxArray *pr); +bool mxIsSharedArray(const mxArray *pr); } class CFloat32CustomMemoryMatlab3D : public CFloat32CustomMemory { @@ -287,6 +288,9 @@ public: //fprintf(stderr, "Passed:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray)); // First unshare the input array, so that we may modify it. if (bUnshare) { + if (mxIsSharedArray(_pArray)) { + fprintf(stderr, "Performance note: unsharing shared array in link\n"); + } mxUnshareArray(_pArray, false); //fprintf(stderr, "Unshared:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray)); } diff --git a/src/CudaBackProjectionAlgorithm3D.cpp b/src/CudaBackProjectionAlgorithm3D.cpp index b096756..891d7fe 100644 --- a/src/CudaBackProjectionAlgorithm3D.cpp +++ b/src/CudaBackProjectionAlgorithm3D.cpp @@ -53,6 +53,7 @@ CCudaBackProjectionAlgorithm3D::CCudaBackProjectionAlgorithm3D() m_bIsInitialized = false; m_iGPUIndex = -1; m_iVoxelSuperSampling = 1; + m_bSIRTWeighting = false; } //---------------------------------------------------------------------------------------- @@ -106,6 +107,17 @@ bool CCudaBackProjectionAlgorithm3D::initialize(const Config& _cfg) m_iVoxelSuperSampling = (int)_cfg.self->getOptionNumerical("VoxelSuperSampling", 1); CC.markOptionParsed("VoxelSuperSampling"); + CFloat32ProjectionData3DMemory* pSinoMem = dynamic_cast<CFloat32ProjectionData3DMemory*>(m_pSinogram); + ASTRA_ASSERT(pSinoMem); + const CProjectionGeometry3D* projgeom = pSinoMem->getGeometry(); +const CParallelProjectionGeometry3D* par3dgeom = dynamic_cast<const CParallelProjectionGeometry3D*>(projgeom); + const CParallelVecProjectionGeometry3D* parvec3dgeom = dynamic_cast<const CParallelVecProjectionGeometry3D*>(projgeom); + if (parvec3dgeom || par3dgeom) { + // This option is only supported for Par3D currently + m_bSIRTWeighting = _cfg.self->getOptionBool("SIRTWeighting", false); + CC.markOptionParsed("SIRTWeighting"); + } + // success m_bIsInitialized = _check(); return m_bIsInitialized; @@ -181,27 +193,55 @@ void CCudaBackProjectionAlgorithm3D::run(int _iNrIterations) conegeom->getProjectionAngles(), m_iGPUIndex, m_iVoxelSuperSampling); } else if (par3dgeom) { - astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), - volgeom.getGridColCount(), - volgeom.getGridRowCount(), - volgeom.getGridSliceCount(), - par3dgeom->getProjectionCount(), - par3dgeom->getDetectorColCount(), - par3dgeom->getDetectorRowCount(), - par3dgeom->getDetectorSpacingX(), - par3dgeom->getDetectorSpacingY(), - par3dgeom->getProjectionAngles(), - m_iGPUIndex, m_iVoxelSuperSampling); + if (!m_bSIRTWeighting) { + astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), + volgeom.getGridColCount(), + volgeom.getGridRowCount(), + volgeom.getGridSliceCount(), + par3dgeom->getProjectionCount(), + par3dgeom->getDetectorColCount(), + par3dgeom->getDetectorRowCount(), + par3dgeom->getDetectorSpacingX(), + par3dgeom->getDetectorSpacingY(), + par3dgeom->getProjectionAngles(), + m_iGPUIndex, m_iVoxelSuperSampling); + } else { + astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(), + pSinoMem->getDataConst(), + volgeom.getGridColCount(), + volgeom.getGridRowCount(), + volgeom.getGridSliceCount(), + par3dgeom->getProjectionCount(), + par3dgeom->getDetectorColCount(), + par3dgeom->getDetectorRowCount(), + par3dgeom->getDetectorSpacingX(), + par3dgeom->getDetectorSpacingY(), + par3dgeom->getProjectionAngles(), + m_iGPUIndex, m_iVoxelSuperSampling); + } } else if (parvec3dgeom) { - astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), - volgeom.getGridColCount(), - volgeom.getGridRowCount(), - volgeom.getGridSliceCount(), - parvec3dgeom->getProjectionCount(), - parvec3dgeom->getDetectorColCount(), - parvec3dgeom->getDetectorRowCount(), - parvec3dgeom->getProjectionVectors(), - m_iGPUIndex, m_iVoxelSuperSampling); + if (!m_bSIRTWeighting) { + astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), + volgeom.getGridColCount(), + volgeom.getGridRowCount(), + volgeom.getGridSliceCount(), + parvec3dgeom->getProjectionCount(), + parvec3dgeom->getDetectorColCount(), + parvec3dgeom->getDetectorRowCount(), + parvec3dgeom->getProjectionVectors(), + m_iGPUIndex, m_iVoxelSuperSampling); + } else { + astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(), + pSinoMem->getDataConst(), + volgeom.getGridColCount(), + volgeom.getGridRowCount(), + volgeom.getGridSliceCount(), + parvec3dgeom->getProjectionCount(), + parvec3dgeom->getDetectorColCount(), + parvec3dgeom->getDetectorRowCount(), + parvec3dgeom->getProjectionVectors(), + m_iGPUIndex, m_iVoxelSuperSampling); + } } else if (conevecgeom) { astraCudaConeBP(pReconMem->getData(), pSinoMem->getDataConst(), volgeom.getGridColCount(), |