diff options
| -rw-r--r-- | cuda/3d/arith3d.cu | 6 | ||||
| -rw-r--r-- | cuda/3d/arith3d.h | 1 | ||||
| -rw-r--r-- | cuda/3d/astra3d.cu | 142 | ||||
| -rw-r--r-- | cuda/3d/astra3d.h | 22 | ||||
| -rw-r--r-- | cuda/3d/par3d_bp.cu | 18 | ||||
| -rw-r--r-- | include/astra/CudaBackProjectionAlgorithm3D.h | 7 | ||||
| -rw-r--r-- | matlab/mex/astra_mex_data3d_c.cpp | 4 | ||||
| -rw-r--r-- | src/CudaBackProjectionAlgorithm3D.cpp | 80 | 
8 files changed, 245 insertions, 35 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index 7cb56f6..cc96da5 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -52,6 +52,11 @@ struct opAddMul {  		out += in1 * in2;  	}  }; +struct opAdd { +	__device__ void operator()(float& out, const float in) { +		out += in; +	} +};  struct opMul {  	__device__ void operator()(float& out, const float in) {  		out *= in; @@ -594,6 +599,7 @@ INST_DDFtoD(opAddMulScaled)  INST_DDtoD(opAddMul)  INST_DDtoD(opMul2)  INST_DtoD(opMul) +INST_DtoD(opAdd)  INST_DtoD(opDividedBy)  INST_toD(opInvert)  INST_FtoD(opSet) diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h index 53c9b79..5635ffd 100644 --- a/cuda/3d/arith3d.h +++ b/cuda/3d/arith3d.h @@ -37,6 +37,7 @@ struct opAddScaled;  struct opScaleAndAdd;  struct opAddMulScaled;  struct opAddMul; +struct opAdd;  struct opMul;  struct opMul2;  struct opDividedBy; diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu index 4447775..2efac98 100644 --- a/cuda/3d/astra3d.cu +++ b/cuda/3d/astra3d.cu @@ -1463,6 +1463,40 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,  	return ok;  } +// This computes the column weights, divides by them, and adds the +// result to the current volume. This is both more expensive and more +// GPU memory intensive than the regular BP, but allows saving system RAM. +bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, +                      unsigned int iVolX, +                      unsigned int iVolY, +                      unsigned int iVolZ, +                      unsigned int iProjAngles, +                      unsigned int iProjU, +                      unsigned int iProjV, +                      float fDetUSize, +                      float fDetVSize, +                      const float *pfAngles, +                      int iGPUIndex, int iVoxelSuperSampling) +{ +	if (iVolX == 0 || iVolY == 0 || iVolZ == 0) +		return false; +	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) +		return false; + +	SPar3DProjection* p = genPar3DProjections(iProjAngles, +                                             iProjU, iProjV, +                                             fDetUSize, fDetVSize, +                                             pfAngles); + +	bool ok; +	ok = astraCudaPar3DBP_SIRTWeighted(pfVolume, pfProjections, iVolX, iVolY, iVolZ, +	                      iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + +	delete[] p; + +	return ok; +} +  bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,                        unsigned int iVolX, @@ -1491,9 +1525,6 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,  	dims.iRaysPerVoxelDim = iVoxelSuperSampling; -	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) -		return false; -  	if (iGPUIndex != -1) {  		cudaSetDevice(iGPUIndex);  		cudaError_t err = cudaGetLastError(); @@ -1540,6 +1571,111 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,  } +// This computes the column weights, divides by them, and adds the +// result to the current volume. This is both more expensive and more +// GPU memory intensive than the regular BP, but allows saving system RAM. +bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, +                      const float* pfProjections, +                      unsigned int iVolX, +                      unsigned int iVolY, +                      unsigned int iVolZ, +                      unsigned int iProjAngles, +                      unsigned int iProjU, +                      unsigned int iProjV, +                      const SPar3DProjection *pfAngles, +                      int iGPUIndex, int iVoxelSuperSampling) +{ +	SDimensions3D dims; + +	dims.iVolX = iVolX; +	dims.iVolY = iVolY; +	dims.iVolZ = iVolZ; +	if (iVolX == 0 || iVolY == 0 || iVolZ == 0) +		return false; + +	dims.iProjAngles = iProjAngles; +	dims.iProjU = iProjU; +	dims.iProjV = iProjV; + +	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) +		return false; + +	dims.iRaysPerVoxelDim = iVoxelSuperSampling; + +	if (iGPUIndex != -1) { +		cudaSetDevice(iGPUIndex); +		cudaError_t err = cudaGetLastError(); + +		// Ignore errors caused by calling cudaSetDevice multiple times +		if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) +			return false; +	} + + +	cudaPitchedPtr D_pixelWeight = allocateVolumeData(dims); +	bool ok = D_pixelWeight.ptr; +	if (!ok) +		return false; + +	cudaPitchedPtr D_volumeData = allocateVolumeData(dims); +	ok = D_volumeData.ptr; +	if (!ok) { +		cudaFree(D_pixelWeight.ptr); +		return false; +	} + +	cudaPitchedPtr D_projData = allocateProjectionData(dims); +	ok = D_projData.ptr; +	if (!ok) { +		cudaFree(D_pixelWeight.ptr); +		cudaFree(D_volumeData.ptr); +		return false; +	} + +	// Compute weights +	ok &= zeroVolumeData(D_pixelWeight, dims); +	processSino3D<opSet>(D_projData, 1.0f, dims); +	ok &= Par3DBP(D_pixelWeight, D_projData, dims, pfAngles); +	processVol3D<opInvert>(D_pixelWeight, dims); +	if (!ok) { +		cudaFree(D_pixelWeight.ptr); +		cudaFree(D_volumeData.ptr); +		cudaFree(D_projData.ptr); +		return false; +	} + +	ok &= copyProjectionsToDevice(pfProjections, D_projData, +	                              dims, dims.iProjU); +	ok &= zeroVolumeData(D_volumeData, dims); +	// Do BP into D_volumeData +	ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles); +	// Multiply with weights +	processVol3D<opMul>(D_volumeData, D_pixelWeight, dims); + +	// Upload previous iterate to D_pixelWeight... +	ok &= copyVolumeToDevice(pfVolume, D_pixelWeight, dims, dims.iVolX); +	if (!ok) { +		cudaFree(D_pixelWeight.ptr); +		cudaFree(D_volumeData.ptr); +		cudaFree(D_projData.ptr); +		return false; +	} +	// ...and add it to the weighted BP +	processVol3D<opAdd>(D_volumeData, D_pixelWeight, dims); + +	// Then copy the result back +	ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + +	cudaFree(D_pixelWeight.ptr); +	cudaFree(D_volumeData.ptr); +	cudaFree(D_projData.ptr); + +	return ok; + +} + +  bool astraCudaFDK(float* pfVolume, const float* pfProjections,                    unsigned int iVolX, diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h index 5712f89..9d87e33 100644 --- a/cuda/3d/astra3d.h +++ b/cuda/3d/astra3d.h @@ -429,6 +429,28 @@ _AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,                        const SPar3DProjection *pfAngles,                        int iGPUIndex, int iVoxelSuperSampling); +_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, +                      unsigned int iVolX, +                      unsigned int iVolY, +                      unsigned int iVolZ, +                      unsigned int iProjAngles, +                      unsigned int iProjU, +                      unsigned int iProjV, +                      float fDetUSize, +                      float fDetVSize, +                      const float *pfAngles, +                      int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections, +                      unsigned int iVolX, +                      unsigned int iVolY, +                      unsigned int iVolZ, +                      unsigned int iProjAngles, +                      unsigned int iProjU, +                      unsigned int iProjV, +                      const SPar3DProjection *pfAngles, +                      int iGPUIndex, int iVoxelSuperSampling); +  _AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections,                    unsigned int iVolX,                    unsigned int iVolY, diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu index 9bf9a9f..8cb285c 100644 --- a/cuda/3d/par3d_bp.cu +++ b/cuda/3d/par3d_bp.cu @@ -136,13 +136,10 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn  			const float fCvz = gC_Cvz[angle];  			const float fCvc = gC_Cvc[angle]; -			const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; -			const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; +			const float fU = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; +			const float fV = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; -			const float fU = fUNum; -			const float fV = fVNum; - -			fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order +			fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV);  		} @@ -213,13 +210,10 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star  			float fZs = fZ;  			for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { -				const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; -				const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; - -				const float fU = fUNum; -				const float fV = fVNum; +				const float fU = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; +				const float fV = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; -				fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order +				fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV);  				fZs += fSubStep;  			}  			fYs += fSubStep; diff --git a/include/astra/CudaBackProjectionAlgorithm3D.h b/include/astra/CudaBackProjectionAlgorithm3D.h index a9cc559..90a085d 100644 --- a/include/astra/CudaBackProjectionAlgorithm3D.h +++ b/include/astra/CudaBackProjectionAlgorithm3D.h @@ -140,6 +140,13 @@ protected:  	int m_iGPUIndex;  	int m_iVoxelSuperSampling; +	/** Option to compute the column weights on the fly, divide by +	 *  them, and add the result to the current volume. This is both +	 *  more expensive and more GPU memory intensive than the regular +	 *  BP, but allows saving system RAM. +	 */ +	bool m_bSIRTWeighting; +  };  // inline functions diff --git a/matlab/mex/astra_mex_data3d_c.cpp b/matlab/mex/astra_mex_data3d_c.cpp index dfb3003..8492695 100644 --- a/matlab/mex/astra_mex_data3d_c.cpp +++ b/matlab/mex/astra_mex_data3d_c.cpp @@ -277,6 +277,7 @@ extern "C" {  mxArray *mxCreateSharedDataCopy(const mxArray *pr);  bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy);  mxArray *mxUnreference(const mxArray *pr); +bool mxIsSharedArray(const mxArray *pr);  }  class CFloat32CustomMemoryMatlab3D : public CFloat32CustomMemory { @@ -287,6 +288,9 @@ public:  		//fprintf(stderr, "Passed:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray));  		// First unshare the input array, so that we may modify it.  		if (bUnshare) { +			if (mxIsSharedArray(_pArray)) { +				fprintf(stderr, "Performance note: unsharing shared array in link\n"); +			}  			mxUnshareArray(_pArray, false);  			//fprintf(stderr, "Unshared:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray));  		} diff --git a/src/CudaBackProjectionAlgorithm3D.cpp b/src/CudaBackProjectionAlgorithm3D.cpp index b096756..891d7fe 100644 --- a/src/CudaBackProjectionAlgorithm3D.cpp +++ b/src/CudaBackProjectionAlgorithm3D.cpp @@ -53,6 +53,7 @@ CCudaBackProjectionAlgorithm3D::CCudaBackProjectionAlgorithm3D()  	m_bIsInitialized = false;  	m_iGPUIndex = -1;  	m_iVoxelSuperSampling = 1; +	m_bSIRTWeighting = false;  }  //---------------------------------------------------------------------------------------- @@ -106,6 +107,17 @@ bool CCudaBackProjectionAlgorithm3D::initialize(const Config& _cfg)  	m_iVoxelSuperSampling = (int)_cfg.self->getOptionNumerical("VoxelSuperSampling", 1);  	CC.markOptionParsed("VoxelSuperSampling"); +	CFloat32ProjectionData3DMemory* pSinoMem = dynamic_cast<CFloat32ProjectionData3DMemory*>(m_pSinogram); +	ASTRA_ASSERT(pSinoMem); +	const CProjectionGeometry3D* projgeom = pSinoMem->getGeometry(); +const CParallelProjectionGeometry3D* par3dgeom = dynamic_cast<const CParallelProjectionGeometry3D*>(projgeom); +	const CParallelVecProjectionGeometry3D* parvec3dgeom = dynamic_cast<const CParallelVecProjectionGeometry3D*>(projgeom); +	if (parvec3dgeom || par3dgeom) { +		// This option is only supported for Par3D currently +		m_bSIRTWeighting = _cfg.self->getOptionBool("SIRTWeighting", false); +		CC.markOptionParsed("SIRTWeighting"); +	} +  	// success  	m_bIsInitialized = _check();  	return m_bIsInitialized; @@ -181,27 +193,55 @@ void CCudaBackProjectionAlgorithm3D::run(int _iNrIterations)  		                conegeom->getProjectionAngles(),  		                m_iGPUIndex, m_iVoxelSuperSampling);  	} else if (par3dgeom) { -		astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), -		                 volgeom.getGridColCount(), -		                 volgeom.getGridRowCount(), -		                 volgeom.getGridSliceCount(), -		                 par3dgeom->getProjectionCount(), -		                 par3dgeom->getDetectorColCount(), -		                 par3dgeom->getDetectorRowCount(), -		                 par3dgeom->getDetectorSpacingX(), -		                 par3dgeom->getDetectorSpacingY(), -		                 par3dgeom->getProjectionAngles(), -		                 m_iGPUIndex, m_iVoxelSuperSampling); +		if (!m_bSIRTWeighting) { +			astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), +			                 volgeom.getGridColCount(), +			                 volgeom.getGridRowCount(), +			                 volgeom.getGridSliceCount(), +			                 par3dgeom->getProjectionCount(), +			                 par3dgeom->getDetectorColCount(), +			                 par3dgeom->getDetectorRowCount(), +			                 par3dgeom->getDetectorSpacingX(), +			                 par3dgeom->getDetectorSpacingY(), +			                 par3dgeom->getProjectionAngles(), +			                 m_iGPUIndex, m_iVoxelSuperSampling); +		} else { +			astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(), +			                 pSinoMem->getDataConst(), +			                 volgeom.getGridColCount(), +			                 volgeom.getGridRowCount(), +			                 volgeom.getGridSliceCount(), +			                 par3dgeom->getProjectionCount(), +			                 par3dgeom->getDetectorColCount(), +			                 par3dgeom->getDetectorRowCount(), +			                 par3dgeom->getDetectorSpacingX(), +			                 par3dgeom->getDetectorSpacingY(), +			                 par3dgeom->getProjectionAngles(), +			                 m_iGPUIndex, m_iVoxelSuperSampling); +		}  	} else if (parvec3dgeom) { -		astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), -		                 volgeom.getGridColCount(), -		                 volgeom.getGridRowCount(), -		                 volgeom.getGridSliceCount(), -		                 parvec3dgeom->getProjectionCount(), -		                 parvec3dgeom->getDetectorColCount(), -		                 parvec3dgeom->getDetectorRowCount(), -		                 parvec3dgeom->getProjectionVectors(), -		                 m_iGPUIndex, m_iVoxelSuperSampling); +		if (!m_bSIRTWeighting) { +			astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(), +			                 volgeom.getGridColCount(), +			                 volgeom.getGridRowCount(), +			                 volgeom.getGridSliceCount(), +			                 parvec3dgeom->getProjectionCount(), +			                 parvec3dgeom->getDetectorColCount(), +			                 parvec3dgeom->getDetectorRowCount(), +			                 parvec3dgeom->getProjectionVectors(), +			                 m_iGPUIndex, m_iVoxelSuperSampling); +		} else { +			astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(), +			                 pSinoMem->getDataConst(), +			                 volgeom.getGridColCount(), +			                 volgeom.getGridRowCount(), +			                 volgeom.getGridSliceCount(), +			                 parvec3dgeom->getProjectionCount(), +			                 parvec3dgeom->getDetectorColCount(), +			                 parvec3dgeom->getDetectorRowCount(), +			                 parvec3dgeom->getProjectionVectors(), +			                 m_iGPUIndex, m_iVoxelSuperSampling); +		}  	} else if (conevecgeom) {  		astraCudaConeBP(pReconMem->getData(), pSinoMem->getDataConst(),  		                volgeom.getGridColCount(),  | 
