summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--cuda/3d/arith3d.cu6
-rw-r--r--cuda/3d/arith3d.h1
-rw-r--r--cuda/3d/astra3d.cu142
-rw-r--r--cuda/3d/astra3d.h22
-rw-r--r--cuda/3d/par3d_bp.cu18
-rw-r--r--include/astra/CudaBackProjectionAlgorithm3D.h7
-rw-r--r--matlab/mex/astra_mex_data3d_c.cpp4
-rw-r--r--src/CudaBackProjectionAlgorithm3D.cpp80
8 files changed, 245 insertions, 35 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index 7cb56f6..cc96da5 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -52,6 +52,11 @@ struct opAddMul {
out += in1 * in2;
}
};
+struct opAdd {
+ __device__ void operator()(float& out, const float in) {
+ out += in;
+ }
+};
struct opMul {
__device__ void operator()(float& out, const float in) {
out *= in;
@@ -594,6 +599,7 @@ INST_DDFtoD(opAddMulScaled)
INST_DDtoD(opAddMul)
INST_DDtoD(opMul2)
INST_DtoD(opMul)
+INST_DtoD(opAdd)
INST_DtoD(opDividedBy)
INST_toD(opInvert)
INST_FtoD(opSet)
diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h
index 53c9b79..5635ffd 100644
--- a/cuda/3d/arith3d.h
+++ b/cuda/3d/arith3d.h
@@ -37,6 +37,7 @@ struct opAddScaled;
struct opScaleAndAdd;
struct opAddMulScaled;
struct opAddMul;
+struct opAdd;
struct opMul;
struct opMul2;
struct opDividedBy;
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
index 4447775..2efac98 100644
--- a/cuda/3d/astra3d.cu
+++ b/cuda/3d/astra3d.cu
@@ -1463,6 +1463,40 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
return ok;
}
+// This computes the column weights, divides by them, and adds the
+// result to the current volume. This is both more expensive and more
+// GPU memory intensive than the regular BP, but allows saving system RAM.
+bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ SPar3DProjection* p = genPar3DProjections(iProjAngles,
+ iProjU, iProjV,
+ fDetUSize, fDetVSize,
+ pfAngles);
+
+ bool ok;
+ ok = astraCudaPar3DBP_SIRTWeighted(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+ iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling);
+
+ delete[] p;
+
+ return ok;
+}
+
bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
unsigned int iVolX,
@@ -1491,9 +1525,6 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
dims.iRaysPerVoxelDim = iVoxelSuperSampling;
- if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
- return false;
-
if (iGPUIndex != -1) {
cudaSetDevice(iGPUIndex);
cudaError_t err = cudaGetLastError();
@@ -1540,6 +1571,111 @@ bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
}
+// This computes the column weights, divides by them, and adds the
+// result to the current volume. This is both more expensive and more
+// GPU memory intensive than the regular BP, but allows saving system RAM.
+bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume,
+ const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling)
+{
+ SDimensions3D dims;
+
+ dims.iVolX = iVolX;
+ dims.iVolY = iVolY;
+ dims.iVolZ = iVolZ;
+ if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+ return false;
+
+ dims.iProjAngles = iProjAngles;
+ dims.iProjU = iProjU;
+ dims.iProjV = iProjV;
+
+ if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+ return false;
+
+ dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+ if (iGPUIndex != -1) {
+ cudaSetDevice(iGPUIndex);
+ cudaError_t err = cudaGetLastError();
+
+ // Ignore errors caused by calling cudaSetDevice multiple times
+ if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+ return false;
+ }
+
+
+ cudaPitchedPtr D_pixelWeight = allocateVolumeData(dims);
+ bool ok = D_pixelWeight.ptr;
+ if (!ok)
+ return false;
+
+ cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+ ok = D_volumeData.ptr;
+ if (!ok) {
+ cudaFree(D_pixelWeight.ptr);
+ return false;
+ }
+
+ cudaPitchedPtr D_projData = allocateProjectionData(dims);
+ ok = D_projData.ptr;
+ if (!ok) {
+ cudaFree(D_pixelWeight.ptr);
+ cudaFree(D_volumeData.ptr);
+ return false;
+ }
+
+ // Compute weights
+ ok &= zeroVolumeData(D_pixelWeight, dims);
+ processSino3D<opSet>(D_projData, 1.0f, dims);
+ ok &= Par3DBP(D_pixelWeight, D_projData, dims, pfAngles);
+ processVol3D<opInvert>(D_pixelWeight, dims);
+ if (!ok) {
+ cudaFree(D_pixelWeight.ptr);
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+
+ ok &= copyProjectionsToDevice(pfProjections, D_projData,
+ dims, dims.iProjU);
+ ok &= zeroVolumeData(D_volumeData, dims);
+ // Do BP into D_volumeData
+ ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles);
+ // Multiply with weights
+ processVol3D<opMul>(D_volumeData, D_pixelWeight, dims);
+
+ // Upload previous iterate to D_pixelWeight...
+ ok &= copyVolumeToDevice(pfVolume, D_pixelWeight, dims, dims.iVolX);
+ if (!ok) {
+ cudaFree(D_pixelWeight.ptr);
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+ return false;
+ }
+ // ...and add it to the weighted BP
+ processVol3D<opAdd>(D_volumeData, D_pixelWeight, dims);
+
+ // Then copy the result back
+ ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+ cudaFree(D_pixelWeight.ptr);
+ cudaFree(D_volumeData.ptr);
+ cudaFree(D_projData.ptr);
+
+ return ok;
+
+}
+
+
bool astraCudaFDK(float* pfVolume, const float* pfProjections,
unsigned int iVolX,
diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h
index 5712f89..9d87e33 100644
--- a/cuda/3d/astra3d.h
+++ b/cuda/3d/astra3d.h
@@ -429,6 +429,28 @@ _AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
const SPar3DProjection *pfAngles,
int iGPUIndex, int iVoxelSuperSampling);
+_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ float fDetUSize,
+ float fDetVSize,
+ const float *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaPar3DBP_SIRTWeighted(float* pfVolume, const float* pfProjections,
+ unsigned int iVolX,
+ unsigned int iVolY,
+ unsigned int iVolZ,
+ unsigned int iProjAngles,
+ unsigned int iProjU,
+ unsigned int iProjV,
+ const SPar3DProjection *pfAngles,
+ int iGPUIndex, int iVoxelSuperSampling);
+
_AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections,
unsigned int iVolX,
unsigned int iVolY,
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 9bf9a9f..8cb285c 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -136,13 +136,10 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn
const float fCvz = gC_Cvz[angle];
const float fCvc = gC_Cvc[angle];
- const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
- const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
+ const float fU = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
+ const float fV = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
- const float fU = fUNum;
- const float fV = fVNum;
-
- fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+ fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV);
}
@@ -213,13 +210,10 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star
float fZs = fZ;
for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) {
- const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
- const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
-
- const float fU = fUNum;
- const float fV = fVNum;
+ const float fU = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
+ const float fV = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
- fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+ fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV);
fZs += fSubStep;
}
fYs += fSubStep;
diff --git a/include/astra/CudaBackProjectionAlgorithm3D.h b/include/astra/CudaBackProjectionAlgorithm3D.h
index a9cc559..90a085d 100644
--- a/include/astra/CudaBackProjectionAlgorithm3D.h
+++ b/include/astra/CudaBackProjectionAlgorithm3D.h
@@ -140,6 +140,13 @@ protected:
int m_iGPUIndex;
int m_iVoxelSuperSampling;
+ /** Option to compute the column weights on the fly, divide by
+ * them, and add the result to the current volume. This is both
+ * more expensive and more GPU memory intensive than the regular
+ * BP, but allows saving system RAM.
+ */
+ bool m_bSIRTWeighting;
+
};
// inline functions
diff --git a/matlab/mex/astra_mex_data3d_c.cpp b/matlab/mex/astra_mex_data3d_c.cpp
index dfb3003..8492695 100644
--- a/matlab/mex/astra_mex_data3d_c.cpp
+++ b/matlab/mex/astra_mex_data3d_c.cpp
@@ -277,6 +277,7 @@ extern "C" {
mxArray *mxCreateSharedDataCopy(const mxArray *pr);
bool mxUnshareArray(const mxArray *pr, const bool noDeepCopy);
mxArray *mxUnreference(const mxArray *pr);
+bool mxIsSharedArray(const mxArray *pr);
}
class CFloat32CustomMemoryMatlab3D : public CFloat32CustomMemory {
@@ -287,6 +288,9 @@ public:
//fprintf(stderr, "Passed:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray));
// First unshare the input array, so that we may modify it.
if (bUnshare) {
+ if (mxIsSharedArray(_pArray)) {
+ fprintf(stderr, "Performance note: unsharing shared array in link\n");
+ }
mxUnshareArray(_pArray, false);
//fprintf(stderr, "Unshared:\narray: %p\tdata: %p\n", (void*)_pArray, (void*)mxGetData(_pArray));
}
diff --git a/src/CudaBackProjectionAlgorithm3D.cpp b/src/CudaBackProjectionAlgorithm3D.cpp
index b096756..891d7fe 100644
--- a/src/CudaBackProjectionAlgorithm3D.cpp
+++ b/src/CudaBackProjectionAlgorithm3D.cpp
@@ -53,6 +53,7 @@ CCudaBackProjectionAlgorithm3D::CCudaBackProjectionAlgorithm3D()
m_bIsInitialized = false;
m_iGPUIndex = -1;
m_iVoxelSuperSampling = 1;
+ m_bSIRTWeighting = false;
}
//----------------------------------------------------------------------------------------
@@ -106,6 +107,17 @@ bool CCudaBackProjectionAlgorithm3D::initialize(const Config& _cfg)
m_iVoxelSuperSampling = (int)_cfg.self->getOptionNumerical("VoxelSuperSampling", 1);
CC.markOptionParsed("VoxelSuperSampling");
+ CFloat32ProjectionData3DMemory* pSinoMem = dynamic_cast<CFloat32ProjectionData3DMemory*>(m_pSinogram);
+ ASTRA_ASSERT(pSinoMem);
+ const CProjectionGeometry3D* projgeom = pSinoMem->getGeometry();
+const CParallelProjectionGeometry3D* par3dgeom = dynamic_cast<const CParallelProjectionGeometry3D*>(projgeom);
+ const CParallelVecProjectionGeometry3D* parvec3dgeom = dynamic_cast<const CParallelVecProjectionGeometry3D*>(projgeom);
+ if (parvec3dgeom || par3dgeom) {
+ // This option is only supported for Par3D currently
+ m_bSIRTWeighting = _cfg.self->getOptionBool("SIRTWeighting", false);
+ CC.markOptionParsed("SIRTWeighting");
+ }
+
// success
m_bIsInitialized = _check();
return m_bIsInitialized;
@@ -181,27 +193,55 @@ void CCudaBackProjectionAlgorithm3D::run(int _iNrIterations)
conegeom->getProjectionAngles(),
m_iGPUIndex, m_iVoxelSuperSampling);
} else if (par3dgeom) {
- astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(),
- volgeom.getGridColCount(),
- volgeom.getGridRowCount(),
- volgeom.getGridSliceCount(),
- par3dgeom->getProjectionCount(),
- par3dgeom->getDetectorColCount(),
- par3dgeom->getDetectorRowCount(),
- par3dgeom->getDetectorSpacingX(),
- par3dgeom->getDetectorSpacingY(),
- par3dgeom->getProjectionAngles(),
- m_iGPUIndex, m_iVoxelSuperSampling);
+ if (!m_bSIRTWeighting) {
+ astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(),
+ volgeom.getGridColCount(),
+ volgeom.getGridRowCount(),
+ volgeom.getGridSliceCount(),
+ par3dgeom->getProjectionCount(),
+ par3dgeom->getDetectorColCount(),
+ par3dgeom->getDetectorRowCount(),
+ par3dgeom->getDetectorSpacingX(),
+ par3dgeom->getDetectorSpacingY(),
+ par3dgeom->getProjectionAngles(),
+ m_iGPUIndex, m_iVoxelSuperSampling);
+ } else {
+ astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(),
+ pSinoMem->getDataConst(),
+ volgeom.getGridColCount(),
+ volgeom.getGridRowCount(),
+ volgeom.getGridSliceCount(),
+ par3dgeom->getProjectionCount(),
+ par3dgeom->getDetectorColCount(),
+ par3dgeom->getDetectorRowCount(),
+ par3dgeom->getDetectorSpacingX(),
+ par3dgeom->getDetectorSpacingY(),
+ par3dgeom->getProjectionAngles(),
+ m_iGPUIndex, m_iVoxelSuperSampling);
+ }
} else if (parvec3dgeom) {
- astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(),
- volgeom.getGridColCount(),
- volgeom.getGridRowCount(),
- volgeom.getGridSliceCount(),
- parvec3dgeom->getProjectionCount(),
- parvec3dgeom->getDetectorColCount(),
- parvec3dgeom->getDetectorRowCount(),
- parvec3dgeom->getProjectionVectors(),
- m_iGPUIndex, m_iVoxelSuperSampling);
+ if (!m_bSIRTWeighting) {
+ astraCudaPar3DBP(pReconMem->getData(), pSinoMem->getDataConst(),
+ volgeom.getGridColCount(),
+ volgeom.getGridRowCount(),
+ volgeom.getGridSliceCount(),
+ parvec3dgeom->getProjectionCount(),
+ parvec3dgeom->getDetectorColCount(),
+ parvec3dgeom->getDetectorRowCount(),
+ parvec3dgeom->getProjectionVectors(),
+ m_iGPUIndex, m_iVoxelSuperSampling);
+ } else {
+ astraCudaPar3DBP_SIRTWeighted(pReconMem->getData(),
+ pSinoMem->getDataConst(),
+ volgeom.getGridColCount(),
+ volgeom.getGridRowCount(),
+ volgeom.getGridSliceCount(),
+ parvec3dgeom->getProjectionCount(),
+ parvec3dgeom->getDetectorColCount(),
+ parvec3dgeom->getDetectorRowCount(),
+ parvec3dgeom->getProjectionVectors(),
+ m_iGPUIndex, m_iVoxelSuperSampling);
+ }
} else if (conevecgeom) {
astraCudaConeBP(pReconMem->getData(), pSinoMem->getDataConst(),
volgeom.getGridColCount(),