From a70ad8df8fc2a3da63fc91dd18bbfd55be7a89dd Mon Sep 17 00:00:00 2001
From: Willem Jan Palenstijn <Willem.Jan.Palenstijn@cwi.nl>
Date: Thu, 12 Mar 2015 12:30:47 +0100
Subject: Add outputScale argument to 3D CUDA BP

---
 cuda/3d/algo3d.cu   |  7 ++++---
 cuda/3d/algo3d.h    |  3 ++-
 cuda/3d/astra3d.cu  | 12 ++++++------
 cuda/3d/cgls3d.cu   |  4 ++--
 cuda/3d/cone_bp.cu  | 24 +++++++++++++++---------
 cuda/3d/cone_bp.h   |  7 ++++---
 cuda/3d/par3d_bp.cu | 23 ++++++++++++++---------
 cuda/3d/par3d_bp.h  |  6 ++++--
 cuda/3d/sirt3d.cu   |  6 +++---
 9 files changed, 54 insertions(+), 38 deletions(-)

(limited to 'cuda')

diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu
index 7f61280..b775438 100644
--- a/cuda/3d/algo3d.cu
+++ b/cuda/3d/algo3d.cu
@@ -94,12 +94,13 @@ bool ReconAlgo3D::callFP(cudaPitchedPtr& D_volumeData,
 }
 
 bool ReconAlgo3D::callBP(cudaPitchedPtr& D_volumeData,
-                       cudaPitchedPtr& D_projData)
+                       cudaPitchedPtr& D_projData,
+                       float outputScale)
 {
 	if (coneProjs) {
-		return ConeBP(D_volumeData, D_projData, dims, coneProjs);
+		return ConeBP(D_volumeData, D_projData, dims, coneProjs, outputScale);
 	} else {
-		return Par3DBP(D_volumeData, D_projData, dims, par3DProjs);
+		return Par3DBP(D_volumeData, D_projData, dims, par3DProjs, outputScale);
 	}
 }
 
diff --git a/cuda/3d/algo3d.h b/cuda/3d/algo3d.h
index f4c6a87..35ffc49 100644
--- a/cuda/3d/algo3d.h
+++ b/cuda/3d/algo3d.h
@@ -51,7 +51,8 @@ protected:
 	            cudaPitchedPtr& D_projData, 
 	            float outputScale);
 	bool callBP(cudaPitchedPtr& D_volumeData, 
-	            cudaPitchedPtr& D_projData);
+	            cudaPitchedPtr& D_projData,
+	            float outputScale);
 
 	SDimensions3D dims;
 	SConeProjection* coneProjs;
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
index b2375f3..7589416 100644
--- a/cuda/3d/astra3d.cu
+++ b/cuda/3d/astra3d.cu
@@ -1252,9 +1252,9 @@ bool astraCudaBP(float* pfVolume, const float* pfProjections,
 	}
 
 	if (pParProjs)
-		ok &= Par3DBP(D_volumeData, D_projData, dims, pParProjs);
+		ok &= Par3DBP(D_volumeData, D_projData, dims, pParProjs, 1.0f);
 	else
-		ok &= ConeBP(D_volumeData, D_projData, dims, pConeProjs);
+		ok &= ConeBP(D_volumeData, D_projData, dims, pConeProjs, 1.0f);
 
 	ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
 
@@ -1330,9 +1330,9 @@ bool astraCudaBP_SIRTWeighted(float* pfVolume,
 	processSino3D<opSet>(D_projData, 1.0f, dims);
 
 	if (pParProjs)
-		ok &= Par3DBP(D_pixelWeight, D_projData, dims, pParProjs);
+		ok &= Par3DBP(D_pixelWeight, D_projData, dims, pParProjs, 1.0f);
 	else
-		ok &= ConeBP(D_pixelWeight, D_projData, dims, pConeProjs);
+		ok &= ConeBP(D_pixelWeight, D_projData, dims, pConeProjs, 1.0f);
 
 	processVol3D<opInvert>(D_pixelWeight, dims);
 	if (!ok) {
@@ -1347,9 +1347,9 @@ bool astraCudaBP_SIRTWeighted(float* pfVolume,
 	ok &= zeroVolumeData(D_volumeData, dims);
 	// Do BP into D_volumeData
 	if (pParProjs)
-		ok &= Par3DBP(D_volumeData, D_projData, dims, pParProjs);
+		ok &= Par3DBP(D_volumeData, D_projData, dims, pParProjs, 1.0f);
 	else
-		ok &= ConeBP(D_volumeData, D_projData, dims, pConeProjs);
+		ok &= ConeBP(D_volumeData, D_projData, dims, pConeProjs, 1.0f);
 
 	// Multiply with weights
 	processVol3D<opMul>(D_volumeData, D_pixelWeight, dims);
diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu
index 5071a9b..4f632f3 100644
--- a/cuda/3d/cgls3d.cu
+++ b/cuda/3d/cgls3d.cu
@@ -165,7 +165,7 @@ bool CGLS::iterate(unsigned int iterations)
 
 		// p = A'*r
 		zeroVolumeData(D_p, dims);
-		callBP(D_p, D_r);
+		callBP(D_p, D_r, 1.0f);
 		if (useVolumeMask)
 			processVol3D<opMul>(D_p, D_maskData, dims);
 
@@ -195,7 +195,7 @@ bool CGLS::iterate(unsigned int iterations)
 
 		// z = A'*r
 		zeroVolumeData(D_z, dims);
-		callBP(D_z, D_r);
+		callBP(D_z, D_r, 1.0f);
 		if (useVolumeMask)
 			processVol3D<opMul>(D_z, D_maskData, dims);
 
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 5648d6f..5e67980 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -78,7 +78,8 @@ bool bindProjDataTexture(const cudaArray* array)
 
 //__launch_bounds__(32*16, 4)
 __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle,
-                            int angleOffset, const astraCUDA3d::SDimensions3D dims)
+                            int angleOffset, const astraCUDA3d::SDimensions3D dims,
+                            float fOutputScale)
 {
 	float* volData = (float*)D_volData;
 
@@ -147,13 +148,13 @@ __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAng
 		endZ = dims.iVolZ - startZ;
 
 	for(int i=0; i < endZ; i++)
-		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i];
+		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
 } //End kernel
 
 
 
 // supersampling version
-__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims)
+__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
 {
 	float* volData = (float*)D_volData;
 
@@ -189,6 +190,9 @@ __global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int start
 	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
 	const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
 
+	fOutputScale /= (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+
+
 	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
 	{
 
@@ -236,14 +240,15 @@ __global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int start
 
 		}
 
-		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
 	}
 }
 
 
 bool ConeBP_Array(cudaPitchedPtr D_volumeData,
                   cudaArray *D_projArray,
-                  const SDimensions3D& dims, const SConeProjection* angles)
+                  const SDimensions3D& dims, const SConeProjection* angles,
+                  float fOutputScale)
 {
 	bindProjDataTexture(D_projArray);
 
@@ -291,9 +296,9 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
 		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
 		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
 			if (dims.iRaysPerVoxelDim == 1)
-				dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims);
+				dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
 			else
-				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims);
+				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
 		}
 
 		cudaTextForceKernelsCompletion();
@@ -309,14 +314,15 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
 
 bool ConeBP(cudaPitchedPtr D_volumeData,
             cudaPitchedPtr D_projData,
-            const SDimensions3D& dims, const SConeProjection* angles)
+            const SDimensions3D& dims, const SConeProjection* angles,
+            float fOutputScale)
 {
 	// transfer projections to array
 
 	cudaArray* cuArray = allocateProjectionArray(dims);
 	transferProjectionsToArray(D_projData, cuArray, dims);
 
-	bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles);
+	bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles, fOutputScale);
 
 	cudaFreeArray(cuArray);
 
diff --git a/cuda/3d/cone_bp.h b/cuda/3d/cone_bp.h
index cba6d9f..4d3d2dd 100644
--- a/cuda/3d/cone_bp.h
+++ b/cuda/3d/cone_bp.h
@@ -33,13 +33,14 @@ namespace astraCUDA3d {
 
 _AstraExport bool ConeBP_Array(cudaPitchedPtr D_volumeData,
                   cudaArray *D_projArray,
-                  const SDimensions3D& dims, const SConeProjection* angles);
+                  const SDimensions3D& dims, const SConeProjection* angles,
+                  float fOutputScale);
 
 _AstraExport bool ConeBP(cudaPitchedPtr D_volumeData,
             cudaPitchedPtr D_projData,
-            const SDimensions3D& dims, const SConeProjection* angles);
+            const SDimensions3D& dims, const SConeProjection* angles,
+            float fOutputScale);
          
-
 }
 
 #endif
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 0c33280..1217949 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -77,7 +77,7 @@ static bool bindProjDataTexture(const cudaArray* array)
 }
 
 
-__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims)
+__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
 {
 	float* volData = (float*)D_volData;
 
@@ -139,11 +139,11 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn
 		endZ = dims.iVolZ - startZ;
 
 	for(int i=0; i < endZ; i++)
-		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i];
+		volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale;
 }
 
 // supersampling version
-__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims)
+__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
 {
 	float* volData = (float*)D_volData;
 
@@ -180,6 +180,9 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star
 
 	const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
 
+	fOutputScale /= (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+
+
 	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
 	{
 
@@ -217,14 +220,15 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star
 
 		}
 
-		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale;
 	}
 
 }
 
 bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
                    cudaArray *D_projArray,
-                   const SDimensions3D& dims, const SPar3DProjection* angles)
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   float fOutputScale)
 {
 	bindProjDataTexture(D_projArray);
 
@@ -271,9 +275,9 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
 		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
 			// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
 			if (dims.iRaysPerVoxelDim == 1)
-				dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims);
+				dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
 			else
-				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims);
+				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
 		}
 
 		cudaTextForceKernelsCompletion();
@@ -288,14 +292,15 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
 
 bool Par3DBP(cudaPitchedPtr D_volumeData,
             cudaPitchedPtr D_projData,
-            const SDimensions3D& dims, const SPar3DProjection* angles)
+            const SDimensions3D& dims, const SPar3DProjection* angles,
+            float fOutputScale)
 {
 	// transfer projections to array
 
 	cudaArray* cuArray = allocateProjectionArray(dims);
 	transferProjectionsToArray(D_projData, cuArray, dims);
 
-	bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles);
+	bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles, fOutputScale);
 
 	cudaFreeArray(cuArray);
 
diff --git a/cuda/3d/par3d_bp.h b/cuda/3d/par3d_bp.h
index ece37d1..f1fc62d 100644
--- a/cuda/3d/par3d_bp.h
+++ b/cuda/3d/par3d_bp.h
@@ -33,11 +33,13 @@ namespace astraCUDA3d {
 
 _AstraExport bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
                    cudaArray *D_projArray,
-                   const SDimensions3D& dims, const SPar3DProjection* angles);
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   float fOutputScale);
 
 _AstraExport bool Par3DBP(cudaPitchedPtr D_volumeData,
              cudaPitchedPtr D_projData,
-             const SDimensions3D& dims, const SPar3DProjection* angles);
+             const SDimensions3D& dims, const SPar3DProjection* angles,
+             float fOutputScale);
          
 
 }
diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu
index 389ee6b..0e6630a 100644
--- a/cuda/3d/sirt3d.cu
+++ b/cuda/3d/sirt3d.cu
@@ -160,10 +160,10 @@ bool SIRT::precomputeWeights()
 	zeroVolumeData(D_pixelWeight, dims);
 
 	if (useSinogramMask) {
-		callBP(D_pixelWeight, D_smaskData);
+		callBP(D_pixelWeight, D_smaskData, 1.0f);
 	} else {
 		processSino3D<opSet>(D_projData, 1.0f, dims);
-		callBP(D_pixelWeight, D_projData);
+		callBP(D_pixelWeight, D_projData, 1.0f);
 	}
 #if 0
 	float* bufp = new float[512*512];
@@ -293,7 +293,7 @@ bool SIRT::iterate(unsigned int iterations)
 #endif
 
 
-		callBP(D_tmpData, D_projData);
+		callBP(D_tmpData, D_projData, 1.0f);
 #if 0
 	printf("Dumping tmpData: %p\n", (void*)D_tmpData.ptr);
 	float* buf = new float[256*256];
-- 
cgit v1.2.3