28 files changed, 288 insertions, 374 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
index be15b25..c211729 100644
--- a/cuda/2d/algo.cu
+++ b/cuda/2d/algo.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
index 62eb8c2..45622d0 100644
--- a/cuda/2d/arith.cu
+++ b/cuda/2d/arith.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -451,7 +451,7 @@ void processData(float* pfOut, unsigned int pitch, unsigned int width, unsigned
 
 	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -462,7 +462,7 @@ void processData(float* pfOut, float fParam, unsigned int pitch, unsigned int wi
 
 	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -473,7 +473,7 @@ void processData(float* pfOut1, float* pfOut2, float fParam1, float fParam2, uns
 
 	devFFtoDD<op, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -485,7 +485,7 @@ void processData(float* pfOut, const float* pfIn, unsigned int pitch, unsigned i
 
 	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -496,7 +496,7 @@ void processData(float* pfOut, const float* pfIn, float fParam, unsigned int pit
 
 	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -507,7 +507,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, float fPa
 
 	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -518,7 +518,7 @@ void processData(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned
 
 	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
index 7ff1c95..4752b5f 100644
--- a/cuda/2d/astra.cu
+++ b/cuda/2d/astra.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
index e7238b9..9c2df68 100644
--- a/cuda/2d/cgls.cu
+++ b/cuda/2d/cgls.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu
index b466840..48ec4b0 100644
--- a/cuda/2d/darthelper.cu
+++ b/cuda/2d/darthelper.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
index df140ec..203032c 100644
--- a/cuda/2d/em.cu
+++ b/cuda/2d/em.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
index 7bba302..2068d03 100644
--- a/cuda/2d/fan_bp.cu
+++ b/cuda/2d/fan_bp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -322,13 +322,12 @@ bool FanBP_internal(float* D_volumeData, unsigned int volumePitch,
 		else
 			devFanBP<false><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
@@ -354,13 +353,12 @@ bool FanBP_FBPWeighted_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
 		devFanBP<true><<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	ok = checkCuda(cudaStreamSynchronize(stream), "FanBP_FBPWeighted");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 // D_projData is a pointer to one padded sinogram line
@@ -382,11 +380,8 @@ bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
 	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims, fOutputScale);
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
-
-	return true;
+	return checkCuda(cudaThreadSynchronize(), "FanBP_SART");
 }
 
 bool FanBP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
index 60c02f8..342ca4c 100644
--- a/cuda/2d/fan_fp.cu
+++ b/cuda/2d/fan_fp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -268,16 +268,17 @@ bool FanFP_internal(float* D_volumeData, unsigned int volumePitch,
 	for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
 		FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
 
-	cudaStreamDestroy(stream1);
-	cudaStreamDestroy(stream2);
+	bool ok = true;
 
-	cudaThreadSynchronize();
+	ok &= checkCuda(cudaStreamSynchronize(stream1), "fan_fp hor");
+	cudaStreamDestroy(stream1);
 
-	cudaTextForceKernelsCompletion();
+	ok &= checkCuda(cudaStreamSynchronize(stream2), "fan_fp ver");
+	cudaStreamDestroy(stream2);
 
 	cudaFreeArray(D_dataArray);
 
-	return true;
+	return ok;
 }
 
 bool FanFP(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/fbp.cu b/cuda/2d/fbp.cu
index 4fc3983..7acbf6d 100644
--- a/cuda/2d/fbp.cu
+++ b/cuda/2d/fbp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
index 8361ad2..08acfd4 100644
--- a/cuda/2d/fft.cu
+++ b/cuda/2d/fft.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -40,33 +40,18 @@ along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
 
 using namespace astra;
 
-// TODO: evaluate what we want to do in these situations:
-
-#define CHECK_ERROR(errorMessage) do {                                     \
-  cudaError_t err = cudaThreadSynchronize();                               \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error %s : %s",                                    \
-              errorMessage,cudaGetErrorString( err));                      \
-      exit(EXIT_FAILURE);                                                  \
-  } } while (0)
-
-#define SAFE_CALL( call) do {                                              \
-  cudaError err = call;                                                    \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error: %s ",                                       \
-              cudaGetErrorString( err));                                   \
-      exit(EXIT_FAILURE);                                                  \
-  }                                                                        \
-  err = cudaThreadSynchronize();                                           \
-  if( cudaSuccess != err) {                                                \
-      ASTRA_ERROR("Cuda error: %s : ",                                     \
-              cudaGetErrorString( err));                                   \
-      exit(EXIT_FAILURE);                                                  \
-  } } while (0)
-
-
 namespace astraCUDA {
 
+bool checkCufft(cufftResult err, const char *msg)
+{
+	if (err != CUFFT_SUCCESS) {
+		ASTRA_ERROR("%s: CUFFT error %d.", msg, err);
+		return false;
+	} else {
+		return true;
+	}
+}
+
 __global__ static void applyFilter_kernel(int _iProjectionCount,
                                           int _iFreqBinCount,
                                           cufftComplex * _pSinogram,
@@ -115,7 +100,8 @@ static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount,
 	rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
 	                                                            _iDetectorCount,
 	                                                            _pfInFourierOutput);
-	CHECK_ERROR("rescaleInverseFourier_kernel failed");
+
+	checkCuda(cudaThreadSynchronize(), "rescaleInverseFourier");
 }
 
 void applyFilter(int _iProjectionCount, int _iFreqBinCount,
@@ -128,7 +114,8 @@ void applyFilter(int _iProjectionCount, int _iFreqBinCount,
 	applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
 	                                                  _iFreqBinCount,
 	                                                  _pSinogram, _pFilter);
-	CHECK_ERROR("applyFilter_kernel failed");
+
+	checkCuda(cudaThreadSynchronize(), "applyFilter");
 }
 
 static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
@@ -136,24 +123,22 @@ static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
                           cufftComplex * _pDevTargetComplex)
 {
 	cufftHandle plan;
-	cufftResult result;
 
-	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d r2c fft");
+	if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount), "invokeCudaFFT plan")) {
 		return false;
 	}
 
-	result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex);
-	cufftDestroy(plan);
+	if (!checkCufft(cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex), "invokeCudaFFT exec")) {
+		cufftDestroy(plan);
+		return false;
+	}
 
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to exec 1d r2c fft");
+	if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaFFT sync")) {
+		cufftDestroy(plan);
 		return false;
 	}
 
+	cufftDestroy(plan);
 	return true;
 }
 
@@ -162,26 +147,25 @@ static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount,
                            float * _pfDevTarget)
 {
 	cufftHandle plan;
-	cufftResult result;
 
-	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount);
-	if(result != CUFFT_SUCCESS)
-	{
-		ASTRA_ERROR("Failed to plan 1d c2r fft");
+	if (!checkCufft(cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount), "invokeCudaIFFT plan")) {
 		return false;
 	}
 
-	// todo: why do we have to get rid of the const qualifier?
-	result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
-	                      (cufftReal *)_pfDevTarget);
-	cufftDestroy(plan);
-
-	if(result != CUFFT_SUCCESS)
+	// Getting rid of the const qualifier is due to cufft API issue?
+	if (!checkCufft(cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
+	                      (cufftReal *)_pfDevTarget), "invokeCudaIFFT exec"))
 	{
-		ASTRA_ERROR("Failed to exec 1d c2r fft");
+		cufftDestroy(plan);
 		return false;
 	}
 
+	if (!checkCuda(cudaDeviceSynchronize(), "invokeCudaIFFT sync")) {
+		cufftDestroy(plan);
+		return false;
+	}
+
+	cufftDestroy(plan);
 	return true;
 }
 
@@ -189,14 +173,12 @@ bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount,
                              cufftComplex ** _ppDevComplex)
 {
 	size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
-	SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize));
-	return true;
+	return checkCuda(cudaMalloc((void **)_ppDevComplex, bufferSize), "fft allocateComplexOnDevice");
 }
 
 bool freeComplexOnDevice(cufftComplex * _pDevComplex)
 {
-	SAFE_CALL(cudaFree(_pDevComplex));
-	return true;
+	return checkCuda(cudaFree(_pDevComplex), "fft freeComplexOnDevice");
 }
 
 bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
@@ -204,9 +186,7 @@ bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
                                 cufftComplex * _pDevComplexTarget)
 {
 	size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
-	SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice));
-
-	return true;
+	return checkCuda(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice), "fft uploadComplexArrayToDevice");
 }
 
 bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
@@ -217,25 +197,30 @@ bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
 	float * pfDevRealFFTSource = NULL;
 	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
 
-	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize));
-	SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize));
+	if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize), "runCudaFFT malloc"))
+		return false;
+	if (!checkCuda(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize), "runCudaFFT memset")) {
+		cudaFree(pfDevRealFFTSource);
+		return false;
+	}
 
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
 		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch;
 		float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
 
-		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+		if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaFFT memcpy")) {
+			cudaFree(pfDevRealFFTSource);
+			return false;
+		}
 	}
 
 	bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount,
 	                             pfDevRealFFTSource, _pDevTargetComplex);
 	if(!bResult)
-	{
 		return false;
-	}
 
-	SAFE_CALL(cudaFree(pfDevRealFFTSource));
+	cudaFree(pfDevRealFFTSource);
 
 	return true;
 }
@@ -248,7 +233,8 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 	float * pfDevRealFFTTarget = NULL;
 	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
 
-	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize));
+	if (!checkCuda(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize), "runCudaIFFT malloc"))
+		return false;
 
 	bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount,
 	                              _pDevSourceComplex, pfDevRealFFTTarget);
@@ -260,17 +246,23 @@ bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
 	rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount,
 	                      pfDevRealFFTTarget);
 
-	SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch));
+	if (!checkCuda(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch), "runCudaIFFT memset")) {
+		cudaFree(pfDevRealFFTTarget);
+		return false;
+	}
 
 	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
 	{
 		const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
 		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch;
 
-		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+		if (!checkCuda(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice), "runCudaIFFT memcpy")) {
+			cudaFree(pfDevRealFFTTarget);
+			return false;
+		}
 	}
 
-	SAFE_CALL(cudaFree(pfDevRealFFTTarget));
+	cudaFree(pfDevRealFFTTarget);
 
 	return true;
 }
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
index f080abb..d7c3ab0 100644
--- a/cuda/2d/par_bp.cu
+++ b/cuda/2d/par_bp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -231,13 +231,12 @@ bool BP_internal(float* D_volumeData, unsigned int volumePitch,
 		else
 			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims, fOutputScale);
 	}
-	cudaThreadSynchronize();
 
-	cudaTextForceKernelsCompletion();
+	bool ok = checkCuda(cudaStreamSynchronize(stream), "par_bp");
 
 	cudaStreamDestroy(stream);
 
-	return true;
+	return ok;
 }
 
 bool BP(float* D_volumeData, unsigned int volumePitch,
@@ -284,11 +283,8 @@ bool BP_SART(float* D_volumeData, unsigned int volumePitch,
 	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
 
 	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, angle_offset, angle_scaled_sin, angle_scaled_cos, dims, fOutputScale);
-	cudaThreadSynchronize();
-
-	cudaTextForceKernelsCompletion();
 
-	return true;
+	return checkCuda(cudaThreadSynchronize(), "BP_SART");
 }
 
 
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
index aac6cc3..e947428 100644
--- a/cuda/2d/par_fp.cu
+++ b/cuda/2d/par_fp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -305,8 +305,8 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
 				             (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
 
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -323,19 +323,16 @@ bool FP_simple_internal(float* D_volumeData, unsigned int volumePitch,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaThreadSynchronize();
+	bool ok = true;
 
-	cudaTextForceKernelsCompletion();
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "par_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	cudaFreeArray(D_dataArray);
-		
 
-	return true;
+	return ok;
 }
 
 bool FP_simple(float* D_volumeData, unsigned int volumePitch,
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
index 12ad6df..89d58c2 100644
--- a/cuda/2d/sart.cu
+++ b/cuda/2d/sart.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -54,7 +54,7 @@ void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int
 
 	devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "MUL_SART");
 }
 
 
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
index 2c5fdc9..b251734 100644
--- a/cuda/2d/sirt.cu
+++ b/cuda/2d/sirt.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
index 533b86c..ac360f0 100644
--- a/cuda/2d/util.cu
+++ b/cuda/2d/util.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -40,12 +40,8 @@ bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
 {
 	size_t width = dims.iVolWidth;
 	size_t height = dims.iVolHeight;
-	// TODO: memory order
-	cudaError_t err;
-	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
-	ASTRA_CUDA_ASSERT(err);
-	assert(err == cudaSuccess);
-	return true;
+
+	return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copyVolumeToDevice");
 }
 
 bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
@@ -54,10 +50,8 @@ bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
 {
 	size_t width = dims.iVolWidth;
 	size_t height = dims.iVolHeight;
-	// TODO: memory order
-	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
-	ASTRA_CUDA_ASSERT(err);
-	return true;
+
+	return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copyVolumeFromDevice");
 }
 
 
@@ -67,10 +61,8 @@ bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
 {
 	size_t width = dims.iProjDets;
 	size_t height = dims.iProjAngles;
-	// TODO: memory order
-	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
-	ASTRA_CUDA_ASSERT(err);
-	return true;
+
+	return checkCuda(cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost), "copySinogramFromDevice");
 }
 
 bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
@@ -79,20 +71,15 @@ bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
 {
 	size_t width = dims.iProjDets;
 	size_t height = dims.iProjAngles;
-	// TODO: memory order
-	cudaError_t err;
-	err = cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
-	ASTRA_CUDA_ASSERT(err);
-	return true;
+
+	return checkCuda(cudaMemcpy2D(outD_data, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice), "copySinogramToDevice");
 }
 
 
 bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)
 {
 	size_t p;
-	cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height);
-	if (ret != cudaSuccess) {
-		reportCudaError(ret);
+	if (!checkCuda(cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height), "allocateVolume")) {
 		ASTRA_ERROR("Failed to allocate %dx%d GPU buffer", width, height);
 		return false;
 	}
@@ -104,11 +91,9 @@ bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsign
 	return true;
 }
 
-void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+bool zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
 {
-	cudaError_t err;
-	err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height);
-	ASTRA_CUDA_ASSERT(err);
+	return checkCuda(cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height), "zeroVolume");
 }
 
 bool allocateVolumeData(float*& D_ptr, unsigned int& pitch, const SDimensions& dims)
@@ -121,14 +106,14 @@ bool allocateProjectionData(float*& D_ptr, unsigned int& pitch, const SDimension
 	return allocateVolume(D_ptr, dims.iProjDets, dims.iProjAngles, pitch);
 }
 
-void zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
+bool zeroVolumeData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
 {
-	zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight);
+	return zeroVolume(D_ptr, pitch, dims.iVolWidth, dims.iVolHeight);
 }
 
-void zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
+bool zeroProjectionData(float* D_ptr, unsigned int pitch, const SDimensions& dims)
 {
-	zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);
+	return zeroVolume(D_ptr, pitch, dims.iProjDets, dims.iProjAngles);
 }
 
 void duplicateVolumeData(float* D_dst, float* D_src, unsigned int pitch, const SDimensions& dims)
@@ -231,7 +216,7 @@ float dotProduct2D(float* D_data, unsigned int pitch,
 	// Step 1: reduce 2D from image to a single vector, taking sum of squares
 
 	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height);
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "dotProduct2D reduce2D");
 
 	// Step 2: reduce 1D: add up elements in vector
 	if (bx * by > 512)
@@ -248,31 +233,21 @@ float dotProduct2D(float* D_data, unsigned int pitch,
 	float x;
 	cudaMemcpy(&x, D_res, 4, cudaMemcpyDeviceToHost);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), "dotProduct2D");
 
 	cudaFree(D_buf);
 
 	return x;
 }
 
-
-bool cudaTextForceKernelsCompletion()
+bool checkCuda(cudaError_t err, const char *msg)
 {
-	cudaError_t returnedCudaError = cudaThreadSynchronize();
-
-	if(returnedCudaError != cudaSuccess) {
-		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
+	if (err != cudaSuccess) {
+		ASTRA_ERROR("%s: CUDA error %d: %s.", msg, err, cudaGetErrorString(err));
 		return false;
+	} else {
+		return true;
 	}
-
-	return true;
 }
 
-void reportCudaError(cudaError_t err)
-{
-	if(err != cudaSuccess)
-		ASTRA_ERROR("CUDA error %d: %s.", err, cudaGetErrorString(err));
-}
-
-
 }
diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu
index 3a83194..4ef2052 100644
--- a/cuda/3d/algo3d.cu
+++ b/cuda/3d/algo3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index 2f4054e..b495f22 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -225,7 +225,7 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
 
 	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -238,7 +238,7 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
 
 	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -252,7 +252,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
 
 	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -266,7 +266,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
 
 	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -281,7 +281,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 
 	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -296,7 +296,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 
 	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -328,7 +328,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -344,7 +344,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -362,7 +362,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -380,7 +380,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -400,7 +400,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -420,7 +420,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
@@ -448,7 +448,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -464,7 +464,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 		pfOut += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -482,7 +482,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -500,7 +500,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
 		pfIn += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -520,7 +520,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 template<typename op>
@@ -540,7 +540,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 		pfIn2 += step;
 	}
 
-	cudaTextForceKernelsCompletion();
+	checkCuda(cudaThreadSynchronize(), __FUNCTION__);
 }
 
 
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
index 51e76cd..3df52c8 100644
--- a/cuda/3d/astra3d.cu
+++ b/cuda/3d/astra3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu
index 4829574..cbfb422 100644
--- a/cuda/3d/cgls3d.cu
+++ b/cuda/3d/cgls3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 3525eb4..e265304 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -41,8 +41,7 @@ static texture3D gT_coneProjTexture;
 
 namespace astraCUDA3d {
 
-#define ZSIZE 6
-static const unsigned int g_volBlockZ = ZSIZE;
+static const unsigned int g_volBlockZ = 6;
 
 static const unsigned int g_anglesPerBlock = 32;
 static const unsigned int g_volBlockX = 16;
@@ -77,7 +76,7 @@ bool bindProjDataTexture(const cudaArray* array)
 
 
 //__launch_bounds__(32*16, 4)
-template<bool FDKWEIGHT>
+template<bool FDKWEIGHT, unsigned int ZSIZE>
 __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle,
                             int angleOffset, const astraCUDA3d::SDimensions3D dims,
                             float fOutputScale)
@@ -342,15 +341,25 @@ bool ConeBP_Array(cudaPitchedPtr D_volumeData,
 
 		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
 		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
-			if (params.bFDKWeighting)
-				dev_cone_BP<true><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
-			else if (params.iRaysPerVoxelDim == 1)
-				dev_cone_BP<false><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
-			else
+			if (params.bFDKWeighting) {
+				if (dims.iVolZ == 1) {
+					dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				} else {
+					dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				}
+			} else if (params.iRaysPerVoxelDim == 1) {
+				if (dims.iVolZ == 1) {
+					dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				} else {
+					dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				}
+			} else
 				dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
 		}
 
-		cudaTextForceKernelsCompletion();
+		// TODO: Consider not synchronizing here, if possible.
+		if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+			return false;
 
 		angles = angles + angleCount;
 		// printf("%f\n", toc(t));
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index bd607fa..2c3d1f6 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -169,6 +169,8 @@ __global__ void cone_FP_t(float* D_projData, unsigned int projPitch,
 	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
 
 	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
 	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
 	int endDetectorV = startDetectorV + g_detBlockV;
 	if (endDetectorV > dims.iProjV)
@@ -245,6 +247,8 @@ __global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch,
 	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;
 
 	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
 	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
 	int endDetectorV = startDetectorV + g_detBlockV;
 	if (endDetectorV > dims.iProjV)
@@ -402,8 +406,9 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
 				dim3 dimGrid(
 				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
 (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -446,16 +451,16 @@ bool ConeFP_Array_internal(cudaPitchedPtr D_projData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
+	bool ok = true;
 
-	cudaTextForceKernelsCompletion();
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 
diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu
index d8ccfa6..c3b93c6 100644
--- a/cuda/3d/darthelper3d.cu
+++ b/cuda/3d/darthelper3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index 456694f..0b8d2ab 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -176,7 +176,8 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
 
 	devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fZShift, fDetUSize, fDetVSize, dims);
 
-	cudaTextForceKernelsCompletion();
+	if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight"))
+		return false;
 
 	if (bShortScan && dims.iProjAngles > 1) {
 		ASTRA_DEBUG("Doing Parker weighting");
@@ -225,9 +226,10 @@ bool FDK_PreWeight(cudaPitchedPtr D_projData,
 
 		devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fDetUSize, fCentralFanAngle, dims);
 
+		if (!checkCuda(cudaThreadSynchronize(), "FDK_PreWeight ParkerWeight"))
+			return false;
 	}
 
-	cudaTextForceKernelsCompletion();
 	return true;
 }
 
diff --git a/cuda/3d/mem3d.cu b/cuda/3d/mem3d.cu
index 50cfe75..ad2a0f3 100644
--- a/cuda/3d/mem3d.cu
+++ b/cuda/3d/mem3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -58,15 +58,13 @@ struct SMemHandle3D_internal
 int maxBlockDimension()
 {
 	int dev;
-	cudaError_t err = cudaGetDevice(&dev);
-	if (err != cudaSuccess) {
+	if (!checkCuda(cudaGetDevice(&dev), "maxBlockDimension getDevice")) {
 		ASTRA_WARN("Error querying device");
 		return 0;
 	}
 
 	cudaDeviceProp props;
-	err = cudaGetDeviceProperties(&props, dev);
-	if (err != cudaSuccess) {
+	if (!checkCuda(cudaGetDeviceProperties(&props, dev), "maxBlockDimension getDviceProps")) {
 		ASTRA_WARN("Error querying device %d properties", dev);
 		return 0;
 	}
@@ -84,10 +82,7 @@ MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Me
 
 	size_t free = astraCUDA::availableGPUMemory();
 
-	cudaError_t err;
-	err = cudaMalloc3D(&hnd.ptr, make_cudaExtent(sizeof(float)*x, y, z));
-
-	if (err != cudaSuccess) {
+	if (!checkCuda(cudaMalloc3D(&hnd.ptr, make_cudaExtent(sizeof(float)*x, y, z)), "allocateGPUMemory malloc3d")) {
 		return MemHandle3D();
 	}
 
@@ -98,8 +93,7 @@ MemHandle3D allocateGPUMemory(unsigned int x, unsigned int y, unsigned int z, Me
 
 
 	if (zero == INIT_ZERO) {
-		err = cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z));
-		if (err != cudaSuccess) {
+		if (!checkCuda(cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z)), "allocateGPUMemory memset3d")) {
 			cudaFree(hnd.ptr.ptr);
 			return MemHandle3D();
 		}
@@ -116,23 +110,22 @@ bool zeroGPUMemory(MemHandle3D handle, unsigned int x, unsigned int y, unsigned
 {
 	SMemHandle3D_internal& hnd = *handle.d.get();
 	assert(!hnd.arr);
-	cudaError_t err = cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z));
-	return err == cudaSuccess;
+	return checkCuda(cudaMemset3D(hnd.ptr, 0, make_cudaExtent(sizeof(float)*x, y, z)), "zeroGPUMemory");
 }
 
 bool freeGPUMemory(MemHandle3D handle)
 {
 	size_t free = astraCUDA::availableGPUMemory();
-	cudaError_t err;
+	bool ok;
 	if (handle.d->arr)
-		err = cudaFreeArray(handle.d->arr);
+		ok = checkCuda(cudaFreeArray(handle.d->arr), "freeGPUMemory array");
 	else
-		err = cudaFree(handle.d->ptr.ptr);
+		ok = checkCuda(cudaFree(handle.d->ptr.ptr), "freeGPUMemory");
 	size_t free2 = astraCUDA::availableGPUMemory();
 
 	ASTRA_DEBUG("Freeing memory. (Pre: %lu, post: %lu)", free, free2);
 
-	return err == cudaSuccess;
+	return ok;
 }
 
 bool copyToGPUMemory(const float *src, MemHandle3D dst, const SSubDimensions3D &pos)
@@ -160,9 +153,7 @@ bool copyToGPUMemory(const float *src, MemHandle3D dst, const SSubDimensions3D &
 
 	p.kind = cudaMemcpyHostToDevice;
 
-	cudaError_t err = cudaMemcpy3D(&p);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyToGPUMemory");
 }
 
 
@@ -197,10 +188,7 @@ bool copyFromGPUMemory(float *dst, MemHandle3D src, const SSubDimensions3D &pos)
 
 	p.kind = cudaMemcpyDeviceToHost;
 
-	cudaError_t err = cudaMemcpy3D(&p);
-
-	return err == cudaSuccess;
-
+	return checkCuda(cudaMemcpy3D(&p), "copyFromGPUMemory");
 }
 
 
@@ -409,9 +397,7 @@ bool copyIntoArray(MemHandle3D handle, MemHandle3D subdata, const SSubDimensions
 
 	p.kind = cudaMemcpyHostToDevice;
 
-	cudaError_t err = cudaMemcpy3D(&p);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyIntoArray");
 
 }
 
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 857a314..1dc75ce 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -41,8 +41,7 @@ static texture3D gT_par3DProjTexture;
 
 namespace astraCUDA3d {
 
-#define ZSIZE 6
-static const unsigned int g_volBlockZ = ZSIZE;
+static const unsigned int g_volBlockZ = 6;
 
 static const unsigned int g_anglesPerBlock = 32;
 static const unsigned int g_volBlockX = 16;
@@ -77,6 +76,7 @@ static bool bindProjDataTexture(const cudaArray* array)
 }
 
 
+template<unsigned int ZSIZE>
 __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale)
 {
 	float* volData = (float*)D_volData;
@@ -281,13 +281,19 @@ bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
 
 		for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) {
 			// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
-			if (params.iRaysPerVoxelDim == 1)
-				dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
-			else
+			if (params.iRaysPerVoxelDim == 1) {
+				if (dims.iVolZ == 1) {
+					dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				} else {
+					dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, fOutputScale);
+				}
+			} else
 				dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, th, dims, params.iRaysPerVoxelDim, fOutputScale);
 		}
 
-		cudaTextForceKernelsCompletion();
+		// TODO: Consider not synchronizing here, if possible.
+		if (!checkCuda(cudaThreadSynchronize(), "cone_bp"))
+			return false;
 
 		angles = angles + angleCount;
 		// printf("%f\n", toc(t));
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 0a4a5cc..5daddc1 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -175,6 +175,8 @@ __global__ void par3D_FP_t(float* D_projData, unsigned int projPitch,
 
 
 	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
 	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
 	int endDetectorV = startDetectorV + g_detBlockV;
 	if (endDetectorV > dims.iProjV)
@@ -251,7 +253,10 @@ __global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch,
 	const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ);
 	const float fDistCorr = sc.scale(a1, a2);
 
+
 	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
 	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
 	int endDetectorV = startDetectorV + g_detBlockV;
 	if (endDetectorV > dims.iProjV)
@@ -359,6 +364,8 @@ __global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch,
 
 
 	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	if (detectorU >= dims.iProjU)
+		return;
 	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
 	int endDetectorV = startDetectorV + g_detBlockV;
 	if (endDetectorV > dims.iProjV)
@@ -501,8 +508,8 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
 				dim3 dimGrid(
 				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
 (blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
-				// TODO: check if we can't immediately
-				//       destroy the stream after use
+				// TODO: consider limiting number of handle (chaotic) geoms
+				//       with many alternating directions
 				cudaStream_t stream;
 				cudaStreamCreate(&stream);
 				streams.push_back(stream);
@@ -545,17 +552,16 @@ bool Par3DFP_Array_internal(cudaPitchedPtr D_projData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaTextForceKernelsCompletion();
+	bool ok = true;
 
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 bool Par3DFP(cudaPitchedPtr D_volumeData,
@@ -726,17 +732,16 @@ bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
 		}
 	}
 
-	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
-		cudaStreamDestroy(*iter);
-
-	streams.clear();
-
-	cudaTextForceKernelsCompletion();
+	bool ok = true;
 
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) {
+		ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW");
+		cudaStreamDestroy(*iter);
+	}
 
 	// printf("%f\n", toc(t));
 
-	return true;
+	return ok;
 }
 
 
diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu
index e68bde8..746a96b 100644
--- a/cuda/3d/sirt3d.cu
+++ b/cuda/3d/sirt3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 41eb9d2..71b5668 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -1,7 +1,7 @@
 /*
 -----------------------------------------------------------------------
-Copyright: 2010-2018, imec Vision Lab, University of Antwerp
-           2014-2018, CWI, Amsterdam
+Copyright: 2010-2021, imec Vision Lab, University of Antwerp
+           2014-2021, CWI, Amsterdam
 
 Contact: astra@astra-toolbox.com
 Website: http://www.astra-toolbox.com/
@@ -46,12 +46,9 @@ cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
 
 	cudaPitchedPtr volData;
 
-	cudaError err = cudaMalloc3D(&volData, extentV);
-	if (err != cudaSuccess) {
-		astraCUDA::reportCudaError(err);
+	if (!checkCuda(cudaMalloc3D(&volData, extentV), "allocateVolumeData 3D")) {
 		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iVolX, dims.iVolY, dims.iVolZ);
 		volData.ptr = 0;
-		// TODO: return 0 somehow?
 	}
 
 	return volData;
@@ -65,12 +62,9 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
 
 	cudaPitchedPtr projData;
 
-	cudaError err = cudaMalloc3D(&projData, extentP);
-	if (err != cudaSuccess) {
-		astraCUDA::reportCudaError(err);
+	if (!checkCuda(cudaMalloc3D(&projData, extentP), "allocateProjectionData 3D")) {
 		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU buffer", dims.iProjU, dims.iProjAngles, dims.iProjV);
 		projData.ptr = 0;
-		// TODO: return 0 somehow?
 	}
 
 	return projData;
@@ -78,11 +72,11 @@ cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
 bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
 {
 	char* t = (char*)D_data.ptr;
-	cudaError err;
 
 	for (unsigned int z = 0; z < dims.iVolZ; ++z) {
-		err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
-		ASTRA_CUDA_ASSERT(err);
+		if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY), "zeroVolumeData 3D")) {
+			return false;
+		}
 		t += D_data.pitch * dims.iVolY;
 	}
 	return true;
@@ -90,11 +84,11 @@ bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
 bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
 {
 	char* t = (char*)D_data.ptr;
-	cudaError err;
 
 	for (unsigned int z = 0; z < dims.iProjV; ++z) {
-		err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
-		ASTRA_CUDA_ASSERT(err);
+		if (!checkCuda(cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles), "zeroProjectionData 3D")) {
+			return false;
+		}
 		t += D_data.pitch * dims.iProjAngles;
 	}
 
@@ -128,11 +122,7 @@ bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimens
 	p.extent = extentV;
 	p.kind = cudaMemcpyHostToDevice;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyVolumeToDevice 3D");
 }
 
 bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -163,11 +153,7 @@ bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SD
 	p.extent = extentV;
 	p.kind = cudaMemcpyHostToDevice;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyProjectionsToDevice 3D");
 }
 
 bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
@@ -198,12 +184,9 @@ bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDime
 	p.extent = extentV;
 	p.kind = cudaMemcpyDeviceToHost;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyVolumeFromDevice 3D");
 }
+
 bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
 {
 	if (!pitch)
@@ -232,11 +215,7 @@ bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const
 	p.extent = extentV;
 	p.kind = cudaMemcpyDeviceToHost;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "copyProjectionsFromDevice 3D");
 }
 
 bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
@@ -258,12 +237,9 @@ bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, con
 	p.extent = extentV;
 	p.kind = cudaMemcpyDeviceToDevice;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "duplicateVolumeData 3D");
 }
+
 bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
 {
 	cudaExtent extentV;
@@ -283,11 +259,7 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,
 	p.extent = extentV;
 	p.kind = cudaMemcpyDeviceToDevice;
 
-	cudaError err;
-	err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	return err == cudaSuccess;
+	return checkCuda(cudaMemcpy3D(&p), "duplicateProjectionData 3D");
 }
 
 
@@ -303,9 +275,8 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)
 	extentA.width = dims.iVolX;
 	extentA.height = dims.iVolY;
 	extentA.depth = dims.iVolZ;
-	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
-	if (err != cudaSuccess) {
-		astraCUDA::reportCudaError(err);
+
+	if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateVolumeArray 3D")) {
 		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iVolX, dims.iVolY, dims.iVolZ);
 		return 0;
 	}
@@ -320,10 +291,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
 	extentA.width = dims.iProjU;
 	extentA.height = dims.iProjAngles;
 	extentA.depth = dims.iProjV;
-	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
 
-	if (err != cudaSuccess) {
-		astraCUDA::reportCudaError(err);
+	if (!checkCuda(cudaMalloc3DArray(&cuArray, &channelDesc, extentA), "allocateProjectionArray 3D")) {
 		ASTRA_ERROR("Failed to allocate %dx%dx%d GPU array", dims.iProjU, dims.iProjAngles, dims.iProjV);
 		return 0;
 	}
@@ -352,12 +321,9 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
 	p.extent = extentA;
 	p.kind = cudaMemcpyDeviceToDevice;
 
-	cudaError err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-	// TODO: check errors
-
-	return true;
+	return checkCuda(cudaMemcpy3D(&p), "transferVolumeToArray 3D");
 }
+
 bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
 {
 	cudaExtent extentA;
@@ -379,13 +345,9 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
 	p.extent = extentA;
 	p.kind = cudaMemcpyDeviceToDevice;
 
-	cudaError err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	// TODO: check errors
-
-	return true;
+	return checkCuda(cudaMemcpy3D(&p), "transferProjectionsToArray 3D");
 }
+
 bool transferHostProjectionsToArray(const float *projData, cudaArray* array, const SDimensions3D& dims)
 {
 	cudaExtent extentA;
@@ -413,12 +375,7 @@ bool transferHostProjectionsToArray(const float *projData, cudaArray* array, con
 	p.extent = extentA;
 	p.kind = cudaMemcpyHostToDevice;
 
-	cudaError err = cudaMemcpy3D(&p);
-	ASTRA_CUDA_ASSERT(err);
-
-	// TODO: check errors
-
-	return true;
+	return checkCuda(cudaMemcpy3D(&p), "transferHostProjectionsToArray 3D");
 }
 
 
@@ -430,18 +387,6 @@ float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
 }
 
 
-bool cudaTextForceKernelsCompletion()
-{
-	cudaError_t returnedCudaError = cudaThreadSynchronize();
-
-	if(returnedCudaError != cudaSuccess) {
-		ASTRA_ERROR("Failed to force completion of cuda kernels: %d: %s.", returnedCudaError, cudaGetErrorString(returnedCudaError));
-		return false;
-	}
-
-	return true;
-}
-
 int calcNextPowerOfTwo(int _iValue)
 {
 	int iOutput = 1;