7 files changed, 96 insertions, 201 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
index 9a19be0..7cb56f6 100644
--- a/cuda/3d/arith3d.cu
+++ b/cuda/3d/arith3d.cu
@@ -99,14 +99,14 @@ struct opClampMax {
 
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -116,14 +116,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -134,14 +134,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned
 }
 
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -151,14 +151,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -168,14 +168,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -185,14 +185,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u
 	}
 }
 
-template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+template<class op, unsigned int repeat>
 __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	unsigned int x = threadIdx.x + 16*blockIdx.x;
 	if (x >= width) return;
 
 	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
-	unsigned int off = (y+padY)*pitch+x+padX;
+	unsigned int off = y*pitch+x;
 	for (unsigned int i = 0; i < repeat; ++i) {
 		if (y >= height)
 			break;
@@ -210,7 +210,7 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2,
 
 
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -218,12 +218,12 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign
 
 	float *pfOut = (float*)out;
 
-	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+	devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -231,12 +231,12 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int
 
 	float *pfOut = (float*)out;
 
-	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+	devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -245,12 +245,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns
 	float *pfOut = (float*)out;
 	const float *pfIn = (const float*)in;
 
-	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+	devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -259,12 +259,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned
 	float *pfOut = (float*)out;
 	const float *pfIn = (const float*)in;
 
-	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+	devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -274,12 +274,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 	const float *pfIn1 = (const float*)in1;
 	const float *pfIn2 = (const float*)in2;
 
-	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+	devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
 
-template<typename op, VolType t>
+template<typename op>
 void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height)
 {
 	dim3 blockSize(16,16);
@@ -289,7 +289,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2
 	const float *pfIn1 = (const float*)in1;
 	const float *pfIn2 = (const float*)in2;
 
-	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+	devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
 
 	cudaTextForceKernelsCompletion();
 }
@@ -319,7 +319,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -335,7 +335,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 	}
 
@@ -352,7 +352,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -370,7 +370,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -389,7 +389,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -409,7 +409,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc
 	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
 
 	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -439,7 +439,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -455,7 +455,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 	}
 
@@ -472,7 +472,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -490,7 +490,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam,
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn += step;
 	}
@@ -509,7 +509,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -529,7 +529,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
 
 	for (unsigned int i = 0; i < dims.iProjV; ++i) {
-		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
 		pfOut += step;
 		pfIn1 += step;
 		pfIn2 += step;
@@ -556,39 +556,33 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit
 
 
 #define INST_DFtoD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
 
 #define INST_DtoD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
 
 #define INST_DDtoD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
 
 #define INST_DDFtoD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
 
 
 #define INST_toD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
 
 #define INST_FtoD(name) \
-  template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
-  template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
   template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \
   template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
 
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
index 7f8e320..1b0154d 100644
--- a/cuda/3d/cone_bp.cu
+++ b/cuda/3d/cone_bp.cu
@@ -73,9 +73,9 @@ bool bindProjDataTexture(const cudaArray* array)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
-	gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder;
+	gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder;
 	gT_coneProjTexture.filterMode = cudaFilterModeLinear;
 	gT_coneProjTexture.normalized = false;
 
@@ -148,8 +148,8 @@ __global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAng
 			const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
 			const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz;
 
-			const float fU = fUNum / fDen + 1.0f;
-			const float fV = fVNum / fDen + 1.0f;
+			const float fU = fUNum / fDen;
+			const float fV = fVNum / fDen;
 
 			fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
 
@@ -230,8 +230,8 @@ __global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int start
 				const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
 				const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz;
 
-				const float fU = fUNum / fDen + 1.0f;
-				const float fV = fVNum / fDen + 1.0f;
+				const float fU = fUNum / fDen;
+				const float fV = fVNum / fDen;
 
 				fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
 
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
index 40dca4f..fa308b9 100644
--- a/cuda/3d/cone_fp.cu
+++ b/cuda/3d/cone_fp.cu
@@ -72,9 +72,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp;
-	gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_coneVolumeTexture.addressMode[1] = cudaAddressModeBorder;
+	gT_coneVolumeTexture.addressMode[2] = cudaAddressModeBorder;
 	gT_coneVolumeTexture.filterMode = cudaFilterModeLinear;
 	gT_coneVolumeTexture.normalized = false;
 
@@ -142,9 +142,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
                                                                                                                    \
 		float fVal = 0.0f;                                                                                         \
                                                                                                                    \
-		float f##c0 = startSlice + 1.5f;                                                                           \
-		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
-		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+		float f##c0 = startSlice + 0.5f;                                                                           \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \
                                                                                                                    \
 		for (int s = startSlice; s < endSlice; ++s)                                                                \
 		{                                                                                                          \
@@ -218,9 +218,9 @@ bool bindVolumeDataTexture(const cudaArray* array)
                                                                                                                    \
 		float fVal = 0.0f;                                                                                         \
                                                                                                                    \
-		float f##c0 = startSlice + 1.5f;                                                                           \
-		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
-		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+		float f##c0 = startSlice + 0.5f;                                                                           \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f; \
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f; \
                                                                                                                    \
 		for (int s = startSlice; s < endSlice; ++s)                                                                \
 		{                                                                                                          \
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
index d439d9b..883187d 100644
--- a/cuda/3d/fdk.cu
+++ b/cuda/3d/fdk.cu
@@ -71,9 +71,9 @@ static bool bindProjDataTexture(const cudaArray* array)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
-	gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_coneProjTexture.addressMode[1] = cudaAddressModeBorder;
+	gT_coneProjTexture.addressMode[2] = cudaAddressModeBorder;
 	gT_coneProjTexture.filterMode = cudaFilterModeLinear;
 	gT_coneProjTexture.normalized = false;
 
@@ -120,8 +120,8 @@ __global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle
 	float fY = Y - 0.5f*dims.iVolY + 0.5f;
 	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ;
 
-	const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f;
-	const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ);
+	const float fU_base = 0.5f*dims.iProjU - 0.5f + 0.5f;
+	const float fV_base = 0.5f*dims.iProjV - 0.5f + 0.5f + (fDetZ-fSrcZ);
 
 	// Note re. fZ/rV_base: the computations below are all relative to the
 	// optical axis, so we do the Z-adjustments beforehand.
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
index 872b1eb..9bf9a9f 100644
--- a/cuda/3d/par3d_bp.cu
+++ b/cuda/3d/par3d_bp.cu
@@ -69,9 +69,9 @@ static bool bindProjDataTexture(const cudaArray* array)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp;
-	gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_par3DProjTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_par3DProjTexture.addressMode[1] = cudaAddressModeBorder;
+	gT_par3DProjTexture.addressMode[2] = cudaAddressModeBorder;
 	gT_par3DProjTexture.filterMode = cudaFilterModeLinear;
 	gT_par3DProjTexture.normalized = false;
 
@@ -139,8 +139,8 @@ __global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAn
 			const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
 			const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
 
-			const float fU = fUNum + 1.0f;
-			const float fV = fVNum + 1.0f;
+			const float fU = fUNum;
+			const float fV = fVNum;
 
 			fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
 
@@ -216,8 +216,8 @@ __global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int star
 				const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
 				const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
 
-				const float fU = fUNum + 1.0f;
-				const float fV = fVNum + 1.0f;
+				const float fU = fUNum;
+				const float fV = fVNum;
 
 				fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
 				fZs += fSubStep;
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
index 6bf9037..e0f743c 100644
--- a/cuda/3d/par3d_fp.cu
+++ b/cuda/3d/par3d_fp.cu
@@ -72,9 +72,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 
-	gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp;
-	gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp;
-	gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeBorder;
+	gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeBorder;
+	gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeBorder;
 	gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear;
 	gT_par3DVolumeTexture.normalized = false;
 
@@ -144,9 +144,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
                                                                                                                  \
 		float fVal = 0.0f;                                                                                       \
                                                                                                                  \
-		float f##c0 = startSlice + 1.5f;                                                                         \
-		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
-		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+		float f##c0 = startSlice + 0.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
                                                                                                                  \
 		for (int s = startSlice; s < endSlice; ++s)                                                              \
 		{                                                                                                        \
@@ -226,9 +226,9 @@ static bool bindVolumeDataTexture(const cudaArray* array)
                                                                                                                  \
 		float fVal = 0.0f;                                                                                       \
                                                                                                                  \
-		float f##c0 = startSlice + 1.5f;                                                                         \
-		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
-		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+		float f##c0 = startSlice + 0.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
                                                                                                                  \
 		for (int s = startSlice; s < endSlice; ++s)                                                              \
 		{                                                                                                        \
@@ -281,16 +281,16 @@ PAR3D_FP_SS_BODY(Z,X,Y)
 
 
 __device__ float dirWeights(float fX, float fN) {
-	if (fX <= 0.5f) // outside image on left
+	if (fX <= -0.5f) // outside image on left
 		return 0.0f;
-	if (fX <= 1.5f) // half outside image on left
-		return (fX - 0.5f) * (fX - 0.5f);
-	if (fX <= fN + 0.5f) { // inside image
-		float t = fX - 0.5f - floorf(fX - 0.5f);
+	if (fX <= 0.5f) // half outside image on left
+		return (fX + 0.5f) * (fX + 0.5f);
+	if (fX <= fN - 0.5f) { // inside image
+		float t = fX + 0.5f - floorf(fX + 0.5f);
 		return t*t + (1-t)*(1-t);
 	}
-	if (fX <= fN + 1.5f) // half outside image on right
-		return (fN + 1.5f - fX) * (fN + 1.5f - fX);
+	if (fX <= fN + 0.5f) // half outside image on right
+		return (fN + 0.5f - fX) * (fN + 0.5f - fX);
 	return 0.0f; // outside image on right
 }
 
@@ -346,9 +346,9 @@ __device__ float dirWeights(float fX, float fN) {
                                                                                                                  \
 		float fVal = 0.0f;                                                                                       \
                                                                                                                  \
-		float f##c0 = startSlice + 1.5f;                                                                         \
-		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
-		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+		float f##c0 = startSlice + 0.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 0.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 0.5f;\
                                                                                                                  \
 		for (int s = startSlice; s < endSlice; ++s)                                                              \
 		{                                                                                                        \
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
index 6dc79c7..cf40fdc 100644
--- a/cuda/3d/util3d.cu
+++ b/cuda/3d/util3d.cu
@@ -292,16 +292,14 @@ bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src,
 // TODO: Consider using a single array of size max(proj,volume) (per dim)
 //       instead of allocating a new one each time
 
-// TODO: Figure out a faster way of zeroing the padding?
-
 cudaArray* allocateVolumeArray(const SDimensions3D& dims)
 {
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 	cudaArray* cuArray;
 	cudaExtent extentA;
-	extentA.width = dims.iVolX+2;
-	extentA.height = dims.iVolY+2;
-	extentA.depth = dims.iVolZ+2;
+	extentA.width = dims.iVolX;
+	extentA.height = dims.iVolY;
+	extentA.depth = dims.iVolZ;
 	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
 	if (err != cudaSuccess) {
 		astraCUDA::reportCudaError(err);
@@ -309,8 +307,6 @@ cudaArray* allocateVolumeArray(const SDimensions3D& dims)
 		return 0;
 	}
 
-	zeroVolumeArray(cuArray, dims);
-
 	return cuArray;
 }
 cudaArray* allocateProjectionArray(const SDimensions3D& dims)
@@ -318,9 +314,9 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
 	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 	cudaArray* cuArray;
 	cudaExtent extentA;
-	extentA.width = dims.iProjU+2;
+	extentA.width = dims.iProjU;
 	extentA.height = dims.iProjAngles;
-	extentA.depth = dims.iProjV+2;
+	extentA.depth = dims.iProjV;
 	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
 
 	if (err != cudaSuccess) {
@@ -329,101 +325,8 @@ cudaArray* allocateProjectionArray(const SDimensions3D& dims)
 		return 0;
 	}
 
-	zeroProjectionArray(cuArray, dims);
-
 	return cuArray;
 }
-bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims)
-{
-	cudaPitchedPtr zeroBuf;
-	cudaExtent extentS;
-	extentS.width = sizeof(float)*(dims.iVolX+2);
-	extentS.height = dims.iVolY+2;
-	extentS.depth = 1;
-
-	cudaExtent extentA;
-	extentA.width = dims.iVolX+2;
-	extentA.height = dims.iVolY+2;
-	extentA.depth = 1;
-
-
-
-	cudaError err;
-	err = cudaMalloc3D(&zeroBuf, extentS);
-	ASTRA_CUDA_ASSERT(err);
-	err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2);
-	ASTRA_CUDA_ASSERT(err);
-	
-	// zero array
-	for (unsigned int i = 0; i < dims.iVolZ+2; ++i) {
-		cudaMemcpy3DParms p;
-		cudaPos zp = {0, 0, 0};
-		cudaPos dp = {0, 0, i};
-		p.srcArray = 0;
-		p.srcPos = zp;
-		p.srcPtr = zeroBuf;
-		p.dstArray = array;
-		p.dstPtr.ptr = 0;
-		p.dstPtr.pitch = 0;
-		p.dstPtr.xsize = 0;
-		p.dstPtr.ysize = 0;
-		p.dstPos = dp;
-		p.extent = extentA;
-		p.kind = cudaMemcpyDeviceToDevice;
-
-		err = cudaMemcpy3D(&p);
-		ASTRA_CUDA_ASSERT(err);
-	}
-	cudaFree(zeroBuf.ptr);
-
-	// TODO: check errors
-
-	return true;
-}
-bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims)
-{
-	cudaPitchedPtr zeroBuf;
-	cudaExtent extentS;
-	extentS.width = sizeof(float)*(dims.iProjU+2);
-	extentS.height = dims.iProjAngles;
-	extentS.depth = 1;
-	cudaExtent extentA;
-	extentA.width = dims.iProjU+2;
-	extentA.height = dims.iProjAngles;
-	extentA.depth = 1;
-
-
-	cudaError err;
-	err = cudaMalloc3D(&zeroBuf, extentS);
-	ASTRA_CUDA_ASSERT(err);
-	err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles);
-	ASTRA_CUDA_ASSERT(err);
-
-	for (unsigned int i = 0; i < dims.iProjV+2; ++i) {
-		cudaMemcpy3DParms p;
-		cudaPos zp = {0, 0, 0};
-		cudaPos dp = {0, 0, i};
-		p.srcArray = 0;
-		p.srcPos = zp;
-		p.srcPtr = zeroBuf;
-		p.dstArray = array;
-		p.dstPtr.ptr = 0;
-		p.dstPtr.pitch = 0;
-		p.dstPtr.xsize = 0;
-		p.dstPtr.ysize = 0;
-		p.dstPos = dp;
-		p.extent = extentA;
-		p.kind = cudaMemcpyDeviceToDevice;
-
-		err = cudaMemcpy3D(&p);
-		ASTRA_CUDA_ASSERT(err);
-	}
-	cudaFree(zeroBuf.ptr);
-
-	// TODO: check errors
-	return true;
-}
-
 
 bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims)
 {
@@ -434,7 +337,6 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
 
 	cudaMemcpy3DParms p;
 	cudaPos zp = {0, 0, 0};
-	cudaPos dp = {1, 1, 1};
 	p.srcArray = 0;
 	p.srcPos = zp;
 	p.srcPtr = D_volumeData;
@@ -443,7 +345,7 @@ bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const
 	p.dstPtr.pitch = 0;
 	p.dstPtr.xsize = 0;
 	p.dstPtr.ysize = 0;
-	p.dstPos = dp;
+	p.dstPos = zp;
 	p.extent = extentA;
 	p.kind = cudaMemcpyDeviceToDevice;
 
@@ -462,7 +364,6 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
 
 	cudaMemcpy3DParms p;
 	cudaPos zp = {0, 0, 0};
-	cudaPos dp = {1, 0, 1};
 	p.srcArray = 0;
 	p.srcPos = zp;
 	p.srcPtr = D_projData;
@@ -471,7 +372,7 @@ bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, con
 	p.dstPtr.pitch = 0;
 	p.dstPtr.xsize = 0;
 	p.dstPtr.ysize = 0;
-	p.dstPos = dp;
+	p.dstPos = zp;
 	p.extent = extentA;
 	p.kind = cudaMemcpyDeviceToDevice;