57 files changed, 16248 insertions, 0 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu
new file mode 100644
index 0000000..5ae5d08
--- /dev/null
+++ b/cuda/2d/algo.cu
@@ -0,0 +1,356 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cassert>
+
+#include "algo.h"
+#include "par_fp.h"
+#include "fan_fp.h"
+#include "par_bp.h"
+#include "fan_bp.h"
+#include "util.h"
+#include "arith.h"
+
+namespace astraCUDA {
+
+ReconAlgo::ReconAlgo()
+{
+	angles = 0;
+	TOffsets = 0;
+	fanProjs = 0;
+	shouldAbort = false;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+	D_maskData = 0;
+	D_smaskData = 0;
+
+	D_sinoData = 0;
+	D_volumeData = 0;
+
+	useMinConstraint = false;
+	useMaxConstraint = false;
+
+	freeGPUMemory = false;
+}
+
+ReconAlgo::~ReconAlgo()
+{
+	reset();
+}
+
+void ReconAlgo::reset()
+{
+	delete[] angles;
+	delete[] TOffsets;
+	delete[] fanProjs;
+
+	if (freeGPUMemory) {
+		cudaFree(D_maskData);
+		cudaFree(D_smaskData);
+		cudaFree(D_sinoData);
+		cudaFree(D_volumeData);
+	}
+
+	angles = 0;
+	TOffsets = 0;
+	fanProjs = 0;
+	shouldAbort = false;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	D_maskData = 0;
+	D_smaskData = 0;
+
+	D_sinoData = 0;
+	D_volumeData = 0;
+	
+	useMinConstraint = false;
+	useMaxConstraint = false;
+
+	freeGPUMemory = false;
+}
+
+bool ReconAlgo::setGPUIndex(int iGPUIndex)
+{
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+	return true;
+}
+
+bool ReconAlgo::enableVolumeMask()
+{
+	useVolumeMask = true;
+	return true;
+}
+
+bool ReconAlgo::enableSinogramMask()
+{
+	useSinogramMask = true;
+	return true;
+}
+
+
+bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles)
+{
+	dims = _dims;
+
+	angles = new float[dims.iProjAngles];
+
+	memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles);
+
+	delete[] fanProjs;
+	fanProjs = 0;
+
+	return true;
+}
+
+bool ReconAlgo::setFanGeometry(const SDimensions& _dims,
+                               const SFanProjection* _projs)
+{
+	dims = _dims;
+	fanProjs = new SFanProjection[dims.iProjAngles];
+
+	memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles);
+
+	delete[] angles;
+	angles = 0;
+
+	return true;
+}
+
+
+bool ReconAlgo::setTOffsets(const float* _TOffsets)
+{
+	// TODO: determine if they're all zero?
+	TOffsets = new float[dims.iProjAngles];
+	memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles);
+
+	return true;
+}
+
+
+
+bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch)
+{
+	assert(useVolumeMask);
+
+	D_maskData = _D_maskData;
+	maskPitch = _maskPitch;
+
+	return true;
+}
+
+bool ReconAlgo::setSinogramMask(float* _D_smaskData, unsigned int _smaskPitch)
+{
+	assert(useSinogramMask);
+
+	D_smaskData = _D_smaskData;
+	smaskPitch = _smaskPitch;
+
+	return true;
+}
+
+bool ReconAlgo::setBuffers(float* _D_volumeData, unsigned int _volumePitch,
+                      float* _D_projData, unsigned int _projPitch)
+{
+	D_volumeData = _D_volumeData;
+	volumePitch = _volumePitch;
+	D_sinoData = _D_projData;
+	sinoPitch = _projPitch;
+
+	return true;
+}
+
+bool ReconAlgo::setMinConstraint(float fMin)
+{
+	fMinConstraint = fMin;
+	useMinConstraint = true;
+	return true;
+}
+
+bool ReconAlgo::setMaxConstraint(float fMax)
+{
+	fMaxConstraint = fMax;
+	useMaxConstraint = true;
+	return true;
+}
+
+
+
+bool ReconAlgo::allocateBuffers()
+{
+	bool ok;
+	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	if (!ok)
+		return false;
+
+	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		D_volumeData = 0;
+		return false;
+	}
+
+	if (useVolumeMask) {
+		ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch);
+		if (!ok) {
+			cudaFree(D_volumeData);
+			cudaFree(D_sinoData);
+			D_volumeData = 0;
+			D_sinoData = 0;
+			return false;
+		}
+	}
+
+	if (useSinogramMask) {
+		ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch);
+		if (!ok) {
+			cudaFree(D_volumeData);
+			cudaFree(D_sinoData);
+			cudaFree(D_maskData);
+			D_volumeData = 0;
+			D_sinoData = 0;
+			D_maskData = 0;
+			return false;
+		}
+	}
+
+	freeGPUMemory = true;
+	return true;
+}
+
+bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+                              const float* pfReconstruction, unsigned int iReconstructionPitch,
+                              const float* pfVolMask, unsigned int iVolMaskPitch,
+                              const float* pfSinoMask, unsigned int iSinoMaskPitch)
+{
+	if (!pfSinogram)
+		return false;
+	if (!pfReconstruction)
+		return false;
+
+	bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch,
+	                               dims.iProjDets,
+	                               dims.iProjAngles,
+	                               D_sinoData, sinoPitch);
+	if (!ok)
+		return false;
+
+	// rescale sinogram to adjust for pixel size
+	processVol<opMul,SINO>(D_sinoData, fSinogramScale,
+	                       //1.0f/(fPixelSize*fPixelSize),
+	                       sinoPitch,
+	                       dims.iProjDets, dims.iProjAngles);
+
+	ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch,
+	                        dims.iVolWidth, dims.iVolHeight,
+	                        D_volumeData, volumePitch);
+	if (!ok)
+		return false;
+
+
+
+	if (useVolumeMask) {
+		if (!pfVolMask)
+			return false;
+
+		ok = copyVolumeToDevice(pfVolMask, iVolMaskPitch,
+		                        dims.iVolWidth, dims.iVolHeight,
+		                        D_maskData, maskPitch);
+		if (!ok)
+			return false;
+	}
+
+	if (useSinogramMask) {
+		if (!pfSinoMask)
+			return false;
+
+		ok = copySinogramToDevice(pfSinoMask, iSinoMaskPitch,
+		                          dims.iProjDets, dims.iProjAngles,
+		                          D_smaskData, smaskPitch);
+		if (!ok)
+			return false;
+	}
+
+	return true;
+}
+
+bool ReconAlgo::getReconstruction(float* pfReconstruction,
+                                  unsigned int iReconstructionPitch) const
+{
+	bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch,
+	                               dims.iVolWidth,
+	                               dims.iVolHeight,
+	                               D_volumeData, volumePitch);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+
+bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch,
+                       float* D_projData, unsigned int projPitch,
+                       float outputScale)
+{
+	if (angles) {
+		assert(!fanProjs);
+		return FP(D_volumeData, volumePitch, D_projData, projPitch,
+		          dims, angles, TOffsets, outputScale);
+	} else {
+		assert(fanProjs);
+		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
+		             dims, fanProjs, outputScale);
+	}
+}
+
+bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch,
+                       float* D_projData, unsigned int projPitch)
+{
+	if (angles) {
+		assert(!fanProjs);
+		return BP(D_volumeData, volumePitch, D_projData, projPitch,
+		          dims, angles, TOffsets);
+	} else {
+		assert(fanProjs);
+		return FanBP(D_volumeData, volumePitch, D_projData, projPitch,
+		             dims, fanProjs);
+	}
+
+}
+
+
+
+}
diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h
new file mode 100644
index 0000000..96195a3
--- /dev/null
+++ b/cuda/2d/algo.h
@@ -0,0 +1,155 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ALGO_H
+#define _CUDA_ALGO_H
+
+#include "util.h"
+
+namespace astraCUDA {
+
+class _AstraExport ReconAlgo {
+public:
+	ReconAlgo();
+	virtual ~ReconAlgo();
+
+	bool setGPUIndex(int iGPUIndex);
+
+	bool setGeometry(const SDimensions& dims, const float* angles);
+	bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs);
+
+	// setTOffsets should (optionally) be called after setGeometry
+	bool setTOffsets(const float* TOffsets);
+
+	void signalAbort() { shouldAbort = true; }
+
+	virtual bool enableVolumeMask();
+	virtual bool enableSinogramMask();
+
+	// init should be called after setting all geometry
+	virtual bool init() = 0;
+
+	// setVolumeMask should be called after init and before iterate,
+	// but only if enableVolumeMask was called before init.
+	// It may be called again after iterate.
+	bool setVolumeMask(float* D_maskData, unsigned int maskPitch);
+
+	// setSinogramMask should be called after init and before iterate,
+	// but only if enableSinogramMask was called before init.
+	// It may be called again after iterate.
+	bool setSinogramMask(float* D_smaskData, unsigned int smaskPitch);
+
+
+	// setBuffers should be called after init and before iterate.
+	// It may be called again after iterate.
+	virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch,
+	                        float* D_projData, unsigned int projPitch);
+
+
+	// instead of calling setBuffers, you can also call allocateBuffers
+	// to let ReconAlgo manage its own GPU memory
+	virtual bool allocateBuffers();
+	virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+	                           const float* pfReconstruction, unsigned int iReconstructionPitch,
+	                           const float* pfVolMask, unsigned int iVolMaskPitch,
+	                           const float* pfSinoMask, unsigned int iSinoMaskPitch);
+
+
+
+	// set Min/Max constraints. They may be called at any time, and will affect
+	// any iterate() calls afterwards.
+	virtual bool setMinConstraint(float fMin);
+	virtual bool setMaxConstraint(float fMax);
+
+
+	// iterate should be called after init and setBuffers.
+	// It may be called multiple times.
+	virtual bool iterate(unsigned int iterations) = 0;
+
+	// Compute the norm of the difference of the FP of the current
+	// reconstruction and the sinogram. (This performs one FP.)
+	// It can be called after iterate.
+	virtual float computeDiffNorm() = 0;
+	// TODO: computeDiffNorm shouldn't be virtual, but for it to be
+	// implemented in ReconAlgo, it needs a way to get a suitable
+	// temporary sinogram buffer.
+
+	bool getReconstruction(float* pfReconstruction,
+                           unsigned int iReconstructionPitch) const;
+
+
+
+protected:
+	void reset();
+
+	bool callFP(float* D_volumeData, unsigned int volumePitch,
+	            float* D_projData, unsigned int projPitch,
+	            float outputScale);
+	bool callBP(float* D_volumeData, unsigned int volumePitch,
+	            float* D_projData, unsigned int projPitch);
+
+
+	SDimensions dims;
+	float* angles;
+	float* TOffsets;
+	SFanProjection* fanProjs;
+
+	volatile bool shouldAbort;
+
+	bool freeGPUMemory;
+
+	// Input/output
+	float* D_sinoData;
+	unsigned int sinoPitch;
+
+	float* D_volumeData;
+	unsigned int volumePitch;
+
+	// Masks
+	bool useVolumeMask;
+	bool useSinogramMask;
+
+	float* D_maskData;
+	unsigned int maskPitch;
+	float* D_smaskData;
+	unsigned int smaskPitch;
+
+	// Min/max
+	bool useMinConstraint;
+	bool useMaxConstraint;
+	float fMinConstraint;
+	float fMaxConstraint;
+
+
+};
+
+
+}
+
+#endif
+
diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu
new file mode 100644
index 0000000..1ee02ca
--- /dev/null
+++ b/cuda/2d/arith.cu
@@ -0,0 +1,893 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "arith.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+
+struct opAddScaled {
+	__device__ void operator()(float& out, const float in, const float inp) {
+		out += in * inp;
+	}
+};
+struct opScaleAndAdd {
+	__device__ void operator()(float& out, const float in, const float inp) {
+		out = in + out * inp;
+	}
+};
+struct opAddMulScaled {
+	__device__ void operator()(float& out, const float in1, const float in2, const float inp) {
+		out += in1 * in2 * inp;
+	}
+};
+struct opAddMul {
+	__device__ void operator()(float& out, const float in1, const float in2) {
+		out += in1 * in2;
+	}
+};
+struct opAdd {
+	__device__ void operator()(float& out, const float in) {
+		out += in;
+	}
+};
+struct opAdd2 {
+	__device__ void operator()(float& out, const float in1, const float in2) {
+		out += in1 + in2;
+	}
+};
+struct opMul {
+	__device__ void operator()(float& out, const float in) {
+		out *= in;
+	}
+};
+struct opMul2 {
+	__device__ void operator()(float& out, const float in1, const float in2) {
+		out *= in1 * in2;
+	}
+};
+struct opDividedBy {
+	__device__ void operator()(float& out, const float in) {
+		if (out > 0.000001f) // out is assumed to be positive
+			out = in / out;
+		else
+			out = 0.0f;
+	}
+};
+struct opInvert {
+	__device__ void operator()(float& out) {
+		if (out > 0.000001f) // out is assumed to be positive
+			out = 1 / out;
+		else
+			out = 0.0f;
+	}
+};
+struct opSet {
+	__device__ void operator()(float& out, const float inp) {
+		out = inp;
+	}
+};
+struct opClampMin {
+	__device__ void operator()(float& out, const float inp) {
+		if (out < inp)
+			out = inp;
+	}
+};
+struct opClampMax {
+	__device__ void operator()(float& out, const float inp) {
+		if (out > inp)
+			out = inp;
+	}
+};
+struct opClampMinMask {
+	__device__ void operator()(float& out, const float in) {
+		if (out < in)
+			out = in;
+	}
+};
+struct opClampMaxMask {
+	__device__ void operator()(float& out, const float in) {
+		if (out > in)
+			out = in;
+	}
+};
+struct opSetMaskedValues {
+	__device__ void operator()(float& out, const float in, const float inp) {
+		if (!in)
+			out = inp;
+	}
+};
+struct opSegmentAndMask {
+	__device__ void operator()(float& out1, float& out2, const float inp1, const float inp2) {
+		if (out1 >= inp1) {
+			out1 = inp2;
+			out2 = 0.0f;
+		}
+
+	}
+
+};
+struct opMulMask {
+	__device__ void operator()(float& out, const float mask, const float in) {
+		if (mask > 0.0f) {
+			out *= in;
+		}
+	}
+};
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut1[off], pfOut2[off], fParam1, fParam2);
+		off += pitch;
+		y++;
+	}
+}
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn1[off], pfIn2[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn1[off], pfIn2[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVolCopy(float* out, unsigned int width, unsigned int height)
+{
+	float* D_out;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+
+	processVol<op, t>(D_out, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, float param, unsigned int width, unsigned int height)
+{
+	float* D_out;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+
+	processVol<op, t>(D_out, param, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height)
+{
+	float* D_out1;
+	float* D_out2;
+
+	unsigned int pitch;
+	allocateVolume(D_out1, width+2, height+2, pitch);
+	copyVolumeToDevice(out1, width, width, height, D_out1, pitch);
+	allocateVolume(D_out2, width+2, height+2, pitch);
+	copyVolumeToDevice(out2, width, width, height, D_out2, pitch);
+
+	processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height);
+
+	copyVolumeFromDevice(out1, width, width, height, D_out1, pitch);
+	copyVolumeFromDevice(out2, width, width, height, D_out2, pitch);
+
+	cudaFree(D_out1);
+	cudaFree(D_out2);
+}
+
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height)
+{
+	float* D_out;
+	float* D_in;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+	allocateVolume(D_in, width+2, height+2, pitch);
+	copyVolumeToDevice(in, width, width, height, D_in, pitch);
+
+	processVol<op, t>(D_out, D_in, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+	cudaFree(D_in);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height)
+{
+	float* D_out;
+	float* D_in;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+	allocateVolume(D_in, width+2, height+2, pitch);
+	copyVolumeToDevice(in, width, width, height, D_in, pitch);
+
+	processVol<op, t>(D_out, D_in, param, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+	cudaFree(D_in);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height)
+{
+	float* D_out;
+	float* D_in1;
+	float* D_in2;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+	allocateVolume(D_in1, width+2, height+2, pitch);
+	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
+	allocateVolume(D_in2, width+2, height+2, pitch);
+	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
+
+	processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+	cudaFree(D_in1);
+	cudaFree(D_in2);
+}
+
+template<typename op, VolType t>
+void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height)
+{
+	float* D_out;
+	float* D_in1;
+	float* D_in2;
+
+	unsigned int pitch;
+	allocateVolume(D_out, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_out, pitch);
+	allocateVolume(D_in1, width+2, height+2, pitch);
+	copyVolumeToDevice(in1, width, width, height, D_in1, pitch);
+	allocateVolume(D_in2, width+2, height+2, pitch);
+	copyVolumeToDevice(in2, width, width, height, D_in2, pitch);
+
+	processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height);
+
+	copyVolumeFromDevice(out, width, width, height, D_out, pitch);
+
+	cudaFree(D_out);
+	cudaFree(D_in1);
+	cudaFree(D_in2);
+}
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+511)/512);
+
+	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut1 = (float*)out1.ptr;
+	float *pfOut2 = (float*)out2.ptr;
+	unsigned int step = out1.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut1 += step;
+		pfOut2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut1 = (float*)out1.ptr;
+	float *pfOut2 = (float*)out2.ptr;
+	unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut1 += step;
+		pfOut2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#define INST_DFtoD(name) \
+  template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+
+#define INST_DtoD(name) \
+  template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+
+#define INST_DDtoD(name) \
+  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+#define INST_DDFtoD(name) \
+  template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+
+
+#define INST_toD(name) \
+  template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
+
+#define INST_FtoD(name) \
+  template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims);
+
+#define INST_FFtoDD(name) \
+  template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+  template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \
+  template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+
+
+
+INST_DFtoD(opAddScaled)
+INST_DFtoD(opScaleAndAdd)
+INST_DDFtoD(opAddMulScaled)
+INST_DDtoD(opAddMul)
+INST_DDtoD(opMul2)
+INST_DDtoD(opAdd2)
+INST_DtoD(opMul)
+INST_DDtoD(opMulMask)
+INST_DtoD(opAdd)
+INST_DtoD(opDividedBy)
+INST_toD(opInvert)
+INST_FtoD(opSet)
+INST_FtoD(opMul)
+INST_DFtoD(opMulMask)
+INST_FtoD(opAdd)
+INST_FtoD(opClampMin)
+INST_FtoD(opClampMax)
+INST_DtoD(opClampMinMask)
+INST_DtoD(opClampMaxMask)
+
+// PDART-specific:
+INST_DFtoD(opSetMaskedValues)
+INST_FFtoDD(opSegmentAndMask)
+
+}
diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h
new file mode 100644
index 0000000..c8c7b41
--- /dev/null
+++ b/cuda/2d/arith.h
@@ -0,0 +1,101 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH_H
+#define _CUDA_ARITH_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+
+struct opAddScaled;
+struct opScaleAndAdd;
+struct opAddMulScaled;
+struct opAddMul;
+struct opAdd;
+struct opAdd2;
+struct opMul;
+struct opMul2;
+struct opDividedBy;
+struct opInvert;
+struct opSet;
+struct opClampMin;
+struct opClampMax;
+struct opClampMinMask;
+struct opClampMaxMask;
+struct opSegmentAndMask;
+struct opSetMaskedValues;
+
+struct opMulMask;
+
+
+
+enum VolType {
+  SINO = 0,
+  VOL = 1
+};
+
+
+template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height);
+
+template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height);
+
+template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+
+
+}
+
+#endif
diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu
new file mode 100644
index 0000000..71ed025
--- /dev/null
+++ b/cuda/2d/astra.cu
@@ -0,0 +1,824 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "util.h"
+#include "par_fp.h"
+#include "fan_fp.h"
+#include "par_bp.h"
+#include "arith.h"
+#include "astra.h"
+
+#include "fft.h"
+
+#include <fstream>
+#include <cuda.h>
+
+#include "../../include/astra/Logger.h"
+
+using namespace astraCUDA;
+using namespace std;
+
+
+namespace astra {
+
+enum CUDAProjectionType {
+	PROJ_PARALLEL,
+	PROJ_FAN
+};
+
+
+class AstraFBP_internal {
+public:
+	SDimensions dims;
+	float* angles;
+	float* TOffsets;
+
+	float fPixelSize;
+
+	bool initialized;
+	bool setStartReconstruction;
+
+	float* D_sinoData;
+	unsigned int sinoPitch;
+
+	float* D_volumeData;
+	unsigned int volumePitch;
+
+	cufftComplex * m_pDevFilter;
+};
+
+AstraFBP::AstraFBP()
+{
+	pData = new AstraFBP_internal();
+
+	pData->angles = 0;
+	pData->D_sinoData = 0;
+	pData->D_volumeData = 0;
+
+	pData->dims.iVolWidth = 0;
+	pData->dims.iProjAngles = 0;
+	pData->dims.fDetScale = 1.0f;
+	pData->dims.iRaysPerDet = 1;
+	pData->dims.iRaysPerPixelDim = 1;
+
+	pData->initialized = false;
+	pData->setStartReconstruction = false;
+
+	pData->m_pDevFilter = NULL;
+}
+
+AstraFBP::~AstraFBP()
+{
+	delete[] pData->angles;
+	pData->angles = 0;
+
+	delete[] pData->TOffsets;
+	pData->TOffsets = 0;
+
+	cudaFree(pData->D_sinoData);
+	pData->D_sinoData = 0;
+
+	cudaFree(pData->D_volumeData);
+	pData->D_volumeData = 0;
+
+	if(pData->m_pDevFilter != NULL)
+	{
+		freeComplexOnDevice(pData->m_pDevFilter);
+		pData->m_pDevFilter = NULL;
+	}
+
+	delete pData;
+	pData = 0;
+}
+
+bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth,
+                                          unsigned int iVolHeight,
+                                          float fPixelSize)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iVolWidth = iVolWidth;
+	pData->dims.iVolHeight = iVolHeight;
+
+	pData->fPixelSize = fPixelSize;
+
+	return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f);
+}
+
+bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles,
+                                      unsigned int iProjDets,
+                                      const float* pfAngles,
+                                      float fDetSize)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjDets = iProjDets;
+	pData->dims.fDetScale = fDetSize / pData->fPixelSize;
+
+	if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+		return false;
+
+	pData->angles = new float[iProjAngles];
+	memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0]));
+
+	return true;
+}
+
+bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iPixelSuperSampling == 0)
+		return false;
+
+	pData->dims.iRaysPerPixelDim = iPixelSuperSampling;
+
+	return true;
+}
+
+
+bool AstraFBP::setTOffsets(const float* pfTOffsets)
+{
+	if (pData->initialized)
+		return false;
+
+	if (pfTOffsets == 0)
+		return false;
+
+	pData->TOffsets = new float[pData->dims.iProjAngles];
+	memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0]));
+
+	return true;
+}
+
+bool AstraFBP::init(int iGPUIndex)
+{
+	if (pData->initialized)
+	{
+		return false;
+	}
+
+	if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0)
+	{
+		return false;
+	}
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+	{
+		return false;
+	}
+
+	bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch);
+	if (!ok)
+	{
+		return false;
+	}
+
+	ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch);
+	if (!ok)
+	{
+		cudaFree(pData->D_volumeData);
+		pData->D_volumeData = 0;
+		return false;
+	}
+
+	pData->initialized = true;
+
+	return true;
+}
+
+bool AstraFBP::setSinogram(const float* pfSinogram,
+                            unsigned int iSinogramPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pfSinogram)
+		return false;
+
+	bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch,
+	                               pData->dims.iProjDets,
+	                               pData->dims.iProjAngles,
+	                               pData->D_sinoData, pData->sinoPitch);
+	if (!ok)
+		return false;
+
+	// rescale sinogram to adjust for pixel size
+	processVol<opMul,SINO>(pData->D_sinoData,
+	                       1.0f/(pData->fPixelSize*pData->fPixelSize),
+	                       pData->sinoPitch,
+	                       pData->dims.iProjDets, pData->dims.iProjAngles);
+
+	pData->setStartReconstruction = false;
+
+	return true;
+}
+
+static int calcNextPowerOfTwo(int _iValue)
+{
+	int iOutput = 1;
+
+	while(iOutput < _iValue)
+	{
+		iOutput *= 2;
+	}
+
+	return iOutput;
+}
+
+bool AstraFBP::run()
+{
+	if (!pData->initialized)
+	{
+		return false;
+	}
+
+	zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2);
+
+	bool ok = false;
+
+	if (pData->m_pDevFilter) {
+
+		int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
+		int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount);
+
+		cufftComplex * pDevComplexSinogram = NULL;
+
+		allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram);
+
+		runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram);
+
+		applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter);
+
+		runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount);
+
+		freeComplexOnDevice(pDevComplexSinogram);
+
+	}
+
+	ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets);
+	if(!ok)
+	{
+		return false;
+	}
+
+	processVol<opMul,VOL>(pData->D_volumeData,
+	                      (M_PI / 2.0f) / (float)pData->dims.iProjAngles,
+	                      pData->volumePitch,
+	                      pData->dims.iVolWidth, pData->dims.iVolHeight);
+
+	return true;
+}
+
+bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const
+{
+	if (!pData->initialized)
+		return false;
+
+	bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch,
+	                               pData->dims.iVolWidth,
+	                               pData->dims.iVolHeight,
+	                               pData->D_volumeData, pData->volumePitch);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+int AstraFBP::calcFourierFilterSize(int _iDetectorCount)
+{
+	int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount);
+	int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
+
+	// CHECKME: Matlab makes this at least 64. Do we also need to?
+	return iFreqBinCount;
+}
+
+bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */)
+{
+	if(pData->m_pDevFilter != 0)
+	{
+		freeComplexOnDevice(pData->m_pDevFilter);
+		pData->m_pDevFilter = 0;
+	}
+
+	if (_eFilter == FILTER_NONE)
+		return true; // leave pData->m_pDevFilter set to 0
+
+
+	int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets);
+	int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount);
+
+	cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount];
+	memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount);
+
+	allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter));
+
+	switch(_eFilter)
+	{
+		case FILTER_NONE:
+			// handled above
+			break;
+		case FILTER_RAMLAK:
+		case FILTER_SHEPPLOGAN:
+		case FILTER_COSINE:
+		case FILTER_HAMMING:
+		case FILTER_HANN:
+		case FILTER_TUKEY:
+		case FILTER_LANCZOS:
+		case FILTER_TRIANGULAR:
+		case FILTER_GAUSSIAN:
+		case FILTER_BARTLETTHANN:
+		case FILTER_BLACKMAN:
+		case FILTER_NUTTALL:
+		case FILTER_BLACKMANHARRIS:
+		case FILTER_BLACKMANNUTTALL:
+		case FILTER_FLATTOP:
+		{
+			genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter);
+			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+
+			break;
+		}
+		case FILTER_PROJECTION:
+		{
+			// make sure the offered filter has the correct size
+			assert(_iFilterWidth == iFreqBinCount);
+
+			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+			{
+				float fValue = _pfHostFilter[iFreqBinIndex];
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+				{
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+				}
+			}
+			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+			break;
+		}
+		case FILTER_SINOGRAM:
+		{
+			// make sure the offered filter has the correct size
+			assert(_iFilterWidth == iFreqBinCount);
+
+			for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++)
+			{
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+				{
+					float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth];
+
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue;
+					pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f;
+				}
+			}
+			uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter);
+			break;
+		}
+		case FILTER_RPROJECTION:
+		{
+			int iProjectionCount = pData->dims.iProjAngles;
+			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+			float * pfHostRealFilter = new float[iRealFilterElementCount];
+			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+			int iFilterShiftSize = _iFilterWidth / 2;
+
+			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+			{
+				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+				float fValue = _pfHostFilter[iDetectorIndex];
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+				{
+					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+				}
+			}
+
+			float* pfDevRealFilter = NULL;
+			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+			delete[] pfHostRealFilter;
+
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+
+			cudaFree(pfDevRealFilter);
+
+			break;
+		}
+		case FILTER_RSINOGRAM:
+		{
+			int iProjectionCount = pData->dims.iProjAngles;
+			int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount;
+			float* pfHostRealFilter = new float[iRealFilterElementCount];
+			memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount);
+
+			int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount);
+			int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2;
+			int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth;
+
+			int iFilterShiftSize = _iFilterWidth / 2;
+			
+			for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++)
+			{
+				int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount;
+
+				for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++)
+				{
+					float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth];
+					pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue;
+				}
+			}
+
+			float* pfDevRealFilter = NULL;
+			cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors
+			cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice);
+			delete[] pfHostRealFilter;
+
+			runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter);
+
+			cudaFree(pfDevRealFilter);
+
+			break;
+		}
+		default:
+		{
+			fprintf(stderr, "AstraFBP::setFilter: Weird filter type requested");
+			delete [] pHostFilter;
+			return false;
+		}
+	}
+
+	delete [] pHostFilter;
+
+	return true;
+}
+
+BPalgo::BPalgo()
+{
+
+}
+
+BPalgo::~BPalgo()
+{
+
+}
+
+bool BPalgo::init()
+{
+	return true;
+}
+
+bool BPalgo::iterate(unsigned int)
+{
+	// TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+	return true;
+}
+
+float BPalgo::computeDiffNorm()
+{
+	float *D_projData;
+	unsigned int projPitch;
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+	callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+	cudaFree(D_projData);
+
+	return sqrt(s);
+}
+
+
+bool astraCudaFP(const float* pfVolume, float* pfSinogram,
+                 unsigned int iVolWidth, unsigned int iVolHeight,
+                 unsigned int iProjAngles, unsigned int iProjDets,
+                 const float *pfAngles, const float *pfOffsets,
+                 float fDetSize, unsigned int iDetSuperSampling,
+                 int iGPUIndex)
+{
+	SDimensions dims;
+
+	if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjDets = iProjDets;
+	dims.fDetScale = fDetSize;
+
+	if (iDetSuperSampling == 0)
+		return false;
+
+	dims.iRaysPerDet = iDetSuperSampling;
+
+	if (iVolWidth <= 0 || iVolHeight <= 0)
+		return false;
+
+	dims.iVolWidth = iVolWidth;
+	dims.iVolHeight = iVolHeight;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+	bool ok;
+
+	float* D_volumeData;
+	unsigned int volumePitch;
+
+	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	if (!ok)
+		return false;
+
+	float* D_sinoData;
+	unsigned int sinoPitch;
+
+	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		return false;
+	}
+
+	ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+	                        dims.iVolWidth, dims.iVolHeight,
+	                        D_volumeData, volumePitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+	                            dims.iProjDets,
+	                            dims.iProjAngles,
+	                            D_sinoData, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	cudaFree(D_volumeData);
+	cudaFree(D_sinoData);
+	return true;
+}
+
+bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+                    unsigned int iVolWidth, unsigned int iVolHeight,
+                    unsigned int iProjAngles, unsigned int iProjDets,
+                    const float *pfAngles, float fOriginSourceDistance,
+                    float fOriginDetectorDistance, float fPixelSize,
+                    float fDetSize,
+                    unsigned int iDetSuperSampling,
+                    int iGPUIndex)
+{
+	SDimensions dims;
+
+	if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjDets = iProjDets;
+
+	if (iDetSuperSampling == 0)
+		return false;
+
+	dims.iRaysPerDet = iDetSuperSampling;
+
+	if (iVolWidth <= 0 || iVolHeight <= 0)
+		return false;
+
+	dims.iVolWidth = iVolWidth;
+	dims.iVolHeight = iVolHeight;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+	bool ok;
+
+	float* D_volumeData;
+	unsigned int volumePitch;
+
+	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	if (!ok)
+		return false;
+
+	float* D_sinoData;
+	unsigned int sinoPitch;
+
+	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		return false;
+	}
+
+	ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+	                        dims.iVolWidth, dims.iVolHeight,
+	                        D_volumeData, volumePitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// TODO: Turn this geometry conversion into a util function
+	SFanProjection* projs = new SFanProjection[dims.iProjAngles];
+
+	float fSrcX0 = 0.0f;
+	float fSrcY0 = -fOriginSourceDistance / fPixelSize;
+	float fDetUX0 = fDetSize / fPixelSize;
+	float fDetUY0 = 0.0f;
+	float fDetSX0 = dims.iProjDets * fDetUX0 / -2.0f;
+	float fDetSY0 = fOriginDetectorDistance / fPixelSize;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); projs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0)
+	for (int i = 0; i < dims.iProjAngles; ++i) {
+		ROTATE0(Src, i, pfAngles[i]);
+		ROTATE0(DetS, i, pfAngles[i]);
+		ROTATE0(DetU, i, pfAngles[i]);
+	}
+
+#undef ROTATE0
+
+	ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, projs, 1.0f);
+	delete[] projs;
+
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+	                            dims.iProjDets,
+	                            dims.iProjAngles,
+	                            D_sinoData, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	cudaFree(D_volumeData);
+	cudaFree(D_sinoData);
+
+	return true;
+
+}
+
+
+bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+                    unsigned int iVolWidth, unsigned int iVolHeight,
+                    unsigned int iProjAngles, unsigned int iProjDets,
+                    const SFanProjection *pAngles,
+                    unsigned int iDetSuperSampling,
+                    int iGPUIndex)
+{
+	SDimensions dims;
+
+	if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjDets = iProjDets;
+	dims.fDetScale = 1.0f; // TODO?
+
+	if (iDetSuperSampling == 0)
+		return false;
+
+	dims.iRaysPerDet = iDetSuperSampling;
+
+	if (iVolWidth <= 0 || iVolHeight <= 0)
+		return false;
+
+	dims.iVolWidth = iVolWidth;
+	dims.iVolHeight = iVolHeight;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+	bool ok;
+
+	float* D_volumeData;
+	unsigned int volumePitch;
+
+	ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	if (!ok)
+		return false;
+
+	float* D_sinoData;
+	unsigned int sinoPitch;
+
+	ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		return false;
+	}
+
+	ok = copyVolumeToDevice(pfVolume, dims.iVolWidth,
+	                        dims.iVolWidth, dims.iVolHeight,
+	                        D_volumeData, volumePitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f);
+
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	ok = copySinogramFromDevice(pfSinogram, dims.iProjDets,
+	                            dims.iProjDets,
+	                            dims.iProjAngles,
+	                            D_sinoData, sinoPitch);
+	if (!ok) {
+		cudaFree(D_volumeData);
+		cudaFree(D_sinoData);
+		return false;
+	}
+
+	cudaFree(D_volumeData);
+	cudaFree(D_sinoData);
+
+	return true;
+
+}
+
+
+}
diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h
new file mode 100644
index 0000000..9e58301
--- /dev/null
+++ b/cuda/2d/astra.h
@@ -0,0 +1,205 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ASTRA_H
+#define _CUDA_ASTRA_H
+
+#include "fft.h"
+#include "fbp_filters.h"
+#include "dims.h"
+#include "algo.h"
+
+using astraCUDA::SFanProjection;
+
+namespace astra {
+
+enum Cuda2DProjectionKernel {
+	ker2d_default = 0
+};
+
+class AstraFBP_internal;
+
+class _AstraExport AstraFBP {
+public:
+	// Constructor
+	AstraFBP();
+
+	// Destructor
+	~AstraFBP();
+
+	// Set the size of the reconstruction rectangle.
+	// Volume pixels are currently assumed to be 1x1 squares.
+	bool setReconstructionGeometry(unsigned int iVolWidth,
+	                               unsigned int iVolHeight,
+	                               float fPixelSize = 1.0f);
+
+	// Set the projection angles and number of detector pixels per angle.
+	// pfAngles must be a float array of length iProjAngles.
+	// fDetSize indicates the size of a detector pixel compared to a
+	// volume pixel edge.
+	//
+	// pfAngles will only be read from during this call.
+	bool setProjectionGeometry(unsigned int iProjAngles,
+	                           unsigned int iProjDets,
+	                           const float *pfAngles,
+	                           float fDetSize = 1.0f);
+
+	// Set linear supersampling factor for the BP.
+	// (The number of rays is the square of this)
+	//
+	// This may optionally be called before init().
+	bool setPixelSuperSampling(unsigned int iPixelSuperSampling);
+
+	// Set per-detector shifts.
+	//
+	// pfTOffsets will only be read from during this call.
+	bool setTOffsets(const float *pfTOffsets);
+
+	// Returns the required size of a filter in the fourier domain
+	// when multiplying it with the fft of the projection data.
+	// Its value is equal to the smallest power of two larger than
+	// or equal to twice the number of detectors in the spatial domain.
+	//
+	// _iDetectorCount is the number of detectors in the spatial domain.
+	static int calcFourierFilterSize(int _iDetectorCount);
+
+	// Sets the filter type. Some filter types require the user to supply an
+	// array containing the filter.
+	// The number of elements in a filter in the fourier domain should be equal
+	// to the value returned by calcFourierFilterSize().
+	// The following types require a filter:
+	//
+	// - FILTER_PROJECTION:
+	// The filter size should be equal to the output of
+	// calcFourierFilterSize(). The filtered sinogram is
+	// multiplied with the supplied filter.
+	//
+	// - FILTER_SINOGRAM:
+	// Same as FILTER_PROJECTION, but now the filter should contain a row for
+	// every projection direction.
+	//
+	// - FILTER_RPROJECTION:
+	// The filter should now contain one kernel (= ifft of filter), with the
+	// peak in the center. The filter width
+	// can be any value. If odd, the peak is assumed to be in the center, if
+	// even, it is assumed to be at floor(filter-width/2).
+	//
+	// - FILTER_RSINOGRAM
+	// Same as FILTER_RPROJECTION, but now the supplied filter should contain a
+	// row for every projection direction.
+	//
+	// A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN,
+	// FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN)
+	// have a D variable, which gives the cutoff point in the frequency domain.
+	// Setting this value to 1.0 will include the whole filter
+	bool setFilter(E_FBPFILTER _eFilter,
+                   const float * _pfHostFilter = NULL,
+                   int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f);
+
+	// Initialize CUDA, allocate GPU buffers and
+	// precompute geometry-specific data.
+	//
+	// CUDA is set up to use GPU number iGPUIndex.
+	//
+	// This must be called after calling setReconstructionGeometry() and
+	// setProjectionGeometry().
+	bool init(int iGPUIndex = 0);
+
+	// Setup input sinogram for a slice.
+	// pfSinogram must be a float array of size iProjAngles*iSinogramPitch.
+	// NB: iSinogramPitch is measured in floats, not in bytes.
+	//
+	// This must be called after init(), and before iterate(). It may be
+	// called again after iterate()/getReconstruction() to start a new slice.
+	//
+	// pfSinogram will only be read from during this call.
+	bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+	// Runs an FBP reconstruction.
+	// This must be called after setSinogram().
+	//
+	// run can be called before setFilter, but will then use the default Ram-Lak filter
+	bool run();
+
+	// Get the reconstructed slice.
+	// pfReconstruction must be a float array of size
+	// iVolHeight*iReconstructionPitch.
+	// NB: iReconstructionPitch is measured in floats, not in bytes.
+	//
+	// This may be called after run().
+	bool getReconstruction(float* pfReconstruction,
+	                       unsigned int iReconstructionPitch) const;
+
+private:
+	AstraFBP_internal* pData;
+};
+
+class _AstraExport BPalgo : public astraCUDA::ReconAlgo {
+public:
+	BPalgo();
+	~BPalgo();
+
+	virtual bool init();
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm();
+};
+
+
+
+
+// TODO: Clean up this interface to FP
+
+// Do a single forward projection
+_AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram,
+                 unsigned int iVolWidth, unsigned int iVolHeight,
+                 unsigned int iProjAngles, unsigned int iProjDets,
+                 const float *pfAngles, const float *pfOffsets,
+                 float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1,
+                 int iGPUIndex = 0);
+
+// Do a single forward projection, fan beam
+_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+                    unsigned int iVolWidth, unsigned int iVolHeight,
+                    unsigned int iProjAngles, unsigned int iProjDets,
+                    const float *pfAngles, float fOriginSourceDistance,
+                    float fOriginDetectorDistance, float fPixelSize = 1.0f,
+                    float fDetSize = 1.0f,
+                    unsigned int iDetSuperSampling = 1,
+                    int iGPUIndex = 0);
+
+_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram,
+                    unsigned int iVolWidth, unsigned int iVolHeight,
+                    unsigned int iProjAngles, unsigned int iProjDets,
+                    const SFanProjection *pAngles,
+                    unsigned int iDetSuperSampling = 1,
+                    int iGPUIndex = 0);
+
+}
+#endif
diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu
new file mode 100644
index 0000000..5b1cf46
--- /dev/null
+++ b/cuda/2d/cgls.cu
@@ -0,0 +1,304 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+CGLS::CGLS() : ReconAlgo()
+{
+	D_z = 0;
+	D_p = 0;
+	D_r = 0;
+	D_w = 0;
+
+	sliceInitialized = false;
+}
+
+
+CGLS::~CGLS()
+{
+	reset();
+}
+
+void CGLS::reset()
+{
+	cudaFree(D_z);
+	cudaFree(D_p);
+	cudaFree(D_r);
+	cudaFree(D_w);
+
+	D_z = 0;
+	D_p = 0;
+	D_r = 0;
+	D_w = 0;
+
+	ReconAlgo::reset();
+}
+
+bool CGLS::init()
+{
+	// Lifetime of z: within an iteration
+	allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch);
+
+	// Lifetime of p: full algorithm
+	allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch);
+
+	// Lifetime of r: full algorithm
+	allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch);
+	
+	// Lifetime of w: within an iteration
+	allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch);
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+
+bool CGLS::setBuffers(float* _D_volumeData, unsigned int _volumePitch,
+                      float* _D_projData, unsigned int _projPitch)
+{
+	bool ok = ReconAlgo::setBuffers(_D_volumeData, _volumePitch,
+	                                _D_projData, _projPitch);
+
+	if (!ok)
+		return false;
+
+	sliceInitialized = false;
+
+	return true;
+}
+
+bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+                         const float* pfReconstruction, unsigned int iReconstructionPitch,
+                         const float* pfVolMask, unsigned int iVolMaskPitch,
+                         const float* pfSinoMask, unsigned int iSinoMaskPitch)
+{
+	sliceInitialized = false;
+
+	return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, fSinogramScale, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch);
+}
+
+bool CGLS::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+	if (!sliceInitialized) {
+
+		// copy sinogram
+		cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+		// r = sino - A*x
+		if (useVolumeMask) {
+			// Use z as temporary storage here since it is unused
+			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+			callFP(D_z, zPitch, D_r, rPitch, -1.0f);
+		} else {
+			callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f);
+		}
+
+
+		// p = A'*r
+		zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		callBP(D_p, pPitch, D_r, rPitch);
+		if (useVolumeMask)
+			processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight);
+
+
+		gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+
+		sliceInitialized = true;
+	}
+
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+		// w = A*p
+		zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles);
+		callFP(D_p, pPitch, D_w, wPitch, 1.0f);
+
+		// alpha = gamma / <w,w>
+		float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+		float alpha = gamma / ww;
+
+		// x += alpha*p
+		processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+		// r -= alpha*w
+		processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles);
+
+
+		// z = A'*r
+		zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		callBP(D_z, zPitch, D_r, rPitch);
+		if (useVolumeMask)
+			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+
+		float beta = 1.0f / gamma;
+		gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1);
+		beta *= gamma;
+
+		// p = z + beta*p
+		processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight);
+
+	}
+
+	return true;
+}
+
+
+float CGLS::computeDiffNorm()
+{
+	// We can use w and z as temporary storage here since they're not
+	// used outside of iterations.
+
+	// copy sinogram to w
+	cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+			processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight);
+			callFP(D_z, zPitch, D_w, wPitch, -1.0f);
+	} else {
+			callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f);
+	}
+
+	// compute norm of D_w
+
+	float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+	return sqrt(s);
+}
+
+bool doCGLS(float* D_volumeData, unsigned int volumePitch,
+            float* D_sinoData, unsigned int sinoPitch,
+            const SDimensions& dims, /*const SAugmentedData& augs,*/
+            const float* angles, const float* TOffsets, unsigned int iterations)
+{
+	CGLS cgls;
+	bool ok = true;
+
+	ok &= cgls.setGeometry(dims, angles);
+#if 0
+	if (D_maskData)
+		ok &= cgls.enableVolumeMask();
+#endif
+	if (TOffsets)
+		ok &= cgls.setTOffsets(TOffsets);
+
+	if (!ok)
+		return false;
+
+	ok = cgls.init();
+	if (!ok)
+		return false;
+
+#if 0
+	if (D_maskData)
+		ok &= cgls.setVolumeMask(D_maskData, maskPitch);
+#endif
+
+	ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+	if (!ok)
+		return false;
+
+	ok = cgls.iterate(iterations);
+
+	return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_sinoData;
+
+	SDimensions dims;
+	dims.iVolWidth = 1024;
+	dims.iVolHeight = 1024;
+	dims.iProjAngles = 512;
+	dims.iProjDets = 1536;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, sinoPitch;
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	printf("pitch: %u\n", sinoPitch);
+	
+	unsigned int y, x;
+	float* sino = loadImage("sino.png", y, x);
+
+	float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	CGLS cgls;
+
+	cgls.setGeometry(dims, angle);
+	cgls.init();
+
+	cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+	cgls.iterate(25);
+
+	delete[] angle;
+
+	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+	return 0;
+}
+#endif
diff --git a/cuda/2d/cgls.h b/cuda/2d/cgls.h
new file mode 100644
index 0000000..1013bf8
--- /dev/null
+++ b/cuda/2d/cgls.h
@@ -0,0 +1,92 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CGLS_H
+#define _CUDA_CGLS_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport CGLS : public ReconAlgo {
+public:
+	CGLS();
+	virtual ~CGLS();
+
+	// disable some features
+	virtual bool enableSinogramMask() { return false; }
+	virtual bool setMinConstraint(float) { return false; }
+	virtual bool setMaxConstraint(float) { return false; }
+
+	virtual bool init();
+
+	virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch,
+	                        float* D_projData, unsigned int projPitch);
+
+	virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale,
+	                           const float* pfReconstruction, unsigned int iReconstructionPitch,
+	                           const float* pfVolMask, unsigned int iVolMaskPitch,
+	                           const float* pfSinoMask, unsigned int iSinoMaskPitch);
+
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm();
+
+protected:
+	void reset();
+
+	bool sliceInitialized;
+
+ 	// Buffers
+	float* D_r;
+	unsigned int rPitch;
+
+	float* D_w;
+	unsigned int wPitch;
+
+	float* D_z;
+	unsigned int zPitch;
+
+	float* D_p;
+	unsigned int pPitch;
+
+
+	float gamma;
+};
+
+
+_AstraExport bool doCGLS(float* D_volumeData, unsigned int volumePitch,
+            float* D_projData, unsigned int projPitch,
+            const SDimensions& dims, const float* angles,
+            const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu
new file mode 100644
index 0000000..db0036e
--- /dev/null
+++ b/cuda/2d/darthelper.cu
@@ -0,0 +1,358 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "darthelper.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+// CUDA function for the selection of ROI
+__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	float x = (float)(threadIdx.x + 16*blockIdx.x);
+	float y = (float)(threadIdx.y + 16*blockIdx.y);
+
+	float w = (width-1.0f)*0.5f;
+	float h = (height-1.0f)*0.5f;
+
+	if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) 
+	{
+		float* d = (float*)in;
+		unsigned int o = (y+padY)*pitch+x+padX; 
+		d[o] = 0.0f;
+	}
+}
+
+void roiSelect(float* out, float radius, unsigned int width, unsigned int height)
+{
+	float* D_data;
+
+	unsigned int pitch;
+	allocateVolume(D_data, width+2, height+2, pitch);
+	copyVolumeToDevice(out, width, width, height, D_data, pitch);
+
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+	devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1);
+
+	copyVolumeFromDevice(out, width, width, height, D_data, pitch);
+
+	cudaFree(D_data);
+}
+
+
+
+
+// CUDA function for the masking of DART with a radius == 1
+__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+		float* d = (float*)in;
+		float* m = (float*)mask;
+
+		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o1 = o2 - pitch; // On previous row.
+		unsigned int o3 = o2 + pitch; // On next row.
+
+		if ((conn == 8 && // 8-connected
+		        (d[o1 - 1] != d[o2] || d[o1] != d[o2] || d[o1 + 1] != d[o2] || 
+		         d[o2 - 1] != d[o2] ||                   d[o2 + 1] != d[o2] ||
+				 d[o3 - 1] != d[o2] || d[o3] != d[o2] || d[o3 + 1] != d[o2] ))
+			|| 
+			(conn == 4 && // 4-connected
+		        (                      d[o1] != d[o2] ||                      
+		         d[o2 - 1] != d[o2] ||                  d[o3 + 1] != d[o2] ||
+				                       d[o3] != d[o2]                      )))
+		{
+			m[o2] = 1.0f;
+		}
+	}
+}
+
+
+// CUDA function for the masking of DART with a radius > 1
+__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) 
+	{
+		float* d = (float*)in;
+		float* m = (float*)mask;
+
+		int r = radius;
+
+		// o2: index of the current center pixel
+		int o2 = (y+padY)*pitch+x+padX;
+
+		if (conn == 8) // 8-connected
+		{
+			for (int row = -r; row <= r; row++) 
+			{
+				int o1 = (y+padY+row)*pitch+x+padX; 
+				for (int col = -r; col <= r; col++) 
+				{
+					if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
+				}
+			}
+		}
+		else if (conn == 4) // 4-connected
+		{
+			// horizontal
+			unsigned int o1 = (y+padY)*pitch+x+padX; 
+			for (int col = -r; col <= r; col++) 
+			{
+				if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;}
+			}
+
+			// vertical
+			for (int row = -r; row <= r; row++) 
+			{
+				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				if (d[o1] != d[o2]) {m[o2] = 1.0f; return;}
+			}
+		}
+	}
+}
+
+
+// CUDA function for the masking of ADART with a radius == 1
+__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+		float* d = (float*)in;
+		float* m = (float*)mask;
+
+		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o1 = o2 - pitch; // On previous row.
+		unsigned int o3 = o2 + pitch; // On next row.
+
+		if (conn == 8)
+		{
+			if (d[o1 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o1    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o1 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+		    if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+		    if (d[o3 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o3    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o3 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+		}
+		else if (conn == 4)
+		{
+			if (d[o1    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+		    if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			if (d[o3    ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+		}
+	}
+}
+
+
+// CUDA function for the masking of ADART with a radius > 1
+__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
+	{
+		float* d = (float*)in;
+		float* m = (float*)mask;
+	
+		int r = radius;
+
+		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+
+		if (conn == 8)
+		{
+			for (int row = -r; row <= r; row++) 
+			{
+				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				for (int col = -r; col <= r; col++) 
+				{
+					if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+				}
+			}
+		}
+		else if (conn == 4)
+		{
+			// horizontal
+			for (int col = -r; col <= r; col++) 
+			{
+				if (d[o2+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			}
+
+			// vertical
+			for (int row = -r; row <= r; row++) 
+			{
+				unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+				if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;}
+			}
+		}
+	}
+}
+
+
+void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height)
+{
+	float* D_segmentationData;
+	float* D_maskData;
+
+	unsigned int pitch;
+	allocateVolume(D_segmentationData, width+2, height+2, pitch);
+	copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch);
+
+	allocateVolume(D_maskData, width+2, height+2, pitch);
+	zeroVolume(D_maskData, pitch, width+2, height+2);
+
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	if (threshold == 1 && radius == 1)
+		devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1);
+	else if (threshold > 1 && radius == 1)
+		devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+	else if (threshold == 1 && radius > 1)
+		devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+	else 
+		devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+
+	copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch);
+
+	cudaFree(D_segmentationData);
+	cudaFree(D_maskData);
+
+}
+
+
+__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius)
+	{
+		float* d = (float*)in;
+		float* m = (float*)out;
+
+		unsigned int o2 = (y+padY)*pitch+x+padX;
+		int r = radius;
+		float res = -d[o2];
+
+		for (int row = -r; row < r; row++) 
+		{
+			unsigned int o1 = (y+padY+row)*pitch+x+padX; 
+			for (int col = -r; col <= r; col++) 
+			{
+				res += d[o1+col];
+			}
+		}
+
+		res *= b / 4*r*(r+1);
+		res += (1.0f-b) * d[o2];
+
+		m[o2] = res;
+	}
+}
+
+
+__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+	// Sacrifice the border pixels to simplify the implementation. 
+	if (x > 0 && x < width - 1 && y > 0 && y < height - 1) {
+		float* d = (float*)in;
+		float* m = (float*)out;
+
+		unsigned int o2 = (y+padY)*pitch+x+padX; // On this row.
+		unsigned int o1 = o2 - pitch; // On previous row.
+		unsigned int o3 = o2 + pitch; // On next row.
+
+		m[o2] = (1.0f-b) * d[o2] + b * 0.125f * (d[o1 - 1] + d[o1] + d[o1 + 1] + d[o2 - 1] + d[o2 + 1] + d[o3 - 1] + d[o3] + d[o3 + 1]);
+	}
+}
+
+
+void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height)
+{
+	float* D_inData;
+	float* D_outData;
+
+	unsigned int pitch;
+	allocateVolume(D_inData, width+2, height+2, pitch);
+	copyVolumeToDevice(in, width, width, height, D_inData, pitch);
+
+	allocateVolume(D_outData, width+2, height+2, pitch);
+	zeroVolume(D_outData, pitch, width+2, height+2);
+
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+	if (radius == 1)
+		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1);
+	else
+		devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1);
+
+	copyVolumeFromDevice(out, width, width, height, D_outData, pitch);
+
+	cudaFree(D_outData);
+	cudaFree(D_inData);
+
+}
+
+
+
+bool setGPUIndex(int iGPUIndex)
+{
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+	return true;
+}
+
+
+}
diff --git a/cuda/2d/darthelper.h b/cuda/2d/darthelper.h
new file mode 100644
index 0000000..e05f01e
--- /dev/null
+++ b/cuda/2d/darthelper.h
@@ -0,0 +1,44 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH2_H
+#define _CUDA_ARITH2_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+	void roiSelect(float* out, float radius, unsigned int width, unsigned int height);
+	void dartMask(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height);
+	void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height);
+
+	bool setGPUIndex(int index);
+
+}
+
+#endif
diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu
new file mode 100644
index 0000000..68573b2
--- /dev/null
+++ b/cuda/2d/dataop.cu
@@ -0,0 +1,130 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util.h"
+#include "dataop.h"
+#include "arith.h"
+#include <cassert>
+
+namespace astraCUDA {
+
+void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height)
+{
+	float* D_data1;
+	float* D_data2;
+
+	unsigned int pitch;
+	allocateVolume(D_data1, width+2, height+2, pitch);
+	copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
+
+	allocateVolume(D_data2, width+2, height+2, pitch);
+	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
+
+	processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height);
+
+	copyVolumeFromDevice(data1, width, width, height, D_data1, pitch);
+
+	cudaFree(D_data1);
+	cudaFree(D_data2);
+}
+
+void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height)
+{
+	float* D_data;
+	float* D_mask;
+
+	unsigned int pitch;
+	allocateVolume(D_data, width+2, height+2, pitch);
+	copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+	allocateVolume(D_mask, width+2, height+2, pitch);
+	copyVolumeToDevice(mask, width, width, height, D_mask, pitch);
+
+	processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height);
+
+	copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+	cudaFree(D_data);
+	cudaFree(D_mask);
+}
+
+
+void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height)
+{
+	float* D_data;
+
+	unsigned int pitch;
+	allocateVolume(D_data, width+2, height+2, pitch);
+	copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+	processVol<opMul, VOL>(D_data, scalar, pitch, width, height);
+
+	copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+	cudaFree(D_data);
+}
+
+
+void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height)
+{
+	float* D_data1;
+	float* D_data2;
+
+	unsigned int pitch;
+	allocateVolume(D_data1, width+2, height+2, pitch);
+	copyVolumeToDevice(data1, width, width, height, D_data1, pitch);
+
+	allocateVolume(D_data2, width+2, height+2, pitch);
+	copyVolumeToDevice(data2, width, width, height, D_data2, pitch);
+
+	processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height);
+
+	copyVolumeFromDevice(data1, width, width, height, D_data1, pitch);
+
+	cudaFree(D_data1);
+	cudaFree(D_data2);
+}
+
+
+void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height)
+{
+	float* D_data;
+
+	unsigned int pitch;
+	allocateVolume(D_data, width+2, height+2, pitch);
+	copyVolumeToDevice(data, width, width, height, D_data, pitch);
+
+	processVol<opAdd, VOL>(D_data, scalar, pitch, width, height);
+
+	copyVolumeFromDevice(data, width, width, height, D_data, pitch);
+
+	cudaFree(D_data);
+}
+
+
+}
diff --git a/cuda/2d/dataop.h b/cuda/2d/dataop.h
new file mode 100644
index 0000000..3e9c7e2
--- /dev/null
+++ b/cuda/2d/dataop.h
@@ -0,0 +1,47 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DATAOP_H
+#define _CUDA_DATAOP_H
+
+#include <cuda.h>
+
+namespace astraCUDA {
+
+	void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height);
+
+	void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height);
+	void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height);
+
+	void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height);
+
+	void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height);
+
+}
+
+#endif
diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h
new file mode 100644
index 0000000..21ccb31
--- /dev/null
+++ b/cuda/2d/dims.h
@@ -0,0 +1,68 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DIMS_H
+#define _CUDA_DIMS_H
+
+namespace astraCUDA {
+
+struct SFanProjection {
+        // the source
+        float fSrcX, fSrcY;
+
+        // the start of the (linear) detector
+        float fDetSX, fDetSY;
+
+        // the length of a single detector pixel
+        float fDetUX, fDetUY;
+};
+
+
+struct SDimensions {
+	unsigned int iVolWidth;
+	unsigned int iVolHeight;
+	unsigned int iProjAngles;
+	unsigned int iProjDets;
+	float fDetScale; // size of detector compared to volume pixels
+	unsigned int iRaysPerDet;
+	unsigned int iRaysPerPixelDim;
+};
+
+struct SDimensions3D {
+        unsigned int iVolX;
+        unsigned int iVolY;
+        unsigned int iVolZ;
+        unsigned int iProjAngles;
+        unsigned int iProjU; // number of detectors in the U direction
+        unsigned int iProjV; // number of detectors in the V direction
+};
+
+}
+
+#endif
+
diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu
new file mode 100644
index 0000000..74d1bbf
--- /dev/null
+++ b/cuda/2d/em.cu
@@ -0,0 +1,262 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "em.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+
+// TODO: ensure non-negativity somewhere??
+
+
+EM::EM()
+{
+	D_projData = 0;
+	D_tmpData = 0;
+	D_pixelWeight = 0;
+
+}
+
+
+EM::~EM()
+{
+	reset();
+}
+
+void EM::reset()
+{
+	cudaFree(D_projData);
+	cudaFree(D_tmpData);
+	cudaFree(D_pixelWeight);
+
+	D_projData = 0;
+	D_tmpData = 0;
+	D_pixelWeight = 0;
+
+	ReconAlgo::reset();
+}
+
+
+bool EM::init()
+{
+	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// We can't precompute pixelWeights when using a volume mask
+#if 0 
+	if (!useVolumeMask)
+#endif
+		precomputeWeights();
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+bool EM::precomputeWeights()
+{
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+#if 0
+	if (useSinogramMask) {
+		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
+	} else
+#endif
+	{
+		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+	}
+	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+#if 0
+	if (useVolumeMask) {
+		// scale pixel weights with mask to zero out masked pixels
+		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+	}
+#endif
+
+	return true;
+}
+
+bool EM::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+#if 0
+	if (useVolumeMask)
+		precomputeWeights();
+#endif
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+		// Do FP of volumeData 
+		zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+		callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f);
+
+		// Divide sinogram by FP (into projData)
+		processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles);
+
+		// Do BP of projData into tmpData
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+
+		// Multiply volumeData with tmpData divided by pixel weights
+		processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+	}
+
+	return true;
+}
+
+float EM::computeDiffNorm()
+{
+	// copy sinogram to projection data
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+	} else {
+			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+	}
+
+
+	// compute norm of D_projData
+
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+	return sqrt(s);
+}
+
+
+bool doEM(float* D_volumeData, unsigned int volumePitch,
+          float* D_sinoData, unsigned int sinoPitch,
+          const SDimensions& dims, const float* angles,
+          const float* TOffsets, unsigned int iterations)
+{
+	EM em;
+	bool ok = true;
+
+	ok &= em.setGeometry(dims, angles);
+	if (TOffsets)
+		ok &= em.setTOffsets(TOffsets);
+
+	if (!ok)
+		return false;
+
+	ok = em.init();
+	if (!ok)
+		return false;
+
+	ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+	if (!ok)
+		return false;
+
+	ok = em.iterate(iterations);
+
+	return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_sinoData;
+
+	SDimensions dims;
+	dims.iVolWidth = 1024;
+	dims.iVolHeight = 1024;
+	dims.iProjAngles = 512;
+	dims.iProjDets = 1536;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, sinoPitch;
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	printf("pitch: %u\n", sinoPitch);
+	
+	unsigned int y, x;
+	float* sino = loadImage("sino.png", y, x);
+
+	float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	EM em;
+
+	em.setGeometry(dims, angle);
+	em.init();
+
+	// TODO: Initialize D_volumeData with an unfiltered backprojection
+
+	em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+	em.iterate(25);
+
+
+	delete[] angle;
+
+	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+	return 0;
+}
+
+#endif
diff --git a/cuda/2d/em.h b/cuda/2d/em.h
new file mode 100644
index 0000000..5a9ffed
--- /dev/null
+++ b/cuda/2d/em.h
@@ -0,0 +1,77 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_EM_H
+#define _CUDA_EM_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport EM : public ReconAlgo {
+public:
+	EM();
+	virtual ~EM();
+
+	// disable some features
+	virtual bool enableSinogramMask() { return false; }
+	virtual bool enableVolumeMask() { return false; }
+	virtual bool setMinConstraint(float) { return false; }
+	virtual bool setMaxConstraint(float) { return false; }
+
+	virtual bool init();
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm();
+
+protected:
+	void reset();
+	bool precomputeWeights();
+
+ 	// Temporary buffers
+	float* D_projData;
+	unsigned int projPitch;
+
+	float* D_tmpData;
+	unsigned int tmpPitch;
+
+	// Geometry-specific precomputed data
+	float* D_pixelWeight;
+	unsigned int pixelPitch;
+};
+
+_AstraExport bool doEM(float* D_volumeData, unsigned int volumePitch,
+          float* D_projData, unsigned int projPitch,
+          const SDimensions& dims, const float* angles,
+          const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu
new file mode 100644
index 0000000..1edc6d9
--- /dev/null
+++ b/cuda/2d/fan_bp.cu
@@ -0,0 +1,374 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_FanProjTexture;
+
+
+namespace astraCUDA {
+
+const unsigned int g_anglesPerBlock = 16;
+const unsigned int g_blockSliceSize = 32;
+const unsigned int g_blockSlices = 16;
+
+const unsigned int g_MaxAngles = 2560;
+
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+
+
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_FanProjTexture.filterMode = cudaFilterModeLinear;
+	gT_FanProjTexture.normalized = false;
+
+	cudaBindTexture2D(0, gT_FanProjTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+
+	// TODO: error value?
+
+	return true;
+}
+
+__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+	const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f );
+
+	float* volData = (float*)D_volData;
+
+	float fVal = 0.0f;
+	float fA = startAngle + 0.5f;
+
+	// TODO: Distance correction?
+
+	for (int angle = startAngle; angle < endAngle; ++angle)
+	{
+		const float fSrcX = gC_SrcX[angle];
+		const float fSrcY = gC_SrcY[angle];
+		const float fDetSX = gC_DetSX[angle];
+		const float fDetSY = gC_DetSY[angle];
+		const float fDetUX = gC_DetUX[angle];
+		const float fDetUY = gC_DetUY[angle];
+
+		const float fXD = fSrcX - fX;
+		const float fYD = fSrcY - fY;
+
+		const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+		const float fDen = fDetUX * fYD - fDetUY * fXD;
+		
+		const float fT = fNum / fDen + 1.0f;
+		fVal += tex2D(gT_FanProjTexture, fT, fA);
+		fA += 1.0f;
+	}
+
+	volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+// supersampling version
+__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fXb = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
+	const float fYb = - ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim);
+
+	const float fSubStep = 1.0f/dims.iRaysPerPixelDim;
+
+	float* volData = (float*)D_volData;
+
+	float fVal = 0.0f;
+	float fA = startAngle + 0.5f;
+
+	// TODO: Distance correction?
+
+	for (int angle = startAngle; angle < endAngle; ++angle)
+	{
+		const float fSrcX = gC_SrcX[angle];
+		const float fSrcY = gC_SrcY[angle];
+		const float fDetSX = gC_DetSX[angle];
+		const float fDetSY = gC_DetSY[angle];
+		const float fDetUX = gC_DetUX[angle];
+		const float fDetUY = gC_DetUY[angle];
+
+		// TODO: Optimize these loops...
+		float fX = fXb;
+		for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+			float fY = fYb;
+			for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+				const float fXD = fSrcX - fX;
+				const float fYD = fSrcY - fY;
+
+				const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+				const float fDen = fDetUX * fYD - fDetUY * fXD;
+		
+				const float fT = fNum / fDen + 1.0f;
+				fVal += tex2D(gT_FanProjTexture, fT, fA);
+				fY -= fSubStep;
+			}
+			fX += fSubStep;
+		}
+		fA += 1.0f;
+	}
+
+	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+}
+
+
+// BP specifically for SART.
+// It includes (free) weighting with voxel weight.
+// It assumes the proj texture is set up _without_ padding, unlike regular BP.
+__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f );
+	const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f );
+
+	float* volData = (float*)D_volData;
+
+	// TODO: Distance correction?
+
+	// TODO: Constant memory vs parameters.
+	const float fSrcX = gC_SrcX[0];
+	const float fSrcY = gC_SrcY[0];
+	const float fDetSX = gC_DetSX[0];
+	const float fDetSY = gC_DetSY[0];
+	const float fDetUX = gC_DetUX[0];
+	const float fDetUY = gC_DetUY[0];
+
+	const float fXD = fSrcX - fX;
+	const float fYD = fSrcY - fY;
+
+	const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX;
+	const float fDen = fDetUX * fYD - fDetUY * fXD;
+		
+	const float fT = fNum / fDen;
+	const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f);
+
+	volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+
+bool FanBP(float* D_volumeData, unsigned int volumePitch,
+           float* D_projData, unsigned int projPitch,
+           const SDimensions& dims, const SFanProjection* angles)
+{
+	// TODO: process angles block by block
+	assert(dims.iProjAngles <= g_MaxAngles);
+
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(SrcX);
+	TRANSFER_TO_CONSTANT(SrcY);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+	cudaStream_t stream;
+	cudaStreamCreate(&stream);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+		if (dims.iRaysPerPixelDim > 1)
+			devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+		else
+			devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims);
+	}
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	cudaStreamDestroy(stream);
+
+	return true;
+}
+
+
+// D_projData is a pointer to one padded sinogram line
+bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
+                float* D_projData, unsigned int projPitch,
+                unsigned int angle,
+                const SDimensions& dims, const SFanProjection* angles)
+{
+	// only one angle
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+
+	// transfer angle to constant memory
+#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(SrcX);
+	TRANSFER_TO_CONSTANT(SrcY);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+	devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims);
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_projData;
+
+	SDimensions dims;
+	dims.iVolWidth = 128;
+	dims.iVolHeight = 128;
+	dims.iProjAngles = 180;
+	dims.iProjDets = 256;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, projPitch;
+
+	SFanProjection projs[180];
+
+	projs[0].fSrcX = 0.0f;
+	projs[0].fSrcY = 1536.0f;
+	projs[0].fDetSX = 128.0f;
+	projs[0].fDetSY = -512.0f;
+	projs[0].fDetUX = -1.0f;
+	projs[0].fDetUY = 0.0f;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
+
+	for (int i = 1; i < 180; ++i) {
+		ROTATE0(Src, i, i*2*M_PI/180);
+		ROTATE0(DetS, i, i*2*M_PI/180);
+		ROTATE0(DetU, i, i*2*M_PI/180);
+	}
+
+#undef ROTATE0
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	printf("pitch: %u\n", projPitch);
+
+	unsigned int y, x;
+	float* sino = loadImage("sino.png", y, x);
+
+	float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+	memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
+
+	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs);
+
+	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+	return 0;
+}
+#endif
diff --git a/cuda/2d/fan_bp.h b/cuda/2d/fan_bp.h
new file mode 100644
index 0000000..a4e62be
--- /dev/null
+++ b/cuda/2d/fan_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FAN_BP_H
+#define _CUDA_FAN_BP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FanBP(float* D_volumeData, unsigned int volumePitch,
+           float* D_projData, unsigned int projPitch,
+           const SDimensions& dims, const SFanProjection* angles);
+
+_AstraExport bool FanBP_SART(float* D_volumeData, unsigned int volumePitch,
+                float* D_projData, unsigned int projPitch,
+                unsigned int angle,
+                const SDimensions& dims, const SFanProjection* angles);
+
+}
+
+#endif
diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu
new file mode 100644
index 0000000..cf9f352
--- /dev/null
+++ b/cuda/2d/fan_fp.cu
@@ -0,0 +1,370 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_FanVolumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+	dataArray = 0;
+	cudaMallocArray(&dataArray, &channelDesc, width, height);
+	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+	gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_FanVolumeTexture.filterMode = cudaFilterModeLinear;
+	gT_FanVolumeTexture.normalized = false;
+
+	// TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+	// not using an array is more efficient.
+	//cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+	cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+	float* projData = (float*)D_projData;
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+	if (angle >= endAngle)
+		return;
+
+	const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	const float fSrcX = gC_SrcX[angle];
+	const float fSrcY = gC_SrcY[angle];
+	const float fDetSX = gC_DetSX[angle];
+	const float fDetSY = gC_DetSY[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+
+	float fVal = 0.0f;
+
+	const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+	const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+	if (fdy > fdx)
+		return;
+
+
+	for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+		const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet;
+
+		const float fDetX = fDetSX + fDet * fDetUX;
+		const float fDetY = fDetSY + fDet * fDetUY;
+
+		// ray: y = alpha * x + beta
+		const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX);
+		const float beta = fSrcY - alpha * fSrcX;
+	
+		const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet;
+
+		// intersect ray with first slice
+
+		float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+		float fX = startSlice + 1.5f;
+
+		int endSlice = startSlice + g_blockSlices;
+		if (endSlice > dims.iVolWidth)
+			endSlice = dims.iVolWidth;
+
+		float fV = 0.0f;
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fV += tex2D(gT_FanVolumeTexture, fX, fY);
+			fY -= alpha;
+			fX += 1.0f;
+		}
+
+		fVal += fV * fDistCorr;
+
+	}
+
+	projData[angle*projPitch+detector+1] += fVal;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+	if (angle >= endAngle)
+		return;
+
+	const int detector = blockIdx.y * g_detBlockSize + relDet;
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	float* projData = (float*)D_projData;
+
+	const float fSrcX = gC_SrcX[angle];
+	const float fSrcY = gC_SrcY[angle];
+	const float fDetSX = gC_DetSX[angle];
+	const float fDetSY = gC_DetSY[angle];
+	const float fDetUX = gC_DetUX[angle];
+	const float fDetUY = gC_DetUY[angle];
+
+	float fVal = 0.0f;
+
+	const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX);
+	const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY);
+
+	if (fdy <= fdx)
+		return;
+
+
+	for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+		const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/;
+
+		const float fDetX = fDetSX + fDet * fDetUX;
+		const float fDetY = fDetSY + fDet * fDetUY;
+
+		// ray: x = alpha * y + beta
+		const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY);
+		const float beta = fSrcX - alpha * fSrcY;
+	
+		const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet;
+
+		// intersect ray with first slice
+
+		float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+		float fY = startSlice + 1.5f;
+
+		int endSlice = startSlice + g_blockSlices;
+		if (endSlice > dims.iVolHeight)
+			endSlice = dims.iVolHeight;
+
+		float fV = 0.0f;
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fV += tex2D(gT_FanVolumeTexture, fX, fY);
+			fX -= alpha;
+			fY += 1.0f;
+		}
+
+		fVal += fV * fDistCorr;
+
+	}
+
+	projData[angle*projPitch+detector+1] += fVal;
+}
+
+bool FanFP(float* D_volumeData, unsigned int volumePitch,
+           float* D_projData, unsigned int projPitch,
+           const SDimensions& dims, const SFanProjection* angles,
+           float outputScale)
+{
+	// TODO: load angles into constant memory in smaller blocks
+	assert(dims.iProjAngles <= g_MaxAngles);
+
+	cudaArray* D_dataArray;
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(SrcX);
+	TRANSFER_TO_CONSTANT(SrcY);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+	const unsigned int g_blockSliceSize = g_detBlockSize;
+
+	std::list<cudaStream_t> streams;
+
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = dims.iProjAngles;
+
+	dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+	             (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions
+	cudaStream_t stream1;
+	cudaStreamCreate(&stream1);
+	streams.push_back(stream1);
+	for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+		FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+	cudaStream_t stream2;
+	cudaStreamCreate(&stream2);
+	streams.push_back(stream2);
+	for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+		FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+
+	cudaStreamDestroy(stream1);
+	cudaStreamDestroy(stream2);
+
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	cudaFreeArray(D_dataArray);
+
+	return true;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_projData;
+
+	SDimensions dims;
+	dims.iVolWidth = 128;
+	dims.iVolHeight = 128;
+	dims.iProjAngles = 180;
+	dims.iProjDets = 256;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, projPitch;
+
+	SFanProjection projs[180];
+
+	projs[0].fSrcX = 0.0f;
+	projs[0].fSrcY = 1536.0f;
+	projs[0].fDetSX = 128.0f;
+	projs[0].fDetSY = -512.0f;
+	projs[0].fDetUX = -1.0f;
+	projs[0].fDetUY = 0.0f;
+
+#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0)
+
+	for (int i = 1; i < 180; ++i) {
+		ROTATE0(Src, i, i*2*M_PI/180);
+		ROTATE0(DetS, i, i*2*M_PI/180);
+		ROTATE0(DetU, i, i*2*M_PI/180);
+	}
+
+#undef ROTATE0
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	printf("pitch: %u\n", projPitch);
+
+	unsigned int y, x;
+	float* img = loadImage("phantom128.png", y, x);
+
+	float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+	memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f);
+
+	delete[] angle;
+
+	copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	float s = 0.0f;
+	for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+		for (unsigned int x = 0; x < dims.iProjDets; ++x)
+			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+	printf("cpu norm: %f\n", s);
+
+	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	printf("gpu norm: %f\n", s);
+
+	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+	return 0;
+}
+#endif
diff --git a/cuda/2d/fan_fp.h b/cuda/2d/fan_fp.h
new file mode 100644
index 0000000..0734f40
--- /dev/null
+++ b/cuda/2d/fan_fp.h
@@ -0,0 +1,41 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FAN_FP_H
+#define _CUDA_FAN_FP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FanFP(float* D_volumeData, unsigned int volumePitch,
+           float* D_projData, unsigned int projPitch,
+           const SDimensions& dims, const SFanProjection* angles,
+           float outputScale);
+
+}
+
+#endif
diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h
new file mode 100644
index 0000000..1232f8e
--- /dev/null
+++ b/cuda/2d/fbp_filters.h
@@ -0,0 +1,58 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef FBP_FILTERS_H
+#define FBP_FILTERS_H
+
+enum E_FBPFILTER
+{
+	FILTER_NONE,			//< no filter (regular BP)
+	FILTER_RAMLAK,			//< default FBP filter
+	FILTER_SHEPPLOGAN,		//< Shepp-Logan
+	FILTER_COSINE,			//< Cosine
+	FILTER_HAMMING,			//< Hamming filter
+	FILTER_HANN,			//< Hann filter
+	FILTER_TUKEY,			//< Tukey filter
+	FILTER_LANCZOS,			//< Lanczos filter
+	FILTER_TRIANGULAR,		//< Triangular filter
+	FILTER_GAUSSIAN,		//< Gaussian filter
+	FILTER_BARTLETTHANN,	//< Bartlett-Hann filter
+	FILTER_BLACKMAN,		//< Blackman filter
+	FILTER_NUTTALL,			//< Nuttall filter, continuous first derivative
+	FILTER_BLACKMANHARRIS,	//< Blackman-Harris filter
+	FILTER_BLACKMANNUTTALL,	//< Blackman-Nuttall filter
+	FILTER_FLATTOP,			//< Flat top filter
+	FILTER_KAISER,			//< Kaiser filter
+	FILTER_PARZEN,			//< Parzen filter
+	FILTER_PROJECTION,		//< all projection directions share one filter
+	FILTER_SINOGRAM,		//< every projection direction has its own filter
+	FILTER_RPROJECTION,		//< projection filter in real space (as opposed to fourier space)
+	FILTER_RSINOGRAM,		//< sinogram filter in real space
+};
+
+#endif /* FBP_FILTERS_H */
diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu
new file mode 100644
index 0000000..79e4be7
--- /dev/null
+++ b/cuda/2d/fft.cu
@@ -0,0 +1,873 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "fft.h"
+#include "util.h"
+
+#include <cufft.h>
+#include <iostream>
+#include <cuda.h>
+#include <fstream>
+
+#include "../../include/astra/Logger.h"
+
+using namespace astra;
+
+// TODO: evaluate what we want to do in these situations:
+
+#define CHECK_ERROR(errorMessage) do {                                     \
+  cudaError_t err = cudaThreadSynchronize();                               \
+  if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n",    \
+              errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\
+			  CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+      exit(EXIT_FAILURE);                                                  \
+  } } while (0)
+
+#define SAFE_CALL( call) do {                                              \
+  cudaError err = call;                                                    \
+  if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+              __FILE__, __LINE__, cudaGetErrorString( err) );              \
+	  CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+      exit(EXIT_FAILURE);                                                  \
+  }                                                                        \
+  err = cudaThreadSynchronize();                                           \
+  if( cudaSuccess != err) {                                                \
+      fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",        \
+              __FILE__, __LINE__, cudaGetErrorString( err) );              \
+	  CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \
+      exit(EXIT_FAILURE);                                                  \
+  } } while (0)
+
+
+__global__ static void applyFilter_kernel(int _iProjectionCount,
+                                          int _iFreqBinCount,
+                                          cufftComplex * _pSinogram,
+                                          cufftComplex * _pFilter)
+{
+	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+	int iProjectionIndex = iIndex / _iFreqBinCount;
+
+	if(iProjectionIndex >= _iProjectionCount)
+	{
+		return;
+	}
+
+	float fA = _pSinogram[iIndex].x;
+	float fB = _pSinogram[iIndex].y;
+	float fC = _pFilter[iIndex].x;
+	float fD = _pFilter[iIndex].y;
+
+	_pSinogram[iIndex].x = fA * fC - fB * fD;
+	_pSinogram[iIndex].y = fA * fD + fC * fB;
+}
+
+__global__ static void rescaleInverseFourier_kernel(int _iProjectionCount,
+                                                    int _iDetectorCount,
+                                                    float* _pfInFourierOutput)
+{
+	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+	int iProjectionIndex = iIndex / _iDetectorCount;
+	int iDetectorIndex = iIndex % _iDetectorCount;
+
+	if(iProjectionIndex >= _iProjectionCount)
+	{
+		return;
+	}
+
+	_pfInFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex] /= (float)_iDetectorCount;
+}
+
+static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount,
+                                  float * _pfInFourierOutput)
+{
+	const int iBlockSize = 256;
+	int iElementCount = _iProjectionCount * _iDetectorCount;
+	int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+	rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+	                                                            _iDetectorCount,
+	                                                            _pfInFourierOutput);
+	CHECK_ERROR("rescaleInverseFourier_kernel failed");
+}
+
+void applyFilter(int _iProjectionCount, int _iFreqBinCount,
+                 cufftComplex * _pSinogram, cufftComplex * _pFilter)
+{
+	const int iBlockSize = 256;
+	int iElementCount = _iProjectionCount * _iFreqBinCount;
+	int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+	applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+	                                                  _iFreqBinCount,
+	                                                  _pSinogram, _pFilter);
+	CHECK_ERROR("applyFilter_kernel failed");
+}
+
+static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount,
+                          const float * _pfDevSource,
+                          cufftComplex * _pDevTargetComplex)
+{
+	cufftHandle plan;
+	cufftResult result;
+
+	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount);
+	if(result != CUFFT_SUCCESS)
+	{
+		std::cerr << "Failed to plan 1d r2c fft" << std::endl;
+		return false;
+	}
+
+	result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex);
+	cufftDestroy(plan);
+
+	if(result != CUFFT_SUCCESS)
+	{
+		std::cerr << "Failed to exec 1d r2c fft" << std::endl;
+		return false;
+	}
+
+	return true;
+}
+
+static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount,
+                           const cufftComplex * _pDevSourceComplex,
+                           float * _pfDevTarget)
+{
+	cufftHandle plan;
+	cufftResult result;
+
+	result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount);
+	if(result != CUFFT_SUCCESS)
+	{
+		std::cerr << "Failed to plan 1d c2r fft" << std::endl;
+		return false;
+	}
+
+	// todo: why do we have to get rid of the const qualifier?
+	result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex,
+                          (cufftReal *)_pfDevTarget);
+	cufftDestroy(plan);
+
+	if(result != CUFFT_SUCCESS)
+	{
+		std::cerr << "Failed to exec 1d c2r fft" << std::endl;
+		return false;
+	}
+
+	return true;
+}
+
+bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount,
+                             cufftComplex ** _ppDevComplex)
+{
+	size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
+	SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize));
+	return true;
+}
+
+bool freeComplexOnDevice(cufftComplex * _pDevComplex)
+{
+	SAFE_CALL(cudaFree(_pDevComplex));
+	return true;
+}
+
+bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
+                                cufftComplex * _pHostComplexSource,
+                                cufftComplex * _pDevComplexTarget)
+{
+	size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount;
+	SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice));
+
+	return true;
+}
+
+bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
+                int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+                int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
+                cufftComplex * _pDevTargetComplex)
+{
+	float * pfDevRealFFTSource = NULL;
+	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
+
+	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize));
+	SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize));
+
+	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+	{
+		const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX;
+		float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount;
+
+		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+	}
+
+	bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount,
+	                             pfDevRealFFTSource, _pDevTargetComplex);
+	if(!bResult)
+	{
+		return false;
+	}
+
+	SAFE_CALL(cudaFree(pfDevRealFFTSource));
+
+	return true;
+}
+
+bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
+                 float * _pfRealTarget,
+                 int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+                 int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
+{
+	float * pfDevRealFFTTarget = NULL;
+	size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount;
+
+	SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize));
+
+	bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount,
+	                              _pDevSourceComplex, pfDevRealFFTTarget);
+	if(!bResult)
+	{
+		return false;
+	}
+
+	rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount,
+                          pfDevRealFFTTarget);
+
+	SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch));
+
+	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+	{
+		const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount;
+		float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX;
+
+		SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice));
+	}
+
+	SAFE_CALL(cudaFree(pfDevRealFFTTarget));
+
+	return true;
+}
+
+
+// Because the input is real, the Fourier transform is symmetric.
+// CUFFT only outputs the first half (ignoring the redundant second half),
+// and expects the same as input for the IFFT.
+int calcFFTFourSize(int _iFFTRealSize)
+{
+	int iFFTFourSize = _iFFTRealSize / 2 + 1;
+
+	return iFFTFourSize;
+}
+
+void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
+                   int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount)
+{
+	for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+	{
+		for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+		{
+			int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount;
+			_pFilter[iIndex].x = 1.0f;
+			_pFilter[iIndex].y = 0.0f;
+		}
+	}
+}
+
+void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
+               cufftComplex * _pFilter, int _iFFTRealDetectorCount,
+               int _iFFTFourierDetectorCount, float _fParameter /* = -1.0f */)
+{
+	float * pfFilt = new float[_iFFTFourierDetectorCount];
+	float * pfW = new float[_iFFTFourierDetectorCount];
+
+	for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+	{
+		float fRelIndex = (float)iDetectorIndex / (float)_iFFTRealDetectorCount;
+
+		// filt = 2*( 0:(order/2) )./order;
+		pfFilt[iDetectorIndex] = 2.0f * fRelIndex;
+		//pfFilt[iDetectorIndex] = 1.0f;
+
+		// w = 2*pi*(0:size(filt,2)-1)/order
+		pfW[iDetectorIndex] = 3.1415f * 2.0f * fRelIndex;
+	}
+
+	switch(_eFilter)
+	{
+		case FILTER_RAMLAK:
+		{
+			// do nothing
+			break;
+		}
+		case FILTER_SHEPPLOGAN:
+		{
+			// filt(2:end) = filt(2:end) .* (sin(w(2:end)/(2*d))./(w(2:end)/(2*d)))
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (sinf(pfW[iDetectorIndex] / 2.0f / _fD) / (pfW[iDetectorIndex] / 2.0f / _fD));
+			}
+			break;
+		}
+		case FILTER_COSINE:
+		{
+			// filt(2:end) = filt(2:end) .* cos(w(2:end)/(2*d))
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * cosf(pfW[iDetectorIndex] / 2.0f / _fD);
+			}
+			break;
+		}
+		case FILTER_HAMMING:
+		{
+			// filt(2:end) = filt(2:end) .* (.54 + .46 * cos(w(2:end)/d))
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * ( 0.54f + 0.46f * cosf(pfW[iDetectorIndex] / _fD));
+			}
+			break;
+		}
+		case FILTER_HANN:
+		{
+			// filt(2:end) = filt(2:end) .*(1+cos(w(2:end)./d)) / 2
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (1.0f + cosf(pfW[iDetectorIndex] / _fD)) / 2.0f;
+			}
+			break;
+		}
+		case FILTER_TUKEY:
+		{
+			float fAlpha = _fParameter;
+			if(_fParameter < 0.0f) fAlpha = 0.5f;
+			float fN = (float)_iFFTFourierDetectorCount;
+			float fHalfN = fN / 2.0f;
+			float fEnumTerm = fAlpha * fHalfN;
+			float fDenom = (1.0f - fAlpha) * fHalfN;
+			float fBlockStart = fHalfN - fEnumTerm;
+			float fBlockEnd = fHalfN + fEnumTerm;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fAbsSmallN = fabs((float)iDetectorIndex);
+				float fStoredValue = 0.0f;
+
+				if((fBlockStart <= fAbsSmallN) && (fAbsSmallN <= fBlockEnd))
+				{
+					fStoredValue = 1.0f;
+				}
+				else
+				{
+					float fEnum = fAbsSmallN - fEnumTerm;
+					float fCosInput = M_PI * fEnum / fDenom;
+					fStoredValue = 0.5f * (1.0f + cosf(fCosInput));
+				}
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_LANCZOS:
+		{
+			float fDenum = (float)(_iFFTFourierDetectorCount - 1);
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fX = 2.0f * fSmallN / fDenum - 1.0f;
+				float fSinInput = M_PI * fX;
+				float fStoredValue = 0.0f;
+
+				if(fabsf(fSinInput) > 0.001f)
+				{
+					fStoredValue = sin(fSinInput)/fSinInput;
+				}
+				else
+				{
+					fStoredValue = 1.0f;
+				}
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_TRIANGULAR:
+		{
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fAbsInput = fSmallN - fNMinusOne / 2.0f;
+				float fParenInput = fNMinusOne / 2.0f - fabsf(fAbsInput);
+				float fStoredValue = 2.0f / fNMinusOne * fParenInput;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_GAUSSIAN:
+		{
+			float fSigma = _fParameter;
+			if(_fParameter < 0.0f) fSigma = 0.4f;
+			float fN = (float)_iFFTFourierDetectorCount;
+			float fQuotient = (fN - 1.0f) / 2.0f;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fEnum = fSmallN - fQuotient;
+				float fDenom = fSigma * fQuotient;
+				float fPower = -0.5f * (fEnum / fDenom) * (fEnum / fDenom);
+				float fStoredValue = expf(fPower);
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_BARTLETTHANN:
+		{
+			const float fA0 = 0.62f;
+			const float fA1 = 0.48f;
+			const float fA2 = 0.38f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+			
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fAbsInput = fSmallN / fNMinusOne - 0.5f;
+				float fFirstTerm = fA1 * fabsf(fAbsInput);
+				float fCosInput = 2.0f * M_PI * fSmallN / fNMinusOne;
+				float fSecondTerm = fA2 * cosf(fCosInput);
+				float fStoredValue = fA0 - fFirstTerm - fSecondTerm;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_BLACKMAN:
+		{
+			float fAlpha = _fParameter;
+			if(_fParameter < 0.0f) fAlpha = 0.16f;
+			float fA0 = (1.0f - fAlpha) / 2.0f;
+			float fA1 = 0.5f;
+			float fA2 = fAlpha / 2.0f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fCosInput1 = 2.0f * M_PI * 0.5f * fSmallN / fNMinusOne;
+				float fCosInput2 = 4.0f * M_PI * 0.5f * fSmallN / fNMinusOne;
+				float fStoredValue = fA0 - fA1 * cosf(fCosInput1) + fA2 * cosf(fCosInput2);
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_NUTTALL:
+		{
+			const float fA0 = 0.355768f;
+			const float fA1 = 0.487396f;
+			const float fA2 = 0.144232f;
+			const float fA3 = 0.012604f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+				float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+				float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+				float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+				float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_BLACKMANHARRIS:
+		{
+			const float fA0 = 0.35875f;
+			const float fA1 = 0.48829f;
+			const float fA2 = 0.14128f;
+			const float fA3 = 0.01168f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+				float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+				float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+				float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+				float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_BLACKMANNUTTALL:
+		{
+			const float fA0 = 0.3635819f;
+			const float fA1 = 0.4891775f;
+			const float fA2 = 0.1365995f;
+			const float fA3 = 0.0106411f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+				float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+				float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+				float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+				float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_FLATTOP:
+		{
+			const float fA0 = 1.0f;
+			const float fA1 = 1.93f;
+			const float fA2 = 1.29f;
+			const float fA3 = 0.388f;
+			const float fA4 = 0.032f;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f;
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fBaseCosInput = M_PI * fSmallN / fNMinusOne;
+				float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput);
+				float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput);
+				float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput);
+				float fFourthTerm = fA4 * cosf(8.0f * fBaseCosInput);
+				float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm + fFourthTerm;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_KAISER:
+		{
+			float fAlpha = _fParameter;
+			if(_fParameter < 0.0f) fAlpha = 3.0f;
+			float fPiTimesAlpha = M_PI * fAlpha;
+			float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1);
+			float fDenom = (float)j0((double)fPiTimesAlpha);
+
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fSquareInput = 2.0f * fSmallN / fNMinusOne - 1;
+				float fSqrtInput = 1.0f - fSquareInput * fSquareInput;
+				float fBesselInput = fPiTimesAlpha * sqrt(fSqrtInput);
+				float fEnum = (float)j0((double)fBesselInput);
+				float fStoredValue = fEnum / fDenom;
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		case FILTER_PARZEN:
+		{
+			for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+			{
+				float fSmallN = (float)iDetectorIndex;
+				float fQ = fSmallN / (float)(_iFFTFourierDetectorCount - 1);
+				float fStoredValue = 0.0f;
+
+				if(fQ <= 0.5f)
+				{
+					fStoredValue = 1.0f - 6.0f * fQ * fQ * (1.0f - fQ);
+				}
+				else
+				{
+					float fCubedValue = 1.0f - fQ;
+					fStoredValue = 2.0f * fCubedValue * fCubedValue * fCubedValue;
+				}
+
+				pfFilt[iDetectorIndex] *= fStoredValue;
+			}
+
+			break;
+		}
+		default:
+		{
+			std::cerr << "Cannot serve requested filter" << std::endl;
+		}
+	}
+
+	// filt(w>pi*d) = 0;
+	float fPiTimesD = M_PI * _fD;
+	for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+	{
+		float fWValue = pfW[iDetectorIndex];
+
+		if(fWValue > fPiTimesD)
+		{
+			pfFilt[iDetectorIndex] = 0.0f;
+		}
+	}
+
+	for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++)
+	{
+		float fFilterValue = pfFilt[iDetectorIndex];
+
+		for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++)
+		{
+			int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount;
+			_pFilter[iIndex].x = fFilterValue;
+			_pFilter[iIndex].y = 0.0f;
+		}
+	}
+
+	delete[] pfFilt;
+	delete[] pfW;
+}
+
+#ifdef STANDALONE
+
+__global__ static void doubleFourierOutput_kernel(int _iProjectionCount,
+                                                  int _iDetectorCount,
+                                                  cufftComplex* _pFourierOutput)
+{
+	int iIndex = threadIdx.x + blockIdx.x * blockDim.x;
+	int iProjectionIndex = iIndex / _iDetectorCount;
+	int iDetectorIndex = iIndex % _iDetectorCount;
+
+	if(iProjectionIndex >= _iProjectionCount)
+	{
+		return;
+	}
+
+	if(iDetectorIndex <= (_iDetectorCount / 2))
+	{
+		return;
+	}
+
+	int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex;
+
+	_pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x;
+	_pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y;
+}
+
+static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount,
+                                cufftComplex * _pFourierOutput)
+{
+	const int iBlockSize = 256;
+	int iElementCount = _iProjectionCount * _iDetectorCount;
+	int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize;
+
+	doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount,
+	                                                          _iDetectorCount,
+	                                                          _pFourierOutput);
+	CHECK_ERROR("doubleFourierOutput_kernel failed");
+}
+
+
+
+static void writeToMatlabFile(const char * _fileName, const float * _pfData,
+                              int _iRowCount, int _iColumnCount)
+{
+	std::ofstream out(_fileName);
+
+	for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++)
+	{
+		for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++)
+		{
+			out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " ";
+		}
+
+		out << std::endl;
+	}
+}
+
+static void convertComplexToRealImg(const cufftComplex * _pComplex,
+                                    int _iElementCount,
+                                    float * _pfReal, float * _pfImaginary)
+{
+	for(int iIndex = 0; iIndex < _iElementCount; iIndex++)
+	{
+		_pfReal[iIndex] = _pComplex[iIndex].x;
+		_pfImaginary[iIndex] = _pComplex[iIndex].y;
+	}
+}
+
+void testCudaFFT()
+{
+	const int iProjectionCount = 2;
+	const int iDetectorCount = 1024;
+	const int iTotalElementCount = iProjectionCount * iDetectorCount;
+
+	float * pfHostProj = new float[iTotalElementCount];
+	memset(pfHostProj, 0, sizeof(float) * iTotalElementCount);
+
+	for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++)
+	{
+		for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++)
+		{
+//			int 
+
+//			pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX;
+		}
+	}
+
+	writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount);
+
+	float * pfDevProj = NULL;
+	SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount));
+	SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice));
+
+	cufftComplex * pDevFourProj = NULL;
+	SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount));
+
+	cufftHandle plan;
+	cufftResult result;
+
+	result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount);
+	if(result != CUFFT_SUCCESS)
+	{
+		cerr << "Failed to plan 1d r2c fft" << endl;
+	}
+
+	result = cufftExecR2C(plan, pfDevProj, pDevFourProj);
+	if(result != CUFFT_SUCCESS)
+	{
+		cerr << "Failed to exec 1d r2c fft" << endl;
+	}
+
+	cufftDestroy(plan);
+
+	doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj);
+
+	cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount];
+	SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost));
+
+	float * pfHostFourProjReal = new float[iTotalElementCount];
+	float * pfHostFourProjImaginary = new float[iTotalElementCount];
+
+	convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary);
+	
+	writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount);
+	writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount);
+
+	float * pfDevInFourProj = NULL;
+	SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount));
+
+	result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount);
+	if(result != CUFFT_SUCCESS)
+	{
+		cerr << "Failed to plan 1d c2r fft" << endl;
+	}
+
+	result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj);
+	if(result != CUFFT_SUCCESS)
+	{
+		cerr << "Failed to exec 1d c2r fft" << endl;
+	}
+
+	cufftDestroy(plan);
+
+	rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj);
+
+	float * pfHostInFourProj = new float[iTotalElementCount];
+	SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost));
+
+	writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount);
+
+	SAFE_CALL(cudaFree(pDevFourProj));
+	SAFE_CALL(cudaFree(pfDevProj));
+
+	delete [] pfHostInFourProj;
+	delete [] pfHostFourProjReal;
+	delete [] pfHostFourProjImaginary;
+	delete [] pfHostProj;
+	delete [] pHostFourProj;
+}
+
+void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount,
+                                int _iDetectorCount,
+                                cufftComplex * _pDevFilter,
+                                int _iFilterDetCount)
+{
+	cufftComplex * pHostFilter = NULL;
+	size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount;
+	pHostFilter = (cufftComplex *)malloc(complMemSize);
+	SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost));
+
+	for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
+	{
+		for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
+		{
+			cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
+			float fReadValue = sqrtf(source.x * source.x + source.y * source.y);
+			_pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue;
+		}
+	}
+
+	free(pHostFilter);
+}
+
+void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount,
+                             int _iDetectorCount, float * _pfDevFilter,
+                             int _iFilterDetCount)
+{
+	float * pfHostFilter = NULL;
+	size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount;
+	pfHostFilter = (float *)malloc(memSize);
+	SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost));
+
+	for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++)
+	{
+		for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++)
+		{
+			float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount];
+			_pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource;
+		}
+	}
+
+	free(pfHostFilter);
+}
+
+
+#endif
diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h
new file mode 100644
index 0000000..55324e5
--- /dev/null
+++ b/cuda/2d/fft.h
@@ -0,0 +1,69 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef FFT_H
+#define FFT_H
+
+#include <cufft.h>
+#include <cuda.h>
+
+#include "fbp_filters.h"
+
+bool allocateComplexOnDevice(int _iProjectionCount,
+                             int _iDetectorCount,
+                             cufftComplex ** _ppDevComplex);
+
+bool freeComplexOnDevice(cufftComplex * _pDevComplex);
+
+bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount,
+                                cufftComplex * _pHostComplexSource,
+                                cufftComplex * _pDevComplexTarget);
+
+bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource,
+                int _iSourcePitch, int _iSourcePadX, int _iProjDets,
+                int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount,
+                cufftComplex * _pDevTargetComplex);
+
+bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex,
+                 float * _pfRealTarget,
+                 int _iTargetPitch, int _iTargetPadX, int _iProjDets,
+                 int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
+
+void applyFilter(int _iProjectionCount, int _iFreqBinCount,
+                 cufftComplex * _pSinogram, cufftComplex * _pFilter);
+
+int calcFFTFourSize(int _iFFTRealSize);
+
+void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount,
+               cufftComplex * _pFilter, int _iFFTRealDetectorCount,
+               int _iFFTFourierDetectorCount, float _fParameter = -1.0f);
+
+void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter,
+                   int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount);
+
+#endif /* FFT_H */
diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu
new file mode 100644
index 0000000..1057879
--- /dev/null
+++ b/cuda/2d/par_bp.cu
@@ -0,0 +1,357 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_projTexture;
+
+
+namespace astraCUDA {
+
+const unsigned int g_anglesPerBlock = 16;
+const unsigned int g_blockSliceSize = 32;
+const unsigned int g_blockSlices = 16;
+
+const unsigned int g_MaxAngles = 2560;
+
+__constant__ float gC_angle_sin[g_MaxAngles];
+__constant__ float gC_angle_cos[g_MaxAngles];
+__constant__ float gC_angle_offset[g_MaxAngles];
+
+static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_projTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_projTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_projTexture.filterMode = cudaFilterModeLinear;
+	gT_projTexture.normalized = false;
+
+	cudaBindTexture2D(0, gT_projTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+
+	// TODO: error value?
+
+	return true;
+}
+
+__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
+
+	float* volData = (float*)D_volData;
+
+	float fVal = 0.0f;
+	float fA = startAngle + 0.5f;
+	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+
+	if (offsets) {
+
+		for (int angle = startAngle; angle < endAngle; ++angle)
+		{
+			const float cos_theta = gC_angle_cos[angle];
+			const float sin_theta = gC_angle_sin[angle];
+			const float TOffset = gC_angle_offset[angle];
+
+			const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
+			fVal += tex2D(gT_projTexture, fT, fA);
+			fA += 1.0f;
+		}
+
+	} else {
+
+		for (int angle = startAngle; angle < endAngle; ++angle)
+		{
+			const float cos_theta = gC_angle_cos[angle];
+			const float sin_theta = gC_angle_sin[angle];
+
+			const float fT = fT_base + fX * cos_theta - fY * sin_theta;
+			fVal += tex2D(gT_projTexture, fT, fA);
+			fA += 1.0f;
+		}
+
+	}
+
+	volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+// supersampling version
+__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale;
+
+	const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale);
+
+	float* volData = (float*)D_volData;
+
+	float fVal = 0.0f;
+	float fA = startAngle + 0.5f;
+	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f;
+
+	if (offsets) {
+
+		for (int angle = startAngle; angle < endAngle; ++angle)
+		{
+			const float cos_theta = gC_angle_cos[angle];
+			const float sin_theta = gC_angle_sin[angle];
+			const float TOffset = gC_angle_offset[angle];
+
+			float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset;
+
+			for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+				float fTy = fT;
+				fT += fSubStep * cos_theta;
+				for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+					fVal += tex2D(gT_projTexture, fTy, fA);
+					fTy -= fSubStep * sin_theta;
+				}
+			}
+			fA += 1.0f;
+		}
+
+	} else {
+
+		for (int angle = startAngle; angle < endAngle; ++angle)
+		{
+			const float cos_theta = gC_angle_cos[angle];
+			const float sin_theta = gC_angle_sin[angle];
+
+			float fT = fT_base + fX * cos_theta - fY * sin_theta;
+
+			for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) {
+				float fTy = fT;
+				fT += fSubStep * cos_theta;
+				for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) {
+					fVal += tex2D(gT_projTexture, fTy, fA);
+					fTy -= fSubStep * sin_theta;
+				}
+			}
+			fA += 1.0f;
+
+		}
+
+	}
+
+	volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim);
+}
+
+__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims)
+{
+	const int relX = threadIdx.x;
+	const int relY = threadIdx.y;
+
+	const int X = blockIdx.x * g_blockSlices + relX;
+	const int Y = blockIdx.y * g_blockSliceSize + relY;
+
+	if (X >= dims.iVolWidth || Y >= dims.iVolHeight)
+		return;
+
+	const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale;
+	const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale;
+
+	const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f;
+
+	const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset;
+	const float fVal = tex2D(gT_projTexture, fT, 0.5f);
+
+	D_volData[(Y+1)*volPitch+X+1] += fVal;
+}
+
+
+bool BP(float* D_volumeData, unsigned int volumePitch,
+        float* D_projData, unsigned int projPitch,
+        const SDimensions& dims, const float* angles, const float* TOffsets)
+{
+	// TODO: process angles block by block
+	assert(dims.iProjAngles <= g_MaxAngles);
+
+	float* angle_sin = new float[dims.iProjAngles];
+	float* angle_cos = new float[dims.iProjAngles];
+
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
+		angle_sin[i] = sinf(angles[i]);
+		angle_cos[i] = cosf(angles[i]);
+	}
+	cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	assert(e1 == cudaSuccess);
+	assert(e2 == cudaSuccess);
+
+	if (TOffsets) {
+		cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+		assert(e3 == cudaSuccess);
+	}
+
+	delete[] angle_sin;
+	delete[] angle_cos;
+
+	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+	cudaStream_t stream;
+	cudaStreamCreate(&stream);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+
+		if (dims.iRaysPerPixelDim > 1)
+			devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+		else
+			devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims);
+	}
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	cudaStreamDestroy(stream);
+
+	return true;
+}
+
+bool BP_SART(float* D_volumeData, unsigned int volumePitch,
+             float* D_projData, unsigned int projPitch,
+             unsigned int angle, const SDimensions& dims,
+             const float* angles, const float* TOffsets)
+{
+	// only one angle
+	bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1);
+
+	float angle_sin = sinf(angles[angle]);
+	float angle_cos = cosf(angles[angle]);
+
+	float offset = 0.0f;
+	if (TOffsets)
+		offset = TOffsets[angle];
+
+	dim3 dimBlock(g_blockSlices, g_blockSliceSize);
+	dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices,
+	             (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize);
+
+	devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims);
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_projData;
+
+	SDimensions dims;
+	dims.iVolWidth = 1024;
+	dims.iVolHeight = 1024;
+	dims.iProjAngles = 512;
+	dims.iProjDets = 1536;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+
+	unsigned int volumePitch, projPitch;
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	printf("pitch: %u\n", projPitch);
+
+	unsigned int y, x;
+	float* sino = loadImage("sino.png", y, x);
+
+	float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+	memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float));
+
+	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0);
+
+	delete[] angle;
+
+	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+	return 0;
+}
+#endif
diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h
new file mode 100644
index 0000000..c6dbd59
--- /dev/null
+++ b/cuda/2d/par_bp.h
@@ -0,0 +1,48 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR_BP_H
+#define _CUDA_PAR_BP_H
+
+#include "dims.h"
+
+namespace astraCUDA {
+
+_AstraExport bool BP(float* D_volumeData, unsigned int volumePitch,
+        float* D_projData, unsigned int projPitch,
+        const SDimensions& dims, const float* angles,
+        const float* TOffsets);
+
+_AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch,
+             float* D_projData, unsigned int projPitch,
+             unsigned int angle, const SDimensions& dims,
+             const float* angles, const float* TOffsets);
+
+}
+
+#endif
diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu
new file mode 100644
index 0000000..585cb06
--- /dev/null
+++ b/cuda/2d/par_fp.cu
@@ -0,0 +1,704 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#define PIXELTRACE
+
+
+typedef texture<float, 2, cudaReadModeElementType> texture2D;
+
+static texture2D gT_volumeTexture;
+
+
+namespace astraCUDA {
+
+static const unsigned g_MaxAngles = 2560;
+__constant__ float gC_angle[g_MaxAngles];
+__constant__ float gC_angle_offset[g_MaxAngles];
+
+
+// optimization parameters
+static const unsigned int g_anglesPerBlock = 16;
+static const unsigned int g_detBlockSize = 32;
+static const unsigned int g_blockSlices = 64;
+
+// fixed point scaling factor
+#define fPREC_FACTOR 16.0f
+#define iPREC_FACTOR 16
+
+
+// if necessary, a buffer of zeroes of size g_MaxAngles
+static float* g_pfZeroes = 0;
+
+
+static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+	dataArray = 0;
+	cudaMallocArray(&dataArray, &channelDesc, width, height);
+	cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice);
+
+	gT_volumeTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_volumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_volumeTexture.filterMode = cudaFilterModeLinear;
+	gT_volumeTexture.normalized = false;
+
+	// TODO: For very small sizes (roughly <=512x128) with few angles (<=180)
+	// not using an array is more efficient.
+//	cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch);
+	cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+// projection for angles that are roughly horizontal
+// theta between 45 and 135 degrees
+__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+	if (angle >= endAngle)
+		return;
+
+	const float theta = gC_angle[angle];
+	const float cos_theta = __cosf(theta);
+	const float sin_theta = __sinf(theta);
+
+	// compute start detector for this block/angle:
+	// (The same for all threadIdx.x)
+	// -------------------------------------
+
+	const int midSlice = startSlice + g_blockSlices / 2;
+
+	// ASSUMPTION: fDetScale >= 1.0f
+	// problem: detector regions get skipped because slice blocks aren't large
+	// enough
+	const unsigned int g_blockSliceSize = g_detBlockSize;
+
+	// project (midSlice,midRegion) on this thread's detector
+
+	const float fBase = 0.5f*dims.iProjDets - 0.5f +
+		(
+		    (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+		  - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+		  + gC_angle_offset[angle]
+		) / dims.fDetScale;
+	int iBase = (int)floorf(fBase * fPREC_FACTOR);
+	int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR);
+
+	// ASSUMPTION: 16 > regionOffset / fDetScale
+	const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+	const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+	if (blockIdx.y > 0 && detRegion == detPrevRegion)
+		return;
+
+	const int detector = detRegion * g_detBlockSize + relDet;
+
+	// Now project the part of the ray to angle,detector through
+	// slices startSlice to startSlice+g_blockSlices-1
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	const float fDetStep = -dims.fDetScale / sin_theta;
+	float fSliceStep = cos_theta / sin_theta;
+	float fDistCorr;
+	if (sin_theta > 0.0f)
+		fDistCorr = -fDetStep;
+	else
+		fDistCorr = fDetStep;
+	fDistCorr *= outputScale;
+
+	float fVal = 0.0f;
+	// project detector on slice
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+	float fS = startSlice + 1.5f;
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > dims.iVolWidth)
+		endSlice = dims.iVolWidth;
+
+	if (dims.iRaysPerDet > 1) {
+
+		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+		fDistCorr /= dims.iRaysPerDet;
+
+		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+				fVal += tex2D(gT_volumeTexture, fS, fP);
+				fP += fSubDetStep;
+			}
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	} else {
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fVal += tex2D(gT_volumeTexture, fS, fP);
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+
+	}
+
+	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly vertical
+// theta between 0 and 45, or 135 and 180 degrees
+__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale)
+{
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+	if (angle >= endAngle)
+		return;
+
+	const float theta = gC_angle[angle];
+	const float cos_theta = __cosf(theta);
+	const float sin_theta = __sinf(theta);
+
+	// compute start detector for this block/angle:
+	// (The same for all threadIdx.x)
+	// -------------------------------------
+
+	const int midSlice = startSlice + g_blockSlices / 2;
+
+	// project (midSlice,midRegion) on this thread's detector
+
+	// ASSUMPTION: fDetScale >= 1.0f
+	// problem: detector regions get skipped because slice blocks aren't large
+	// enough
+	const unsigned int g_blockSliceSize = g_detBlockSize;
+
+	const float fBase = 0.5f*dims.iProjDets - 0.5f +
+		(
+		    (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta
+		  - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta
+		  + gC_angle_offset[angle]
+		) / dims.fDetScale;
+	int iBase = (int)floorf(fBase * fPREC_FACTOR);
+	int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR);
+
+	// ASSUMPTION: 16 > regionOffset / fDetScale
+	const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+	const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16;
+
+	if (blockIdx.y > 0 && detRegion == detPrevRegion)
+		return;
+
+	const int detector = detRegion * g_detBlockSize + relDet;
+
+	// Now project the part of the ray to angle,detector through
+	// slices startSlice to startSlice+g_blockSlices-1
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	const float fDetStep = dims.fDetScale / cos_theta;
+	float fSliceStep = sin_theta / cos_theta;
+	float fDistCorr;
+	if (cos_theta < 0.0f)
+		fDistCorr = -fDetStep;
+	else
+		fDistCorr = fDetStep;
+	fDistCorr *= outputScale;
+
+	float fVal = 0.0f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+	float fS = startSlice+1.5f;
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > dims.iVolHeight)
+		endSlice = dims.iVolHeight;
+
+	if (dims.iRaysPerDet > 1) {
+
+		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+		fDistCorr /= dims.iRaysPerDet;
+
+		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+				fVal += tex2D(gT_volumeTexture, fP, fS);
+				fP += fSubDetStep;
+			}
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	} else {
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fVal += tex2D(gT_volumeTexture, fP, fS);
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	}
+
+	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+// projection for angles that are roughly horizontal
+// (detector roughly vertical)
+__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+	if (angle >= endAngle)
+		return;
+
+	const float theta = gC_angle[angle];
+	const float cos_theta = __cosf(theta);
+	const float sin_theta = __sinf(theta);
+
+	// compute start detector for this block/angle:
+	const int detRegion = blockIdx.y;
+
+	const int detector = detRegion * g_detBlockSize + relDet;
+
+	// Now project the part of the ray to angle,detector through
+	// slices startSlice to startSlice+g_blockSlices-1
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	const float fDetStep = -dims.fDetScale / sin_theta;
+	float fSliceStep = cos_theta / sin_theta;
+	float fDistCorr;
+	if (sin_theta > 0.0f)
+		fDistCorr = -fDetStep;
+	else
+		fDistCorr = fDetStep;
+	fDistCorr *= outputScale;
+
+	float fVal = 0.0f;
+	// project detector on slice
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f;
+	float fS = startSlice + 1.5f;
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > dims.iVolWidth)
+		endSlice = dims.iVolWidth;
+
+	if (dims.iRaysPerDet > 1) {
+
+		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+		fDistCorr /= dims.iRaysPerDet;
+
+		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+				fVal += tex2D(gT_volumeTexture, fS, fP);
+				fP += fSubDetStep;
+			}
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	} else {
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fVal += tex2D(gT_volumeTexture, fS, fP);
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+
+	}
+
+	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+// projection for angles that are roughly vertical
+// (detector roughly horizontal)
+__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale)
+{
+	const int relDet = threadIdx.x;
+	const int relAngle = threadIdx.y;
+
+	int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle;
+
+	if (angle >= endAngle)
+		return;
+
+	const float theta = gC_angle[angle];
+	const float cos_theta = __cosf(theta);
+	const float sin_theta = __sinf(theta);
+
+	// compute start detector for this block/angle:
+	const int detRegion = blockIdx.y;
+
+	const int detector = detRegion * g_detBlockSize + relDet;
+
+	// Now project the part of the ray to angle,detector through
+	// slices startSlice to startSlice+g_blockSlices-1
+
+	if (detector < 0 || detector >= dims.iProjDets)
+		return;
+
+	const float fDetStep = dims.fDetScale / cos_theta;
+	float fSliceStep = sin_theta / cos_theta;
+	float fDistCorr;
+	if (cos_theta < 0.0f)
+		fDistCorr = -fDetStep;
+	else
+		fDistCorr = fDetStep;
+	fDistCorr *= outputScale;
+
+	float fVal = 0.0f;
+	float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f;
+	float fS = startSlice+1.5f;
+	int endSlice = startSlice + g_blockSlices;
+	if (endSlice > dims.iVolHeight)
+		endSlice = dims.iVolHeight;
+
+	if (dims.iRaysPerDet > 1) {
+
+		fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep;
+		const float fSubDetStep = fDetStep / dims.iRaysPerDet;
+		fDistCorr /= dims.iRaysPerDet;
+
+		fSliceStep -= dims.iRaysPerDet * fSubDetStep;
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) {
+				fVal += tex2D(gT_volumeTexture, fP, fS);
+				fP += fSubDetStep;
+			}
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	} else {
+
+		for (int slice = startSlice; slice < endSlice; ++slice)
+		{
+			fVal += tex2D(gT_volumeTexture, fP, fS);
+			fP += fSliceStep;
+			fS += 1.0f;
+		}
+
+	}
+
+	D_projData[angle*projPitch+detector+1] += fVal * fDistCorr;
+}
+
+
+
+bool FP_simple(float* D_volumeData, unsigned int volumePitch,
+               float* D_projData, unsigned int projPitch,
+               const SDimensions& dims, const float* angles,
+               const float* TOffsets, float outputScale)
+{
+	// TODO: load angles into constant memory in smaller blocks
+	assert(dims.iProjAngles <= g_MaxAngles);
+
+	cudaArray* D_dataArray;
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+	if (TOffsets) {
+		cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	} else {
+		if (!g_pfZeroes) {
+			g_pfZeroes = new float[g_MaxAngles];
+			memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+		}
+		cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	}
+
+	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles
+
+	std::list<cudaStream_t> streams;
+
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	// TODO: Check if it's worth it to store this info instead
+	// of recomputing it every FP.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	bool blockVertical = false;
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		bool vertical;
+		// TODO: Having <= instead of < below causes a 5% speedup.
+		// Maybe we should detect corner cases and put them in the optimal
+		// group of angles.
+		if (a != dims.iProjAngles)
+			vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+		if (a == dims.iProjAngles || vertical != blockVertical) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+				             (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks
+
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+				//printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+				if (!blockVertical)
+					for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+						FPhorizontal_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+				else
+					for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+						FPvertical_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale);
+			}
+			blockVertical = vertical;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	cudaFreeArray(D_dataArray);
+		
+
+	return true;
+}
+
+bool FP(float* D_volumeData, unsigned int volumePitch,
+        float* D_projData, unsigned int projPitch,
+        const SDimensions& dims, const float* angles,
+        const float* TOffsets, float outputScale)
+{
+	return FP_simple(D_volumeData, volumePitch, D_projData, projPitch,
+	                 dims, angles, TOffsets, outputScale);
+
+	// TODO: Fix bug in this non-simple FP with large detscale and TOffsets
+#if 0
+
+	// TODO: load angles into constant memory in smaller blocks
+	assert(dims.iProjAngles <= g_MaxAngles);
+
+	// TODO: compute region size dynamically to resolve these two assumptions
+	// ASSUMPTION: 16 > regionOffset / fDetScale
+	const unsigned int g_blockSliceSize = g_detBlockSize;
+	assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale);
+	// ASSUMPTION: fDetScale >= 1.0f
+	assert(dims.fDetScale > 0.9999f);
+
+	cudaArray* D_dataArray;
+	bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+
+	if (TOffsets) {
+		cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	} else {
+		if (!g_pfZeroes) {
+			g_pfZeroes = new float[g_MaxAngles];
+			memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float));
+		}
+		cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	}
+
+	int regionOffset = g_blockSlices / g_blockSliceSize;
+
+	dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles
+
+	std::list<cudaStream_t> streams;
+
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	// TODO: Check if it's worth it to store this info instead
+	// of recomputing it every FP.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	bool blockVertical = false;
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		bool vertical;
+		// TODO: Having <= instead of < below causes a 5% speedup.
+		// Maybe we should detect corner cases and put them in the optimal
+		// group of angles.
+		if (a != dims.iProjAngles)
+			vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a])));
+		if (a == dims.iProjAngles || vertical != blockVertical) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+				unsigned int length = dims.iVolHeight;
+				if (blockVertical)
+					length = dims.iVolWidth;
+				dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock,
+				             (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+				//printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical);
+				if (!blockVertical)
+					for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices)
+						FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+				else
+					for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices)
+						FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale);
+			}
+			blockVertical = vertical;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaThreadSynchronize();
+
+	cudaTextForceKernelsCompletion();
+
+	cudaFreeArray(D_dataArray);
+		
+
+	return true;
+#endif
+}
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_projData;
+
+	SDimensions dims;
+	dims.iVolWidth = 1024;
+	dims.iVolHeight = 1024;
+	dims.iProjAngles = 512;
+	dims.iProjDets = 1536;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, projPitch;
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	printf("pitch: %u\n", projPitch);
+
+	unsigned int y, x;
+	float* img = loadImage("phantom.png", y, x);
+
+	float* sino = new float[dims.iProjAngles * dims.iProjDets];
+
+	memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float));
+
+	copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f);
+
+	delete[] angle;
+
+	copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch);
+
+	float s = 0.0f;
+	for (unsigned int y = 0; y < dims.iProjAngles; ++y)
+		for (unsigned int x = 0; x < dims.iProjDets; ++x)
+			s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x];
+	printf("cpu norm: %f\n", s);
+
+	//zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+	printf("gpu norm: %f\n", s);
+
+	saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino);
+
+
+	return 0;
+}
+#endif
diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h
new file mode 100644
index 0000000..3213b14
--- /dev/null
+++ b/cuda/2d/par_fp.h
@@ -0,0 +1,41 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR_FP_H
+#define _CUDA_PAR_FP_H
+
+namespace astraCUDA {
+
+_AstraExport bool FP(float* D_volumeData, unsigned int volumePitch,
+        float* D_projData, unsigned int projPitch,
+        const SDimensions& dims, const float* angles,
+        const float* TOffsets, float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu
new file mode 100644
index 0000000..a40176d
--- /dev/null
+++ b/cuda/2d/sart.cu
@@ -0,0 +1,283 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sart.h"
+#include "util.h"
+#include "arith.h"
+#include "fan_fp.h"
+#include "fan_bp.h"
+#include "par_fp.h"
+#include "par_bp.h"
+
+namespace astraCUDA {
+
+
+__global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	// Copy result down and left one pixel.
+	pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1];
+}
+
+void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, 1);
+
+	devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width);
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+SART::SART() : ReconAlgo()
+{
+	D_projData = 0;
+	D_tmpData = 0;
+
+	D_lineWeight = 0;
+
+	projectionOrder = 0;
+	projectionCount = 0;
+	iteration = 0;
+	customOrder = false;
+}
+
+
+SART::~SART()
+{
+	reset();
+}
+
+void SART::reset()
+{
+	cudaFree(D_projData);
+	cudaFree(D_tmpData);
+	cudaFree(D_lineWeight);
+
+	D_projData = 0;
+	D_tmpData = 0;
+
+	D_lineWeight = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	if (projectionOrder != NULL) delete[] projectionOrder;
+	projectionOrder = 0;
+	projectionCount = 0;
+	iteration = 0;
+	customOrder = false;
+
+	ReconAlgo::reset();
+}
+
+bool SART::init()
+{
+	if (useVolumeMask) {
+		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	}
+
+	// HACK: D_projData consists of two lines. The first is used padded,
+	// the second unpadded. This is to satisfy the alignment requirements
+	// of resp. FP and BP_SART.
+	allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1);
+	
+	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// We can't precompute lineWeights when using a mask
+	if (!useVolumeMask)
+		precomputeWeights();
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount)
+{
+	customOrder = true;
+	projectionCount = _projectionCount;
+	projectionOrder = new int[projectionCount];
+	for (int i = 0; i < projectionCount; i++) {
+		projectionOrder[i] = _projectionOrder[i];
+	}
+
+	return true;
+}
+
+
+bool SART::precomputeWeights()
+{
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	if (useVolumeMask) {
+		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
+	} else {
+		// Allocate tmpData temporarily
+		allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+
+		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
+
+
+		cudaFree(D_tmpData);
+		D_tmpData = 0;
+	}
+	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+
+	return true;
+}
+
+bool SART::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+	if (useVolumeMask)
+		precomputeWeights();
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+		int angle;
+		if (customOrder) {
+			angle = projectionOrder[iteration % projectionCount];
+		} else {
+			angle = iteration % dims.iProjAngles;  
+		}
+
+		// copy one line of sinogram to projection data
+		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice);
+
+		// do FP, subtracting projection from sinogram
+		if (useVolumeMask) {
+				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+				callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f);
+		} else {
+				callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f);
+		}
+
+		MUL_SART(D_projData, D_lineWeight + angle*linePitch, projPitch, dims.iProjDets);
+
+		if (useVolumeMask) {
+			// BP, mask, and add back
+			// TODO: Try putting the masking directly in the BP
+			zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+			callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle);
+			processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		} else {
+			callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle);
+		}
+
+		if (useMinConstraint)
+			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		if (useMaxConstraint)
+			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+		iteration++;
+
+	}
+
+	return true;
+}
+
+float SART::computeDiffNorm()
+{
+	unsigned int pPitch;
+	float *D_p;
+	allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch);
+	zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// copy sinogram to D_p
+	cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+	} else {
+			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+	}
+
+
+	// compute norm of D_p
+	float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+	cudaFree(D_p);
+
+	return sqrt(s);
+}
+
+bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch,
+                       float* D_projData, unsigned int projPitch,
+                       unsigned int angle, float outputScale)
+{
+	SDimensions d = dims;
+	d.iProjAngles = 1;
+	if (angles) {
+		assert(!fanProjs);
+		return FP(D_volumeData, volumePitch, D_projData, projPitch,
+		          d, &angles[angle], TOffsets, outputScale);
+	} else {
+		assert(fanProjs);
+		return FanFP(D_volumeData, volumePitch, D_projData, projPitch,
+		             d, &fanProjs[angle], outputScale);
+	}
+}
+
+bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch,
+                       float* D_projData, unsigned int projPitch,
+                       unsigned int angle)
+{
+	if (angles) {
+		assert(!fanProjs);
+		return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+		               angle, dims, angles, TOffsets);
+	} else {
+		assert(fanProjs);
+		return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch,
+		                  angle, dims, fanProjs);
+	}
+
+}
+
+
+}
+
+
diff --git a/cuda/2d/sart.h b/cuda/2d/sart.h
new file mode 100644
index 0000000..ad80259
--- /dev/null
+++ b/cuda/2d/sart.h
@@ -0,0 +1,85 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SART_H
+#define _CUDA_SART_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport SART : public ReconAlgo {
+public:
+	SART();
+	~SART();
+
+	// disable some features
+	virtual bool enableSinogramMask() { return false; }
+
+	virtual bool init();
+
+	virtual bool setProjectionOrder(int* projectionOrder, int projectionCount);
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm();
+
+protected:
+	void reset();
+	bool precomputeWeights();
+
+	bool callFP_SART(float* D_volumeData, unsigned int volumePitch,
+	                 float* D_projData, unsigned int projPitch,
+	                 unsigned int angle, float outputScale);
+	bool callBP_SART(float* D_volumeData, unsigned int volumePitch,
+	                 float* D_projData, unsigned int projPitch,
+	                 unsigned int angle);
+
+
+	// projection angle variables
+	bool customOrder;
+	int* projectionOrder;
+	int projectionCount;
+	int iteration;
+
+ 	// Temporary buffers
+	float* D_projData;
+	unsigned int projPitch;
+
+	float* D_tmpData; // Only used when there's a volume mask
+	unsigned int tmpPitch;
+
+	// Geometry-specific precomputed data
+	float* D_lineWeight;
+	unsigned int linePitch;
+};
+
+}
+
+#endif
diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu
new file mode 100644
index 0000000..31954e4
--- /dev/null
+++ b/cuda/2d/sirt.cu
@@ -0,0 +1,342 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sirt.h"
+#include "util.h"
+#include "arith.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA {
+
+SIRT::SIRT() : ReconAlgo()
+{
+	D_projData = 0;
+	D_tmpData = 0;
+
+	D_lineWeight = 0;
+	D_pixelWeight = 0;
+
+	D_minMaskData = 0;
+	D_maxMaskData = 0;
+
+	freeMinMaxMasks = false;
+}
+
+
+SIRT::~SIRT()
+{
+	reset();
+}
+
+void SIRT::reset()
+{
+	cudaFree(D_projData);
+	cudaFree(D_tmpData);
+	cudaFree(D_lineWeight);
+	cudaFree(D_pixelWeight);
+	if (freeMinMaxMasks) {
+		cudaFree(D_minMaskData);
+		cudaFree(D_maxMaskData);
+	}
+
+	D_projData = 0;
+	D_tmpData = 0;
+
+	D_lineWeight = 0;
+	D_pixelWeight = 0;
+
+	freeMinMaxMasks = false;
+	D_minMaskData = 0;
+	D_maxMaskData = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	ReconAlgo::reset();
+}
+
+bool SIRT::init()
+{
+	allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch);
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch);
+	zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+	allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch);
+	zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles);
+	
+	allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch);
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+
+	// We can't precompute lineWeights and pixelWeights when using a mask
+	if (!useVolumeMask && !useSinogramMask)
+		precomputeWeights();
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+bool SIRT::precomputeWeights()
+{
+	zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles);
+	if (useVolumeMask) {
+		callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f);
+	} else {
+		processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+		callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f);
+	}
+	processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles);
+
+	if (useSinogramMask) {
+		// scale line weights with sinogram mask to zero out masked sinogram pixels
+		processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles);
+	}
+
+
+	zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	if (useSinogramMask) {
+		callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch);
+	} else {
+		processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles);
+		callBP(D_pixelWeight, pixelPitch, D_projData, projPitch);
+	}
+	processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+
+	if (useVolumeMask) {
+		// scale pixel weights with mask to zero out masked pixels
+		processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight);
+	}
+
+	return true;
+}
+
+bool SIRT::setMinMaxMasks(float* D_minMaskData_, float* D_maxMaskData_,
+	                      unsigned int iPitch)
+{
+	D_minMaskData = D_minMaskData_;
+	D_maxMaskData = D_maxMaskData_;
+	minMaskPitch = iPitch;
+	maxMaskPitch = iPitch;
+
+	freeMinMaxMasks = false;
+	return true;
+}
+
+bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskData,
+	                         unsigned int iPitch)
+{
+	freeMinMaxMasks = true;
+	bool ok = true;
+	if (pfMinMaskData) {
+		allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch);
+		ok = copyVolumeToDevice(pfMinMaskData, iPitch,
+		                        dims.iVolWidth, dims.iVolHeight,
+		                        D_minMaskData, minMaskPitch);
+	}
+	if (!ok)
+		return false;
+
+	if (pfMaxMaskData) {
+		allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch);
+		ok = copyVolumeToDevice(pfMaxMaskData, iPitch,
+		                        dims.iVolWidth, dims.iVolHeight,
+		                        D_maxMaskData, maxMaskPitch);
+	}
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+bool SIRT::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+	if (useVolumeMask || useSinogramMask)
+		precomputeWeights();
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+		// copy sinogram to projection data
+		cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+		// do FP, subtracting projection from sinogram
+		if (useVolumeMask) {
+				cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+				processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+				callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+		} else {
+				callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+		}
+
+		processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles);
+
+		zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2);
+
+		callBP(D_tmpData, tmpPitch, D_projData, projPitch);
+
+		processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+
+		if (useMinConstraint)
+			processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		if (useMaxConstraint)
+			processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		if (D_minMaskData)
+			processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+		if (D_maxMaskData)
+			processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight);
+	}
+
+	return true;
+}
+
+float SIRT::computeDiffNorm()
+{
+	// copy sinogram to projection data
+	cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice);
+			processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight);
+			callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f);
+	} else {
+			callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f);
+	}
+
+
+	// compute norm of D_projData
+
+	float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0);
+
+	return sqrt(s);
+}
+
+
+bool doSIRT(float* D_volumeData, unsigned int volumePitch,
+            float* D_sinoData, unsigned int sinoPitch,
+            float* D_maskData, unsigned int maskPitch,
+            const SDimensions& dims, const float* angles,
+            const float* TOffsets, unsigned int iterations)
+{
+	SIRT sirt;
+	bool ok = true;
+
+	ok &= sirt.setGeometry(dims, angles);
+	if (D_maskData)
+		ok &= sirt.enableVolumeMask();
+	if (TOffsets)
+		ok &= sirt.setTOffsets(TOffsets);
+
+	if (!ok)
+		return false;
+
+	ok = sirt.init();
+	if (!ok)
+		return false;
+
+	if (D_maskData)
+		ok &= sirt.setVolumeMask(D_maskData, maskPitch);
+
+	ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+	if (!ok)
+		return false;
+
+	ok = sirt.iterate(iterations);
+
+	return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA;
+
+int main()
+{
+	float* D_volumeData;
+	float* D_sinoData;
+
+	SDimensions dims;
+	dims.iVolWidth = 1024;
+	dims.iVolHeight = 1024;
+	dims.iProjAngles = 512;
+	dims.iProjDets = 1536;
+	dims.fDetScale = 1.0f;
+	dims.iRaysPerDet = 1;
+	unsigned int volumePitch, sinoPitch;
+
+	allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch);
+	zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2);
+	printf("pitch: %u\n", volumePitch);
+
+	allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch);
+	zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles);
+	printf("pitch: %u\n", sinoPitch);
+	
+	unsigned int y, x;
+	float* sino = loadImage("sino.png", y, x);
+
+	float* img = new float[dims.iVolWidth*dims.iVolHeight];
+
+	copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch);
+
+	float* angle = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i)
+		angle[i] = i*(M_PI/dims.iProjAngles);
+
+	SIRT sirt;
+
+	sirt.setGeometry(dims, angle);
+	sirt.init();
+
+	sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch);
+
+	sirt.iterate(25);
+
+
+	delete[] angle;
+
+	copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch);
+
+	saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img);
+
+	return 0;
+}
+#endif
+
diff --git a/cuda/2d/sirt.h b/cuda/2d/sirt.h
new file mode 100644
index 0000000..5592616
--- /dev/null
+++ b/cuda/2d/sirt.h
@@ -0,0 +1,90 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SIRT_H
+#define _CUDA_SIRT_H
+
+#include "util.h"
+#include "algo.h"
+
+namespace astraCUDA {
+
+class _AstraExport SIRT : public ReconAlgo {
+public:
+	SIRT();
+	~SIRT();
+
+	virtual bool init();
+
+	// Set min/max masks to existing GPU memory buffers
+	bool setMinMaxMasks(float* D_minMaskData, float* D_maxMaskData,
+	                    unsigned int pitch);
+
+	// Set min/max masks from RAM buffers
+	bool uploadMinMaxMasks(const float* minMaskData, const float* maxMaskData,
+	                       unsigned int pitch);
+
+	virtual bool iterate(unsigned int iterations);
+
+	virtual float computeDiffNorm();
+
+protected:
+	void reset();
+	bool precomputeWeights();
+
+ 	// Temporary buffers
+	float* D_projData;
+	unsigned int projPitch;
+
+	float* D_tmpData;
+	unsigned int tmpPitch;
+
+	// Geometry-specific precomputed data
+	float* D_lineWeight;
+	unsigned int linePitch;
+
+	float* D_pixelWeight;
+	unsigned int pixelPitch;
+
+	// Masks
+	bool freeMinMaxMasks;
+	float* D_minMaskData;
+	unsigned int minMaskPitch;
+	float* D_maxMaskData;
+	unsigned int maxMaskPitch;
+};
+
+bool doSIRT(float* D_volumeData, unsigned int volumePitch,
+            float* D_projData, unsigned int projPitch,
+            float* D_maskData, unsigned int maskPitch,
+            const SDimensions& dims, const float* angles,
+            const float* TOffsets, unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu
new file mode 100644
index 0000000..06f6714
--- /dev/null
+++ b/cuda/2d/util.cu
@@ -0,0 +1,244 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include "util.h"
+
+namespace astraCUDA {
+
+bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
+		unsigned int width, unsigned int height,
+		float* outD_data, unsigned int out_pitch)
+{
+	// TODO: a full memset isn't necessary. Only the edges.
+	cudaError_t err;
+	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2);
+	ASTRA_CUDA_ASSERT(err);
+	err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+	ASTRA_CUDA_ASSERT(err);
+	assert(err == cudaSuccess);
+	return true;
+}
+
+bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
+		unsigned int width, unsigned int height,
+		float* inD_data, unsigned int in_pitch)
+{
+	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+	ASTRA_CUDA_ASSERT(err);
+	return true;
+}
+
+
+bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
+		unsigned int width, unsigned int height,
+		float* inD_data, unsigned int in_pitch)
+{   
+	cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost);
+	ASTRA_CUDA_ASSERT(err);
+	return true;
+}
+
+bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
+		unsigned int width, unsigned int height,
+		float* outD_data, unsigned int out_pitch)
+{   
+	// TODO: a full memset isn't necessary. Only the edges.
+	cudaError_t err;
+	err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height);
+	ASTRA_CUDA_ASSERT(err);
+	err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice);
+	ASTRA_CUDA_ASSERT(err);
+	return true;
+}
+
+
+bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch)
+{
+	size_t p;
+	cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height);
+	if (ret != cudaSuccess) {
+		reportCudaError(ret);
+		fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height);
+		return false;
+	}
+
+	assert(p % sizeof(float) == 0);
+
+	pitch = p / sizeof(float);
+
+	return true;
+}
+
+void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	cudaError_t err;
+	err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height);
+	ASTRA_CUDA_ASSERT(err);
+}
+
+
+template <unsigned int blockSize>
+__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n)
+{
+	extern __shared__ float sdata[];
+	unsigned int tid = threadIdx.x;
+
+	unsigned int i = blockIdx.x*(blockSize*2) + tid;
+	unsigned int gridSize = blockSize*gridDim.x;
+	sdata[tid] = 0;
+	while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; }
+	__syncthreads();
+	if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
+	if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
+	if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
+	if (tid < 32) {
+		volatile float* smem = sdata;
+		if (blockSize >= 64) smem[tid] += smem[tid + 32];
+		if (blockSize >= 32) smem[tid] += smem[tid + 16];
+		if (blockSize >= 16) smem[tid] += smem[tid + 8];
+		if (blockSize >= 8) smem[tid] += smem[tid + 4];
+		if (blockSize >= 4) smem[tid] += smem[tid + 2];
+		if (blockSize >= 2) smem[tid] += smem[tid + 1];
+	}
+	if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+__global__ void reduce2D(float *g_idata, float *g_odata,
+                         unsigned int pitch,
+                         unsigned int nx, unsigned int ny,
+                         unsigned int padX, unsigned int padY)
+{
+	extern __shared__ float sdata[];
+	const unsigned int tidx = threadIdx.x;
+	const unsigned int tidy = threadIdx.y;
+	const unsigned int tid = tidy * 16 + tidx;
+
+	unsigned int x = blockIdx.x*16 + tidx;
+	unsigned int y = blockIdx.y*16 + tidy;
+
+	sdata[tid] = 0;
+
+	if (x >= padX && x < padX + nx) {
+
+		while (y < padY + ny) {
+			if (y >= padY)
+				sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]);
+			y += 16 * gridDim.y;
+		}
+
+	}
+
+	__syncthreads();
+
+	if (tid < 128)
+		sdata[tid] += sdata[tid + 128];
+	__syncthreads();
+
+	if (tid < 64)
+		sdata[tid] += sdata[tid + 64];
+	__syncthreads();
+
+	if (tid < 32) { // 32 is warp size
+		volatile float* smem = sdata;
+		smem[tid] += smem[tid + 32];
+		smem[tid] += smem[tid + 16];
+		smem[tid] += smem[tid + 8];
+		smem[tid] += smem[tid + 4];
+		smem[tid] += smem[tid + 2];
+		smem[tid] += smem[tid + 1];
+	} 
+
+	if (tid == 0)
+		g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0];
+}
+
+float dotProduct2D(float* D_data, unsigned int pitch,
+                   unsigned int width, unsigned int height,
+                   unsigned int padX, unsigned int padY)
+{
+	unsigned int bx = ((width+padX) + 15) / 16;
+	unsigned int by = ((height+padY) + 127) / 128;
+	unsigned int shared_mem2 = sizeof(float) * 16 * 16;
+
+	dim3 dimBlock2(16, 16);
+	dim3 dimGrid2(bx, by);
+
+	float* D_buf;
+	cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) );
+
+	// Step 1: reduce 2D from image to a single vector, taking sum of squares
+
+	reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY);
+	cudaTextForceKernelsCompletion();
+
+	// Step 2: reduce 1D: add up elements in vector
+	if (bx * by > 512)
+		reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by);
+	else if (bx * by > 128)
+		reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by);
+	else if (bx * by > 32)
+		reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by);
+	else if (bx * by > 8)
+		reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by);
+	else
+		reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by);
+
+	float x;
+	cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost);
+
+	cudaTextForceKernelsCompletion();
+
+	cudaFree(D_buf);
+
+	return x;
+}
+
+
+bool cudaTextForceKernelsCompletion()
+{
+	cudaError_t returnedCudaError = cudaThreadSynchronize();
+
+	if(returnedCudaError != cudaSuccess) {
+		fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
+		return false;
+	}
+
+	return true;
+}
+
+void reportCudaError(cudaError_t err)
+{
+	if(err != cudaSuccess)
+		fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err));
+}
+
+
+
+}
diff --git a/cuda/2d/util.h b/cuda/2d/util.h
new file mode 100644
index 0000000..d31e2eb
--- /dev/null
+++ b/cuda/2d/util.h
@@ -0,0 +1,90 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_UTIL_H
+#define _CUDA_UTIL_H
+
+#include <cuda.h>
+#include <driver_types.h>
+
+#ifdef _MSC_VER
+
+#ifdef DLL_EXPORTS
+#define _AstraExport __declspec(dllexport)
+#define EXPIMP_TEMPLATE
+#else
+#define _AstraExport __declspec(dllimport)
+#define EXPIMP_TEMPLATE extern
+#endif
+
+#else
+
+#define _AstraExport
+
+#endif
+
+#include "dims.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#define ASTRA_CUDA_ASSERT(err) do {  if (err != cudaSuccess) { astraCUDA::reportCudaError(err); assert(err == cudaSuccess); } } while(0)
+
+
+namespace astraCUDA {
+
+bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch,
+		unsigned int width, unsigned int height,
+		float* outD_data, unsigned int out_pitch);
+bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch,
+		unsigned int width, unsigned int height,
+		float* inD_data, unsigned int in_pitch);
+bool copySinogramFromDevice(float* out_data, unsigned int out_pitch,
+		unsigned int width, unsigned int height,
+		float* inD_data, unsigned int in_pitch);
+bool copySinogramToDevice(const float* in_data, unsigned int in_pitch,
+		unsigned int width, unsigned int height,
+		float* outD_data, unsigned int out_pitch);
+
+bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch);
+
+void zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height);
+
+bool cudaTextForceKernelsCompletion();
+void reportCudaError(cudaError_t err);
+
+
+
+float dotProduct2D(float* D_data, unsigned int pitch,
+                   unsigned int width, unsigned int height,
+                   unsigned int padX, unsigned int padY);
+
+}
+
+#endif
diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu
new file mode 100644
index 0000000..20e7381
--- /dev/null
+++ b/cuda/3d/algo3d.cu
@@ -0,0 +1,108 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cassert>
+
+#include "algo3d.h"
+#include "cone_fp.h"
+#include "cone_bp.h"
+#include "par3d_fp.h"
+#include "par3d_bp.h"
+
+namespace astraCUDA3d {
+
+ReconAlgo3D::ReconAlgo3D()
+{
+	coneProjs = 0;
+	par3DProjs = 0;
+	shouldAbort = false;
+}
+
+ReconAlgo3D::~ReconAlgo3D()
+{
+	reset();
+}
+
+void ReconAlgo3D::reset()
+{
+	delete[] coneProjs;
+	coneProjs = 0;
+	delete[] par3DProjs;
+	par3DProjs = 0;
+	shouldAbort = false;
+}
+
+bool ReconAlgo3D::setConeGeometry(const SDimensions3D& _dims, const SConeProjection* _angles)
+{
+	dims = _dims;
+
+	coneProjs = new SConeProjection[dims.iProjAngles];
+	par3DProjs = 0;
+
+	memcpy(coneProjs, _angles, sizeof(coneProjs[0]) * dims.iProjAngles);
+
+	return true;
+}
+
+bool ReconAlgo3D::setPar3DGeometry(const SDimensions3D& _dims, const SPar3DProjection* _angles)
+{
+	dims = _dims;
+
+	par3DProjs = new SPar3DProjection[dims.iProjAngles];
+	coneProjs = 0;
+
+	memcpy(par3DProjs, _angles, sizeof(par3DProjs[0]) * dims.iProjAngles);
+
+	return true;
+}
+
+
+bool ReconAlgo3D::callFP(cudaPitchedPtr& D_volumeData,
+                       cudaPitchedPtr& D_projData,
+                       float outputScale)
+{
+	if (coneProjs) {
+		return ConeFP(D_volumeData, D_projData, dims, coneProjs, outputScale);
+	} else {
+		return Par3DFP(D_volumeData, D_projData, dims, par3DProjs, outputScale);
+	}
+}
+
+bool ReconAlgo3D::callBP(cudaPitchedPtr& D_volumeData,
+                       cudaPitchedPtr& D_projData)
+{
+	if (coneProjs) {
+		return ConeBP(D_volumeData, D_projData, dims, coneProjs);
+	} else {
+		return Par3DBP(D_volumeData, D_projData, dims, par3DProjs);
+	}
+}
+
+
+
+}
diff --git a/cuda/3d/algo3d.h b/cuda/3d/algo3d.h
new file mode 100644
index 0000000..2b44f6f
--- /dev/null
+++ b/cuda/3d/algo3d.h
@@ -0,0 +1,68 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ALGO_H
+#define _CUDA_ALGO_H
+
+#include "dims3d.h"
+#include "util3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport ReconAlgo3D {
+public:
+	ReconAlgo3D();
+	~ReconAlgo3D();
+
+	bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+	bool setPar3DGeometry(const SDimensions3D& dims, const SPar3DProjection* projs);
+
+	void signalAbort() { shouldAbort = true; }
+
+protected:
+	void reset();
+
+	bool callFP(cudaPitchedPtr& D_volumeData, 
+	            cudaPitchedPtr& D_projData, 
+	            float outputScale);
+	bool callBP(cudaPitchedPtr& D_volumeData, 
+	            cudaPitchedPtr& D_projData);
+
+	SDimensions3D dims;
+	SConeProjection* coneProjs;
+	SPar3DProjection* par3DProjs;
+
+	volatile bool shouldAbort;
+
+};
+
+
+}
+
+#endif
+
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu
new file mode 100644
index 0000000..9a19be0
--- /dev/null
+++ b/cuda/3d/arith3d.cu
@@ -0,0 +1,610 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util3d.h"
+#include "arith3d.h"
+#include <cassert>
+
+namespace astraCUDA3d {
+
+struct opAddScaled {
+	__device__ void operator()(float& out, const float in, const float inp) {
+		out += in * inp;
+	}
+};
+struct opScaleAndAdd {
+	__device__ void operator()(float& out, const float in, const float inp) {
+		out = in + out * inp;
+	}
+};
+struct opAddMulScaled {
+	__device__ void operator()(float& out, const float in1, const float in2, const float inp) {
+		out += in1 * in2 * inp;
+	}
+};
+struct opAddMul {
+	__device__ void operator()(float& out, const float in1, const float in2) {
+		out += in1 * in2;
+	}
+};
+struct opMul {
+	__device__ void operator()(float& out, const float in) {
+		out *= in;
+	}
+};
+struct opMul2 {
+	__device__ void operator()(float& out, const float in1, const float in2) {
+		out *= in1 * in2;
+	}
+};
+struct opDividedBy {
+	__device__ void operator()(float& out, const float in) {
+		if (out > 0.000001f) // out is assumed to be positive
+			out = in / out;
+		else
+			out = 0.0f;
+	}
+};
+struct opInvert {
+	__device__ void operator()(float& out) {
+		if (out > 0.000001f) // out is assumed to be positive
+			out = 1 / out;
+		else
+			out = 0.0f;
+	}
+};
+struct opSet {
+	__device__ void operator()(float& out, const float inp) {
+		out = inp;
+	}
+};
+struct opClampMin {
+	__device__ void operator()(float& out, const float inp) {
+		if (out < inp)
+			out = inp;
+	}
+};
+struct opClampMax {
+	__device__ void operator()(float& out, const float inp) {
+		if (out > inp)
+			out = inp;
+	}
+};
+
+
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn1[off], pfIn2[off]);
+		off += pitch;
+		y++;
+	}
+}
+
+template<class op, unsigned int padX, unsigned int padY, unsigned int repeat>
+__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	unsigned int x = threadIdx.x + 16*blockIdx.x;
+	if (x >= width) return;
+
+	unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat;
+	unsigned int off = (y+padY)*pitch+x+padX;
+	for (unsigned int i = 0; i < repeat; ++i) {
+		if (y >= height)
+			break;
+		op()(pfOut[off], pfIn1[off], pfIn2[off], fParam);
+		off += pitch;
+		y++;
+	}
+}
+
+
+
+
+
+
+
+
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+511)/512);
+
+	float *pfOut = (float*)out;
+
+	devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	float *pfOut = (float*)out;
+
+	devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	float *pfOut = (float*)out;
+	const float *pfIn = (const float*)in;
+
+	devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	float *pfOut = (float*)out;
+	const float *pfIn = (const float*)in;
+
+	devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	float *pfOut = (float*)out;
+	const float *pfIn1 = (const float*)in1;
+	const float *pfIn2 = (const float*)in2;
+
+	devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op, VolType t>
+void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((width+15)/16, (height+15)/16);
+
+	float *pfOut = (float*)out;
+	const float *pfIn1 = (const float*)in1;
+	const float *pfIn2 = (const float*)in2;
+
+	devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height);
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iVolY;
+
+	for (unsigned int i = 0; i < dims.iVolZ; ++i) {
+		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn = (float*)in.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+template<typename op>
+void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims)
+{
+	dim3 blockSize(16,16);
+	dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512);
+	float *pfOut = (float*)out.ptr;
+	float *pfIn1 = (float*)in1.ptr;
+	float *pfIn2 = (float*)in2.ptr;
+	unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles;
+
+	for (unsigned int i = 0; i < dims.iProjV; ++i) {
+		devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles);
+		pfOut += step;
+		pfIn1 += step;
+		pfIn2 += step;
+	}
+
+	cudaTextForceKernelsCompletion();
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+#define INST_DFtoD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+
+#define INST_DtoD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+
+#define INST_DDtoD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+#define INST_DDFtoD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+
+
+#define INST_toD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims);
+
+#define INST_FtoD(name) \
+  template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \
+  template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \
+  template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+
+
+
+INST_DFtoD(opAddScaled)
+INST_DFtoD(opScaleAndAdd)
+INST_DDFtoD(opAddMulScaled)
+INST_DDtoD(opAddMul)
+INST_DDtoD(opMul2)
+INST_DtoD(opMul)
+INST_DtoD(opDividedBy)
+INST_toD(opInvert)
+INST_FtoD(opSet)
+INST_FtoD(opClampMin)
+INST_FtoD(opClampMax)
+
+
+}
diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h
new file mode 100644
index 0000000..53c9b79
--- /dev/null
+++ b/cuda/3d/arith3d.h
@@ -0,0 +1,79 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ARITH3D_H
+#define _CUDA_ARITH3D_H
+
+#include <cuda.h>
+
+namespace astraCUDA3d {
+
+struct opAddScaled;
+struct opScaleAndAdd;
+struct opAddMulScaled;
+struct opAddMul;
+struct opMul;
+struct opMul2;
+struct opDividedBy;
+struct opInvert;
+struct opSet;
+struct opClampMin;
+struct opClampMax;
+
+enum VolType {
+  SINO = 0,
+  VOL = 1
+};
+
+
+template<typename op, VolType t> void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height);
+template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height);
+
+template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims);
+template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims);
+
+
+
+}
+
+#endif
diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu
new file mode 100644
index 0000000..fd4b370
--- /dev/null
+++ b/cuda/3d/astra3d.cu
@@ -0,0 +1,1620 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls3d.h"
+#include "sirt3d.h"
+#include "util3d.h"
+#include "cone_fp.h"
+#include "cone_bp.h"
+#include "par3d_fp.h"
+#include "par3d_bp.h"
+#include "fdk.h"
+#include "arith3d.h"
+#include "astra3d.h"
+
+#include <iostream>
+
+using namespace astraCUDA3d;
+
+namespace astra {
+
+enum CUDAProjectionType3d {
+	PROJ_PARALLEL,
+	PROJ_CONE
+};
+
+
+static SConeProjection* genConeProjections(unsigned int iProjAngles,
+                                           unsigned int iProjU,
+                                           unsigned int iProjV,
+                                           double fOriginSourceDistance,
+                                           double fOriginDetectorDistance,
+                                           double fDetUSize,
+                                           double fDetVSize,
+                                           const float *pfAngles)
+{
+	SConeProjection base;
+	base.fSrcX = 0.0f;
+	base.fSrcY = -fOriginSourceDistance;
+	base.fSrcZ = 0.0f;
+
+	base.fDetSX = iProjU * fDetUSize * -0.5f;
+	base.fDetSY = fOriginDetectorDistance;
+	base.fDetSZ = iProjV * fDetVSize * -0.5f;
+
+	base.fDetUX = fDetUSize;
+	base.fDetUY = 0.0f;
+	base.fDetUZ = 0.0f;
+
+	base.fDetVX = 0.0f;
+	base.fDetVY = 0.0f;
+	base.fDetVZ = fDetVSize;
+
+	SConeProjection* p = new SConeProjection[iProjAngles];
+
+#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0)
+
+	for (unsigned int i = 0; i < iProjAngles; ++i) {
+		ROTATE0(Src, i, pfAngles[i]);
+		ROTATE0(DetS, i, pfAngles[i]);
+		ROTATE0(DetU, i, pfAngles[i]);
+		ROTATE0(DetV, i, pfAngles[i]);
+	}
+
+#undef ROTATE0
+
+	return p;
+}
+
+static SPar3DProjection* genPar3DProjections(unsigned int iProjAngles,
+                                             unsigned int iProjU,
+                                             unsigned int iProjV,
+                                             double fDetUSize,
+                                             double fDetVSize,
+                                             const float *pfAngles)
+{
+	SPar3DProjection base;
+	base.fRayX = 0.0f;
+	base.fRayY = 1.0f;
+	base.fRayZ = 0.0f;
+
+	base.fDetSX = iProjU * fDetUSize * -0.5f;
+	base.fDetSY = 0.0f;
+	base.fDetSZ = iProjV * fDetVSize * -0.5f;
+
+	base.fDetUX = fDetUSize;
+	base.fDetUY = 0.0f;
+	base.fDetUZ = 0.0f;
+
+	base.fDetVX = 0.0f;
+	base.fDetVY = 0.0f;
+	base.fDetVZ = fDetVSize;
+
+	SPar3DProjection* p = new SPar3DProjection[iProjAngles];
+
+#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0)
+
+	for (unsigned int i = 0; i < iProjAngles; ++i) {
+		ROTATE0(Ray, i, pfAngles[i]);
+		ROTATE0(DetS, i, pfAngles[i]);
+		ROTATE0(DetU, i, pfAngles[i]);
+		ROTATE0(DetV, i, pfAngles[i]);
+	}
+
+#undef ROTATE0
+
+	return p;
+}
+
+
+
+
+class AstraSIRT3d_internal {
+public:
+	SDimensions3D dims;
+	CUDAProjectionType3d projType;
+
+	float* angles;
+	float fOriginSourceDistance;
+	float fOriginDetectorDistance;
+	float fSourceZ;
+	float fDetSize;
+
+	SConeProjection* projs;
+	SPar3DProjection* parprojs;
+
+	float fPixelSize;
+
+	bool initialized;
+	bool setStartReconstruction;
+
+	bool useVolumeMask;
+	bool useSinogramMask;
+
+	// Input/output
+	cudaPitchedPtr D_projData;
+	cudaPitchedPtr D_volumeData;
+	cudaPitchedPtr D_maskData;
+	cudaPitchedPtr D_smaskData;
+
+	SIRT sirt;
+};
+
+AstraSIRT3d::AstraSIRT3d()
+{
+	pData = new AstraSIRT3d_internal();
+
+	pData->angles = 0;
+	pData->D_projData.ptr = 0;
+	pData->D_volumeData.ptr = 0;
+	pData->D_maskData.ptr = 0;
+	pData->D_smaskData.ptr = 0;
+
+	pData->dims.iVolX = 0;
+	pData->dims.iVolY = 0;
+	pData->dims.iVolZ = 0;
+	pData->dims.iProjAngles = 0;
+	pData->dims.iProjU = 0;
+	pData->dims.iProjV = 0;
+	pData->dims.iRaysPerDetDim = 1;
+	pData->dims.iRaysPerVoxelDim = 1;
+
+	pData->projs = 0;
+
+	pData->initialized = false;
+	pData->setStartReconstruction = false;
+
+	pData->useVolumeMask = false;
+	pData->useSinogramMask = false;
+}
+
+AstraSIRT3d::~AstraSIRT3d()
+{
+	delete[] pData->angles;
+	pData->angles = 0;
+
+	delete[] pData->projs;
+	pData->projs = 0;
+
+	cudaFree(pData->D_projData.ptr);
+	pData->D_projData.ptr = 0;
+
+	cudaFree(pData->D_volumeData.ptr);
+	pData->D_volumeData.ptr = 0;
+
+	cudaFree(pData->D_maskData.ptr);
+	pData->D_maskData.ptr = 0;
+
+	cudaFree(pData->D_smaskData.ptr);
+	pData->D_smaskData.ptr = 0;
+
+	delete pData;
+	pData = 0;
+}
+
+bool AstraSIRT3d::setReconstructionGeometry(unsigned int iVolX,
+                                            unsigned int iVolY,
+                                            unsigned int iVolZ/*,
+                                            float fPixelSize = 1.0f*/)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iVolX = iVolX;
+	pData->dims.iVolY = iVolY;
+	pData->dims.iVolZ = iVolZ;
+
+	return (iVolX > 0 && iVolY > 0 && iVolZ > 0);
+}
+
+
+bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles,
+                                   unsigned int iProjU,
+                                   unsigned int iProjV,
+                                   const SPar3DProjection* projs)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+		return false;
+
+	pData->parprojs = new SPar3DProjection[iProjAngles];
+	memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0]));
+
+	pData->projType = PROJ_PARALLEL;
+
+	return true;
+}
+
+bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles,
+                                   unsigned int iProjU,
+                                   unsigned int iProjV,
+                                   float fDetUSize,
+                                   float fDetVSize,
+                                   const float *pfAngles)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SPar3DProjection* p = genPar3DProjections(iProjAngles,
+                                              iProjU, iProjV,
+                                              fDetUSize, fDetVSize,
+                                              pfAngles);
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	pData->parprojs = p;
+	pData->projType = PROJ_PARALLEL;
+
+	return true;
+}
+
+
+
+bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles,
+                                  unsigned int iProjU,
+                                  unsigned int iProjV,
+                                  const SConeProjection* projs)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+		return false;
+
+	pData->projs = new SConeProjection[iProjAngles];
+	memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0]));
+
+	pData->projType = PROJ_CONE;
+
+	return true;
+}
+
+bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles,
+                                  unsigned int iProjU,
+                                  unsigned int iProjV,
+                                  float fOriginSourceDistance,
+                                  float fOriginDetectorDistance,
+                                  float fDetUSize,
+                                  float fDetVSize,
+                                  const float *pfAngles)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SConeProjection* p = genConeProjections(iProjAngles,
+                                            iProjU, iProjV,
+                                            fOriginSourceDistance,
+                                            fOriginDetectorDistance,
+                                            fDetUSize, fDetVSize,
+                                            pfAngles);
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	pData->projs = p;
+	pData->projType = PROJ_CONE;
+
+	return true;
+}
+
+bool AstraSIRT3d::enableSuperSampling(unsigned int iVoxelSuperSampling,
+                                      unsigned int iDetectorSuperSampling)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0)
+		return false;
+
+	pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+	pData->dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+	return true;
+}
+
+bool AstraSIRT3d::enableVolumeMask()
+{
+	if (pData->initialized)
+		return false;
+
+	bool ok = pData->sirt.enableVolumeMask();
+	pData->useVolumeMask = ok;
+
+	return ok;
+}
+
+bool AstraSIRT3d::enableSinogramMask()
+{
+	if (pData->initialized)
+		return false;
+
+	bool ok = pData->sirt.enableSinogramMask();
+	pData->useSinogramMask = ok;
+
+	return ok;
+}
+	
+bool AstraSIRT3d::setGPUIndex(int index)
+{
+	cudaSetDevice(index);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+	return true;
+}
+
+bool AstraSIRT3d::init()
+{
+	fprintf(stderr, "001: %d\n", true);
+	if (pData->initialized)
+		return false;
+	fprintf(stderr, "002: %d\n", true);
+
+	if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0)
+		return false;
+	fprintf(stderr, "003: %d\n", true);
+
+	bool ok;
+
+	if (pData->projType == PROJ_PARALLEL) {
+		ok = pData->sirt.setPar3DGeometry(pData->dims, pData->parprojs);
+	} else {
+		ok = pData->sirt.setConeGeometry(pData->dims, pData->projs);
+	}
+	fprintf(stderr, "004: %d\n", ok);
+
+	if (!ok)
+		return false;
+
+	ok = pData->sirt.init();
+	if (!ok)
+		return false;
+	fprintf(stderr, "005: %d\n", ok);
+
+	pData->D_volumeData = allocateVolumeData(pData->dims);
+	ok = pData->D_volumeData.ptr;
+	if (!ok)
+		return false;
+	fprintf(stderr, "006: %d\n", ok);
+
+	fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV);
+	pData->D_projData = allocateProjectionData(pData->dims);
+	ok = pData->D_projData.ptr;
+	if (!ok) {
+		cudaFree(pData->D_volumeData.ptr);
+		pData->D_volumeData.ptr = 0;
+		return false;
+	}
+	fprintf(stderr, "007: %d\n", ok);
+
+	if (pData->useVolumeMask) {
+		pData->D_maskData = allocateVolumeData(pData->dims);
+		ok = pData->D_maskData.ptr;
+		if (!ok) {
+			cudaFree(pData->D_volumeData.ptr);
+			cudaFree(pData->D_projData.ptr);
+			pData->D_volumeData.ptr = 0;
+			pData->D_projData.ptr = 0;
+			return false;
+		}
+	}
+
+	if (pData->useSinogramMask) {
+		pData->D_smaskData = allocateProjectionData(pData->dims);
+		ok = pData->D_smaskData.ptr;
+		if (!ok) {
+			cudaFree(pData->D_volumeData.ptr);
+			cudaFree(pData->D_projData.ptr);
+			cudaFree(pData->D_maskData.ptr);
+			pData->D_volumeData.ptr = 0;
+			pData->D_projData.ptr = 0;
+			pData->D_maskData.ptr = 0;
+			return false;
+		}
+	}
+	fprintf(stderr, "008: %d\n", ok);
+
+	pData->initialized = true;
+
+	return true;
+}
+
+bool AstraSIRT3d::setMinConstraint(float fMin)
+{
+	if (!pData->initialized)
+		return false;
+	return pData->sirt.setMinConstraint(fMin);
+}
+
+bool AstraSIRT3d::setMaxConstraint(float fMax)
+{
+	if (!pData->initialized)
+		return false;
+	return pData->sirt.setMaxConstraint(fMax);
+}
+
+bool AstraSIRT3d::setSinogram(const float* pfSinogram,
+                              unsigned int iSinogramPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pfSinogram)
+		return false;
+
+	bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch);
+
+	if (!ok)
+		return false;
+
+	ok = pData->sirt.setBuffers(pData->D_volumeData, pData->D_projData);
+	if (!ok)
+		return false;
+
+	pData->setStartReconstruction = false;
+
+	return true;
+}
+
+bool AstraSIRT3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pData->useVolumeMask)
+		return false;
+	if (!pfMask)
+		return false;
+
+	bool ok = copyVolumeToDevice(pfMask, pData->D_maskData,
+	                             pData->dims, iMaskPitch);
+	if (!ok)
+		return false;
+
+	ok = pData->sirt.setVolumeMask(pData->D_maskData);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+bool AstraSIRT3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pData->useSinogramMask)
+		return false;
+	if (!pfMask)
+		return false;
+
+	bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch);
+
+	if (!ok)
+		return false;
+
+	ok = pData->sirt.setSinogramMask(pData->D_smaskData);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+bool AstraSIRT3d::setStartReconstruction(const float* pfReconstruction,
+                                         unsigned int iReconstructionPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pfReconstruction)
+		return false;
+
+	bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData,
+	                             pData->dims, iReconstructionPitch);
+	if (!ok)
+		return false;
+
+	pData->setStartReconstruction = true;
+
+	return true;
+}
+
+bool AstraSIRT3d::iterate(unsigned int iIterations)
+{
+	if (!pData->initialized)
+		return false;
+
+	if (!pData->setStartReconstruction)
+		zeroVolumeData(pData->D_volumeData, pData->dims);
+
+	bool ok = pData->sirt.iterate(iIterations);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+bool AstraSIRT3d::getReconstruction(float* pfReconstruction,
+                                    unsigned int iReconstructionPitch) const
+{
+	if (!pData->initialized)
+		return false;
+
+	bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData,
+	                               pData->dims, iReconstructionPitch);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+void AstraSIRT3d::signalAbort()
+{
+	if (!pData->initialized)
+		return;
+
+	pData->sirt.signalAbort();
+}
+
+float AstraSIRT3d::computeDiffNorm()
+{
+	if (!pData->initialized)
+		return 0.0f; // FIXME: Error?
+
+	return pData->sirt.computeDiffNorm();
+}
+
+
+
+
+class AstraCGLS3d_internal {
+public:
+	SDimensions3D dims;
+	CUDAProjectionType3d projType;
+
+	float* angles;
+	float fOriginSourceDistance;
+	float fOriginDetectorDistance;
+	float fSourceZ;
+	float fDetSize;
+
+	SConeProjection* projs;
+	SPar3DProjection* parprojs;
+
+	float fPixelSize;
+
+	bool initialized;
+	bool setStartReconstruction;
+
+	bool useVolumeMask;
+	bool useSinogramMask;
+
+	// Input/output
+	cudaPitchedPtr D_projData;
+	cudaPitchedPtr D_volumeData;
+	cudaPitchedPtr D_maskData;
+	cudaPitchedPtr D_smaskData;
+
+	CGLS cgls;
+};
+
+AstraCGLS3d::AstraCGLS3d()
+{
+	pData = new AstraCGLS3d_internal();
+
+	pData->angles = 0;
+	pData->D_projData.ptr = 0;
+	pData->D_volumeData.ptr = 0;
+	pData->D_maskData.ptr = 0;
+	pData->D_smaskData.ptr = 0;
+
+	pData->dims.iVolX = 0;
+	pData->dims.iVolY = 0;
+	pData->dims.iVolZ = 0;
+	pData->dims.iProjAngles = 0;
+	pData->dims.iProjU = 0;
+	pData->dims.iProjV = 0;
+	pData->dims.iRaysPerDetDim = 1;
+	pData->dims.iRaysPerVoxelDim = 1;
+
+	pData->projs = 0;
+
+	pData->initialized = false;
+	pData->setStartReconstruction = false;
+
+	pData->useVolumeMask = false;
+	pData->useSinogramMask = false;
+}
+
+AstraCGLS3d::~AstraCGLS3d()
+{
+	delete[] pData->angles;
+	pData->angles = 0;
+
+	delete[] pData->projs;
+	pData->projs = 0;
+
+	cudaFree(pData->D_projData.ptr);
+	pData->D_projData.ptr = 0;
+
+	cudaFree(pData->D_volumeData.ptr);
+	pData->D_volumeData.ptr = 0;
+
+	cudaFree(pData->D_maskData.ptr);
+	pData->D_maskData.ptr = 0;
+
+	cudaFree(pData->D_smaskData.ptr);
+	pData->D_smaskData.ptr = 0;
+
+	delete pData;
+	pData = 0;
+}
+
+bool AstraCGLS3d::setReconstructionGeometry(unsigned int iVolX,
+                                            unsigned int iVolY,
+                                            unsigned int iVolZ/*,
+                                            float fPixelSize = 1.0f*/)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iVolX = iVolX;
+	pData->dims.iVolY = iVolY;
+	pData->dims.iVolZ = iVolZ;
+
+	return (iVolX > 0 && iVolY > 0 && iVolZ > 0);
+}
+
+
+bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles,
+                                   unsigned int iProjU,
+                                   unsigned int iProjV,
+                                   const SPar3DProjection* projs)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+		return false;
+
+	pData->parprojs = new SPar3DProjection[iProjAngles];
+	memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0]));
+
+	pData->projType = PROJ_PARALLEL;
+
+	return true;
+}
+
+bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles,
+                                   unsigned int iProjU,
+                                   unsigned int iProjV,
+                                   float fDetUSize,
+                                   float fDetVSize,
+                                   const float *pfAngles)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SPar3DProjection* p = genPar3DProjections(iProjAngles,
+                                              iProjU, iProjV,
+                                              fDetUSize, fDetVSize,
+                                              pfAngles);
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	pData->parprojs = p;
+	pData->projType = PROJ_PARALLEL;
+
+	return true;
+}
+
+
+
+bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles,
+                                  unsigned int iProjU,
+                                  unsigned int iProjV,
+                                  const SConeProjection* projs)
+{
+	if (pData->initialized)
+		return false;
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0)
+		return false;
+
+	pData->projs = new SConeProjection[iProjAngles];
+	memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0]));
+
+	pData->projType = PROJ_CONE;
+
+	return true;
+}
+
+bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles,
+                                  unsigned int iProjU,
+                                  unsigned int iProjV,
+                                  float fOriginSourceDistance,
+                                  float fOriginDetectorDistance,
+                                  float fDetUSize,
+                                  float fDetVSize,
+                                  const float *pfAngles)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SConeProjection* p = genConeProjections(iProjAngles,
+                                            iProjU, iProjV,
+                                            fOriginSourceDistance,
+                                            fOriginDetectorDistance,
+                                            fDetUSize, fDetVSize,
+                                            pfAngles);
+
+	pData->dims.iProjAngles = iProjAngles;
+	pData->dims.iProjU = iProjU;
+	pData->dims.iProjV = iProjV;
+
+	pData->projs = p;
+	pData->projType = PROJ_CONE;
+
+	return true;
+}
+
+bool AstraCGLS3d::enableSuperSampling(unsigned int iVoxelSuperSampling,
+                                      unsigned int iDetectorSuperSampling)
+{
+	if (pData->initialized)
+		return false;
+
+	if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0)
+		return false;
+
+	pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+	pData->dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+	return true;
+}
+
+bool AstraCGLS3d::enableVolumeMask()
+{
+	if (pData->initialized)
+		return false;
+
+	bool ok = pData->cgls.enableVolumeMask();
+	pData->useVolumeMask = ok;
+
+	return ok;
+}
+
+#if 0
+bool AstraCGLS3d::enableSinogramMask()
+{
+	if (pData->initialized)
+		return false;
+
+	bool ok = pData->cgls.enableSinogramMask();
+	pData->useSinogramMask = ok;
+
+	return ok;
+}
+#endif
+	
+bool AstraCGLS3d::setGPUIndex(int index)
+{
+	cudaSetDevice(index);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+	return true;
+}
+
+bool AstraCGLS3d::init()
+{
+	fprintf(stderr, "001: %d\n", true);
+	if (pData->initialized)
+		return false;
+	fprintf(stderr, "002: %d\n", true);
+
+	if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0)
+		return false;
+	fprintf(stderr, "003: %d\n", true);
+
+	bool ok;
+
+	if (pData->projType == PROJ_PARALLEL) {
+		ok = pData->cgls.setPar3DGeometry(pData->dims, pData->parprojs);
+	} else {
+		ok = pData->cgls.setConeGeometry(pData->dims, pData->projs);
+	}
+	fprintf(stderr, "004: %d\n", ok);
+
+	if (!ok)
+		return false;
+
+	ok = pData->cgls.init();
+	if (!ok)
+		return false;
+	fprintf(stderr, "005: %d\n", ok);
+
+	pData->D_volumeData = allocateVolumeData(pData->dims);
+	ok = pData->D_volumeData.ptr;
+	if (!ok)
+		return false;
+	fprintf(stderr, "006: %d\n", ok);
+
+	fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV);
+	pData->D_projData = allocateProjectionData(pData->dims);
+	ok = pData->D_projData.ptr;
+	if (!ok) {
+		cudaFree(pData->D_volumeData.ptr);
+		pData->D_volumeData.ptr = 0;
+		return false;
+	}
+	fprintf(stderr, "007: %d\n", ok);
+
+	if (pData->useVolumeMask) {
+		pData->D_maskData = allocateVolumeData(pData->dims);
+		ok = pData->D_maskData.ptr;
+		if (!ok) {
+			cudaFree(pData->D_volumeData.ptr);
+			cudaFree(pData->D_projData.ptr);
+			pData->D_volumeData.ptr = 0;
+			pData->D_projData.ptr = 0;
+			return false;
+		}
+	}
+
+	if (pData->useSinogramMask) {
+		pData->D_smaskData = allocateProjectionData(pData->dims);
+		ok = pData->D_smaskData.ptr;
+		if (!ok) {
+			cudaFree(pData->D_volumeData.ptr);
+			cudaFree(pData->D_projData.ptr);
+			cudaFree(pData->D_maskData.ptr);
+			pData->D_volumeData.ptr = 0;
+			pData->D_projData.ptr = 0;
+			pData->D_maskData.ptr = 0;
+			return false;
+		}
+	}
+	fprintf(stderr, "008: %d\n", ok);
+
+	pData->initialized = true;
+
+	return true;
+}
+
+#if 0
+bool AstraCGLS3d::setMinConstraint(float fMin)
+{
+	if (!pData->initialized)
+		return false;
+	return pData->cgls.setMinConstraint(fMin);
+}
+
+bool AstraCGLS3d::setMaxConstraint(float fMax)
+{
+	if (!pData->initialized)
+		return false;
+	return pData->cgls.setMaxConstraint(fMax);
+}
+#endif
+
+bool AstraCGLS3d::setSinogram(const float* pfSinogram,
+                              unsigned int iSinogramPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pfSinogram)
+		return false;
+
+	bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch);
+
+	if (!ok)
+		return false;
+
+	ok = pData->cgls.setBuffers(pData->D_volumeData, pData->D_projData);
+	if (!ok)
+		return false;
+
+	pData->setStartReconstruction = false;
+
+	return true;
+}
+
+bool AstraCGLS3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pData->useVolumeMask)
+		return false;
+	if (!pfMask)
+		return false;
+
+	bool ok = copyVolumeToDevice(pfMask, pData->D_maskData,
+	                             pData->dims, iMaskPitch);
+	if (!ok)
+		return false;
+
+	ok = pData->cgls.setVolumeMask(pData->D_maskData);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+#if 0
+bool AstraCGLS3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pData->useSinogramMask)
+		return false;
+	if (!pfMask)
+		return false;
+
+	bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch);
+
+	if (!ok)
+		return false;
+
+	ok = pData->cgls.setSinogramMask(pData->D_smaskData);
+	if (!ok)
+		return false;
+
+	return true;
+}
+#endif
+
+bool AstraCGLS3d::setStartReconstruction(const float* pfReconstruction,
+                                         unsigned int iReconstructionPitch)
+{
+	if (!pData->initialized)
+		return false;
+	if (!pfReconstruction)
+		return false;
+
+	bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData,
+	                             pData->dims, iReconstructionPitch);
+	if (!ok)
+		return false;
+
+	pData->setStartReconstruction = true;
+
+	return true;
+}
+
+bool AstraCGLS3d::iterate(unsigned int iIterations)
+{
+	if (!pData->initialized)
+		return false;
+
+	if (!pData->setStartReconstruction)
+		zeroVolumeData(pData->D_volumeData, pData->dims);
+
+	bool ok = pData->cgls.iterate(iIterations);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+bool AstraCGLS3d::getReconstruction(float* pfReconstruction,
+                                    unsigned int iReconstructionPitch) const
+{
+	if (!pData->initialized)
+		return false;
+
+	bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData,
+	                               pData->dims, iReconstructionPitch);
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+void AstraCGLS3d::signalAbort()
+{
+	if (!pData->initialized)
+		return;
+
+	pData->cgls.signalAbort();
+}
+
+float AstraCGLS3d::computeDiffNorm()
+{
+	if (!pData->initialized)
+		return 0.0f; // FIXME: Error?
+
+	return pData->cgls.computeDiffNorm();
+}
+
+
+
+bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     float fOriginSourceDistance,
+                     float fOriginDetectorDistance,
+                     float fDetUSize,
+                     float fDetVSize,
+                     const float *pfAngles,
+                     int iGPUIndex, int iDetectorSuperSampling)
+{
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SConeProjection* p = genConeProjections(iProjAngles,
+                                            iProjU, iProjV,
+                                            fOriginSourceDistance,
+                                            fOriginDetectorDistance,
+                                            fDetUSize, fDetVSize,
+                                            pfAngles);
+
+	bool ok;
+	ok = astraCudaConeFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+	                     iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling);
+
+	delete[] p;
+
+	return ok;
+}
+
+bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     const SConeProjection *pfAngles,
+                     int iGPUIndex, int iDetectorSuperSampling)
+{
+	SDimensions3D dims;
+
+	dims.iVolX = iVolX;
+	dims.iVolY = iVolY;
+	dims.iVolZ = iVolZ;
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjU = iProjU;
+	dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+	if (iDetectorSuperSampling == 0)
+		return false;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+	cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+	bool ok = D_volumeData.ptr;
+	if (!ok)
+		return false;
+
+	cudaPitchedPtr D_projData = allocateProjectionData(dims);
+	ok = D_projData.ptr;
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		return false;
+	}
+
+	ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+	ok &= zeroProjectionData(D_projData, dims);
+
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		cudaFree(D_projData.ptr);
+		return false;
+	}
+
+	ok &= ConeFP(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+
+	ok &= copyProjectionsFromDevice(pfProjections, D_projData,
+	                                dims, dims.iProjU);
+
+
+	cudaFree(D_volumeData.ptr);
+	cudaFree(D_projData.ptr);
+
+	return ok;
+
+}
+
+bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      float fDetUSize,
+                      float fDetVSize,
+                      const float *pfAngles,
+                      int iGPUIndex, int iDetectorSuperSampling,
+                      Cuda3DProjectionKernel projKernel)
+{
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SPar3DProjection* p = genPar3DProjections(iProjAngles,
+                                             iProjU, iProjV,
+                                             fDetUSize, fDetVSize,
+                                             pfAngles);
+
+	bool ok;
+	ok = astraCudaPar3DFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+	                      iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling,
+	                      projKernel);
+
+	delete[] p;
+
+	return ok;
+}
+
+
+bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      const SPar3DProjection *pfAngles,
+                      int iGPUIndex, int iDetectorSuperSampling,
+                      Cuda3DProjectionKernel projKernel)
+{
+	SDimensions3D dims;
+
+	dims.iVolX = iVolX;
+	dims.iVolY = iVolY;
+	dims.iVolZ = iVolZ;
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjU = iProjU;
+	dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	dims.iRaysPerDetDim = iDetectorSuperSampling;
+
+	if (iDetectorSuperSampling == 0)
+		return false;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+
+	cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+	bool ok = D_volumeData.ptr;
+	if (!ok)
+		return false;
+
+	cudaPitchedPtr D_projData = allocateProjectionData(dims);
+	ok = D_projData.ptr;
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		return false;
+	}
+
+	ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+	ok &= zeroProjectionData(D_projData, dims);
+
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		cudaFree(D_projData.ptr);
+		return false;
+	}
+
+	switch (projKernel) {
+	case ker3d_default:
+		ok &= Par3DFP(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+		break;
+	case ker3d_sum_square_weights:
+		ok &= Par3DFP_SumSqW(D_volumeData, D_projData, dims, pfAngles, 1.0f);
+		break;
+	default:
+		assert(false);
+	}
+
+	ok &= copyProjectionsFromDevice(pfProjections, D_projData,
+	                                dims, dims.iProjU);
+
+
+	cudaFree(D_volumeData.ptr);
+	cudaFree(D_projData.ptr);
+
+	return ok;
+
+}
+
+bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     float fOriginSourceDistance,
+                     float fOriginDetectorDistance,
+                     float fDetUSize,
+                     float fDetVSize,
+                     const float *pfAngles,
+                     int iGPUIndex, int iVoxelSuperSampling)
+{
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SConeProjection* p = genConeProjections(iProjAngles,
+                                            iProjU, iProjV,
+                                            fOriginSourceDistance,
+                                            fOriginDetectorDistance,
+                                            fDetUSize, fDetVSize,
+                                            pfAngles);
+
+	bool ok;
+	ok = astraCudaConeBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+	                     iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling);
+
+	delete[] p;
+
+	return ok;
+}
+
+bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     const SConeProjection *pfAngles,
+                     int iGPUIndex, int iVoxelSuperSampling)
+{
+	SDimensions3D dims;
+
+	dims.iVolX = iVolX;
+	dims.iVolY = iVolY;
+	dims.iVolZ = iVolZ;
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjU = iProjU;
+	dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+	cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+	bool ok = D_volumeData.ptr;
+	if (!ok)
+		return false;
+
+	cudaPitchedPtr D_projData = allocateProjectionData(dims);
+	ok = D_projData.ptr;
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		return false;
+	}
+
+	ok &= copyProjectionsToDevice(pfProjections, D_projData,
+	                              dims, dims.iProjU);
+
+	ok &= zeroVolumeData(D_volumeData, dims);
+
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		cudaFree(D_projData.ptr);
+		return false;
+	}
+
+	ok &= ConeBP(D_volumeData, D_projData, dims, pfAngles);
+
+	ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+	cudaFree(D_volumeData.ptr);
+	cudaFree(D_projData.ptr);
+
+	return ok;
+
+}
+
+bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      float fDetUSize,
+                      float fDetVSize,
+                      const float *pfAngles,
+                      int iGPUIndex, int iVoxelSuperSampling)
+{
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	SPar3DProjection* p = genPar3DProjections(iProjAngles,
+                                             iProjU, iProjV,
+                                             fDetUSize, fDetVSize,
+                                             pfAngles);
+
+	bool ok;
+	ok = astraCudaPar3DBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ,
+	                      iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling);
+
+	delete[] p;
+
+	return ok;
+}
+
+
+bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      const SPar3DProjection *pfAngles,
+                      int iGPUIndex, int iVoxelSuperSampling)
+{
+	SDimensions3D dims;
+
+	dims.iVolX = iVolX;
+	dims.iVolY = iVolY;
+	dims.iVolZ = iVolZ;
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjU = iProjU;
+	dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+
+	cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+	bool ok = D_volumeData.ptr;
+	if (!ok)
+		return false;
+
+	cudaPitchedPtr D_projData = allocateProjectionData(dims);
+	ok = D_projData.ptr;
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		return false;
+	}
+
+	ok &= copyProjectionsToDevice(pfProjections, D_projData,
+	                              dims, dims.iProjU);
+
+	ok &= zeroVolumeData(D_volumeData, dims);
+
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		cudaFree(D_projData.ptr);
+		return false;
+	}
+
+	ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles);
+
+	ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+	cudaFree(D_volumeData.ptr);
+	cudaFree(D_projData.ptr);
+
+	return ok;
+
+}
+
+
+
+bool astraCudaFDK(float* pfVolume, const float* pfProjections,
+                  unsigned int iVolX,
+                  unsigned int iVolY,
+                  unsigned int iVolZ,
+                  unsigned int iProjAngles,
+                  unsigned int iProjU,
+                  unsigned int iProjV,
+                  float fOriginSourceDistance,
+                  float fOriginDetectorDistance,
+                  float fDetUSize,
+                  float fDetVSize,
+                  const float *pfAngles,
+                  bool bShortScan,
+                  int iGPUIndex, int iVoxelSuperSampling)
+{
+	SDimensions3D dims;
+
+	dims.iVolX = iVolX;
+	dims.iVolY = iVolY;
+	dims.iVolZ = iVolZ;
+	if (iVolX == 0 || iVolY == 0 || iVolZ == 0)
+		return false;
+
+	dims.iProjAngles = iProjAngles;
+	dims.iProjU = iProjU;
+	dims.iProjV = iProjV;
+
+	if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0)
+		return false;
+
+	dims.iRaysPerVoxelDim = iVoxelSuperSampling;
+
+	if (iVoxelSuperSampling == 0)
+		return false;
+
+	cudaSetDevice(iGPUIndex);
+	cudaError_t err = cudaGetLastError();
+
+	// Ignore errors caused by calling cudaSetDevice multiple times
+	if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+		return false;
+
+
+
+	cudaPitchedPtr D_volumeData = allocateVolumeData(dims);
+	bool ok = D_volumeData.ptr;
+	if (!ok)
+		return false;
+
+	cudaPitchedPtr D_projData = allocateProjectionData(dims);
+	ok = D_projData.ptr;
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		return false;
+	}
+
+	ok &= copyProjectionsToDevice(pfProjections, D_projData, dims, dims.iProjU);
+
+	ok &= zeroVolumeData(D_volumeData, dims);
+
+	if (!ok) {
+		cudaFree(D_volumeData.ptr);
+		cudaFree(D_projData.ptr);
+		return false;
+	}
+
+	// TODO: Offer interface for SrcZ, DetZ
+	ok &= FDK(D_volumeData, D_projData, fOriginSourceDistance,
+	          fOriginDetectorDistance, 0, 0, fDetUSize, fDetVSize,
+	          dims, pfAngles, bShortScan);
+
+	ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX);
+
+
+	cudaFree(D_volumeData.ptr);
+	cudaFree(D_projData.ptr);
+
+	return ok;
+
+}
+
+
+
+
+}
diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h
new file mode 100644
index 0000000..5712f89
--- /dev/null
+++ b/cuda/3d/astra3d.h
@@ -0,0 +1,450 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_ASTRA3D_H
+#define _CUDA_ASTRA3D_H
+
+#include "dims3d.h"
+
+namespace astra {
+
+
+// TODO: Switch to a class hierarchy as with the 2D algorithms
+
+
+enum Cuda3DProjectionKernel {
+	ker3d_default = 0,
+	ker3d_sum_square_weights
+};
+
+
+class AstraSIRT3d_internal;
+
+
+class _AstraExport AstraSIRT3d {
+public:
+
+	AstraSIRT3d();
+	~AstraSIRT3d();
+
+	// Set the number of pixels in the reconstruction rectangle,
+	// and the length of the edge of a pixel.
+	// Volume pixels are assumed to be square.
+	// This must be called before setting the projection geometry.
+	bool setReconstructionGeometry(unsigned int iVolX,
+	                               unsigned int iVolY,
+	                               unsigned int iVolZ/*,
+	                               float fPixelSize = 1.0f*/);
+
+	bool setConeGeometry(unsigned int iProjAngles,
+	                     unsigned int iProjU,
+	                     unsigned int iProjV,
+	                     const SConeProjection* projs);
+	bool setConeGeometry(unsigned int iProjAngles,
+	                     unsigned int iProjU,
+	                     unsigned int iProjV,
+	                     float fOriginSourceDistance,
+	                     float fOriginDetectorDistance,
+	                     float fSourceZ,
+	                     float fDetSize,
+	                     const float *pfAngles);
+	bool setPar3DGeometry(unsigned int iProjAngles,
+	                      unsigned int iProjU,
+	                      unsigned int iProjV,
+	                      const SPar3DProjection* projs);
+	bool setPar3DGeometry(unsigned int iProjAngles,
+	                      unsigned int iProjU,
+	                      unsigned int iProjV,
+	                      float fSourceZ,
+	                      float fDetSize,
+	                      const float *pfAngles);
+
+	// Enable supersampling.
+	//
+	// The number of rays used in FP is the square of iDetectorSuperSampling.
+	// The number of rays used in BP is the cube of iVoxelSuperSampling.
+	bool enableSuperSampling(unsigned int iVoxelSuperSampling,
+	                         unsigned int iDetectorSuperSampling);
+
+	// Enable volume/sinogram masks
+	//
+	// This may optionally be called before init().
+	// If it is called, setVolumeMask()/setSinogramMask() must be called between
+	// setSinogram() and iterate().
+	bool enableVolumeMask();
+	bool enableSinogramMask();
+
+	// Set GPU index
+	//
+	// This should be called before init(). Note that setting the GPU index
+	// in a thread which has already used the GPU may not work.
+	bool setGPUIndex(int index);
+
+	// Allocate GPU buffers and
+	// precompute geometry-specific data.
+	//
+	// This must be called after calling setReconstructionGeometry() and
+	// setProjectionGeometry() or setFanProjectionGeometry().
+	bool init();
+
+	// Setup input sinogram for a slice.
+	// pfSinogram must be a float array of size XXX
+	// NB: iSinogramPitch is measured in floats, not in bytes.
+	//
+	// This must be called after init(), and before iterate(). It may be
+	// called again after iterate()/getReconstruction() to start a new slice.
+	//
+	// pfSinogram will only be read from during this call.
+	bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+	// Setup volume mask for a slice.
+	// pfMask must be a float array of size XXX
+	// NB: iMaskPitch is measured in floats, not in bytes.
+	//
+	// It may only contain the exact values 0.0f and 1.0f. Only volume pixels
+	// for which pfMask[z] is 1.0f are processed.
+	bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch);
+
+	// Setup sinogram mask for a slice.
+	// pfMask must be a float array of size XXX
+	// NB: iMaskPitch is measured in floats, not in bytes.
+	//
+	// It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels
+	// for which pfMask[z] is 1.0f are processed.
+	bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch);
+
+	// Set the starting reconstruction for SIRT.	
+	// pfReconstruction must be a float array of size XXX
+	// NB: iReconstructionPitch is measured in floats, not in bytes.
+	//
+	// This may be called between setSinogram() and iterate().
+	// If this function is not called before iterate(), SIRT will start
+	// from a zero reconstruction.
+	//
+	// pfReconstruction will only be read from during this call.
+	bool setStartReconstruction(const float* pfReconstruction,
+	                            unsigned int iReconstructionPitch);
+
+	// Enable min/max constraint.
+	//
+	// These may optionally be called between init() and iterate()
+	bool setMinConstraint(float fMin);
+	bool setMaxConstraint(float fMax);
+
+	// Perform a number of (additive) SIRT iterations.
+	// This must be called after setSinogram().
+	//
+	// If called multiple times, without calls to setSinogram() or
+	// setStartReconstruction() in between, iterate() will continue from
+	// the result of the previous call.
+	// Calls to getReconstruction() are allowed between calls to iterate() and
+	// do not change the state.
+	bool iterate(unsigned int iIterations);
+
+	// Get the reconstructed slice.
+	// pfReconstruction must be a float array of size XXX
+	// NB: iReconstructionPitch is measured in floats, not in bytes.
+	//
+	// This may be called after iterate().
+	bool getReconstruction(float* pfReconstruction,
+	                       unsigned int iReconstructionPitch) const;
+
+	// Compute the norm of the difference of the FP of the current
+	// reconstruction and the sinogram. (This performs one FP.)
+	// It can be called after iterate().
+	float computeDiffNorm();
+
+	// Signal the algorithm that it should abort after the current iteration.
+	// This is intended to be called from another thread.
+	void signalAbort();
+
+protected:
+	AstraSIRT3d_internal *pData;
+};
+
+
+class AstraCGLS3d_internal;
+
+
+class _AstraExport AstraCGLS3d {
+public:
+
+	AstraCGLS3d();
+	~AstraCGLS3d();
+
+	// Set the number of pixels in the reconstruction rectangle,
+	// and the length of the edge of a pixel.
+	// Volume pixels are assumed to be square.
+	// This must be called before setting the projection geometry.
+	bool setReconstructionGeometry(unsigned int iVolX,
+	                               unsigned int iVolY,
+	                               unsigned int iVolZ/*,
+	                               float fPixelSize = 1.0f*/);
+
+	bool setConeGeometry(unsigned int iProjAngles,
+	                     unsigned int iProjU,
+	                     unsigned int iProjV,
+	                     const SConeProjection* projs);
+	bool setConeGeometry(unsigned int iProjAngles,
+	                     unsigned int iProjU,
+	                     unsigned int iProjV,
+	                     float fOriginSourceDistance,
+	                     float fOriginDetectorDistance,
+	                     float fSourceZ,
+	                     float fDetSize,
+	                     const float *pfAngles);
+	bool setPar3DGeometry(unsigned int iProjAngles,
+	                      unsigned int iProjU,
+	                      unsigned int iProjV,
+	                      const SPar3DProjection* projs);
+	bool setPar3DGeometry(unsigned int iProjAngles,
+	                      unsigned int iProjU,
+	                      unsigned int iProjV,
+	                      float fSourceZ,
+	                      float fDetSize,
+	                      const float *pfAngles);
+
+	// Enable supersampling.
+	//
+	// The number of rays used in FP is the square of iDetectorSuperSampling.
+	// The number of rays used in BP is the cube of iVoxelSuperSampling.
+	bool enableSuperSampling(unsigned int iVoxelSuperSampling,
+	                         unsigned int iDetectorSuperSampling);
+
+	// Enable volume/sinogram masks
+	//
+	// This may optionally be called before init().
+	// If it is called, setVolumeMask()/setSinogramMask() must be called between
+	// setSinogram() and iterate().
+	bool enableVolumeMask();
+	//bool enableSinogramMask();
+
+	// Set GPU index
+	//
+	// This should be called before init(). Note that setting the GPU index
+	// in a thread which has already used the GPU may not work.
+	bool setGPUIndex(int index);
+
+	// Allocate GPU buffers and
+	// precompute geometry-specific data.
+	//
+	// This must be called after calling setReconstructionGeometry() and
+	// setProjectionGeometry() or setFanProjectionGeometry().
+	bool init();
+
+	// Setup input sinogram for a slice.
+	// pfSinogram must be a float array of size XXX
+	// NB: iSinogramPitch is measured in floats, not in bytes.
+	//
+	// This must be called after init(), and before iterate(). It may be
+	// called again after iterate()/getReconstruction() to start a new slice.
+	//
+	// pfSinogram will only be read from during this call.
+	bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch);
+
+	// Setup volume mask for a slice.
+	// pfMask must be a float array of size XXX
+	// NB: iMaskPitch is measured in floats, not in bytes.
+	//
+	// It may only contain the exact values 0.0f and 1.0f. Only volume pixels
+	// for which pfMask[z] is 1.0f are processed.
+	bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch);
+
+	// Setup sinogram mask for a slice.
+	// pfMask must be a float array of size XXX
+	// NB: iMaskPitch is measured in floats, not in bytes.
+	//
+	// It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels
+	// for which pfMask[z] is 1.0f are processed.
+	//bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch);
+
+	// Set the starting reconstruction for SIRT.	
+	// pfReconstruction must be a float array of size XXX
+	// NB: iReconstructionPitch is measured in floats, not in bytes.
+	//
+	// This may be called between setSinogram() and iterate().
+	// If this function is not called before iterate(), SIRT will start
+	// from a zero reconstruction.
+	//
+	// pfReconstruction will only be read from during this call.
+	bool setStartReconstruction(const float* pfReconstruction,
+	                            unsigned int iReconstructionPitch);
+
+	// Enable min/max constraint.
+	//
+	// These may optionally be called between init() and iterate()
+	//bool setMinConstraint(float fMin);
+	//bool setMaxConstraint(float fMax);
+
+	// Perform a number of (additive) SIRT iterations.
+	// This must be called after setSinogram().
+	//
+	// If called multiple times, without calls to setSinogram() or
+	// setStartReconstruction() in between, iterate() will continue from
+	// the result of the previous call.
+	// Calls to getReconstruction() are allowed between calls to iterate() and
+	// do not change the state.
+	bool iterate(unsigned int iIterations);
+
+	// Get the reconstructed slice.
+	// pfReconstruction must be a float array of size XXX
+	// NB: iReconstructionPitch is measured in floats, not in bytes.
+	//
+	// This may be called after iterate().
+	bool getReconstruction(float* pfReconstruction,
+	                       unsigned int iReconstructionPitch) const;
+
+	// Compute the norm of the difference of the FP of the current
+	// reconstruction and the sinogram. (This performs one FP.)
+	// It can be called after iterate().
+	float computeDiffNorm();
+
+	// Signal the algorithm that it should abort after the current iteration.
+	// This is intended to be called from another thread.
+	void signalAbort();
+
+protected:
+	AstraCGLS3d_internal *pData;
+};
+
+
+
+_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     float fOriginSourceDistance,
+                     float fOriginDetectorDistance,
+                     float fDetUSize,
+                     float fDetVSize,
+                     const float *pfAngles,
+                     int iGPUIndex, int iDetectorSuperSampling);
+
+_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     const SConeProjection *pfAngles,
+                     int iGPUIndex, int iDetectorSuperSampling);
+
+_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      float fDetUSize,
+                      float fDetVSize,
+                      const float *pfAngles,
+                      int iGPUIndex, int iDetectorSuperSampling,
+                      Cuda3DProjectionKernel projKernel);
+
+_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      const SPar3DProjection *pfAngles,
+                      int iGPUIndex, int iDetectorSuperSampling,
+                      Cuda3DProjectionKernel projKernel);
+
+
+_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     float fOriginSourceDistance,
+                     float fOriginDetectorDistance,
+                     float fDetUSize,
+                     float fDetVSize,
+                     const float *pfAngles,
+                     int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections,
+                     unsigned int iVolX,
+                     unsigned int iVolY,
+                     unsigned int iVolZ,
+                     unsigned int iProjAngles,
+                     unsigned int iProjU,
+                     unsigned int iProjV,
+                     const SConeProjection *pfAngles,
+                     int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      float fDetUSize,
+                      float fDetVSize,
+                      const float *pfAngles,
+                      int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections,
+                      unsigned int iVolX,
+                      unsigned int iVolY,
+                      unsigned int iVolZ,
+                      unsigned int iProjAngles,
+                      unsigned int iProjU,
+                      unsigned int iProjV,
+                      const SPar3DProjection *pfAngles,
+                      int iGPUIndex, int iVoxelSuperSampling);
+
+_AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections,
+                  unsigned int iVolX,
+                  unsigned int iVolY,
+                  unsigned int iVolZ,
+                  unsigned int iProjAngles,
+                  unsigned int iProjU,
+                  unsigned int iProjV,
+                  float fOriginSourceDistance,
+                  float fOriginDetectorDistance,
+                  float fDetUSize,
+                  float fDetVSize,
+                  const float *pfAngles,
+                  bool bShortScan,
+                  int iGPUIndex, int iVoxelSuperSampling);
+
+}
+
+
+#endif
diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu
new file mode 100644
index 0000000..72bb9cd
--- /dev/null
+++ b/cuda/3d/cgls3d.cu
@@ -0,0 +1,428 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "cgls3d.h"
+#include "util3d.h"
+#include "arith3d.h"
+#include "cone_fp.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA3d {
+
+CGLS::CGLS() : ReconAlgo3D()
+{
+	D_maskData.ptr = 0;
+	D_smaskData.ptr = 0;
+
+	D_sinoData.ptr = 0;
+	D_volumeData.ptr = 0;
+
+	D_r.ptr = 0;
+	D_w.ptr = 0;
+	D_z.ptr = 0;
+	D_p.ptr = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+}
+
+
+CGLS::~CGLS()
+{
+	reset();
+}
+
+void CGLS::reset()
+{
+	cudaFree(D_r.ptr);
+	cudaFree(D_w.ptr);
+	cudaFree(D_z.ptr);
+	cudaFree(D_p.ptr);
+
+	D_maskData.ptr = 0;
+	D_smaskData.ptr = 0;
+
+	D_sinoData.ptr = 0;
+	D_volumeData.ptr = 0;
+
+	D_r.ptr = 0;
+	D_w.ptr = 0;
+	D_z.ptr = 0;
+	D_p.ptr = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	sliceInitialized = false;
+
+	ReconAlgo3D::reset();
+}
+
+bool CGLS::enableVolumeMask()
+{
+	useVolumeMask = true;
+	return true;
+}
+
+bool CGLS::enableSinogramMask()
+{
+	useSinogramMask = true;
+	return true;
+}
+
+
+bool CGLS::init()
+{
+	D_z = allocateVolumeData(dims);
+	D_p = allocateVolumeData(dims);
+	D_r = allocateProjectionData(dims);
+	D_w = allocateProjectionData(dims);
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+bool CGLS::setVolumeMask(cudaPitchedPtr& _D_maskData)
+{
+	assert(useVolumeMask);
+
+	D_maskData = _D_maskData;
+
+	return true;
+}
+
+bool CGLS::setSinogramMask(cudaPitchedPtr& _D_smaskData)
+{
+	return false;
+#if 0
+	// TODO: Implement this
+	assert(useSinogramMask);
+
+	D_smaskData = _D_smaskData;
+	return true;
+#endif
+}
+
+bool CGLS::setBuffers(cudaPitchedPtr& _D_volumeData,
+                      cudaPitchedPtr& _D_projData)
+{
+	D_volumeData = _D_volumeData;
+	D_sinoData = _D_projData;
+
+	fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr);
+
+	sliceInitialized = false;
+
+	return true;
+}
+
+bool CGLS::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+	if (!sliceInitialized) {
+
+		// copy sinogram
+		duplicateProjectionData(D_r, D_sinoData, dims);
+
+		// r = sino - A*x
+		if (useVolumeMask) {
+				duplicateVolumeData(D_z, D_volumeData, dims);
+				processVol3D<opMul>(D_z, D_maskData, dims);
+				callFP(D_z, D_r, -1.0f);
+		} else {
+				callFP(D_volumeData, D_r, -1.0f);
+		}
+
+		// p = A'*r
+		zeroVolumeData(D_p, dims);
+		callBP(D_p, D_r);
+		if (useVolumeMask)
+			processVol3D<opMul>(D_p, D_maskData, dims);
+
+		gamma = dotProduct3D(D_p, dims.iVolX, dims.iVolY, dims.iVolZ);
+
+		sliceInitialized = true;
+
+	}
+
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+
+		// w = A*p
+		zeroProjectionData(D_w, dims);
+		callFP(D_p, D_w, 1.0f);
+
+		// alpha = gamma / <w,w>
+		float ww = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV);
+		float alpha = gamma / ww;
+
+		// x += alpha*p
+		processVol3D<opAddScaled>(D_volumeData, D_p, alpha, dims);
+
+		// r -= alpha*w
+		processSino3D<opAddScaled>(D_r, D_w, -alpha, dims);
+
+		// z = A'*r
+		zeroVolumeData(D_z, dims);
+		callBP(D_z, D_r);
+		if (useVolumeMask)
+			processVol3D<opMul>(D_z, D_maskData, dims);
+
+		float beta = 1.0f / gamma;
+		gamma = dotProduct3D(D_z, dims.iVolX, dims.iVolY, dims.iVolZ);
+
+		beta *= gamma;
+
+		// p = z + beta*p
+		processVol3D<opScaleAndAdd>(D_p, D_z, beta, dims);
+	}
+
+	return true;
+}
+
+float CGLS::computeDiffNorm()
+{
+	// We can use w and z as temporary storage here since they're not
+	// used outside of iterations.
+
+	// copy sinogram to w
+	duplicateProjectionData(D_w, D_sinoData, dims);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			duplicateVolumeData(D_z, D_volumeData, dims);
+			processVol3D<opMul>(D_z, D_maskData, dims);
+			callFP(D_z, D_w, -1.0f);
+	} else {
+			callFP(D_volumeData, D_w, -1.0f);
+	}
+
+	float s = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV);
+	return sqrt(s);
+}
+
+
+bool doCGLS(cudaPitchedPtr& D_volumeData, 
+            cudaPitchedPtr& D_sinoData,
+            cudaPitchedPtr& D_maskData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            unsigned int iterations)
+{
+	CGLS cgls;
+	bool ok = true;
+
+	ok &= cgls.setConeGeometry(dims, angles);
+	if (D_maskData.ptr)
+		ok &= cgls.enableVolumeMask();
+
+	if (!ok)
+		return false;
+
+	ok = cgls.init();
+	if (!ok)
+		return false;
+
+	if (D_maskData.ptr)
+		ok &= cgls.setVolumeMask(D_maskData);
+
+	ok &= cgls.setBuffers(D_volumeData, D_sinoData);
+	if (!ok)
+		return false;
+
+	ok = cgls.iterate(iterations);
+
+	return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+	SDimensions3D dims;
+	dims.iVolX = 256;
+	dims.iVolY = 256;
+	dims.iVolZ = 256;
+	dims.iProjAngles = 100;
+	dims.iProjU = 512;
+	dims.iProjV = 512;
+	dims.iRaysPerDet = 1;
+
+	SConeProjection angle[100];
+	angle[0].fSrcX = -2905.6;
+	angle[0].fSrcY = 0;
+	angle[0].fSrcZ = 0;
+
+	angle[0].fDetSX = 694.4;
+	angle[0].fDetSY = -122.4704;
+	angle[0].fDetSZ = -122.4704;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = .4784;
+	//angle[0].fDetUY = .5;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = .4784;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 100; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Src, i, i*2*M_PI/100);
+		ROTATE0(DetS, i, i*2*M_PI/100);
+		ROTATE0(DetU, i, i*2*M_PI/100);
+		ROTATE0(DetV, i, i*2*M_PI/100);
+	}
+#undef ROTATE0
+
+
+	cudaPitchedPtr volData = allocateVolumeData(dims);
+	cudaPitchedPtr projData = allocateProjectionData(dims);
+	zeroProjectionData(projData, dims);
+
+	float* pbuf = new float[100*512*512];
+	copyProjectionsFromDevice(pbuf, projData, dims);
+	copyProjectionsToDevice(pbuf, projData, dims);
+	delete[] pbuf;
+
+#if 0
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256; ++i) {
+		for (unsigned int y = 0; y < 256; ++y)
+			for (unsigned int x = 0; x < 256; ++x)
+				slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f;
+
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+	}
+	astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+#else
+
+	for (int i = 0; i < 100; ++i) {
+		char fname[32];
+		sprintf(fname, "Tiffs/%04d.png", 4*i);
+		unsigned int w,h;
+		float* bufp = loadImage(fname, w,h);
+
+		for (int j = 0; j < 512*512; ++j) {
+			float v = bufp[j];
+			if (v > 236.0f) v = 236.0f;
+			v = logf(236.0f / v);
+			bufp[j] = 256*v;
+		}
+
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice);
+		}
+
+		delete[] bufp;
+
+	}
+#endif
+
+#if 0
+	float* bufs = new float[100*512];
+
+	for (int i = 0; i < 512; ++i) {
+		cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+		char fname[20];
+		sprintf(fname, "sino%03d.png", i);
+		saveImage(fname, 100, 512, bufs);
+	}
+
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 100; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "proj%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif
+
+	zeroVolumeData(volData, dims);
+
+	cudaPitchedPtr maskData;
+	maskData.ptr = 0;
+
+	astraCUDA3d::doCGLS(volData, projData, maskData, dims, angle, 50);
+#if 1
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		char fname[20];
+		sprintf(fname, "vol%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+
+	return 0;
+}
+#endif
+
diff --git a/cuda/3d/cgls3d.h b/cuda/3d/cgls3d.h
new file mode 100644
index 0000000..d16b571
--- /dev/null
+++ b/cuda/3d/cgls3d.h
@@ -0,0 +1,114 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CGLS3D_H
+#define _CUDA_CGLS3D_H
+
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport CGLS : public ReconAlgo3D {
+public:
+	CGLS();
+	~CGLS();
+
+//	bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+
+
+	bool enableVolumeMask();
+	bool enableSinogramMask();
+
+	// init should be called after setting all geometry
+	bool init();
+
+	// setVolumeMask should be called after init and before iterate,
+	// but only if enableVolumeMask was called before init.
+	// It may be called again after iterate.
+	bool setVolumeMask(cudaPitchedPtr& D_maskData);
+
+	// setSinogramMask should be called after init and before iterate,
+	// but only if enableSinogramMask was called before init.
+	// It may be called again after iterate.
+	bool setSinogramMask(cudaPitchedPtr& D_smaskData);
+
+
+	// setBuffers should be called after init and before iterate.
+	// It may be called again after iterate.
+	bool setBuffers(cudaPitchedPtr& D_volumeData,
+	                cudaPitchedPtr& D_projData);
+
+
+	// set Min/Max constraints. They may be called at any time, and will affect
+	// any iterate() calls afterwards.
+	bool setMinConstraint(float fMin) { return false; }
+	bool setMaxConstraint(float fMax) { return false; }
+
+	// iterate should be called after init and setBuffers.
+	// It may be called multiple times.
+	bool iterate(unsigned int iterations);
+
+	// Compute the norm of the difference of the FP of the current reconstruction
+	// and the sinogram. (This performs one FP.)
+	// It can be called after iterate.
+	float computeDiffNorm();
+
+protected:
+	void reset();
+
+	bool useVolumeMask;
+	bool useSinogramMask;
+
+	cudaPitchedPtr D_maskData;
+	cudaPitchedPtr D_smaskData;
+
+	// Input/output
+	cudaPitchedPtr D_sinoData;
+	cudaPitchedPtr D_volumeData;
+
+ 	// Temporary buffers
+	cudaPitchedPtr D_r;
+	cudaPitchedPtr D_w;
+	cudaPitchedPtr D_z;
+	cudaPitchedPtr D_p;
+
+	float gamma;
+
+	bool sliceInitialized;
+};
+
+_AstraExport bool doCGLS(cudaPitchedPtr D_volumeData, unsigned int volumePitch,
+            cudaPitchedPtr D_projData, unsigned int projPitch,
+            cudaPitchedPtr D_maskData, unsigned int maskPitch,
+            const SDimensions3D& dims, const SConeProjection* projs,
+            unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu
new file mode 100644
index 0000000..7f8e320
--- /dev/null
+++ b/cuda/3d/cone_bp.cu
@@ -0,0 +1,481 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "cone_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned g_MaxAngles = 1024;
+
+__constant__ float gC_Cux[g_MaxAngles];
+__constant__ float gC_Cuy[g_MaxAngles];
+__constant__ float gC_Cuz[g_MaxAngles];
+__constant__ float gC_Cuc[g_MaxAngles];
+__constant__ float gC_Cvx[g_MaxAngles];
+__constant__ float gC_Cvy[g_MaxAngles];
+__constant__ float gC_Cvz[g_MaxAngles];
+__constant__ float gC_Cvc[g_MaxAngles];
+__constant__ float gC_Cdx[g_MaxAngles];
+__constant__ float gC_Cdy[g_MaxAngles];
+__constant__ float gC_Cdz[g_MaxAngles];
+__constant__ float gC_Cdc[g_MaxAngles];
+
+
+bool bindProjDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneProjTexture.filterMode = cudaFilterModeLinear;
+	gT_coneProjTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+
+__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+
+			const float fCux = gC_Cux[angle];
+			const float fCuy = gC_Cuy[angle];
+			const float fCuz = gC_Cuz[angle];
+			const float fCuc = gC_Cuc[angle];
+			const float fCvx = gC_Cvx[angle];
+			const float fCvy = gC_Cvy[angle];
+			const float fCvz = gC_Cvz[angle];
+			const float fCvc = gC_Cvc[angle];
+			const float fCdx = gC_Cdx[angle];
+			const float fCdy = gC_Cdy[angle];
+			const float fCdz = gC_Cdz[angle];
+			const float fCdc = gC_Cdc[angle];
+
+			const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
+			const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
+			const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz;
+
+			const float fU = fUNum / fDen + 1.0f;
+			const float fV = fVNum / fDen + 1.0f;
+
+			fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+	}
+
+}
+
+// supersampling version
+__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+	const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+
+			const float fCux = gC_Cux[angle];
+			const float fCuy = gC_Cuy[angle];
+			const float fCuz = gC_Cuz[angle];
+			const float fCuc = gC_Cuc[angle];
+			const float fCvx = gC_Cvx[angle];
+			const float fCvy = gC_Cvy[angle];
+			const float fCvz = gC_Cvz[angle];
+			const float fCvc = gC_Cvc[angle];
+			const float fCdx = gC_Cdx[angle];
+			const float fCdy = gC_Cdy[angle];
+			const float fCdz = gC_Cdz[angle];
+			const float fCdc = gC_Cdc[angle];
+
+			float fXs = fX;
+			for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) {
+			float fYs = fY;
+			for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) {
+			float fZs = fZ;
+			for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) {
+
+				const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
+				const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
+				const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz;
+
+				const float fU = fUNum / fDen + 1.0f;
+				const float fV = fVNum / fDen + 1.0f;
+
+				fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+				fZs += fSubStep;
+			}
+			fYs += fSubStep;
+			}
+			fXs += fSubStep;
+			}
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+	}
+
+}
+
+
+bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+                  cudaArray *D_projArray,
+                  const SDimensions3D& dims, const SConeProjection* angles)
+{
+	bindProjDataTexture(D_projArray);
+
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVY - (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVZ , Cux );
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVZ -(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVX , Cuy );
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVY , Cuz );
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fSrcX - (angles[i].fDetSX*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVX)*angles[i].fSrcY + (angles[i].fDetSX*angles[i].fDetVY - angles[i].fDetSY*angles[i].fDetVX)*angles[i].fSrcZ , Cuc );
+
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUZ-(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUY, Cvx );
+	TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUZ , Cvy );
+	TRANSFER_TO_CONSTANT((angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUY-(angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUX , Cvz );
+	TRANSFER_TO_CONSTANT( -(angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fSrcX + (angles[i].fDetSX*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUX)*angles[i].fSrcY - (angles[i].fDetSX*angles[i].fDetUY - angles[i].fDetSY*angles[i].fDetUX)*angles[i].fSrcZ , Cvc );
+
+	TRANSFER_TO_CONSTANT( angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY , Cdx );
+	TRANSFER_TO_CONSTANT( angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ , Cdy );
+	TRANSFER_TO_CONSTANT( angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX , Cdz );
+	TRANSFER_TO_CONSTANT( -angles[i].fSrcX * (angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY) - angles[i].fSrcY * (angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ) - angles[i].fSrcZ * (angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX) , Cdc );
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+	dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
+		if (dims.iRaysPerVoxelDim == 1)
+			dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+		else
+			dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+	}
+
+	cudaTextForceKernelsCompletion();
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+bool ConeBP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles)
+{
+	// transfer projections to array
+
+	cudaArray* cuArray = allocateProjectionArray(dims);
+	transferProjectionsToArray(D_projData, cuArray, dims);
+
+	bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+	SDimensions3D dims;
+	dims.iVolX = 256;
+	dims.iVolY = 256;
+	dims.iVolZ = 256;
+	dims.iProjAngles = 180;
+	dims.iProjU = 512;
+	dims.iProjV = 512;
+	dims.iRaysPerDet = 1;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&volData, extentV);
+
+	cudaExtent extentP;
+	extentP.width = dims.iProjU*sizeof(float);
+	extentP.height = dims.iProjAngles;
+	extentP.depth = dims.iProjV;
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&projData, extentP);
+	cudaMemset3D(projData, 0, extentP);
+
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 1.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+#if 0
+		if (i == 128) {
+			for (unsigned int j = 0; j < 256*256; ++j)
+				slice[j] = 0.0f;
+		}
+#endif 
+	}
+
+
+	SConeProjection angle[180];
+	angle[0].fSrcX = -1536;
+	angle[0].fSrcY = 0;
+	angle[0].fSrcZ = 0;
+
+	angle[0].fDetSX = 512;
+	angle[0].fDetSY = -256;
+	angle[0].fDetSZ = -256;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = 1;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 180; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Src, i, i*2*M_PI/180);
+		ROTATE0(DetS, i, i*2*M_PI/180);
+		ROTATE0(DetU, i, i*2*M_PI/180);
+		ROTATE0(DetV, i, i*2*M_PI/180);
+	}
+#undef ROTATE0
+
+	astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+#if 0
+	float* bufs = new float[180*512];
+
+	for (int i = 0; i < 512; ++i) {
+		cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+		char fname[20];
+		sprintf(fname, "sino%03d.png", i);
+		saveImage(fname, 180, 512, bufs);
+	}
+
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 180; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "proj%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif		
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 0.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+	}
+
+	astraCUDA3d::ConeBP(volData, projData, dims, angle);
+#if 0
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize);
+
+		char fname[20];
+		sprintf(fname, "vol%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+
+}
+#endif
diff --git a/cuda/3d/cone_bp.h b/cuda/3d/cone_bp.h
new file mode 100644
index 0000000..c77714e
--- /dev/null
+++ b/cuda/3d/cone_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_BP_H
+#define _CUDA_CONE_BP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool ConeBP_Array(cudaPitchedPtr D_volumeData,
+                  cudaArray *D_projArray,
+                  const SDimensions3D& dims, const SConeProjection* angles);
+
+_AstraExport bool ConeBP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles);
+         
+
+}
+
+#endif
diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu
new file mode 100644
index 0000000..40dca4f
--- /dev/null
+++ b/cuda/3d/cone_fp.cu
@@ -0,0 +1,513 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneVolumeTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 64;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_SrcX[g_MaxAngles];
+__constant__ float gC_SrcY[g_MaxAngles];
+__constant__ float gC_SrcZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+bool bindVolumeDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneVolumeTexture.filterMode = cudaFilterModeLinear;
+	gT_coneVolumeTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_coneVolumeTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+	// threadIdx: x = ??? detector  (u?)
+	//            y = relative angle
+
+	// blockIdx:  x = ??? detector  (u+v?)
+    //            y = angle block
+
+
+#define CONE_FP_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                          \
+	if (angle >= endAngle)                                                                                         \
+		return;                                                                                                    \
+                                                                                                                   \
+	const float fSrcX = gC_SrcX[angle];                                                                            \
+	const float fSrcY = gC_SrcY[angle];                                                                            \
+	const float fSrcZ = gC_SrcZ[angle];                                                                            \
+	const float fDetUX = gC_DetUX[angle];                                                                          \
+	const float fDetUY = gC_DetUY[angle];                                                                          \
+	const float fDetUZ = gC_DetUZ[angle];                                                                          \
+	const float fDetVX = gC_DetVX[angle];                                                                          \
+	const float fDetVY = gC_DetVY[angle];                                                                          \
+	const float fDetVZ = gC_DetVZ[angle];                                                                          \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                          \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                          \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                          \
+                                                                                                                   \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;      \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;               \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                               \
+	if (endDetectorV > dims.iProjV)                                                                                \
+		endDetectorV = dims.iProjV;                                                                                \
+                                                                                                                   \
+	int endSlice = startSlice + g_blockSlices;                                                                     \
+	if (endSlice > dims.iVol##c0)                                                                                     \
+		endSlice = dims.iVol##c0;                                                                                    \
+                                                                                                                   \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                    \
+	{                                                                                                              \
+		/* Trace ray from Src to (detectorU,detectorV) from */                                                     \
+		/* X = startSlice to X = endSlice                   */                                                     \
+                                                                                                                   \
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;                                          \
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;                                          \
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;                                          \
+                                                                                                                   \
+		/*        (x)   ( 1)       ( 0) */                                                                         \
+		/* ray:   (y) = (ay) * x + (by) */                                                                         \
+		/*        (z)   (az)       (bz) */                                                                         \
+                                                                                                                   \
+		const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0);                                         \
+		const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0);                                         \
+		const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0;                                                           \
+		const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0;                                                           \
+                                                                                                                   \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                                 \
+                                                                                                                   \
+		float fVal = 0.0f;                                                                                         \
+                                                                                                                   \
+		float f##c0 = startSlice + 1.5f;                                                                           \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+                                                                                                                   \
+		for (int s = startSlice; s < endSlice; ++s)                                                                \
+		{                                                                                                          \
+			fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ);                                                       \
+			f##c0 += 1.0f;                                                                                         \
+			f##c1 += a##c1;                                                                                        \
+			f##c2 += a##c2;                                                                                        \
+		}                                                                                                          \
+                                                                                                                   \
+		fVal *= fDistCorr;                                                                                         \
+                                                                                                                   \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;                                \
+	}
+
+#define CONE_FP_SS_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                          \
+	if (angle >= endAngle)                                                                                         \
+		return;                                                                                                    \
+                                                                                                                   \
+	const float fSrcX = gC_SrcX[angle];                                                                            \
+	const float fSrcY = gC_SrcY[angle];                                                                            \
+	const float fSrcZ = gC_SrcZ[angle];                                                                            \
+	const float fDetUX = gC_DetUX[angle];                                                                          \
+	const float fDetUY = gC_DetUY[angle];                                                                          \
+	const float fDetUZ = gC_DetUZ[angle];                                                                          \
+	const float fDetVX = gC_DetVX[angle];                                                                          \
+	const float fDetVY = gC_DetVY[angle];                                                                          \
+	const float fDetVZ = gC_DetVZ[angle];                                                                          \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                          \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                          \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                          \
+                                                                                                                   \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;      \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;               \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                               \
+	if (endDetectorV > dims.iProjV)                                                                                \
+		endDetectorV = dims.iProjV;                                                                                \
+                                                                                                                   \
+	int endSlice = startSlice + g_blockSlices;                                                                     \
+	if (endSlice > dims.iVolX)                                                                                     \
+		endSlice = dims.iVolX;                                                                                     \
+                                                                                                                   \
+	const float fSubStep = 1.0f/dims.iRaysPerDetDim;                                                               \
+                                                                                                                   \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                    \
+	{                                                                                                              \
+		/* Trace ray from Src to (detectorU,detectorV) from */                                                     \
+		/* X = startSlice to X = endSlice                   */                                                     \
+                                                                                                                   \
+		float fV = 0.0f;                                                                                           \
+                                                                                                                   \
+		float fdU = detectorU - 0.5f + 0.5f*fSubStep;                                                              \
+		for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {                                 \
+		float fdV = detectorV - 0.5f + 0.5f*fSubStep;                                                              \
+		for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {                                 \
+                                                                                                                   \
+		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;                                                      \
+		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;                                                      \
+		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;                                                      \
+                                                                                                                   \
+		/*        (x)   ( 1)       ( 0) */                                                                         \
+		/* ray:   (y) = (ay) * x + (by) */                                                                         \
+		/*        (z)   (az)       (bz) */                                                                         \
+                                                                                                                   \
+		const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0);                                         \
+		const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0);                                         \
+		const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0;                                                           \
+		const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0;                                                           \
+                                                                                                                   \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                                 \
+                                                                                                                   \
+		float fVal = 0.0f;                                                                                         \
+                                                                                                                   \
+		float f##c0 = startSlice + 1.5f;                                                                           \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \
+                                                                                                                   \
+		for (int s = startSlice; s < endSlice; ++s)                                                                \
+		{                                                                                                          \
+			fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ);                                                       \
+			f##c0 += 1.0f;                                                                                         \
+			f##c1 += a##c1;                                                                                        \
+			f##c2 += a##c2;                                                                                        \
+		}                                                                                                          \
+                                                                                                                   \
+		fVal *= fDistCorr;                                                                                         \
+		fV += fVal;                                                                                                \
+                                                                                                                   \
+		}                                                                                                          \
+		}                                                                                                          \
+                                                                                                                   \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\
+	}
+
+
+
+
+ 
+__global__ void FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(X,Y,Z)
+}
+
+__global__ void FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(Y,X,Z)
+}
+
+__global__ void FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_BODY(Z,X,Y)
+}
+
+ 
+__global__ void FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(X,Y,Z)
+}
+
+__global__ void FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(Y,X,Z)
+}
+
+__global__ void FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+CONE_FP_SS_BODY(Z,X,Y)
+}
+
+
+
+bool ConeFP_Array(cudaArray *D_volArray,
+                  cudaPitchedPtr D_projData,
+                  const SDimensions3D& dims, const SConeProjection* angles,
+                  float fOutputScale)
+{
+	bindVolumeDataTexture(D_volArray);
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(SrcX);
+	TRANSFER_TO_CONSTANT(SrcY);
+	TRANSFER_TO_CONSTANT(SrcZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f));
+			float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f));
+			float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f));
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaTextForceKernelsCompletion();
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+bool ConeFP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            float fOutputScale)
+{
+	// transfer volume to array
+
+	cudaArray* cuArray = allocateVolumeArray(dims);
+	transferVolumeToArray(D_volumeData, cuArray, dims);
+
+	bool ret = ConeFP_Array(cuArray, D_projData, dims, angles, fOutputScale);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+	SDimensions3D dims;
+	dims.iVolX = 256;
+	dims.iVolY = 256;
+	dims.iVolZ = 256;
+	dims.iProjAngles = 32;
+	dims.iProjU = 512;
+	dims.iProjV = 512;
+	dims.iRaysPerDet = 1;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&volData, extentV);
+
+	cudaExtent extentP;
+	extentP.width = dims.iProjU*sizeof(float);
+	extentP.height = dims.iProjV;
+	extentP.depth = dims.iProjAngles;
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&projData, extentP);
+	cudaMemset3D(projData, 0, extentP);
+
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 1.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaError err = cudaMemcpy3D(&p);
+		assert(!err);
+	}
+
+
+	SConeProjection angle[32];
+	angle[0].fSrcX = -1536;
+	angle[0].fSrcY = 0;
+	angle[0].fSrcZ = 200;
+
+	angle[0].fDetSX = 512;
+	angle[0].fDetSY = -256;
+	angle[0].fDetSZ = -256;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = 1;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 32; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Src, i, i*1*M_PI/180);
+		ROTATE0(DetS, i, i*1*M_PI/180);
+		ROTATE0(DetU, i, i*1*M_PI/180);
+		ROTATE0(DetV, i, i*1*M_PI/180);
+	}
+#undef ROTATE0
+
+	astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+	float* buf = new float[512*512];
+
+	cudaMemcpy(buf, ((float*)projData.ptr)+512*512*8, 512*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+	printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+	saveImage("proj.png", 512, 512, buf);
+	
+
+}
+#endif
diff --git a/cuda/3d/cone_fp.h b/cuda/3d/cone_fp.h
new file mode 100644
index 0000000..2a0463b
--- /dev/null
+++ b/cuda/3d/cone_fp.h
@@ -0,0 +1,46 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_FP_H
+#define _CUDA_CONE_FP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool ConeFP_Array(cudaArray *D_volArray,
+                  cudaPitchedPtr D_projData,
+                  const SDimensions3D& dims, const SConeProjection* angles,
+                  float fOutputScale);
+
+_AstraExport bool ConeFP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu
new file mode 100644
index 0000000..68330a1
--- /dev/null
+++ b/cuda/3d/darthelper3d.cu
@@ -0,0 +1,229 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include "util3d.h"
+#include "dims3d.h"
+#include "darthelper3d.h"
+#include <cassert>
+
+namespace astraCUDA3d {
+
+
+	// -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+	__global__ void devDartSmoothing(cudaPitchedPtr out, cudaPitchedPtr in, float b, SDimensions3D dims)
+	{
+		unsigned int x = threadIdx.x + 16*blockIdx.x;
+		unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+		// Sacrifice the border pixels to simplify the implementation. 
+		if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) {
+			
+			float* d = (float*)in.ptr;
+			float* m = (float*)out.ptr;
+
+			unsigned int index;
+			unsigned int p = (out.pitch >> 2);
+
+			for (unsigned int z = 0; z <= dims.iVolZ-1; z++) {
+
+				float res = 0.0f;
+
+				// bottom slice
+				if (z > 0) {
+					index = ((z-1)*dims.iVolY + y) * p + x;
+					res += d[index-p-1] + d[index-p] + d[index-p+1] +
+						d[index  -1] + d[index  ] + d[index  +1] +
+						d[index+p-1] + d[index+p] + d[index+p+1];
+				}
+
+				// top slice
+				if (z < dims.iVolZ-1) {
+					index = ((z+1)*dims.iVolY + y) * p + x;
+					res += d[index-p-1] + d[index-p] + d[index-p+1] +
+						d[index  -1] + d[index  ] + d[index  +1] +
+						d[index+p-1] + d[index+p] + d[index+p+1];
+				}
+	
+				// same slice
+				index = (z*dims.iVolY + y) * p + x;
+				res += d[index-p-1] + d[index-p] + d[index-p+1] +
+					d[index  -1] +              d[index  +1] +
+					d[index+p-1] + d[index+p] + d[index+p+1];
+
+				// result
+				m[index] = (1.0f-b) * d[index] + b * 0.038461538f * res;
+
+			}
+
+		}
+	}
+
+	// -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+	void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims)
+	{
+		cudaPitchedPtr D_inData;
+		D_inData = allocateVolumeData(dims);
+		copyVolumeToDevice(in, D_inData, dims);
+
+		cudaPitchedPtr D_outData;
+		D_outData = allocateVolumeData(dims);
+		copyVolumeToDevice(out, D_outData, dims);
+
+		dim3 blockSize(16,16);
+		dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16);
+
+		devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, dims);
+
+		copyVolumeFromDevice(out, D_outData, dims);
+
+		cudaFree(D_outData.ptr);
+		cudaFree(D_inData.ptr);
+
+	}
+
+
+	// -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+	// CUDA function for the masking of DART with a radius == 1
+	__global__ void devDartMasking(cudaPitchedPtr mask, cudaPitchedPtr in, unsigned int conn, SDimensions3D dims)
+	{
+		unsigned int x = threadIdx.x + 16*blockIdx.x;
+		unsigned int y = threadIdx.y + 16*blockIdx.y;
+
+		// Sacrifice the border pixels to simplify the implementation. 
+		if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) {
+			
+			float* d = (float*)in.ptr;
+			float* m = (float*)mask.ptr;
+
+			unsigned int index;
+			unsigned int p = (in.pitch >> 2);
+
+			for (unsigned int z = 0; z <= dims.iVolZ-1; z++) {
+				
+				unsigned int o2 = (z*dims.iVolY + y) * p + x;
+				
+				m[o2] = 0.0f;
+
+				// bottom slice
+				if (z > 0) {
+					index = ((z-1)*dims.iVolY + y) * p + x;
+					if ((conn == 26 && 
+						(d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || 
+						 d[index  -1] != d[o2] || d[index  ] != d[o2] || d[index  +1] != d[o2] || 
+						 d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+						|| 
+						(conn == 6 && d[index] != d[o2]))
+					{
+						m[o2] = 1.0f;
+						continue;
+					}
+				}
+
+				// top slice
+				if (z < dims.iVolZ-1) {
+					index = ((z+1)*dims.iVolY + y) * p + x;
+					if ((conn == 26 && 
+						(d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || 
+						 d[index  -1] != d[o2] || d[index  ] != d[o2] || d[index  +1] != d[o2] || 
+						 d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+						|| 
+						(conn == 6 && d[index] != d[o2]))
+					{
+						m[o2] = 1.0f;
+						continue;
+					}
+				}
+
+				// other slices
+				index = (z*dims.iVolY + y) * p + x;
+				if ((conn == 26 && 
+					(d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || 
+					 d[index  -1] != d[o2] ||                        d[index  +1] != d[o2] || 
+					 d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] ))
+					|| 
+					(conn == 6 && 
+					(                         d[index-p] != d[o2] || 
+					 d[index  -1] != d[o2] ||                        d[index  +1] != d[o2] || 
+					                          d[index+p] != d[o2]                          )))
+				{
+					m[o2] = 1.0f;
+					continue;
+				}
+
+			}
+
+		}
+	}
+
+
+	
+	// -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+	void dartMasking(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims)
+	{
+		cudaPitchedPtr D_maskData;
+		D_maskData = allocateVolumeData(dims);
+		copyVolumeToDevice(mask, D_maskData, dims);
+
+		cudaPitchedPtr D_segmentationData;
+		D_segmentationData = allocateVolumeData(dims);
+		copyVolumeToDevice(segmentation, D_segmentationData, dims);
+
+		dim3 blockSize(16,16);
+		dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16);
+
+		if (threshold == 1 && radius == 1)
+			devDartMasking<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, dims);
+		//else if (threshold > 1 && radius == 1)
+		//	devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1);
+		//else if (threshold == 1 && radius > 1)
+		//	devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1);
+		//else 
+		//	devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1);
+
+		copyVolumeFromDevice(mask, D_maskData, dims);
+
+		cudaFree(D_maskData.ptr);
+		cudaFree(D_segmentationData.ptr);
+
+	}
+	// -------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+	bool setGPUIndex(int iGPUIndex)
+	{
+		cudaSetDevice(iGPUIndex);
+		cudaError_t err = cudaGetLastError();
+
+		// Ignore errors caused by calling cudaSetDevice multiple times
+		if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess)
+			return false;
+
+		return true;
+	}
+
+
+}
diff --git a/cuda/3d/darthelper3d.h b/cuda/3d/darthelper3d.h
new file mode 100644
index 0000000..7899629
--- /dev/null
+++ b/cuda/3d/darthelper3d.h
@@ -0,0 +1,46 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_DARTHELPER3_H
+#define _CUDA_DARTHELPER3_H
+
+#include <cuda.h>
+#include <driver_types.h>
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+	void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims);
+	void dartMasking(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims);
+
+	bool setGPUIndex(int index);
+
+}
+
+#endif
diff --git a/cuda/3d/dims3d.h b/cuda/3d/dims3d.h
new file mode 100644
index 0000000..ec3c4a3
--- /dev/null
+++ b/cuda/3d/dims3d.h
@@ -0,0 +1,84 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_CONE_DIMS_H
+#define _CUDA_CONE_DIMS_H
+
+namespace astra {
+
+struct SConeProjection {
+	// the source
+	double fSrcX, fSrcY, fSrcZ;
+
+	// the origin ("bottom left") of the (flat-panel) detector
+	double fDetSX, fDetSY, fDetSZ;
+
+	// the U-edge of a detector pixel
+	double fDetUX, fDetUY, fDetUZ;
+
+	// the V-edge of a detector pixel
+	double fDetVX, fDetVY, fDetVZ;
+};
+
+struct SPar3DProjection {
+	// the ray direction
+	double fRayX, fRayY, fRayZ;
+
+	// the origin ("bottom left") of the (flat-panel) detector
+	double fDetSX, fDetSY, fDetSZ;
+
+	// the U-edge of a detector pixel
+	double fDetUX, fDetUY, fDetUZ;
+
+	// the V-edge of a detector pixel
+	double fDetVX, fDetVY, fDetVZ;
+};
+
+}
+
+
+namespace astraCUDA3d {
+
+using astra::SConeProjection;
+using astra::SPar3DProjection;
+
+struct SDimensions3D {
+	unsigned int iVolX;
+	unsigned int iVolY;
+	unsigned int iVolZ;
+	unsigned int iProjAngles;
+	unsigned int iProjU; // number of detectors in the U direction
+	unsigned int iProjV; // number of detectors in the V direction
+	unsigned int iRaysPerDetDim;
+	unsigned int iRaysPerVoxelDim;
+};
+
+}
+
+#endif
+
diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu
new file mode 100644
index 0000000..ad0604c
--- /dev/null
+++ b/cuda/3d/fdk.cu
@@ -0,0 +1,646 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "cone_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+#include "../2d/fft.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_coneProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned int g_anglesPerWeightBlock = 16;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 2048;
+
+__constant__ float gC_angle_sin[g_MaxAngles];
+__constant__ float gC_angle_cos[g_MaxAngles];
+__constant__ float gC_angle[g_MaxAngles];
+
+
+// per-detector u/v shifts?
+
+static bool bindProjDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_coneProjTexture.filterMode = cudaFilterModeLinear;
+	gT_coneProjTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+
+__global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fInvDetUSize, float fInvDetVSize, const SDimensions3D dims)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X > dims.iVolX)
+		return;
+	if (Y > dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ;
+
+	const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f;
+	const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ);
+
+	// Note re. fZ/rV_base: the computations below are all relative to the
+	// optical axis, so we do the Z-adjustments beforehand.
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+
+			const float cos_theta = gC_angle_cos[angle];
+			const float sin_theta = gC_angle_sin[angle];
+
+			const float fR = fSrcOrigin;
+			const float fD = fR - fX * sin_theta + fY * cos_theta;
+			float fWeight = fR / fD;
+			fWeight *= fWeight;
+
+			const float fScaleFactor = (fR + fDetOrigin) / fD;
+			const float fU = fU_base + (fX*cos_theta+fY*sin_theta) * fScaleFactor * fInvDetUSize;
+			const float fV = fV_base + fZ * fScaleFactor * fInvDetVSize;
+
+			fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV);
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+//		projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU] = 10.0f;
+//		if (threadIdx.x == 0 && threadIdx.y == 0) { printf("%d,%d,%d [%d / %d] -> %f\n", angle, detectorU, detectorV, (angle*dims.iProjV+detectorV)*projPitch+detectorU, projPitch, projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU]); }
+	}
+
+}
+
+
+bool FDK_BP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            float fSrcOrigin, float fDetOrigin,
+            float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+            const SDimensions3D& dims, const float* angles)
+{
+	// transfer projections to array
+
+	cudaArray* cuArray = allocateProjectionArray(dims);
+	transferProjectionsToArray(D_projData, cuArray, dims);
+
+	bindProjDataTexture(cuArray);
+
+	float* angle_sin = new float[dims.iProjAngles];
+	float* angle_cos = new float[dims.iProjAngles];
+
+	for (unsigned int i = 0; i < dims.iProjAngles; ++i) {
+		angle_sin[i] = sinf(angles[i]);
+		angle_cos[i] = cosf(angles[i]);
+	}
+	cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice);
+	assert(e1 == cudaSuccess);
+	assert(e2 == cudaSuccess);
+
+	delete[] angle_sin;
+	delete[] angle_cos;
+
+	dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+	dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+		devBP_FDK<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, 1.0f / fDetUSize, 1.0f / fDetVSize, dims);
+	}
+
+	cudaTextForceKernelsCompletion();
+
+	cudaFreeArray(cuArray);
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, const SDimensions3D dims)
+{
+	float* projData = (float*)D_projData;
+	int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	// We need the length of the central ray and the length of the ray(s) to
+	// our detector pixel(s).
+
+	const float fCentralRayLength = fSrcOrigin + fDetOrigin;
+
+	const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize;
+
+	const float fT = fCentralRayLength * fCentralRayLength + fU * fU;
+
+	float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fDetZ - fSrcZ;
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+		const float fRayLength = sqrtf(fT + fV * fV);
+
+		const float fWeight = fCentralRayLength / fRayLength;
+
+		projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight;
+
+		fV += 1.0f;
+	}
+}
+
+__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims)
+{
+	float* projData = (float*)D_projData;
+	int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y;
+	if (angle >= endAngle)
+		return;
+
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;
+	int endDetectorV = startDetectorV + g_detBlockV;
+	if (endDetectorV > dims.iProjV)
+		endDetectorV = dims.iProjV;
+
+	// We need the length of the central ray and the length of the projection
+	// of our ray onto the central slice
+
+	const float fCentralRayLength = fSrcOrigin + fDetOrigin;
+
+	// TODO: Detector pixel size
+	const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize;
+
+	//const float fGamma = atanf(fU / fCentralRayLength);
+	//const float fBeta = gC_angle[angle];
+	const float fGamma = atanf(fU / fCentralRayLength);
+	const float fBeta = -gC_angle[angle];
+
+	// compute the weight depending on the location in the central fan's radon
+	// space
+	float fWeight;
+
+	if (fBeta <= 0.0f) {
+		fWeight = 0.0f;
+	} else if (fBeta <= 2.0f*(fCentralFanAngle + fGamma)) {
+		fWeight = sinf((M_PI / 4.0f) * fBeta / (fCentralFanAngle + fGamma));
+		fWeight *= fWeight;
+	} else if (fBeta <= M_PI + 2*fGamma) {
+		fWeight = 1.0f;
+	} else if (fBeta <= M_PI + 2*fCentralFanAngle) {
+		fWeight = sinf((M_PI / 4.0f) * (M_PI + 2.0f*fCentralFanAngle - fBeta) / (fCentralFanAngle - fGamma));
+		fWeight *= fWeight;
+	} else {
+		fWeight = 0.0f;
+	}
+
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)
+	{
+
+		projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight;
+
+	}
+}
+
+
+
+// Perform the FDK pre-weighting and filtering
+bool FDK_Filter(cudaPitchedPtr D_projData,
+                cufftComplex * D_filter,
+                float fSrcOrigin, float fDetOrigin,
+                float fSrcZ, float fDetZ,
+                float fDetUSize, float fDetVSize, bool bShortScan,
+                const SDimensions3D& dims, const float* angles)
+{
+	// The pre-weighting factor for a ray is the cosine of the angle between
+	// the central line and the ray.
+
+	dim3 dimBlock(g_detBlockU, g_anglesPerWeightBlock);
+	dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+	              (dims.iProjAngles+g_anglesPerWeightBlock-1)/g_anglesPerWeightBlock);
+
+	int projPitch = D_projData.pitch/sizeof(float);
+
+	devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, dims);
+
+	cudaTextForceKernelsCompletion();
+
+	if (bShortScan) {
+		// We do short-scan Parker weighting
+
+		cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, angles,
+		                                    dims.iProjAngles*sizeof(float), 0,
+		                                    cudaMemcpyHostToDevice);
+		assert(!e1);
+
+		// TODO: detector pixel size!
+		float fCentralFanAngle = atanf((dims.iProjU*0.5f) /
+		                               (fSrcOrigin + fDetOrigin));
+
+		devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fCentralFanAngle, dims);
+
+	}
+
+	cudaTextForceKernelsCompletion();
+
+
+	// The filtering is a regular ramp filter per detector line.
+
+	int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU);
+	int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount);
+
+	
+
+	// We process one sinogram at a time.
+	float* D_sinoData = (float*)D_projData.ptr;
+
+	cufftComplex * D_sinoFFT = NULL;
+	allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT);
+
+	bool ok = true;
+
+	for (int v = 0; v < dims.iProjV; ++v) {
+
+		ok = runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, 0,
+		                dims.iProjU, iPaddedDetCount, iHalfFFTSize,
+		                D_sinoFFT);
+
+		if (!ok) break;
+
+		applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter);
+
+
+		ok = runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch,
+		                 0, dims.iProjU, iPaddedDetCount, iHalfFFTSize);
+
+		if (!ok) break;
+
+		D_sinoData += (dims.iProjAngles * projPitch);
+	}
+
+	freeComplexOnDevice(D_sinoFFT);
+
+	return ok;
+}
+
+
+bool FDK(cudaPitchedPtr D_volumeData,
+         cudaPitchedPtr D_projData,
+         float fSrcOrigin, float fDetOrigin,
+         float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+         const SDimensions3D& dims, const float* angles, bool bShortScan)
+{
+	bool ok;
+	// Generate filter
+	// TODO: Check errors
+	cufftComplex * D_filter;
+	int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU);
+	int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount);
+
+	cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize];
+	memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize);
+
+	genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize);
+
+	allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter);
+	uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter);
+
+	delete [] pHostFilter;
+
+
+	// Perform filtering
+
+	ok = FDK_Filter(D_projData, D_filter, fSrcOrigin, fDetOrigin,
+	                fSrcZ, fDetZ, fDetUSize, fDetVSize,
+	                bShortScan, dims, angles);
+
+	// Clean up filter
+	freeComplexOnDevice(D_filter);
+
+
+	if (!ok)
+		return false;
+
+	// Perform BP
+
+	ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ,
+	            fDetUSize, fDetVSize, dims, angles);
+
+	if (!ok)
+		return false;
+
+	return true;
+}
+
+
+}
+
+#ifdef STANDALONE
+void dumpVolume(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+	float* buf = new float[dims.iVolX*dims.iVolY];
+	unsigned int pitch = data.pitch / sizeof(float);
+
+	for (int i = 0; i < dims.iVolZ; ++i) {
+		cudaMemcpy2D(buf, dims.iVolX*sizeof(float), ((float*)data.ptr)+pitch*dims.iVolY*i, data.pitch, dims.iVolX*sizeof(float), dims.iVolY, cudaMemcpyDeviceToHost);
+
+		char fname[512];
+		sprintf(fname, filespec, dims.iVolZ-i-1);
+		saveImage(fname, dims.iVolY, dims.iVolX, buf, fMin, fMax);
+	}
+}
+
+void dumpSinograms(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+	float* bufs = new float[dims.iProjAngles*dims.iProjU];
+	unsigned int pitch = data.pitch / sizeof(float);
+
+	for (int i = 0; i < dims.iProjV; ++i) {
+		cudaMemcpy2D(bufs, dims.iProjU*sizeof(float), ((float*)data.ptr)+pitch*dims.iProjAngles*i, data.pitch, dims.iProjU*sizeof(float), dims.iProjAngles, cudaMemcpyDeviceToHost);
+
+		char fname[512];
+		sprintf(fname, filespec, i);
+		saveImage(fname, dims.iProjAngles, dims.iProjU, bufs, fMin, fMax);
+	}
+}
+
+void dumpProjections(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax)
+{
+	float* bufp = new float[dims.iProjV*dims.iProjU];
+	unsigned int pitch = data.pitch / sizeof(float);
+
+	for (int i = 0; i < dims.iProjAngles; ++i) {
+		for (int j = 0; j < dims.iProjV; ++j) {
+			cudaMemcpy(bufp+dims.iProjU*j, ((float*)data.ptr)+pitch*dims.iProjAngles*j+pitch*i, dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[512];
+		sprintf(fname, filespec, i);
+		saveImage(fname, dims.iProjV, dims.iProjU, bufp, fMin, fMax);
+	}
+}
+
+
+
+
+int main()
+{
+#if 0
+	SDimensions3D dims;
+	dims.iVolX = 512;
+	dims.iVolY = 512;
+	dims.iVolZ = 512;
+	dims.iProjAngles = 180;
+	dims.iProjU = 1024;
+	dims.iProjV = 1024;
+	dims.iRaysPerDet = 1;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&volData, extentV);
+
+	cudaExtent extentP;
+	extentP.width = dims.iProjU*sizeof(float);
+	extentP.height = dims.iProjAngles;
+	extentP.depth = dims.iProjV;
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&projData, extentP);
+	cudaMemset3D(projData, 0, extentP);
+
+#if 0
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 1.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+#if 0
+		if (i == 128) {
+			for (unsigned int j = 0; j < 256*256; ++j)
+				slice[j] = 0.0f;
+		}
+#endif 
+	}
+#endif
+
+	SConeProjection angle[180];
+	angle[0].fSrcX = -1536;
+	angle[0].fSrcY = 0;
+	angle[0].fSrcZ = 0;
+
+	angle[0].fDetSX = 1024;
+	angle[0].fDetSY = -512;
+	angle[0].fDetSZ = 512;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = 1;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = -1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 180; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Src, i, i*2*M_PI/180);
+		ROTATE0(DetS, i, i*2*M_PI/180);
+		ROTATE0(DetU, i, i*2*M_PI/180);
+		ROTATE0(DetV, i, i*2*M_PI/180);
+	}
+#undef ROTATE0
+
+	astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+	//dumpSinograms("sino%03d.png", projData, dims, 0, 512);
+	//dumpProjections("proj%03d.png", projData, dims, 0, 512);
+
+	astraCUDA3d::zeroVolumeData(volData, dims);
+
+	float* angles = new float[dims.iProjAngles];
+	for (int i = 0; i < 180; ++i)
+		angles[i] = i*2*M_PI/180;
+
+	astraCUDA3d::FDK(volData, projData, 1536, 512, 0, 0, dims, angles);
+
+	dumpVolume("vol%03d.png", volData, dims, -20, 100);
+
+
+#else
+
+	SDimensions3D dims;
+	dims.iVolX = 1000;
+	dims.iVolY = 999;
+	dims.iVolZ = 500;
+	dims.iProjAngles = 376;
+	dims.iProjU = 1024;
+	dims.iProjV = 524;
+	dims.iRaysPerDet = 1;
+
+	float* angles = new float[dims.iProjAngles];
+	for (int i = 0; i < dims.iProjAngles; ++i)
+		angles[i] = -i*(M_PI)/360;
+
+	cudaPitchedPtr volData = astraCUDA3d::allocateVolumeData(dims);
+	cudaPitchedPtr projData = astraCUDA3d::allocateProjectionData(dims);
+	astraCUDA3d::zeroProjectionData(projData, dims);
+	astraCUDA3d::zeroVolumeData(volData, dims);
+
+	timeval t;
+	tic(t);
+
+	for (int i = 0; i < dims.iProjAngles; ++i) {
+		char fname[256];
+		sprintf(fname, "/home/wpalenst/tmp/Elke/proj%04d.png", i);
+		unsigned int w,h;
+		float* bufp = loadImage(fname, w,h);
+
+		int pitch = projData.pitch / sizeof(float);
+		for (int j = 0; j < dims.iProjV; ++j) {
+			cudaMemcpy(((float*)projData.ptr)+dims.iProjAngles*pitch*j+pitch*i, bufp+dims.iProjU*j, dims.iProjU*sizeof(float), cudaMemcpyHostToDevice);
+		}
+
+		delete[] bufp;
+	}
+	printf("Load time: %f\n", toc(t));
+
+	//dumpSinograms("sino%03d.png", projData, dims, -8.0f, 256.0f);
+	//astraCUDA3d::FDK(volData, projData, 7350, 62355, 0, 10, dims, angles);
+	//astraCUDA3d::FDK(volData, projData, 7350, -380, 0, 10, dims, angles);
+
+	tic(t);
+
+	astraCUDA3d::FDK(volData, projData, 7383.29867, 0, 0, 10, dims, angles);
+
+	printf("FDK time: %f\n", toc(t));
+	tic(t);
+
+	dumpVolume("vol%03d.png", volData, dims, -65.9f, 200.0f);
+	//dumpVolume("vol%03d.png", volData, dims, 0.0f, 256.0f);
+	printf("Save time: %f\n", toc(t));
+
+#endif
+
+
+}
+#endif
diff --git a/cuda/3d/fdk.h b/cuda/3d/fdk.h
new file mode 100644
index 0000000..5443b19
--- /dev/null
+++ b/cuda/3d/fdk.h
@@ -0,0 +1,43 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_FDK_H
+#define _CUDA_FDK_H
+
+namespace astraCUDA3d {
+
+bool FDK(cudaPitchedPtr D_volumeData,
+         cudaPitchedPtr D_projData,
+         float fSrcOrigin, float fDetOrigin,
+         float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize,
+         const SDimensions3D& dims, const float* angles, bool bShortScan);
+
+
+}
+
+#endif
diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu
new file mode 100644
index 0000000..872b1eb
--- /dev/null
+++ b/cuda/3d/par3d_bp.cu
@@ -0,0 +1,464 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "par3d_fp.h"
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_par3DProjTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_volBlockZ = 16;
+
+static const unsigned int g_anglesPerBlock = 64;
+static const unsigned int g_volBlockX = 32;
+static const unsigned int g_volBlockY = 16;
+
+static const unsigned g_MaxAngles = 1024;
+
+__constant__ float gC_Cux[g_MaxAngles];
+__constant__ float gC_Cuy[g_MaxAngles];
+__constant__ float gC_Cuz[g_MaxAngles];
+__constant__ float gC_Cuc[g_MaxAngles];
+__constant__ float gC_Cvx[g_MaxAngles];
+__constant__ float gC_Cvy[g_MaxAngles];
+__constant__ float gC_Cvz[g_MaxAngles];
+__constant__ float gC_Cvc[g_MaxAngles];
+
+
+static bool bindProjDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_par3DProjTexture.filterMode = cudaFilterModeLinear;
+	gT_par3DProjTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_par3DProjTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+
+__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f;
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+
+			const float fCux = gC_Cux[angle];
+			const float fCuy = gC_Cuy[angle];
+			const float fCuz = gC_Cuz[angle];
+			const float fCuc = gC_Cuc[angle];
+			const float fCvx = gC_Cvx[angle];
+			const float fCvy = gC_Cvy[angle];
+			const float fCvz = gC_Cvz[angle];
+			const float fCvc = gC_Cvc[angle];
+
+			const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz;
+			const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz;
+
+			const float fU = fUNum + 1.0f;
+			const float fV = fVNum + 1.0f;
+
+			fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal;
+	}
+
+}
+
+// supersampling version
+__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims)
+{
+	float* volData = (float*)D_volData;
+
+	int endAngle = startAngle + g_anglesPerBlock;
+	if (endAngle > dims.iProjAngles)
+		endAngle = dims.iProjAngles;
+
+	// threadIdx: x = rel x
+	//            y = rel y
+
+	// blockIdx:  x = x + y
+    //            y = z
+
+
+	// TO TRY: precompute part of detector intersection formulas in shared mem?
+	// TO TRY: inner loop over z, gather ray values in shared mem
+
+	const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x;
+	const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y;
+
+	if (X >= dims.iVolX)
+		return;
+	if (Y >= dims.iVolY)
+		return;
+
+	const int startZ = blockIdx.y * g_volBlockZ;
+	int endZ = startZ + g_volBlockZ;
+	if (endZ > dims.iVolZ)
+		endZ = dims.iVolZ;
+
+	float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+	float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+	float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim;
+
+	const float fSubStep = 1.0f/dims.iRaysPerVoxelDim;
+
+	for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f)
+	{
+
+		float fVal = 0.0f;
+		float fAngle = startAngle + 0.5f;
+
+		for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f)
+		{
+			const float fCux = gC_Cux[angle];
+			const float fCuy = gC_Cuy[angle];
+			const float fCuz = gC_Cuz[angle];
+			const float fCuc = gC_Cuc[angle];
+			const float fCvx = gC_Cvx[angle];
+			const float fCvy = gC_Cvy[angle];
+			const float fCvz = gC_Cvz[angle];
+			const float fCvc = gC_Cvc[angle];
+
+			float fXs = fX;
+			for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) {
+			float fYs = fY;
+			for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) {
+			float fZs = fZ;
+			for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) {
+
+				const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz;
+				const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz;
+
+				const float fU = fUNum + 1.0f;
+				const float fV = fVNum + 1.0f;
+
+				fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order
+				fZs += fSubStep;
+			}
+			fYs += fSubStep;
+			}
+			fXs += fSubStep;
+			}
+
+		}
+
+		volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim);
+	}
+
+}
+
+bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+                   cudaArray *D_projArray,
+                   const SDimensions3D& dims, const SPar3DProjection* angles)
+{
+	bindProjDataTexture(D_projArray);
+
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+#define DENOM (angles[i].fRayX*angles[i].fDetUY*angles[i].fDetVZ - angles[i].fRayX*angles[i].fDetUZ*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetUX*angles[i].fDetVZ + angles[i].fRayY*angles[i].fDetUZ*angles[i].fDetVX + angles[i].fRayZ*angles[i].fDetUX*angles[i].fDetVY - angles[i].fRayZ*angles[i].fDetUY*angles[i].fDetVX)
+
+
+	TRANSFER_TO_CONSTANT( ( - (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)) / DENOM , Cux );
+	TRANSFER_TO_CONSTANT( ( (angles[i].fRayX*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVX)) / DENOM , Cuy );
+	TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetVX) ) / DENOM , Cuz );
+	TRANSFER_TO_CONSTANT( (-(angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fRayX + (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)*angles[i].fDetSX - (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetVX) / DENOM , Cuc );
+
+	TRANSFER_TO_CONSTANT( ((angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY) ) / DENOM , Cvx );
+	TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUX) ) / DENOM , Cvy );
+	TRANSFER_TO_CONSTANT( ((angles[i].fRayX*angles[i].fDetUY - angles[i].fRayY*angles[i].fDetUX) ) / DENOM , Cvz );
+	TRANSFER_TO_CONSTANT( ((angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fRayX - (angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY)*angles[i].fDetSX + (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetUX ) / DENOM , Cvc );
+
+#undef TRANSFER_TO_CONSTANT
+#undef DENOM
+
+	delete[] tmp;
+
+	dim3 dimBlock(g_volBlockX, g_volBlockY);
+
+	dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ);
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) {
+		// printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); 
+		if (dims.iRaysPerVoxelDim == 1)
+			dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+		else
+			dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims);
+	}
+
+	cudaTextForceKernelsCompletion();
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+bool Par3DBP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SPar3DProjection* angles)
+{
+	// transfer projections to array
+
+	cudaArray* cuArray = allocateProjectionArray(dims);
+	transferProjectionsToArray(D_projData, cuArray, dims);
+
+	bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+}
+
+#ifdef STANDALONE
+int main()
+{
+	SDimensions3D dims;
+	dims.iVolX = 256;
+	dims.iVolY = 256;
+	dims.iVolZ = 256;
+	dims.iProjAngles = 180;
+	dims.iProjU = 512;
+	dims.iProjV = 512;
+	dims.iRaysPerDet = 1;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&volData, extentV);
+
+	cudaExtent extentP;
+	extentP.width = dims.iProjU*sizeof(float);
+	extentP.height = dims.iProjAngles;
+	extentP.depth = dims.iProjV;
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	cudaMalloc3D(&projData, extentP);
+	cudaMemset3D(projData, 0, extentP);
+
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 1.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+#if 0
+		if (i == 128) {
+			for (unsigned int j = 0; j < 256*256; ++j)
+				slice[j] = 0.0f;
+		}
+#endif 
+	}
+
+
+	SPar3DProjection angle[180];
+	angle[0].fRayX = 1;
+	angle[0].fRayY = 0;
+	angle[0].fRayZ = 0;
+
+	angle[0].fDetSX = 512;
+	angle[0].fDetSY = -256;
+	angle[0].fDetSZ = -256;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = 1;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = 1;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 180; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Ray, i, i*2*M_PI/180);
+		ROTATE0(DetS, i, i*2*M_PI/180);
+		ROTATE0(DetU, i, i*2*M_PI/180);
+		ROTATE0(DetV, i, i*2*M_PI/180);
+	}
+#undef ROTATE0
+
+	astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f);
+#if 1
+	float* bufs = new float[180*512];
+
+	for (int i = 0; i < 512; ++i) {
+		cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+		char fname[20];
+		sprintf(fname, "sino%03d.png", i);
+		saveImage(fname, 180, 512, bufs, 0, 512);
+	}
+
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 180; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "proj%03d.png", i);
+		saveImage(fname, 512, 512, bufp, 0, 512);
+	}
+#endif		
+	for (unsigned int i = 0; i < 256*256; ++i)
+		slice[i] = 0.0f;
+	for (unsigned int i = 0; i < 256; ++i) {
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+	}
+
+	astraCUDA3d::Par3DBP(volData, projData, dims, angle);
+#if 1
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize);
+
+		char fname[20];
+		sprintf(fname, "vol%03d.png", i);
+		saveImage(fname, 256, 256, buf, 0, 60000);
+	}
+#endif
+
+}
+#endif
diff --git a/cuda/3d/par3d_bp.h b/cuda/3d/par3d_bp.h
new file mode 100644
index 0000000..399a3cb
--- /dev/null
+++ b/cuda/3d/par3d_bp.h
@@ -0,0 +1,45 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR3D_BP_H
+#define _CUDA_PAR3D_BP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool Par3DBP_Array(cudaPitchedPtr D_volumeData,
+                   cudaArray *D_projArray,
+                   const SDimensions3D& dims, const SPar3DProjection* angles);
+
+_AstraExport bool Par3DBP(cudaPitchedPtr D_volumeData,
+             cudaPitchedPtr D_projData,
+             const SDimensions3D& dims, const SPar3DProjection* angles);
+         
+
+}
+
+#endif
diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu
new file mode 100644
index 0000000..6bf9037
--- /dev/null
+++ b/cuda/3d/par3d_fp.cu
@@ -0,0 +1,814 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include <iostream>
+#include <list>
+
+#include <cuda.h>
+#include "util3d.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+#include "dims3d.h"
+
+typedef texture<float, 3, cudaReadModeElementType> texture3D;
+
+static texture3D gT_par3DVolumeTexture;
+
+namespace astraCUDA3d {
+
+static const unsigned int g_anglesPerBlock = 4;
+
+// thickness of the slices we're splitting the volume up into
+static const unsigned int g_blockSlices = 64;
+static const unsigned int g_detBlockU = 32;
+static const unsigned int g_detBlockV = 32;
+
+static const unsigned g_MaxAngles = 1024;
+__constant__ float gC_RayX[g_MaxAngles];
+__constant__ float gC_RayY[g_MaxAngles];
+__constant__ float gC_RayZ[g_MaxAngles];
+__constant__ float gC_DetSX[g_MaxAngles];
+__constant__ float gC_DetSY[g_MaxAngles];
+__constant__ float gC_DetSZ[g_MaxAngles];
+__constant__ float gC_DetUX[g_MaxAngles];
+__constant__ float gC_DetUY[g_MaxAngles];
+__constant__ float gC_DetUZ[g_MaxAngles];
+__constant__ float gC_DetVX[g_MaxAngles];
+__constant__ float gC_DetVY[g_MaxAngles];
+__constant__ float gC_DetVZ[g_MaxAngles];
+
+
+static bool bindVolumeDataTexture(const cudaArray* array)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+
+	gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp;
+	gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear;
+	gT_par3DVolumeTexture.normalized = false;
+
+	cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc);
+
+	// TODO: error value?
+
+	return true;
+}
+
+
+
+// threadIdx: x = u detector
+//            y = relative angle
+// blockIdx:  x = u/v detector
+//            y = angle block
+
+#define PAR3D_FP_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;                                        \
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;                                        \
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;                                        \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ);                                                    \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		fVal *= fDistCorr;                                                                                       \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;                              \
+	}
+
+
+
+// Supersampling version
+#define PAR3D_FP_SS_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	const float fSubStep = 1.0f/dims.iRaysPerDetDim;                                                             \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+                                                                                                                 \
+		float fV = 0.0f;                                                                                         \
+                                                                                                                 \
+		float fdU = detectorU - 0.5f + 0.5f*fSubStep;                                                            \
+		for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) {                               \
+		float fdV = detectorV - 0.5f + 0.5f*fSubStep;                                                            \
+		for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) {                               \
+                                                                                                                 \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX;                                                    \
+		const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY;                                                    \
+		const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ;                                                    \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ);                                                    \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		fVal *= fDistCorr;                                                                                       \
+		fV += fVal;                                                                                              \
+                                                                                                                 \
+		}                                                                                                        \
+		}                                                                                                        \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\
+	}
+
+
+
+__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_BODY(Z,X,Y)
+}
+
+__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SS_BODY(Z,X,Y)
+}
+
+
+__device__ float dirWeights(float fX, float fN) {
+	if (fX <= 0.5f) // outside image on left
+		return 0.0f;
+	if (fX <= 1.5f) // half outside image on left
+		return (fX - 0.5f) * (fX - 0.5f);
+	if (fX <= fN + 0.5f) { // inside image
+		float t = fX - 0.5f - floorf(fX - 0.5f);
+		return t*t + (1-t)*(1-t);
+	}
+	if (fX <= fN + 1.5f) // half outside image on right
+		return (fN + 1.5f - fX) * (fN + 1.5f - fX);
+	return 0.0f; // outside image on right
+}
+
+#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \
+	int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y;                                        \
+	if (angle >= endAngle)                                                                                       \
+		return;                                                                                                  \
+                                                                                                                 \
+	const float fRayX = gC_RayX[angle];                                                                          \
+	const float fRayY = gC_RayY[angle];                                                                          \
+	const float fRayZ = gC_RayZ[angle];                                                                          \
+	const float fDetUX = gC_DetUX[angle];                                                                        \
+	const float fDetUY = gC_DetUY[angle];                                                                        \
+	const float fDetUZ = gC_DetUZ[angle];                                                                        \
+	const float fDetVX = gC_DetVX[angle];                                                                        \
+	const float fDetVY = gC_DetVY[angle];                                                                        \
+	const float fDetVZ = gC_DetVZ[angle];                                                                        \
+	const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX;                                        \
+	const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY;                                        \
+	const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ;                                        \
+                                                                                                                 \
+                                                                                                                 \
+                                                                                                                 \
+	const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x;    \
+	const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV;             \
+	int endDetectorV = startDetectorV + g_detBlockV;                                                             \
+	if (endDetectorV > dims.iProjV)                                                                              \
+		endDetectorV = dims.iProjV;                                                                              \
+                                                                                                                 \
+	int endSlice = startSlice + g_blockSlices;                                                                   \
+	if (endSlice > dims.iVol##c0)                                                                                \
+		endSlice = dims.iVol##c0;                                                                                \
+                                                                                                                 \
+	for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV)                                  \
+	{                                                                                                            \
+		/* Trace ray in direction Ray to (detectorU,detectorV) from  */                                          \
+		/* X = startSlice to X = endSlice                            */                                          \
+                                                                                                                 \
+		const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX;                                        \
+		const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY;                                        \
+		const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ;                                        \
+                                                                                                                 \
+		/*        (x)   ( 1)       ( 0)    */                                                                    \
+		/* ray:   (y) = (ay) * x + (by)    */                                                                    \
+		/*        (z)   (az)       (bz)    */                                                                    \
+                                                                                                                 \
+		const float a##c1 = fRay##c1 / fRay##c0;                                                                 \
+		const float a##c2 = fRay##c2 / fRay##c0;                                                                 \
+		const float b##c1 = fDet##c1 - a##c1 * fDet##c0;                                                         \
+		const float b##c2 = fDet##c2 - a##c2 * fDet##c0;                                                         \
+                                                                                                                 \
+		const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale;                               \
+                                                                                                                 \
+		float fVal = 0.0f;                                                                                       \
+                                                                                                                 \
+		float f##c0 = startSlice + 1.5f;                                                                         \
+		float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\
+		float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\
+                                                                                                                 \
+		for (int s = startSlice; s < endSlice; ++s)                                                              \
+		{                                                                                                        \
+			fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \
+			f##c0 += 1.0f;                                                                                       \
+			f##c1 += a##c1;                                                                                      \
+			f##c2 += a##c2;                                                                                      \
+		}                                                                                                        \
+                                                                                                                 \
+		D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal;                              \
+	}
+
+// Supersampling version
+// TODO
+
+
+__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(X,Y,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Y,X,Z)
+}
+
+__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale)
+{
+PAR3D_FP_SUMSQW_BODY(Z,X,Y)
+}
+
+
+
+bool Par3DFP_Array(cudaArray *D_volArray,
+                   cudaPitchedPtr D_projData,
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   float fOutputScale)
+{
+
+	bindVolumeDataTexture(D_volArray);
+
+
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+							par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaTextForceKernelsCompletion();
+
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+bool Par3DFP(cudaPitchedPtr D_volumeData,
+             cudaPitchedPtr D_projData,
+             const SDimensions3D& dims, const SPar3DProjection* angles,
+             float fOutputScale)
+{
+	// transfer volume to array
+	cudaArray* cuArray = allocateVolumeArray(dims);
+	transferVolumeToArray(D_volumeData, cuArray, dims);
+
+	bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale);
+
+	cudaFreeArray(cuArray);
+
+	return ret;
+}
+
+
+
+bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+                    cudaPitchedPtr D_projData,
+                    const SDimensions3D& dims, const SPar3DProjection* angles,
+                    float fOutputScale)
+{
+	// transfer angles to constant memory
+	float* tmp = new float[dims.iProjAngles];
+
+#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0)
+
+	TRANSFER_TO_CONSTANT(RayX);
+	TRANSFER_TO_CONSTANT(RayY);
+	TRANSFER_TO_CONSTANT(RayZ);
+	TRANSFER_TO_CONSTANT(DetSX);
+	TRANSFER_TO_CONSTANT(DetSY);
+	TRANSFER_TO_CONSTANT(DetSZ);
+	TRANSFER_TO_CONSTANT(DetUX);
+	TRANSFER_TO_CONSTANT(DetUY);
+	TRANSFER_TO_CONSTANT(DetUZ);
+	TRANSFER_TO_CONSTANT(DetVX);
+	TRANSFER_TO_CONSTANT(DetVY);
+	TRANSFER_TO_CONSTANT(DetVZ);
+
+#undef TRANSFER_TO_CONSTANT
+
+	delete[] tmp;
+
+	std::list<cudaStream_t> streams;
+	dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles
+
+	// Run over all angles, grouping them into groups of the same
+	// orientation (roughly horizontal vs. roughly vertical).
+	// Start a stream of grids for each such group.
+
+	unsigned int blockStart = 0;
+	unsigned int blockEnd = 0;
+	int blockDirection = 0;
+
+	// timeval t;
+	// tic(t);
+
+	for (unsigned int a = 0; a <= dims.iProjAngles; ++a) {
+		int dir;
+		if (a != dims.iProjAngles) {
+			float dX = fabsf(angles[a].fRayX);
+			float dY = fabsf(angles[a].fRayY);
+			float dZ = fabsf(angles[a].fRayZ);
+
+			if (dX >= dY && dX >= dZ)
+				dir = 0;
+			else if (dY >= dX && dY >= dZ)
+				dir = 1;
+			else
+				dir = 2;
+		}
+
+		if (a == dims.iProjAngles || dir != blockDirection) {
+			// block done
+
+			blockEnd = a;
+			if (blockStart != blockEnd) {
+
+				dim3 dimGrid(
+				             ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV),
+(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock);
+				// TODO: check if we can't immediately
+				//       destroy the stream after use
+				cudaStream_t stream;
+				cudaStreamCreate(&stream);
+				streams.push_back(stream);
+
+				// printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y);
+
+				if (blockDirection == 0) {
+					for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 1) {
+					for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				} else if (blockDirection == 2) {
+					for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices)
+						if (dims.iRaysPerDetDim == 1)
+							par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+						else
+#if 0
+							par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale);
+#else
+							assert(false);
+#endif
+				}
+
+			}
+
+			blockDirection = dir;
+			blockStart = a;
+		}
+	}
+
+	for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter)
+		cudaStreamDestroy(*iter);
+
+	streams.clear();
+
+	cudaTextForceKernelsCompletion();
+
+
+	// printf("%f\n", toc(t));
+
+	return true;
+}
+
+
+
+
+
+
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+	cudaSetDevice(1);
+
+
+	SDimensions3D dims;
+	dims.iVolX = 500;
+	dims.iVolY = 500;
+	dims.iVolZ = 81;
+	dims.iProjAngles = 241;
+	dims.iProjU = 600;
+	dims.iProjV = 100;
+	dims.iRaysPerDet = 1;
+
+	SPar3DProjection base;
+	base.fRayX = 1.0f;
+	base.fRayY = 0.0f;
+	base.fRayZ = 0.1f;
+
+	base.fDetSX = 0.0f;
+	base.fDetSY = -300.0f;
+	base.fDetSZ = -50.0f;
+
+	base.fDetUX = 0.0f;
+	base.fDetUY = 1.0f;
+	base.fDetUZ = 0.0f;
+
+	base.fDetVX = 0.0f;
+	base.fDetVY = 0.0f;
+	base.fDetVZ = 1.0f;
+
+	SPar3DProjection angle[dims.iProjAngles];
+
+	cudaPitchedPtr volData; // pitch, ptr, xsize, ysize
+
+	volData = allocateVolumeData(dims);
+
+	cudaPitchedPtr projData; // pitch, ptr, xsize, ysize
+
+	projData = allocateProjectionData(dims);
+
+	unsigned int ix = 500,iy = 500;
+
+	float* buf = new float[dims.iProjU*dims.iProjV];
+
+	float* slice = new float[dims.iVolX*dims.iVolY];
+	for (int i = 0; i < dims.iVolX*dims.iVolY; ++i)
+		slice[i] = 1.0f;
+
+	for (unsigned int a = 0; a < 241; a += dims.iProjAngles) {
+
+		zeroProjectionData(projData, dims);
+
+		for (int y = 0; y < iy; y += dims.iVolY) {
+			for (int x = 0; x < ix; x += dims.iVolX) { 
+
+				timeval st;
+				tic(st);
+
+				for (int z = 0; z < dims.iVolZ; ++z) {
+//					char sfn[256];
+//					sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z);
+//					float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY);
+
+					cudaPitchedPtr ptr;
+					ptr.ptr = slice;
+					ptr.pitch = dims.iVolX*sizeof(float);
+					ptr.xsize = dims.iVolX*sizeof(float);
+					ptr.ysize = dims.iVolY;
+					cudaExtent extentS;
+					extentS.width = dims.iVolX*sizeof(float);
+					extentS.height = dims.iVolY;
+					extentS.depth = 1;
+
+					cudaPos sp = { 0, 0, 0 };
+					cudaPos dp = { 0, 0, z };
+					cudaMemcpy3DParms p;
+					p.srcArray = 0;
+					p.srcPos = sp;
+					p.srcPtr = ptr;
+					p.dstArray = 0;
+					p.dstPos = dp;
+					p.dstPtr = volData;
+					p.extent = extentS;
+					p.kind = cudaMemcpyHostToDevice;
+					cudaError err = cudaMemcpy3D(&p);
+					assert(!err);
+//					delete[] slice;
+				}
+
+				printf("Load: %f\n", toc(st));
+
+#if 0
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaPitchedPtr t;
+	t.ptr = new float[1024*1024];
+	t.pitch = 1024*4;
+	t.xsize = 1024*4;
+	t.ysize = 1024;
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = volData;
+	p.extent = extentS;
+	p.dstArray = 0;
+	p.dstPtr = t;
+	p.dstPos = zp;
+	p.kind = cudaMemcpyDeviceToHost;
+	cudaError err = cudaMemcpy3D(&p);
+	assert(!err);
+
+	char fn[32];
+	sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY);
+	saveImage(fn, 1024, 1024, (float*)t.ptr);
+	saveImage("s.png", 4096, 4096, slice);
+	delete[] (float*)t.ptr;
+#endif
+
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0)
+#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0)
+				for (int i = 0; i < dims.iProjAngles; ++i) {
+					ROTATE0(Ray, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetS, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetU, i, (a+i)*.8*M_PI/180);
+					ROTATE0(DetV, i, (a+i)*.8*M_PI/180);
+
+
+//					SHIFT(Src, i, (-x+1536), (-y+1536));
+//					SHIFT(DetS, i, (-x+1536), (-y+1536));
+				}
+#undef ROTATE0
+#undef SHIFT
+				tic(st);
+
+				astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f);
+
+				printf("FP: %f\n", toc(st));
+
+			}
+		}
+		for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) {
+			for (unsigned int v = 0; v < dims.iProjV; ++v)
+				cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost);
+
+			char fname[32];
+			sprintf(fname, "proj%03d.png", a+aa);
+			saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f);
+		}
+	}
+
+	delete[] buf;
+
+}
+#endif
diff --git a/cuda/3d/par3d_fp.h b/cuda/3d/par3d_fp.h
new file mode 100644
index 0000000..7208361
--- /dev/null
+++ b/cuda/3d/par3d_fp.h
@@ -0,0 +1,51 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_PAR3D_FP_H
+#define _CUDA_PAR3D_FP_H
+
+namespace astraCUDA3d {
+
+_AstraExport bool Par3DFP_Array(cudaArray *D_volArray,
+                   cudaPitchedPtr D_projData,
+                   const SDimensions3D& dims, const SPar3DProjection* angles,
+                   float fOutputScale);
+
+_AstraExport bool Par3DFP(cudaPitchedPtr D_volumeData,
+            cudaPitchedPtr D_projData,
+            const SDimensions3D& dims, const SPar3DProjection* angles,
+            float fOutputScale);
+
+_AstraExport bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData,
+                    cudaPitchedPtr D_projData,
+                    const SDimensions3D& dims, const SPar3DProjection* angles,
+                    float fOutputScale);
+
+}
+
+#endif
diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu
new file mode 100644
index 0000000..f615204
--- /dev/null
+++ b/cuda/3d/sirt3d.cu
@@ -0,0 +1,533 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+
+#include "sirt3d.h"
+#include "util3d.h"
+#include "arith3d.h"
+#include "cone_fp.h"
+
+#ifdef STANDALONE
+#include "testutil.h"
+#endif
+
+namespace astraCUDA3d {
+
+SIRT::SIRT() : ReconAlgo3D()
+{
+	D_maskData.ptr = 0;
+	D_smaskData.ptr = 0;
+
+	D_sinoData.ptr = 0;
+	D_volumeData.ptr = 0;
+
+	D_projData.ptr = 0;
+	D_tmpData.ptr = 0;
+
+	D_lineWeight.ptr = 0;
+	D_pixelWeight.ptr = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	useMinConstraint = false;
+	useMaxConstraint = false;
+}
+
+
+SIRT::~SIRT()
+{
+	reset();
+}
+
+void SIRT::reset()
+{
+	cudaFree(D_projData.ptr);
+	cudaFree(D_tmpData.ptr);
+	cudaFree(D_lineWeight.ptr);
+	cudaFree(D_pixelWeight.ptr);
+
+	D_maskData.ptr = 0;
+	D_smaskData.ptr = 0;
+
+	D_sinoData.ptr = 0;
+	D_volumeData.ptr = 0;
+
+	D_projData.ptr = 0;
+	D_tmpData.ptr = 0;
+
+	D_lineWeight.ptr = 0;
+	D_pixelWeight.ptr = 0;
+
+	useVolumeMask = false;
+	useSinogramMask = false;
+
+	ReconAlgo3D::reset();
+}
+
+bool SIRT::enableVolumeMask()
+{
+	useVolumeMask = true;
+	return true;
+}
+
+bool SIRT::enableSinogramMask()
+{
+	useSinogramMask = true;
+	return true;
+}
+
+
+bool SIRT::init()
+{
+	D_pixelWeight = allocateVolumeData(dims);
+	zeroVolumeData(D_pixelWeight, dims);
+
+	D_tmpData = allocateVolumeData(dims);
+	zeroVolumeData(D_tmpData, dims);
+
+	D_projData = allocateProjectionData(dims);
+	zeroProjectionData(D_projData, dims);
+
+	D_lineWeight = allocateProjectionData(dims);
+	zeroProjectionData(D_lineWeight, dims);
+
+	// We can't precompute lineWeights and pixelWeights when using a mask
+	if (!useVolumeMask && !useSinogramMask)
+		precomputeWeights();
+
+	// TODO: check if allocations succeeded
+	return true;
+}
+
+bool SIRT::setMinConstraint(float fMin)
+{
+	fMinConstraint = fMin;
+	useMinConstraint = true;
+	return true;
+}
+
+bool SIRT::setMaxConstraint(float fMax)
+{
+	fMaxConstraint = fMax;
+	useMaxConstraint = true;
+	return true;
+}
+
+bool SIRT::precomputeWeights()
+{
+	zeroProjectionData(D_lineWeight, dims);
+	if (useVolumeMask) {
+		callFP(D_maskData, D_lineWeight, 1.0f);
+	} else {
+		processVol3D<opSet>(D_tmpData, 1.0f, dims);
+		callFP(D_tmpData, D_lineWeight, 1.0f);
+	}
+	processSino3D<opInvert>(D_lineWeight, dims);
+
+	if (useSinogramMask) {
+		// scale line weights with sinogram mask to zero out masked sinogram pixels
+		processSino3D<opMul>(D_lineWeight, D_smaskData, dims);
+	}
+
+	zeroVolumeData(D_pixelWeight, dims);
+
+	if (useSinogramMask) {
+		callBP(D_pixelWeight, D_smaskData);
+	} else {
+		processSino3D<opSet>(D_projData, 1.0f, dims);
+		callBP(D_pixelWeight, D_projData);
+	}
+#if 0
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 180; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "ray%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif
+
+#if 0
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		char fname[20];
+		sprintf(fname, "pix%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+	processVol3D<opInvert>(D_pixelWeight, dims);
+
+	if (useVolumeMask) {
+		// scale pixel weights with mask to zero out masked pixels
+		processVol3D<opMul>(D_pixelWeight, D_maskData, dims);
+	}
+
+	return true;
+}
+
+
+bool SIRT::setVolumeMask(cudaPitchedPtr& _D_maskData)
+{
+	assert(useVolumeMask);
+
+	D_maskData = _D_maskData;
+
+	return true;
+}
+
+bool SIRT::setSinogramMask(cudaPitchedPtr& _D_smaskData)
+{
+	assert(useSinogramMask);
+
+	D_smaskData = _D_smaskData;
+
+	return true;
+}
+
+bool SIRT::setBuffers(cudaPitchedPtr& _D_volumeData,
+                      cudaPitchedPtr& _D_projData)
+{
+	D_volumeData = _D_volumeData;
+	D_sinoData = _D_projData;
+
+	fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr);
+
+	return true;
+}
+
+bool SIRT::iterate(unsigned int iterations)
+{
+	shouldAbort = false;
+
+	if (useVolumeMask || useSinogramMask)
+		precomputeWeights();
+
+#if 0
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		char fname[20];
+		sprintf(fname, "pix%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+#if 0
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 100; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)D_lineWeight.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "ray%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif
+
+
+	// iteration
+	for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) {
+		// copy sinogram to projection data
+		duplicateProjectionData(D_projData, D_sinoData, dims);
+
+		// do FP, subtracting projection from sinogram
+		if (useVolumeMask) {
+				duplicateVolumeData(D_tmpData, D_volumeData, dims);
+				processVol3D<opMul>(D_tmpData, D_maskData, dims);
+				callFP(D_tmpData, D_projData, -1.0f);
+		} else {
+				callFP(D_volumeData, D_projData, -1.0f);
+		}
+
+		processSino3D<opMul>(D_projData, D_lineWeight, dims);
+
+		zeroVolumeData(D_tmpData, dims);
+#if 0
+	float* bufp = new float[512*512];
+	printf("Dumping projData: %p\n", (void*)D_projData.ptr);
+	for (int i = 0; i < 180; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "diff%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif
+
+
+		callBP(D_tmpData, D_projData);
+#if 0
+	printf("Dumping tmpData: %p\n", (void*)D_tmpData.ptr);
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)D_tmpData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		char fname[20];
+		sprintf(fname, "add%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+
+
+		processVol3D<opAddMul>(D_volumeData, D_tmpData, D_pixelWeight, dims);
+
+		if (useMinConstraint)
+			processVol3D<opClampMin>(D_volumeData, fMinConstraint, dims);
+		if (useMaxConstraint)
+			processVol3D<opClampMax>(D_volumeData, fMaxConstraint, dims);
+	}
+
+	return true;
+}
+
+float SIRT::computeDiffNorm()
+{
+	// copy sinogram to projection data
+	duplicateProjectionData(D_projData, D_sinoData, dims);
+
+	// do FP, subtracting projection from sinogram
+	if (useVolumeMask) {
+			duplicateVolumeData(D_tmpData, D_volumeData, dims);
+			processVol3D<opMul>(D_tmpData, D_maskData, dims);
+			callFP(D_tmpData, D_projData, -1.0f);
+	} else {
+			callFP(D_volumeData, D_projData, -1.0f);
+	}
+
+	float s = dotProduct3D(D_projData, dims.iProjU, dims.iProjAngles, dims.iProjV);
+	return sqrt(s);
+}
+
+
+bool doSIRT(cudaPitchedPtr& D_volumeData, 
+            cudaPitchedPtr& D_sinoData,
+            cudaPitchedPtr& D_maskData,
+            const SDimensions3D& dims, const SConeProjection* angles,
+            unsigned int iterations)
+{
+	SIRT sirt;
+	bool ok = true;
+
+	ok &= sirt.setConeGeometry(dims, angles);
+	if (D_maskData.ptr)
+		ok &= sirt.enableVolumeMask();
+
+	if (!ok)
+		return false;
+
+	ok = sirt.init();
+	if (!ok)
+		return false;
+
+	if (D_maskData.ptr)
+		ok &= sirt.setVolumeMask(D_maskData);
+
+	ok &= sirt.setBuffers(D_volumeData, D_sinoData);
+	if (!ok)
+		return false;
+
+	ok = sirt.iterate(iterations);
+
+	return ok;
+}
+
+}
+
+#ifdef STANDALONE
+
+using namespace astraCUDA3d;
+
+int main()
+{
+	SDimensions3D dims;
+	dims.iVolX = 256;
+	dims.iVolY = 256;
+	dims.iVolZ = 256;
+	dims.iProjAngles = 100;
+	dims.iProjU = 512;
+	dims.iProjV = 512;
+	dims.iRaysPerDet = 1;
+
+	SConeProjection angle[100];
+	angle[0].fSrcX = -2905.6;
+	angle[0].fSrcY = 0;
+	angle[0].fSrcZ = 0;
+
+	angle[0].fDetSX = 694.4;
+	angle[0].fDetSY = -122.4704;
+	angle[0].fDetSZ = -122.4704;
+
+	angle[0].fDetUX = 0;
+	angle[0].fDetUY = .4784;
+	//angle[0].fDetUY = .5;
+	angle[0].fDetUZ = 0;
+
+	angle[0].fDetVX = 0;
+	angle[0].fDetVY = 0;
+	angle[0].fDetVZ = .4784;
+
+#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0)
+	for (int i = 1; i < 100; ++i) {
+		angle[i] = angle[0];
+		ROTATE0(Src, i, i*2*M_PI/100);
+		ROTATE0(DetS, i, i*2*M_PI/100);
+		ROTATE0(DetU, i, i*2*M_PI/100);
+		ROTATE0(DetV, i, i*2*M_PI/100);
+	}
+#undef ROTATE0
+
+
+	cudaPitchedPtr volData = allocateVolumeData(dims);
+	cudaPitchedPtr projData = allocateProjectionData(dims);
+	zeroProjectionData(projData, dims);
+
+	float* pbuf = new float[100*512*512];
+	copyProjectionsFromDevice(pbuf, projData, dims);
+	copyProjectionsToDevice(pbuf, projData, dims);
+	delete[] pbuf;
+
+#if 0
+	float* slice = new float[256*256];
+	cudaPitchedPtr ptr;
+	ptr.ptr = slice;
+	ptr.pitch = 256*sizeof(float);
+	ptr.xsize = 256*sizeof(float);
+	ptr.ysize = 256;
+
+	for (unsigned int i = 0; i < 256; ++i) {
+		for (unsigned int y = 0; y < 256; ++y)
+			for (unsigned int x = 0; x < 256; ++x)
+				slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f;
+
+		cudaExtent extentS;
+		extentS.width = dims.iVolX*sizeof(float);
+		extentS.height = dims.iVolY;
+		extentS.depth = 1;
+		cudaPos sp = { 0, 0, 0 };
+		cudaPos dp = { 0, 0, i };
+		cudaMemcpy3DParms p;
+		p.srcArray = 0;
+		p.srcPos = sp;
+		p.srcPtr = ptr;
+		p.dstArray = 0;
+		p.dstPos = dp;
+		p.dstPtr = volData;
+		p.extent = extentS;
+		p.kind = cudaMemcpyHostToDevice;
+		cudaMemcpy3D(&p);
+	}
+	astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f);
+
+#else
+
+	for (int i = 0; i < 100; ++i) {
+		char fname[32];
+		sprintf(fname, "Tiffs/%04d.png", 4*i);
+		unsigned int w,h;
+		float* bufp = loadImage(fname, w,h);
+
+		for (int j = 0; j < 512*512; ++j) {
+			float v = bufp[j];
+			if (v > 236.0f) v = 236.0f;
+			v = logf(236.0f / v);
+			bufp[j] = 256*v;
+		}
+
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice);
+		}
+
+		delete[] bufp;
+
+	}
+#endif
+
+#if 0
+	float* bufs = new float[100*512];
+
+	for (int i = 0; i < 512; ++i) {
+		cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost);
+
+		printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize);
+
+		char fname[20];
+		sprintf(fname, "sino%03d.png", i);
+		saveImage(fname, 100, 512, bufs);
+	}
+
+	float* bufp = new float[512*512];
+
+	for (int i = 0; i < 100; ++i) {
+		for (int j = 0; j < 512; ++j) {
+			cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost);
+		}
+
+		char fname[20];
+		sprintf(fname, "proj%03d.png", i);
+		saveImage(fname, 512, 512, bufp);
+	}
+#endif
+
+	zeroVolumeData(volData, dims);
+
+	cudaPitchedPtr maskData;
+	maskData.ptr = 0;
+
+	astraCUDA3d::doSIRT(volData, projData, maskData, dims, angle, 50);
+#if 1
+	float* buf = new float[256*256];
+
+	for (int i = 0; i < 256; ++i) {
+		cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost);
+
+		char fname[20];
+		sprintf(fname, "vol%03d.png", i);
+		saveImage(fname, 256, 256, buf);
+	}
+#endif
+
+	return 0;
+}
+#endif
+
diff --git a/cuda/3d/sirt3d.h b/cuda/3d/sirt3d.h
new file mode 100644
index 0000000..c3752c2
--- /dev/null
+++ b/cuda/3d/sirt3d.h
@@ -0,0 +1,118 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_SIRT3D_H
+#define _CUDA_SIRT3D_H
+
+#include "util3d.h"
+#include "algo3d.h"
+
+namespace astraCUDA3d {
+
+class _AstraExport SIRT : public ReconAlgo3D {
+public:
+	SIRT();
+	~SIRT();
+
+//	bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs);
+
+
+	bool enableVolumeMask();
+	bool enableSinogramMask();
+
+	// init should be called after setting all geometry
+	bool init();
+
+	// setVolumeMask should be called after init and before iterate,
+	// but only if enableVolumeMask was called before init.
+	// It may be called again after iterate.
+	bool setVolumeMask(cudaPitchedPtr& D_maskData);
+
+	// setSinogramMask should be called after init and before iterate,
+	// but only if enableSinogramMask was called before init.
+	// It may be called again after iterate.
+	bool setSinogramMask(cudaPitchedPtr& D_smaskData);
+
+
+	// setBuffers should be called after init and before iterate.
+	// It may be called again after iterate.
+	bool setBuffers(cudaPitchedPtr& D_volumeData,
+	                cudaPitchedPtr& D_projData);
+
+
+	// set Min/Max constraints. They may be called at any time, and will affect
+	// any iterate() calls afterwards.
+	bool setMinConstraint(float fMin);
+	bool setMaxConstraint(float fMax);
+
+	// iterate should be called after init and setBuffers.
+	// It may be called multiple times.
+	bool iterate(unsigned int iterations);
+
+	// Compute the norm of the difference of the FP of the current reconstruction
+	// and the sinogram. (This performs one FP.)
+	// It can be called after iterate.
+	float computeDiffNorm();
+
+protected:
+	void reset();
+	bool precomputeWeights();
+
+	bool useVolumeMask;
+	bool useSinogramMask;
+
+	bool useMinConstraint;
+	bool useMaxConstraint;
+	float fMinConstraint;
+	float fMaxConstraint;
+
+	cudaPitchedPtr D_maskData;
+	cudaPitchedPtr D_smaskData;
+
+	// Input/output
+	cudaPitchedPtr D_sinoData;
+	cudaPitchedPtr D_volumeData;
+
+ 	// Temporary buffers
+	cudaPitchedPtr D_projData;
+	cudaPitchedPtr D_tmpData;
+
+	// Geometry-specific precomputed data
+	cudaPitchedPtr D_lineWeight;
+	cudaPitchedPtr D_pixelWeight;
+};
+
+bool doSIRT(cudaPitchedPtr D_volumeData, unsigned int volumePitch,
+            cudaPitchedPtr D_projData, unsigned int projPitch,
+            cudaPitchedPtr D_maskData, unsigned int maskPitch,
+            const SDimensions3D& dims, const SConeProjection* projs,
+            unsigned int iterations);
+
+}
+
+#endif
diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu
new file mode 100644
index 0000000..81ea823
--- /dev/null
+++ b/cuda/3d/util3d.cu
@@ -0,0 +1,514 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#include <cstdio>
+#include <cassert>
+#include "util3d.h"
+#include "../2d/util.h"
+
+namespace astraCUDA3d {
+
+
+cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims)
+{
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPitchedPtr volData;
+
+	cudaError err = cudaMalloc3D(&volData, extentV);
+	if (err != cudaSuccess) {
+		astraCUDA::reportCudaError(err);
+		fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ);
+		volData.ptr = 0;
+		// TODO: return 0 somehow?
+	}
+
+	return volData;
+}
+cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims)
+{
+	cudaExtent extentP;
+	extentP.width = dims.iProjU*sizeof(float);
+	extentP.height = dims.iProjAngles;
+	extentP.depth = dims.iProjV;
+
+	cudaPitchedPtr projData;
+
+	cudaError err = cudaMalloc3D(&projData, extentP);
+	if (err != cudaSuccess) {
+		astraCUDA::reportCudaError(err);
+		fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
+		projData.ptr = 0;
+		// TODO: return 0 somehow?
+	}
+
+	return projData;
+}
+bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
+{
+	char* t = (char*)D_data.ptr;
+	cudaError err;
+
+	for (unsigned int z = 0; z < dims.iVolZ; ++z) {
+		err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY);
+		ASTRA_CUDA_ASSERT(err);
+		t += D_data.pitch * dims.iVolY;
+	}
+	return true;
+}
+bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims)
+{
+	char* t = (char*)D_data.ptr;
+	cudaError err;
+
+	for (unsigned int z = 0; z < dims.iProjV; ++z) {
+		err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles);
+		ASTRA_CUDA_ASSERT(err);
+		t += D_data.pitch * dims.iProjAngles;
+	}
+
+	return true;
+}
+bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+	if (!pitch)
+		pitch = dims.iVolX;
+
+	cudaPitchedPtr ptr;
+	ptr.ptr = (void*)data; // const cast away
+	ptr.pitch = pitch*sizeof(float);
+	ptr.xsize = dims.iVolX*sizeof(float);
+	ptr.ysize = dims.iVolY;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = ptr;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = D_data;
+	p.extent = extentV;
+	p.kind = cudaMemcpyHostToDevice;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+
+bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+	if (!pitch)
+		pitch = dims.iProjU;
+
+	cudaPitchedPtr ptr;
+	ptr.ptr = (void*)data; // const cast away
+	ptr.pitch = pitch*sizeof(float);
+	ptr.xsize = dims.iProjU*sizeof(float);
+	ptr.ysize = dims.iProjAngles;
+
+	cudaExtent extentV;
+	extentV.width = dims.iProjU*sizeof(float);
+	extentV.height = dims.iProjAngles;
+	extentV.depth = dims.iProjV;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = ptr;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = D_data;
+	p.extent = extentV;
+	p.kind = cudaMemcpyHostToDevice;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+
+bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+	if (!pitch)
+		pitch = dims.iVolX;
+
+	cudaPitchedPtr ptr;
+	ptr.ptr = data;
+	ptr.pitch = pitch*sizeof(float);
+	ptr.xsize = dims.iVolX*sizeof(float);
+	ptr.ysize = dims.iVolY;
+
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_data;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = ptr;
+	p.extent = extentV;
+	p.kind = cudaMemcpyDeviceToHost;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch)
+{
+	if (!pitch)
+		pitch = dims.iProjU;
+
+	cudaPitchedPtr ptr;
+	ptr.ptr = data;
+	ptr.pitch = pitch*sizeof(float);
+	ptr.xsize = dims.iProjU*sizeof(float);
+	ptr.ysize = dims.iProjAngles;
+
+	cudaExtent extentV;
+	extentV.width = dims.iProjU*sizeof(float);
+	extentV.height = dims.iProjAngles;
+	extentV.depth = dims.iProjV;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_data;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = ptr;
+	p.extent = extentV;
+	p.kind = cudaMemcpyDeviceToHost;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+
+bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
+{
+	cudaExtent extentV;
+	extentV.width = dims.iVolX*sizeof(float);
+	extentV.height = dims.iVolY;
+	extentV.depth = dims.iVolZ;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_src;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = D_dst;
+	p.extent = extentV;
+	p.kind = cudaMemcpyDeviceToDevice;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims)
+{
+	cudaExtent extentV;
+	extentV.width = dims.iProjU*sizeof(float);
+	extentV.height = dims.iProjAngles;
+	extentV.depth = dims.iProjV;
+
+	cudaPos zp = { 0, 0, 0 };
+
+	cudaMemcpy3DParms p;
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_src;
+	p.dstArray = 0;
+	p.dstPos = zp;
+	p.dstPtr = D_dst;
+	p.extent = extentV;
+	p.kind = cudaMemcpyDeviceToDevice;
+
+	cudaError err;
+	err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	return err == cudaSuccess;
+}
+
+
+
+// TODO: Consider using a single array of size max(proj,volume) (per dim)
+//       instead of allocating a new one each time
+
+// TODO: Figure out a faster way of zeroing the padding?
+
+cudaArray* allocateVolumeArray(const SDimensions3D& dims)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+	cudaArray* cuArray;
+	cudaExtent extentA;
+	extentA.width = dims.iVolX+2;
+	extentA.height = dims.iVolY+2;
+	extentA.depth = dims.iVolZ+2;
+	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
+	if (err != cudaSuccess) {
+		astraCUDA::reportCudaError(err);
+		fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ);
+		return 0;
+	}
+
+	zeroVolumeArray(cuArray, dims);
+
+	return cuArray;
+}
+cudaArray* allocateProjectionArray(const SDimensions3D& dims)
+{
+	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+	cudaArray* cuArray;
+	cudaExtent extentA;
+	extentA.width = dims.iProjU+2;
+	extentA.height = dims.iProjAngles;
+	extentA.depth = dims.iProjV+2;
+	cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA);
+
+	if (err != cudaSuccess) {
+		astraCUDA::reportCudaError(err);
+		fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV);
+		return 0;
+	}
+
+	zeroProjectionArray(cuArray, dims);
+
+	return cuArray;
+}
+bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims)
+{
+	cudaPitchedPtr zeroBuf;
+	cudaExtent extentS;
+	extentS.width = sizeof(float)*(dims.iVolX+2);
+	extentS.height = dims.iVolY+2;
+	extentS.depth = 1;
+
+	cudaExtent extentA;
+	extentA.width = dims.iVolX+2;
+	extentA.height = dims.iVolY+2;
+	extentA.depth = 1;
+
+
+
+	cudaError err;
+	err = cudaMalloc3D(&zeroBuf, extentS);
+	ASTRA_CUDA_ASSERT(err);
+	err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2);
+	ASTRA_CUDA_ASSERT(err);
+	
+	// zero array
+	for (unsigned int i = 0; i < dims.iVolZ+2; ++i) {
+		cudaMemcpy3DParms p;
+		cudaPos zp = {0, 0, 0};
+		cudaPos dp = {0, 0, i};
+		p.srcArray = 0;
+		p.srcPos = zp;
+		p.srcPtr = zeroBuf;
+		p.dstArray = array;
+		p.dstPtr.ptr = 0;
+		p.dstPtr.pitch = 0;
+		p.dstPtr.xsize = 0;
+		p.dstPtr.ysize = 0;
+		p.dstPos = dp;
+		p.extent = extentA;
+		p.kind = cudaMemcpyDeviceToDevice;
+
+		err = cudaMemcpy3D(&p);
+		ASTRA_CUDA_ASSERT(err);
+	}
+	cudaFree(zeroBuf.ptr);
+
+	// TODO: check errors
+
+	return true;
+}
+bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims)
+{
+	cudaPitchedPtr zeroBuf;
+	cudaExtent extentS;
+	extentS.width = sizeof(float)*(dims.iProjU+2);
+	extentS.height = dims.iProjAngles;
+	extentS.depth = 1;
+	cudaExtent extentA;
+	extentA.width = dims.iProjU+2;
+	extentA.height = dims.iProjAngles;
+	extentA.depth = 1;
+
+
+	cudaError err;
+	err = cudaMalloc3D(&zeroBuf, extentS);
+	ASTRA_CUDA_ASSERT(err);
+	err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles);
+	ASTRA_CUDA_ASSERT(err);
+
+	for (unsigned int i = 0; i < dims.iProjV+2; ++i) {
+		cudaMemcpy3DParms p;
+		cudaPos zp = {0, 0, 0};
+		cudaPos dp = {0, 0, i};
+		p.srcArray = 0;
+		p.srcPos = zp;
+		p.srcPtr = zeroBuf;
+		p.dstArray = array;
+		p.dstPtr.ptr = 0;
+		p.dstPtr.pitch = 0;
+		p.dstPtr.xsize = 0;
+		p.dstPtr.ysize = 0;
+		p.dstPos = dp;
+		p.extent = extentA;
+		p.kind = cudaMemcpyDeviceToDevice;
+
+		err = cudaMemcpy3D(&p);
+		ASTRA_CUDA_ASSERT(err);
+	}
+	cudaFree(zeroBuf.ptr);
+
+	// TODO: check errors
+	return true;
+}
+
+
+bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims)
+{
+	cudaExtent extentA;
+	extentA.width = dims.iVolX;
+	extentA.height = dims.iVolY;
+	extentA.depth = dims.iVolZ;
+
+	cudaMemcpy3DParms p;
+	cudaPos zp = {0, 0, 0};
+	cudaPos dp = {1, 1, 1};
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_volumeData;
+	p.dstArray = array;
+	p.dstPtr.ptr = 0;
+	p.dstPtr.pitch = 0;
+	p.dstPtr.xsize = 0;
+	p.dstPtr.ysize = 0;
+	p.dstPos = dp;
+	p.extent = extentA;
+	p.kind = cudaMemcpyDeviceToDevice;
+
+	cudaError err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+	// TODO: check errors
+
+	return true;
+}
+bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims)
+{
+	cudaExtent extentA;
+	extentA.width = dims.iProjU;
+	extentA.height = dims.iProjAngles;
+	extentA.depth = dims.iProjV;
+
+	cudaMemcpy3DParms p;
+	cudaPos zp = {0, 0, 0};
+	cudaPos dp = {1, 0, 1};
+	p.srcArray = 0;
+	p.srcPos = zp;
+	p.srcPtr = D_projData;
+	p.dstArray = array;
+	p.dstPtr.ptr = 0;
+	p.dstPtr.pitch = 0;
+	p.dstPtr.xsize = 0;
+	p.dstPtr.ysize = 0;
+	p.dstPos = dp;
+	p.extent = extentA;
+	p.kind = cudaMemcpyDeviceToDevice;
+
+	cudaError err = cudaMemcpy3D(&p);
+	ASTRA_CUDA_ASSERT(err);
+
+	// TODO: check errors
+
+	return true;
+}
+
+
+float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y,
+                   unsigned int z)
+{
+	return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0);
+}
+
+
+bool cudaTextForceKernelsCompletion()
+{
+	cudaError_t returnedCudaError = cudaThreadSynchronize();
+
+	if(returnedCudaError != cudaSuccess) {
+		fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError));
+		return false;
+	}
+
+	return true;
+}
+
+int calcNextPowerOfTwo(int _iValue)
+{
+	int iOutput = 1;
+	while(iOutput < _iValue)
+		iOutput *= 2;
+	return iOutput;
+}
+
+}
diff --git a/cuda/3d/util3d.h b/cuda/3d/util3d.h
new file mode 100644
index 0000000..cf04a18
--- /dev/null
+++ b/cuda/3d/util3d.h
@@ -0,0 +1,69 @@
+/*
+-----------------------------------------------------------------------
+Copyright 2012 iMinds-Vision Lab, University of Antwerp
+
+Contact: astra@ua.ac.be
+Website: http://astra.ua.ac.be
+
+
+This file is part of the
+All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox").
+
+The ASTRA Toolbox is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+The ASTRA Toolbox is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>.
+
+-----------------------------------------------------------------------
+$Id$
+*/
+
+#ifndef _CUDA_UTIL3D_H
+#define _CUDA_UTIL3D_H
+
+#include <cuda.h>
+#include "dims3d.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#include "../2d/util.h"
+
+namespace astraCUDA3d {
+
+cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims);
+cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims);
+bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims);
+bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims);
+bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0);
+bool duplicateVolumeData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); 
+bool duplicateProjectionData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); 
+
+
+bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims);
+bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims);
+bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims);
+bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims);
+cudaArray* allocateProjectionArray(const SDimensions3D& dims);
+cudaArray* allocateVolumeArray(const SDimensions3D& dims);
+
+bool cudaTextForceKernelsCompletion();
+
+float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z);
+
+int calcNextPowerOfTwo(int _iValue);
+
+}
+
+#endif