diff options
Diffstat (limited to 'cuda')
57 files changed, 16248 insertions, 0 deletions
diff --git a/cuda/2d/algo.cu b/cuda/2d/algo.cu new file mode 100644 index 0000000..5ae5d08 --- /dev/null +++ b/cuda/2d/algo.cu @@ -0,0 +1,356 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cassert> + +#include "algo.h" +#include "par_fp.h" +#include "fan_fp.h" +#include "par_bp.h" +#include "fan_bp.h" +#include "util.h" +#include "arith.h" + +namespace astraCUDA { + +ReconAlgo::ReconAlgo() +{ + angles = 0; + TOffsets = 0; + fanProjs = 0; + shouldAbort = false; + + useVolumeMask = false; + useSinogramMask = false; + D_maskData = 0; + D_smaskData = 0; + + D_sinoData = 0; + D_volumeData = 0; + + useMinConstraint = false; + useMaxConstraint = false; + + freeGPUMemory = false; +} + +ReconAlgo::~ReconAlgo() +{ + reset(); +} + +void ReconAlgo::reset() +{ + delete[] angles; + delete[] TOffsets; + delete[] fanProjs; + + if (freeGPUMemory) { + cudaFree(D_maskData); + cudaFree(D_smaskData); + cudaFree(D_sinoData); + cudaFree(D_volumeData); + } + + angles = 0; + TOffsets = 0; + fanProjs = 0; + shouldAbort = false; + + useVolumeMask = false; + useSinogramMask = false; + + D_maskData = 0; + D_smaskData = 0; + + D_sinoData = 0; + D_volumeData = 0; + + useMinConstraint = false; + useMaxConstraint = false; + + freeGPUMemory = false; +} + +bool ReconAlgo::setGPUIndex(int iGPUIndex) +{ + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool ReconAlgo::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool ReconAlgo::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool ReconAlgo::setGeometry(const SDimensions& _dims, const float* _angles) +{ + dims = _dims; + + angles = new float[dims.iProjAngles]; + + memcpy(angles, _angles, sizeof(angles[0]) * dims.iProjAngles); + + delete[] fanProjs; + fanProjs = 0; + + return true; +} + +bool ReconAlgo::setFanGeometry(const SDimensions& _dims, + const SFanProjection* _projs) +{ + dims = _dims; + fanProjs = new SFanProjection[dims.iProjAngles]; + + memcpy(fanProjs, _projs, sizeof(fanProjs[0]) * dims.iProjAngles); + + delete[] angles; + angles = 0; + + return true; +} + + +bool ReconAlgo::setTOffsets(const float* _TOffsets) +{ + // TODO: determine if they're all zero? + TOffsets = new float[dims.iProjAngles]; + memcpy(TOffsets, _TOffsets, sizeof(angles[0]) * dims.iProjAngles); + + return true; +} + + + +bool ReconAlgo::setVolumeMask(float* _D_maskData, unsigned int _maskPitch) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + maskPitch = _maskPitch; + + return true; +} + +bool ReconAlgo::setSinogramMask(float* _D_smaskData, unsigned int _smaskPitch) +{ + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + smaskPitch = _smaskPitch; + + return true; +} + +bool ReconAlgo::setBuffers(float* _D_volumeData, unsigned int _volumePitch, + float* _D_projData, unsigned int _projPitch) +{ + D_volumeData = _D_volumeData; + volumePitch = _volumePitch; + D_sinoData = _D_projData; + sinoPitch = _projPitch; + + return true; +} + +bool ReconAlgo::setMinConstraint(float fMin) +{ + fMinConstraint = fMin; + useMinConstraint = true; + return true; +} + +bool ReconAlgo::setMaxConstraint(float fMax) +{ + fMaxConstraint = fMax; + useMaxConstraint = true; + return true; +} + + + +bool ReconAlgo::allocateBuffers() +{ + bool ok; + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + D_volumeData = 0; + return false; + } + + if (useVolumeMask) { + ok = allocateVolume(D_maskData, dims.iVolWidth+2, dims.iVolHeight+2, maskPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + D_volumeData = 0; + D_sinoData = 0; + return false; + } + } + + if (useSinogramMask) { + ok = allocateVolume(D_smaskData, dims.iProjDets+2, dims.iProjAngles, smaskPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + cudaFree(D_maskData); + D_volumeData = 0; + D_sinoData = 0; + D_maskData = 0; + return false; + } + } + + freeGPUMemory = true; + return true; +} + +bool ReconAlgo::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch) +{ + if (!pfSinogram) + return false; + if (!pfReconstruction) + return false; + + bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) + return false; + + // rescale sinogram to adjust for pixel size + processVol<opMul,SINO>(D_sinoData, fSinogramScale, + //1.0f/(fPixelSize*fPixelSize), + sinoPitch, + dims.iProjDets, dims.iProjAngles); + + ok = copyVolumeToDevice(pfReconstruction, iReconstructionPitch, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) + return false; + + + + if (useVolumeMask) { + if (!pfVolMask) + return false; + + ok = copyVolumeToDevice(pfVolMask, iVolMaskPitch, + dims.iVolWidth, dims.iVolHeight, + D_maskData, maskPitch); + if (!ok) + return false; + } + + if (useSinogramMask) { + if (!pfSinoMask) + return false; + + ok = copySinogramToDevice(pfSinoMask, iSinoMaskPitch, + dims.iProjDets, dims.iProjAngles, + D_smaskData, smaskPitch); + if (!ok) + return false; + } + + return true; +} + +bool ReconAlgo::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, + dims.iVolWidth, + dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) + return false; + + return true; +} + + +bool ReconAlgo::callFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float outputScale) +{ + if (angles) { + assert(!fanProjs); + return FP(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets, outputScale); + } else { + assert(fanProjs); + return FanFP(D_volumeData, volumePitch, D_projData, projPitch, + dims, fanProjs, outputScale); + } +} + +bool ReconAlgo::callBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch) +{ + if (angles) { + assert(!fanProjs); + return BP(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets); + } else { + assert(fanProjs); + return FanBP(D_volumeData, volumePitch, D_projData, projPitch, + dims, fanProjs); + } + +} + + + +} diff --git a/cuda/2d/algo.h b/cuda/2d/algo.h new file mode 100644 index 0000000..96195a3 --- /dev/null +++ b/cuda/2d/algo.h @@ -0,0 +1,155 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ALGO_H +#define _CUDA_ALGO_H + +#include "util.h" + +namespace astraCUDA { + +class _AstraExport ReconAlgo { +public: + ReconAlgo(); + virtual ~ReconAlgo(); + + bool setGPUIndex(int iGPUIndex); + + bool setGeometry(const SDimensions& dims, const float* angles); + bool setFanGeometry(const SDimensions& dims, const SFanProjection* projs); + + // setTOffsets should (optionally) be called after setGeometry + bool setTOffsets(const float* TOffsets); + + void signalAbort() { shouldAbort = true; } + + virtual bool enableVolumeMask(); + virtual bool enableSinogramMask(); + + // init should be called after setting all geometry + virtual bool init() = 0; + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(float* D_maskData, unsigned int maskPitch); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(float* D_smaskData, unsigned int smaskPitch); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + + // instead of calling setBuffers, you can also call allocateBuffers + // to let ReconAlgo manage its own GPU memory + virtual bool allocateBuffers(); + virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch); + + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + virtual bool setMinConstraint(float fMin); + virtual bool setMaxConstraint(float fMax); + + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + virtual bool iterate(unsigned int iterations) = 0; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate. + virtual float computeDiffNorm() = 0; + // TODO: computeDiffNorm shouldn't be virtual, but for it to be + // implemented in ReconAlgo, it needs a way to get a suitable + // temporary sinogram buffer. + + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + + +protected: + void reset(); + + bool callFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float outputScale); + bool callBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + + SDimensions dims; + float* angles; + float* TOffsets; + SFanProjection* fanProjs; + + volatile bool shouldAbort; + + bool freeGPUMemory; + + // Input/output + float* D_sinoData; + unsigned int sinoPitch; + + float* D_volumeData; + unsigned int volumePitch; + + // Masks + bool useVolumeMask; + bool useSinogramMask; + + float* D_maskData; + unsigned int maskPitch; + float* D_smaskData; + unsigned int smaskPitch; + + // Min/max + bool useMinConstraint; + bool useMaxConstraint; + float fMinConstraint; + float fMaxConstraint; + + +}; + + +} + +#endif + diff --git a/cuda/2d/arith.cu b/cuda/2d/arith.cu new file mode 100644 index 0000000..1ee02ca --- /dev/null +++ b/cuda/2d/arith.cu @@ -0,0 +1,893 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "arith.h" +#include <cassert> + +namespace astraCUDA { + + +struct opAddScaled { + __device__ void operator()(float& out, const float in, const float inp) { + out += in * inp; + } +}; +struct opScaleAndAdd { + __device__ void operator()(float& out, const float in, const float inp) { + out = in + out * inp; + } +}; +struct opAddMulScaled { + __device__ void operator()(float& out, const float in1, const float in2, const float inp) { + out += in1 * in2 * inp; + } +}; +struct opAddMul { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 * in2; + } +}; +struct opAdd { + __device__ void operator()(float& out, const float in) { + out += in; + } +}; +struct opAdd2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 + in2; + } +}; +struct opMul { + __device__ void operator()(float& out, const float in) { + out *= in; + } +}; +struct opMul2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out *= in1 * in2; + } +}; +struct opDividedBy { + __device__ void operator()(float& out, const float in) { + if (out > 0.000001f) // out is assumed to be positive + out = in / out; + else + out = 0.0f; + } +}; +struct opInvert { + __device__ void operator()(float& out) { + if (out > 0.000001f) // out is assumed to be positive + out = 1 / out; + else + out = 0.0f; + } +}; +struct opSet { + __device__ void operator()(float& out, const float inp) { + out = inp; + } +}; +struct opClampMin { + __device__ void operator()(float& out, const float inp) { + if (out < inp) + out = inp; + } +}; +struct opClampMax { + __device__ void operator()(float& out, const float inp) { + if (out > inp) + out = inp; + } +}; +struct opClampMinMask { + __device__ void operator()(float& out, const float in) { + if (out < in) + out = in; + } +}; +struct opClampMaxMask { + __device__ void operator()(float& out, const float in) { + if (out > in) + out = in; + } +}; +struct opSetMaskedValues { + __device__ void operator()(float& out, const float in, const float inp) { + if (!in) + out = inp; + } +}; +struct opSegmentAndMask { + __device__ void operator()(float& out1, float& out2, const float inp1, const float inp2) { + if (out1 >= inp1) { + out1 = inp2; + out2 = 0.0f; + } + + } + +}; +struct opMulMask { + __device__ void operator()(float& out, const float mask, const float in) { + if (mask > 0.0f) { + out *= in; + } + } +}; + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFFtoDD(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut1[off], pfOut2[off], fParam1, fParam2); + off += pitch; + y++; + } +} + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off], fParam); + off += pitch; + y++; + } +} + + + + + + + + + + + + + + + + +template<typename op, VolType t> +void processVolCopy(float* out, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out, float param, unsigned int width, unsigned int height) +{ + float* D_out; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + + processVol<op, t>(D_out, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); +} + +template<typename op, VolType t> +void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height) +{ + float* D_out1; + float* D_out2; + + unsigned int pitch; + allocateVolume(D_out1, width+2, height+2, pitch); + copyVolumeToDevice(out1, width, width, height, D_out1, pitch); + allocateVolume(D_out2, width+2, height+2, pitch); + copyVolumeToDevice(out2, width, width, height, D_out2, pitch); + + processVol<op, t>(D_out1, D_out2, param1, param2, pitch, width, height); + + copyVolumeFromDevice(out1, width, width, height, D_out1, pitch); + copyVolumeFromDevice(out2, width, width, height, D_out2, pitch); + + cudaFree(D_out1); + cudaFree(D_out2); +} + + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_in, pitch); + + processVol<op, t>(D_out, D_in, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + +template<typename op, VolType t> +void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height) +{ + float* D_out; + float* D_in1; + float* D_in2; + + unsigned int pitch; + allocateVolume(D_out, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_out, pitch); + allocateVolume(D_in1, width+2, height+2, pitch); + copyVolumeToDevice(in1, width, width, height, D_in1, pitch); + allocateVolume(D_in2, width+2, height+2, pitch); + copyVolumeToDevice(in2, width, width, height, D_in2, pitch); + + processVol<op, t>(D_out, D_in1, D_in2, param, pitch, width, height); + + copyVolumeFromDevice(out, width, width, height, D_out, pitch); + + cudaFree(D_out); + cudaFree(D_in1); + cudaFree(D_in2); +} + + + + + + + + + +template<typename op, VolType t> +void processVol(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+511)/512); + + devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut1, float* pfOut2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devFFtoDD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut1 = (float*)out1.ptr; + float *pfOut2 = (float*)out2.ptr; + unsigned int step = out1.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFFtoDD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut1, pfOut2, fParam1, fParam2, out1.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut1 += step; + pfOut2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + + +#define INST_DFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); + +#define INST_DtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); + +#define INST_DDtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +#define INST_DDFtoD(name) \ + template void processVolCopy<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); + + +#define INST_toD(name) \ + template void processVolCopy<name, VOL>(float* out, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); + +#define INST_FtoD(name) \ + template void processVolCopy<name, VOL>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out, float param, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out, float param, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, float param, const SDimensions3D& dims); + +#define INST_FFtoDD(name) \ + template void processVolCopy<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVolCopy<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int width, unsigned int height); \ + template void processVol<name, VOL>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); + + + +INST_DFtoD(opAddScaled) +INST_DFtoD(opScaleAndAdd) +INST_DDFtoD(opAddMulScaled) +INST_DDtoD(opAddMul) +INST_DDtoD(opMul2) +INST_DDtoD(opAdd2) +INST_DtoD(opMul) +INST_DDtoD(opMulMask) +INST_DtoD(opAdd) +INST_DtoD(opDividedBy) +INST_toD(opInvert) +INST_FtoD(opSet) +INST_FtoD(opMul) +INST_DFtoD(opMulMask) +INST_FtoD(opAdd) +INST_FtoD(opClampMin) +INST_FtoD(opClampMax) +INST_DtoD(opClampMinMask) +INST_DtoD(opClampMaxMask) + +// PDART-specific: +INST_DFtoD(opSetMaskedValues) +INST_FFtoDD(opSegmentAndMask) + +} diff --git a/cuda/2d/arith.h b/cuda/2d/arith.h new file mode 100644 index 0000000..c8c7b41 --- /dev/null +++ b/cuda/2d/arith.h @@ -0,0 +1,101 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH_H +#define _CUDA_ARITH_H + +#include <cuda.h> + +namespace astraCUDA { + + +struct opAddScaled; +struct opScaleAndAdd; +struct opAddMulScaled; +struct opAddMul; +struct opAdd; +struct opAdd2; +struct opMul; +struct opMul2; +struct opDividedBy; +struct opInvert; +struct opSet; +struct opClampMin; +struct opClampMax; +struct opClampMinMask; +struct opClampMaxMask; +struct opSegmentAndMask; +struct opSetMaskedValues; + +struct opMulMask; + + + +enum VolType { + SINO = 0, + VOL = 1 +}; + + +template<typename op, VolType t> void processVolCopy(float* out, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, float param, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out1, float* out2, float param1, float param2, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in, float param, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVolCopy(float* out, const float* in1, const float* in2, float param, unsigned int width, unsigned int height); + +template<typename op, VolType t> void processVol(float* out, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out1, float* out2, float fParam1, float fParam2, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(float* out, const float* in1, const float* in2, unsigned int pitch, unsigned int width, unsigned int height); + +template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out1, cudaPitchedPtr& out2, float fParam1, float fParam2, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + + + +} + +#endif diff --git a/cuda/2d/astra.cu b/cuda/2d/astra.cu new file mode 100644 index 0000000..71ed025 --- /dev/null +++ b/cuda/2d/astra.cu @@ -0,0 +1,824 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "util.h" +#include "par_fp.h" +#include "fan_fp.h" +#include "par_bp.h" +#include "arith.h" +#include "astra.h" + +#include "fft.h" + +#include <fstream> +#include <cuda.h> + +#include "../../include/astra/Logger.h" + +using namespace astraCUDA; +using namespace std; + + +namespace astra { + +enum CUDAProjectionType { + PROJ_PARALLEL, + PROJ_FAN +}; + + +class AstraFBP_internal { +public: + SDimensions dims; + float* angles; + float* TOffsets; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + float* D_sinoData; + unsigned int sinoPitch; + + float* D_volumeData; + unsigned int volumePitch; + + cufftComplex * m_pDevFilter; +}; + +AstraFBP::AstraFBP() +{ + pData = new AstraFBP_internal(); + + pData->angles = 0; + pData->D_sinoData = 0; + pData->D_volumeData = 0; + + pData->dims.iVolWidth = 0; + pData->dims.iProjAngles = 0; + pData->dims.fDetScale = 1.0f; + pData->dims.iRaysPerDet = 1; + pData->dims.iRaysPerPixelDim = 1; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->m_pDevFilter = NULL; +} + +AstraFBP::~AstraFBP() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->TOffsets; + pData->TOffsets = 0; + + cudaFree(pData->D_sinoData); + pData->D_sinoData = 0; + + cudaFree(pData->D_volumeData); + pData->D_volumeData = 0; + + if(pData->m_pDevFilter != NULL) + { + freeComplexOnDevice(pData->m_pDevFilter); + pData->m_pDevFilter = NULL; + } + + delete pData; + pData = 0; +} + +bool AstraFBP::setReconstructionGeometry(unsigned int iVolWidth, + unsigned int iVolHeight, + float fPixelSize) +{ + if (pData->initialized) + return false; + + pData->dims.iVolWidth = iVolWidth; + pData->dims.iVolHeight = iVolHeight; + + pData->fPixelSize = fPixelSize; + + return (iVolWidth > 0 && iVolHeight > 0 && fPixelSize > 0.0f); +} + +bool AstraFBP::setProjectionGeometry(unsigned int iProjAngles, + unsigned int iProjDets, + const float* pfAngles, + float fDetSize) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjDets = iProjDets; + pData->dims.fDetScale = fDetSize / pData->fPixelSize; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + pData->angles = new float[iProjAngles]; + memcpy(pData->angles, pfAngles, iProjAngles * sizeof(pfAngles[0])); + + return true; +} + +bool AstraFBP::setPixelSuperSampling(unsigned int iPixelSuperSampling) +{ + if (pData->initialized) + return false; + + if (iPixelSuperSampling == 0) + return false; + + pData->dims.iRaysPerPixelDim = iPixelSuperSampling; + + return true; +} + + +bool AstraFBP::setTOffsets(const float* pfTOffsets) +{ + if (pData->initialized) + return false; + + if (pfTOffsets == 0) + return false; + + pData->TOffsets = new float[pData->dims.iProjAngles]; + memcpy(pData->TOffsets, pfTOffsets, pData->dims.iProjAngles * sizeof(pfTOffsets[0])); + + return true; +} + +bool AstraFBP::init(int iGPUIndex) +{ + if (pData->initialized) + { + return false; + } + + if (pData->dims.iProjAngles == 0 || pData->dims.iVolWidth == 0) + { + return false; + } + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + { + return false; + } + + bool ok = allocateVolume(pData->D_volumeData, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2, pData->volumePitch); + if (!ok) + { + return false; + } + + ok = allocateVolume(pData->D_sinoData, pData->dims.iProjDets+2, pData->dims.iProjAngles, pData->sinoPitch); + if (!ok) + { + cudaFree(pData->D_volumeData); + pData->D_volumeData = 0; + return false; + } + + pData->initialized = true; + + return true; +} + +bool AstraFBP::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copySinogramToDevice(pfSinogram, iSinogramPitch, + pData->dims.iProjDets, + pData->dims.iProjAngles, + pData->D_sinoData, pData->sinoPitch); + if (!ok) + return false; + + // rescale sinogram to adjust for pixel size + processVol<opMul,SINO>(pData->D_sinoData, + 1.0f/(pData->fPixelSize*pData->fPixelSize), + pData->sinoPitch, + pData->dims.iProjDets, pData->dims.iProjAngles); + + pData->setStartReconstruction = false; + + return true; +} + +static int calcNextPowerOfTwo(int _iValue) +{ + int iOutput = 1; + + while(iOutput < _iValue) + { + iOutput *= 2; + } + + return iOutput; +} + +bool AstraFBP::run() +{ + if (!pData->initialized) + { + return false; + } + + zeroVolume(pData->D_volumeData, pData->volumePitch, pData->dims.iVolWidth+2, pData->dims.iVolHeight+2); + + bool ok = false; + + if (pData->m_pDevFilter) { + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); + int iFFTFourDetCount = calcFFTFourSize(iFFTRealDetCount); + + cufftComplex * pDevComplexSinogram = NULL; + + allocateComplexOnDevice(pData->dims.iProjAngles, iFFTFourDetCount, &pDevComplexSinogram); + + runCudaFFT(pData->dims.iProjAngles, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount, pDevComplexSinogram); + + applyFilter(pData->dims.iProjAngles, iFFTFourDetCount, pDevComplexSinogram, pData->m_pDevFilter); + + runCudaIFFT(pData->dims.iProjAngles, pDevComplexSinogram, pData->D_sinoData, pData->sinoPitch, 1, pData->dims.iProjDets, iFFTRealDetCount, iFFTFourDetCount); + + freeComplexOnDevice(pDevComplexSinogram); + + } + + ok = BP(pData->D_volumeData, pData->volumePitch, pData->D_sinoData, pData->sinoPitch, pData->dims, pData->angles, pData->TOffsets); + if(!ok) + { + return false; + } + + processVol<opMul,VOL>(pData->D_volumeData, + (M_PI / 2.0f) / (float)pData->dims.iProjAngles, + pData->volumePitch, + pData->dims.iVolWidth, pData->dims.iVolHeight); + + return true; +} + +bool AstraFBP::getReconstruction(float* pfReconstruction, unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, iReconstructionPitch, + pData->dims.iVolWidth, + pData->dims.iVolHeight, + pData->D_volumeData, pData->volumePitch); + if (!ok) + return false; + + return true; +} + +int AstraFBP::calcFourierFilterSize(int _iDetectorCount) +{ + int iFFTRealDetCount = calcNextPowerOfTwo(2 * _iDetectorCount); + int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); + + // CHECKME: Matlab makes this at least 64. Do we also need to? + return iFreqBinCount; +} + +bool AstraFBP::setFilter(E_FBPFILTER _eFilter, const float * _pfHostFilter /* = NULL */, int _iFilterWidth /* = 0 */, float _fD /* = 1.0f */, float _fFilterParameter /* = -1.0f */) +{ + if(pData->m_pDevFilter != 0) + { + freeComplexOnDevice(pData->m_pDevFilter); + pData->m_pDevFilter = 0; + } + + if (_eFilter == FILTER_NONE) + return true; // leave pData->m_pDevFilter set to 0 + + + int iFFTRealDetCount = calcNextPowerOfTwo(2 * pData->dims.iProjDets); + int iFreqBinCount = calcFFTFourSize(iFFTRealDetCount); + + cufftComplex * pHostFilter = new cufftComplex[pData->dims.iProjAngles * iFreqBinCount]; + memset(pHostFilter, 0, sizeof(cufftComplex) * pData->dims.iProjAngles * iFreqBinCount); + + allocateComplexOnDevice(pData->dims.iProjAngles, iFreqBinCount, &(pData->m_pDevFilter)); + + switch(_eFilter) + { + case FILTER_NONE: + // handled above + break; + case FILTER_RAMLAK: + case FILTER_SHEPPLOGAN: + case FILTER_COSINE: + case FILTER_HAMMING: + case FILTER_HANN: + case FILTER_TUKEY: + case FILTER_LANCZOS: + case FILTER_TRIANGULAR: + case FILTER_GAUSSIAN: + case FILTER_BARTLETTHANN: + case FILTER_BLACKMAN: + case FILTER_NUTTALL: + case FILTER_BLACKMANHARRIS: + case FILTER_BLACKMANNUTTALL: + case FILTER_FLATTOP: + { + genFilter(_eFilter, _fD, pData->dims.iProjAngles, pHostFilter, iFFTRealDetCount, iFreqBinCount, _fFilterParameter); + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + + break; + } + case FILTER_PROJECTION: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + break; + } + case FILTER_SINOGRAM: + { + // make sure the offered filter has the correct size + assert(_iFilterWidth == iFreqBinCount); + + for(int iFreqBinIndex = 0; iFreqBinIndex < iFreqBinCount; iFreqBinIndex++) + { + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iFreqBinIndex + iProjectionIndex * _iFilterWidth]; + + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].x = fValue; + pHostFilter[iFreqBinIndex + iProjectionIndex * iFreqBinCount].y = 0.0f; + } + } + uploadComplexArrayToDevice(pData->dims.iProjAngles, iFreqBinCount, pHostFilter, pData->m_pDevFilter); + break; + } + case FILTER_RPROJECTION: + { + int iProjectionCount = pData->dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float * pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + float fValue = _pfHostFilter[iDetectorIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + + cudaFree(pfDevRealFilter); + + break; + } + case FILTER_RSINOGRAM: + { + int iProjectionCount = pData->dims.iProjAngles; + int iRealFilterElementCount = iProjectionCount * iFFTRealDetCount; + float* pfHostRealFilter = new float[iRealFilterElementCount]; + memset(pfHostRealFilter, 0, sizeof(float) * iRealFilterElementCount); + + int iUsedFilterWidth = min(_iFilterWidth, iFFTRealDetCount); + int iStartFilterIndex = (_iFilterWidth - iUsedFilterWidth) / 2; + int iMaxFilterIndex = iStartFilterIndex + iUsedFilterWidth; + + int iFilterShiftSize = _iFilterWidth / 2; + + for(int iDetectorIndex = iStartFilterIndex; iDetectorIndex < iMaxFilterIndex; iDetectorIndex++) + { + int iFFTInFilterIndex = (iDetectorIndex + iFFTRealDetCount - iFilterShiftSize) % iFFTRealDetCount; + + for(int iProjectionIndex = 0; iProjectionIndex < (int)pData->dims.iProjAngles; iProjectionIndex++) + { + float fValue = _pfHostFilter[iDetectorIndex + iProjectionIndex * _iFilterWidth]; + pfHostRealFilter[iFFTInFilterIndex + iProjectionIndex * iFFTRealDetCount] = fValue; + } + } + + float* pfDevRealFilter = NULL; + cudaMalloc((void **)&pfDevRealFilter, sizeof(float) * iRealFilterElementCount); // TODO: check for errors + cudaMemcpy(pfDevRealFilter, pfHostRealFilter, sizeof(float) * iRealFilterElementCount, cudaMemcpyHostToDevice); + delete[] pfHostRealFilter; + + runCudaFFT(iProjectionCount, pfDevRealFilter, iFFTRealDetCount, 0, iFFTRealDetCount, iFFTRealDetCount, iFreqBinCount, pData->m_pDevFilter); + + cudaFree(pfDevRealFilter); + + break; + } + default: + { + fprintf(stderr, "AstraFBP::setFilter: Weird filter type requested"); + delete [] pHostFilter; + return false; + } + } + + delete [] pHostFilter; + + return true; +} + +BPalgo::BPalgo() +{ + +} + +BPalgo::~BPalgo() +{ + +} + +bool BPalgo::init() +{ + return true; +} + +bool BPalgo::iterate(unsigned int) +{ + // TODO: This zeroVolume makes an earlier memcpy of D_volumeData redundant + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_volumeData, volumePitch, D_sinoData, sinoPitch); + return true; +} + +float BPalgo::computeDiffNorm() +{ + float *D_projData; + unsigned int projPitch; + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + cudaFree(D_projData); + + return sqrt(s); +} + + +bool astraCudaFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, const float *pfOffsets, + float fDetSize, unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + dims.fDetScale = fDetSize; + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + ok = FP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pfAngles, pfOffsets, 1.0f); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return true; +} + +bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, float fOriginSourceDistance, + float fOriginDetectorDistance, float fPixelSize, + float fDetSize, + unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pfAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + + // TODO: Turn this geometry conversion into a util function + SFanProjection* projs = new SFanProjection[dims.iProjAngles]; + + float fSrcX0 = 0.0f; + float fSrcY0 = -fOriginSourceDistance / fPixelSize; + float fDetUX0 = fDetSize / fPixelSize; + float fDetUY0 = 0.0f; + float fDetSX0 = dims.iProjDets * fDetUX0 / -2.0f; + float fDetSY0 = fOriginDetectorDistance / fPixelSize; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = f##name##X0 * cos(alpha) - f##name##Y0 * sin(alpha); projs[i].f##name##Y = f##name##X0 * sin(alpha) + f##name##Y0 * cos(alpha); } while(0) + for (int i = 0; i < dims.iProjAngles; ++i) { + ROTATE0(Src, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + } + +#undef ROTATE0 + + ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, projs, 1.0f); + delete[] projs; + + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + + return true; + +} + + +bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const SFanProjection *pAngles, + unsigned int iDetSuperSampling, + int iGPUIndex) +{ + SDimensions dims; + + if (iProjAngles == 0 || iProjDets == 0 || pAngles == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjDets = iProjDets; + dims.fDetScale = 1.0f; // TODO? + + if (iDetSuperSampling == 0) + return false; + + dims.iRaysPerDet = iDetSuperSampling; + + if (iVolWidth <= 0 || iVolHeight <= 0) + return false; + + dims.iVolWidth = iVolWidth; + dims.iVolHeight = iVolHeight; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + bool ok; + + float* D_volumeData; + unsigned int volumePitch; + + ok = allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + if (!ok) + return false; + + float* D_sinoData; + unsigned int sinoPitch; + + ok = allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + return false; + } + + ok = copyVolumeToDevice(pfVolume, dims.iVolWidth, + dims.iVolWidth, dims.iVolHeight, + D_volumeData, volumePitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + + ok = FanFP(D_volumeData, volumePitch, D_sinoData, sinoPitch, dims, pAngles, 1.0f); + + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + ok = copySinogramFromDevice(pfSinogram, dims.iProjDets, + dims.iProjDets, + dims.iProjAngles, + D_sinoData, sinoPitch); + if (!ok) { + cudaFree(D_volumeData); + cudaFree(D_sinoData); + return false; + } + + cudaFree(D_volumeData); + cudaFree(D_sinoData); + + return true; + +} + + +} diff --git a/cuda/2d/astra.h b/cuda/2d/astra.h new file mode 100644 index 0000000..9e58301 --- /dev/null +++ b/cuda/2d/astra.h @@ -0,0 +1,205 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ASTRA_H +#define _CUDA_ASTRA_H + +#include "fft.h" +#include "fbp_filters.h" +#include "dims.h" +#include "algo.h" + +using astraCUDA::SFanProjection; + +namespace astra { + +enum Cuda2DProjectionKernel { + ker2d_default = 0 +}; + +class AstraFBP_internal; + +class _AstraExport AstraFBP { +public: + // Constructor + AstraFBP(); + + // Destructor + ~AstraFBP(); + + // Set the size of the reconstruction rectangle. + // Volume pixels are currently assumed to be 1x1 squares. + bool setReconstructionGeometry(unsigned int iVolWidth, + unsigned int iVolHeight, + float fPixelSize = 1.0f); + + // Set the projection angles and number of detector pixels per angle. + // pfAngles must be a float array of length iProjAngles. + // fDetSize indicates the size of a detector pixel compared to a + // volume pixel edge. + // + // pfAngles will only be read from during this call. + bool setProjectionGeometry(unsigned int iProjAngles, + unsigned int iProjDets, + const float *pfAngles, + float fDetSize = 1.0f); + + // Set linear supersampling factor for the BP. + // (The number of rays is the square of this) + // + // This may optionally be called before init(). + bool setPixelSuperSampling(unsigned int iPixelSuperSampling); + + // Set per-detector shifts. + // + // pfTOffsets will only be read from during this call. + bool setTOffsets(const float *pfTOffsets); + + // Returns the required size of a filter in the fourier domain + // when multiplying it with the fft of the projection data. + // Its value is equal to the smallest power of two larger than + // or equal to twice the number of detectors in the spatial domain. + // + // _iDetectorCount is the number of detectors in the spatial domain. + static int calcFourierFilterSize(int _iDetectorCount); + + // Sets the filter type. Some filter types require the user to supply an + // array containing the filter. + // The number of elements in a filter in the fourier domain should be equal + // to the value returned by calcFourierFilterSize(). + // The following types require a filter: + // + // - FILTER_PROJECTION: + // The filter size should be equal to the output of + // calcFourierFilterSize(). The filtered sinogram is + // multiplied with the supplied filter. + // + // - FILTER_SINOGRAM: + // Same as FILTER_PROJECTION, but now the filter should contain a row for + // every projection direction. + // + // - FILTER_RPROJECTION: + // The filter should now contain one kernel (= ifft of filter), with the + // peak in the center. The filter width + // can be any value. If odd, the peak is assumed to be in the center, if + // even, it is assumed to be at floor(filter-width/2). + // + // - FILTER_RSINOGRAM + // Same as FILTER_RPROJECTION, but now the supplied filter should contain a + // row for every projection direction. + // + // A large number of other filters (FILTER_RAMLAK, FILTER_SHEPPLOGAN, + // FILTER_COSINE, FILTER_HAMMING, and FILTER_HANN) + // have a D variable, which gives the cutoff point in the frequency domain. + // Setting this value to 1.0 will include the whole filter + bool setFilter(E_FBPFILTER _eFilter, + const float * _pfHostFilter = NULL, + int _iFilterWidth = 0, float _fD = 1.0f, float _fFilterParameter = -1.0f); + + // Initialize CUDA, allocate GPU buffers and + // precompute geometry-specific data. + // + // CUDA is set up to use GPU number iGPUIndex. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry(). + bool init(int iGPUIndex = 0); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size iProjAngles*iSinogramPitch. + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Runs an FBP reconstruction. + // This must be called after setSinogram(). + // + // run can be called before setFilter, but will then use the default Ram-Lak filter + bool run(); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size + // iVolHeight*iReconstructionPitch. + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after run(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + +private: + AstraFBP_internal* pData; +}; + +class _AstraExport BPalgo : public astraCUDA::ReconAlgo { +public: + BPalgo(); + ~BPalgo(); + + virtual bool init(); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); +}; + + + + +// TODO: Clean up this interface to FP + +// Do a single forward projection +_AstraExport bool astraCudaFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, const float *pfOffsets, + float fDetSize = 1.0f, unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +// Do a single forward projection, fan beam +_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const float *pfAngles, float fOriginSourceDistance, + float fOriginDetectorDistance, float fPixelSize = 1.0f, + float fDetSize = 1.0f, + unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +_AstraExport bool astraCudaFanFP(const float* pfVolume, float* pfSinogram, + unsigned int iVolWidth, unsigned int iVolHeight, + unsigned int iProjAngles, unsigned int iProjDets, + const SFanProjection *pAngles, + unsigned int iDetSuperSampling = 1, + int iGPUIndex = 0); + +} +#endif diff --git a/cuda/2d/cgls.cu b/cuda/2d/cgls.cu new file mode 100644 index 0000000..5b1cf46 --- /dev/null +++ b/cuda/2d/cgls.cu @@ -0,0 +1,304 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + +CGLS::CGLS() : ReconAlgo() +{ + D_z = 0; + D_p = 0; + D_r = 0; + D_w = 0; + + sliceInitialized = false; +} + + +CGLS::~CGLS() +{ + reset(); +} + +void CGLS::reset() +{ + cudaFree(D_z); + cudaFree(D_p); + cudaFree(D_r); + cudaFree(D_w); + + D_z = 0; + D_p = 0; + D_r = 0; + D_w = 0; + + ReconAlgo::reset(); +} + +bool CGLS::init() +{ + // Lifetime of z: within an iteration + allocateVolume(D_z, dims.iVolWidth+2, dims.iVolHeight+2, zPitch); + + // Lifetime of p: full algorithm + allocateVolume(D_p, dims.iVolWidth+2, dims.iVolHeight+2, pPitch); + + // Lifetime of r: full algorithm + allocateVolume(D_r, dims.iProjDets+2, dims.iProjAngles, rPitch); + + // Lifetime of w: within an iteration + allocateVolume(D_w, dims.iProjDets+2, dims.iProjAngles, wPitch); + + // TODO: check if allocations succeeded + return true; +} + + +bool CGLS::setBuffers(float* _D_volumeData, unsigned int _volumePitch, + float* _D_projData, unsigned int _projPitch) +{ + bool ok = ReconAlgo::setBuffers(_D_volumeData, _volumePitch, + _D_projData, _projPitch); + + if (!ok) + return false; + + sliceInitialized = false; + + return true; +} + +bool CGLS::copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch) +{ + sliceInitialized = false; + + return ReconAlgo::copyDataToGPU(pfSinogram, iSinogramPitch, fSinogramScale, pfReconstruction, iReconstructionPitch, pfVolMask, iVolMaskPitch, pfSinoMask, iSinoMaskPitch); +} + +bool CGLS::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (!sliceInitialized) { + + // copy sinogram + cudaMemcpy2D(D_r, sizeof(float)*rPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // r = sino - A*x + if (useVolumeMask) { + // Use z as temporary storage here since it is unused + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_z, zPitch, D_r, rPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_r, rPitch, -1.0f); + } + + + // p = A'*r + zeroVolume(D_p, pPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_p, pPitch, D_r, rPitch); + if (useVolumeMask) + processVol<opMul, VOL>(D_p, D_maskData, pPitch, dims.iVolWidth, dims.iVolHeight); + + + gamma = dotProduct2D(D_p, pPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + + sliceInitialized = true; + } + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // w = A*p + zeroVolume(D_w, wPitch, dims.iProjDets+2, dims.iProjAngles); + callFP(D_p, pPitch, D_w, wPitch, 1.0f); + + // alpha = gamma / <w,w> + float ww = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + float alpha = gamma / ww; + + // x += alpha*p + processVol<opAddScaled, VOL>(D_volumeData, D_p, alpha, volumePitch, dims.iVolWidth, dims.iVolHeight); + + // r -= alpha*w + processVol<opAddScaled, SINO>(D_r, D_w, -alpha, rPitch, dims.iProjDets, dims.iProjAngles); + + + // z = A'*r + zeroVolume(D_z, zPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_z, zPitch, D_r, rPitch); + if (useVolumeMask) + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + + float beta = 1.0f / gamma; + gamma = dotProduct2D(D_z, zPitch, dims.iVolWidth, dims.iVolHeight, 1, 1); + beta *= gamma; + + // p = z + beta*p + processVol<opScaleAndAdd, VOL>(D_p, D_z, beta, pPitch, dims.iVolWidth, dims.iVolHeight); + + } + + return true; +} + + +float CGLS::computeDiffNorm() +{ + // We can use w and z as temporary storage here since they're not + // used outside of iterations. + + // copy sinogram to w + cudaMemcpy2D(D_w, sizeof(float)*wPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_z, sizeof(float)*zPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_z, D_maskData, zPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_z, zPitch, D_w, wPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_w, wPitch, -1.0f); + } + + // compute norm of D_w + + float s = dotProduct2D(D_w, wPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + +bool doCGLS(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + const SDimensions& dims, /*const SAugmentedData& augs,*/ + const float* angles, const float* TOffsets, unsigned int iterations) +{ + CGLS cgls; + bool ok = true; + + ok &= cgls.setGeometry(dims, angles); +#if 0 + if (D_maskData) + ok &= cgls.enableVolumeMask(); +#endif + if (TOffsets) + ok &= cgls.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = cgls.init(); + if (!ok) + return false; + +#if 0 + if (D_maskData) + ok &= cgls.setVolumeMask(D_maskData, maskPitch); +#endif + + ok &= cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = cgls.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + CGLS cgls; + + cgls.setGeometry(dims, angle); + cgls.init(); + + cgls.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + cgls.iterate(25); + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/cgls.h b/cuda/2d/cgls.h new file mode 100644 index 0000000..1013bf8 --- /dev/null +++ b/cuda/2d/cgls.h @@ -0,0 +1,92 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CGLS_H +#define _CUDA_CGLS_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport CGLS : public ReconAlgo { +public: + CGLS(); + virtual ~CGLS(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + virtual bool setMinConstraint(float) { return false; } + virtual bool setMaxConstraint(float) { return false; } + + virtual bool init(); + + virtual bool setBuffers(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch); + + virtual bool copyDataToGPU(const float* pfSinogram, unsigned int iSinogramPitch, float fSinogramScale, + const float* pfReconstruction, unsigned int iReconstructionPitch, + const float* pfVolMask, unsigned int iVolMaskPitch, + const float* pfSinoMask, unsigned int iSinoMaskPitch); + + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + + bool sliceInitialized; + + // Buffers + float* D_r; + unsigned int rPitch; + + float* D_w; + unsigned int wPitch; + + float* D_z; + unsigned int zPitch; + + float* D_p; + unsigned int pPitch; + + + float gamma; +}; + + +_AstraExport bool doCGLS(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/darthelper.cu b/cuda/2d/darthelper.cu new file mode 100644 index 0000000..db0036e --- /dev/null +++ b/cuda/2d/darthelper.cu @@ -0,0 +1,358 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "darthelper.h" +#include <cassert> + +namespace astraCUDA { + +// CUDA function for the selection of ROI +__global__ void devRoiSelect(float* in, float radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + float x = (float)(threadIdx.x + 16*blockIdx.x); + float y = (float)(threadIdx.y + 16*blockIdx.y); + + float w = (width-1.0f)*0.5f; + float h = (height-1.0f)*0.5f; + + if ((x-w)*(x-w) + (y-h)*(y-h) > radius * radius * 0.25f) + { + float* d = (float*)in; + unsigned int o = (y+padY)*pitch+x+padX; + d[o] = 0.0f; + } +} + +void roiSelect(float* out, float radius, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(out, width, width, height, D_data, pitch); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + devRoiSelect<<<gridSize, blockSize>>>(D_data, radius, pitch, width, height, 1, 1); + + copyVolumeFromDevice(out, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + + + +// CUDA function for the masking of DART with a radius == 1 +__global__ void devDartMask(float* mask, const float* in, unsigned int conn, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)mask; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + if ((conn == 8 && // 8-connected + (d[o1 - 1] != d[o2] || d[o1] != d[o2] || d[o1 + 1] != d[o2] || + d[o2 - 1] != d[o2] || d[o2 + 1] != d[o2] || + d[o3 - 1] != d[o2] || d[o3] != d[o2] || d[o3 + 1] != d[o2] )) + || + (conn == 4 && // 4-connected + ( d[o1] != d[o2] || + d[o2 - 1] != d[o2] || d[o3 + 1] != d[o2] || + d[o3] != d[o2] ))) + { + m[o2] = 1.0f; + } + } +} + + +// CUDA function for the masking of DART with a radius > 1 +__global__ void devDartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)mask; + + int r = radius; + + // o2: index of the current center pixel + int o2 = (y+padY)*pitch+x+padX; + + if (conn == 8) // 8-connected + { + for (int row = -r; row <= r; row++) + { + int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} + } + } + } + else if (conn == 4) // 4-connected + { + // horizontal + unsigned int o1 = (y+padY)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1 + col] != d[o2]) {m[o2] = 1.0f; return;} + } + + // vertical + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + if (d[o1] != d[o2]) {m[o2] = 1.0f; return;} + } + } + } +} + + +// CUDA function for the masking of ADART with a radius == 1 +__global__ void devADartMask(float* mask, const float* in, unsigned int conn, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)mask; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + if (conn == 8) + { + if (d[o1 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o1 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + else if (conn == 4) + { + if (d[o1 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 - 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o2 + 1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + if (d[o3 ] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } +} + + +// CUDA function for the masking of ADART with a radius > 1 +__global__ void devADartMaskRadius(float* mask, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)mask; + + int r = radius; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + + if (conn == 8) + { + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + if (d[o1+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } + } + else if (conn == 4) + { + // horizontal + for (int col = -r; col <= r; col++) + { + if (d[o2+col] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + + // vertical + for (int row = -r; row <= r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + if (d[o1] != d[o2] && --threshold == 0) {m[o2] = 1.0f; return;} + } + } + } +} + + +void dartMask(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height) +{ + float* D_segmentationData; + float* D_maskData; + + unsigned int pitch; + allocateVolume(D_segmentationData, width+2, height+2, pitch); + copyVolumeToDevice(segmentation, width, width, height, D_segmentationData, pitch); + + allocateVolume(D_maskData, width+2, height+2, pitch); + zeroVolume(D_maskData, pitch, width+2, height+2); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + if (threshold == 1 && radius == 1) + devDartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, pitch, width, height, 1, 1); + else if (threshold > 1 && radius == 1) + devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); + else if (threshold == 1 && radius > 1) + devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); + else + devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); + + copyVolumeFromDevice(mask, width, width, height, D_maskData, pitch); + + cudaFree(D_segmentationData); + cudaFree(D_maskData); + +} + + +__global__ void devDartSmoothingRadius(float* out, const float* in, float b, unsigned int radius, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > radius-1 && x < width - radius && y > radius-1 && y < height - radius) + { + float* d = (float*)in; + float* m = (float*)out; + + unsigned int o2 = (y+padY)*pitch+x+padX; + int r = radius; + float res = -d[o2]; + + for (int row = -r; row < r; row++) + { + unsigned int o1 = (y+padY+row)*pitch+x+padX; + for (int col = -r; col <= r; col++) + { + res += d[o1+col]; + } + } + + res *= b / 4*r*(r+1); + res += (1.0f-b) * d[o2]; + + m[o2] = res; + } +} + + +__global__ void devDartSmoothing(float* out, const float* in, float b, unsigned int pitch, unsigned int width, unsigned int height, unsigned int padX, unsigned int padY) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < width - 1 && y > 0 && y < height - 1) { + float* d = (float*)in; + float* m = (float*)out; + + unsigned int o2 = (y+padY)*pitch+x+padX; // On this row. + unsigned int o1 = o2 - pitch; // On previous row. + unsigned int o3 = o2 + pitch; // On next row. + + m[o2] = (1.0f-b) * d[o2] + b * 0.125f * (d[o1 - 1] + d[o1] + d[o1 + 1] + d[o2 - 1] + d[o2 + 1] + d[o3 - 1] + d[o3] + d[o3 + 1]); + } +} + + +void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height) +{ + float* D_inData; + float* D_outData; + + unsigned int pitch; + allocateVolume(D_inData, width+2, height+2, pitch); + copyVolumeToDevice(in, width, width, height, D_inData, pitch); + + allocateVolume(D_outData, width+2, height+2, pitch); + zeroVolume(D_outData, pitch, width+2, height+2); + + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + if (radius == 1) + devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, pitch, width, height, 1, 1); + else + devDartSmoothingRadius<<<gridSize, blockSize>>>(D_outData, D_inData, b, radius, pitch, width, height, 1, 1); + + copyVolumeFromDevice(out, width, width, height, D_outData, pitch); + + cudaFree(D_outData); + cudaFree(D_inData); + +} + + + +bool setGPUIndex(int iGPUIndex) +{ + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + + +} diff --git a/cuda/2d/darthelper.h b/cuda/2d/darthelper.h new file mode 100644 index 0000000..e05f01e --- /dev/null +++ b/cuda/2d/darthelper.h @@ -0,0 +1,44 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH2_H +#define _CUDA_ARITH2_H + +#include <cuda.h> + +namespace astraCUDA { + + void roiSelect(float* out, float radius, unsigned int width, unsigned int height); + void dartMask(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, unsigned int width, unsigned int height); + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, unsigned int width, unsigned int height); + + bool setGPUIndex(int index); + +} + +#endif diff --git a/cuda/2d/dataop.cu b/cuda/2d/dataop.cu new file mode 100644 index 0000000..68573b2 --- /dev/null +++ b/cuda/2d/dataop.cu @@ -0,0 +1,130 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util.h" +#include "dataop.h" +#include "arith.h" +#include <cassert> + +namespace astraCUDA { + +void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height) +{ + float* D_data1; + float* D_data2; + + unsigned int pitch; + allocateVolume(D_data1, width+2, height+2, pitch); + copyVolumeToDevice(data1, width, width, height, D_data1, pitch); + + allocateVolume(D_data2, width+2, height+2, pitch); + copyVolumeToDevice(data2, width, width, height, D_data2, pitch); + + processVol<opMul, VOL>(D_data1, D_data2, pitch, width, height); + + copyVolumeFromDevice(data1, width, width, height, D_data1, pitch); + + cudaFree(D_data1); + cudaFree(D_data2); +} + +void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + float* D_mask; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + allocateVolume(D_mask, width+2, height+2, pitch); + copyVolumeToDevice(mask, width, width, height, D_mask, pitch); + + processVol<opMulMask, VOL>(D_data, D_mask, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); + cudaFree(D_mask); +} + + +void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + processVol<opMul, VOL>(D_data, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + +void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height) +{ + float* D_data1; + float* D_data2; + + unsigned int pitch; + allocateVolume(D_data1, width+2, height+2, pitch); + copyVolumeToDevice(data1, width, width, height, D_data1, pitch); + + allocateVolume(D_data2, width+2, height+2, pitch); + copyVolumeToDevice(data2, width, width, height, D_data2, pitch); + + processVol<opAdd, VOL>(D_data1, D_data2, pitch, width, height); + + copyVolumeFromDevice(data1, width, width, height, D_data1, pitch); + + cudaFree(D_data1); + cudaFree(D_data2); +} + + +void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height) +{ + float* D_data; + + unsigned int pitch; + allocateVolume(D_data, width+2, height+2, pitch); + copyVolumeToDevice(data, width, width, height, D_data, pitch); + + processVol<opAdd, VOL>(D_data, scalar, pitch, width, height); + + copyVolumeFromDevice(data, width, width, height, D_data, pitch); + + cudaFree(D_data); +} + + +} diff --git a/cuda/2d/dataop.h b/cuda/2d/dataop.h new file mode 100644 index 0000000..3e9c7e2 --- /dev/null +++ b/cuda/2d/dataop.h @@ -0,0 +1,47 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DATAOP_H +#define _CUDA_DATAOP_H + +#include <cuda.h> + +namespace astraCUDA { + + void operationVolumeMult(float* data1, float* data2, unsigned int width, unsigned int height); + + void operationVolumeMultScalar(float* data, float scalar, unsigned int width, unsigned int height); + void operationVolumeMultScalarMask(float* data, float* mask, float scalar, unsigned int width, unsigned int height); + + void operationVolumeAdd(float* data1, float* data2, unsigned int width, unsigned int height); + + void operationVolumeAddScalar(float* data, float scalar, unsigned int width, unsigned int height); + +} + +#endif diff --git a/cuda/2d/dims.h b/cuda/2d/dims.h new file mode 100644 index 0000000..21ccb31 --- /dev/null +++ b/cuda/2d/dims.h @@ -0,0 +1,68 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DIMS_H +#define _CUDA_DIMS_H + +namespace astraCUDA { + +struct SFanProjection { + // the source + float fSrcX, fSrcY; + + // the start of the (linear) detector + float fDetSX, fDetSY; + + // the length of a single detector pixel + float fDetUX, fDetUY; +}; + + +struct SDimensions { + unsigned int iVolWidth; + unsigned int iVolHeight; + unsigned int iProjAngles; + unsigned int iProjDets; + float fDetScale; // size of detector compared to volume pixels + unsigned int iRaysPerDet; + unsigned int iRaysPerPixelDim; +}; + +struct SDimensions3D { + unsigned int iVolX; + unsigned int iVolY; + unsigned int iVolZ; + unsigned int iProjAngles; + unsigned int iProjU; // number of detectors in the U direction + unsigned int iProjV; // number of detectors in the V direction +}; + +} + +#endif + diff --git a/cuda/2d/em.cu b/cuda/2d/em.cu new file mode 100644 index 0000000..74d1bbf --- /dev/null +++ b/cuda/2d/em.cu @@ -0,0 +1,262 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "em.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + + +// TODO: ensure non-negativity somewhere?? + + +EM::EM() +{ + D_projData = 0; + D_tmpData = 0; + D_pixelWeight = 0; + +} + + +EM::~EM() +{ + reset(); +} + +void EM::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_pixelWeight); + + D_projData = 0; + D_tmpData = 0; + D_pixelWeight = 0; + + ReconAlgo::reset(); +} + + +bool EM::init() +{ + allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute pixelWeights when using a volume mask +#if 0 + if (!useVolumeMask) +#endif + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool EM::precomputeWeights() +{ + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); +#if 0 + if (useSinogramMask) { + callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); + } else +#endif + { + processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); + } + processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + +#if 0 + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + } +#endif + + return true; +} + +bool EM::iterate(unsigned int iterations) +{ + shouldAbort = false; + +#if 0 + if (useVolumeMask) + precomputeWeights(); +#endif + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // Do FP of volumeData + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + callFP(D_volumeData, volumePitch, D_projData, projPitch, 1.0f); + + // Divide sinogram by FP (into projData) + processVol<opDividedBy, SINO>(D_projData, D_sinoData, projPitch, dims.iProjDets, dims.iProjAngles); + + // Do BP of projData into tmpData + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP(D_tmpData, tmpPitch, D_projData, projPitch); + + // Multiply volumeData with tmpData divided by pixel weights + processVol<opMul2, VOL>(D_volumeData, D_tmpData, D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + + } + + return true; +} + +float EM::computeDiffNorm() +{ + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_projData + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + + +bool doEM(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations) +{ + EM em; + bool ok = true; + + ok &= em.setGeometry(dims, angles); + if (TOffsets) + ok &= em.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = em.init(); + if (!ok) + return false; + + ok &= em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = em.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + EM em; + + em.setGeometry(dims, angle); + em.init(); + + // TODO: Initialize D_volumeData with an unfiltered backprojection + + em.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + em.iterate(25); + + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} + +#endif diff --git a/cuda/2d/em.h b/cuda/2d/em.h new file mode 100644 index 0000000..5a9ffed --- /dev/null +++ b/cuda/2d/em.h @@ -0,0 +1,77 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_EM_H +#define _CUDA_EM_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport EM : public ReconAlgo { +public: + EM(); + virtual ~EM(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + virtual bool enableVolumeMask() { return false; } + virtual bool setMinConstraint(float) { return false; } + virtual bool setMaxConstraint(float) { return false; } + + virtual bool init(); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_pixelWeight; + unsigned int pixelPitch; +}; + +_AstraExport bool doEM(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/fan_bp.cu b/cuda/2d/fan_bp.cu new file mode 100644 index 0000000..1edc6d9 --- /dev/null +++ b/cuda/2d/fan_bp.cu @@ -0,0 +1,374 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_FanProjTexture; + + +namespace astraCUDA { + +const unsigned int g_anglesPerBlock = 16; +const unsigned int g_blockSliceSize = 32; +const unsigned int g_blockSlices = 16; + +const unsigned int g_MaxAngles = 2560; + +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; + + +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_FanProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_FanProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanProjTexture.filterMode = cudaFilterModeLinear; + gT_FanProjTexture.normalized = false; + + cudaBindTexture2D(0, gT_FanProjTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + + // TODO: error value? + + return true; +} + +__global__ void devFanBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f ); + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + + // TODO: Distance correction? + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen + 1.0f; + fVal += tex2D(gT_FanProjTexture, fT, fA); + fA += 1.0f; + } + + volData[(Y+1)*volPitch+X+1] += fVal; +} + +// supersampling version +__global__ void devFanBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fXb = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); + const float fYb = - ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim); + + const float fSubStep = 1.0f/dims.iRaysPerPixelDim; + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + + // TODO: Distance correction? + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + // TODO: Optimize these loops... + float fX = fXb; + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fY = fYb; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen + 1.0f; + fVal += tex2D(gT_FanProjTexture, fT, fA); + fY -= fSubStep; + } + fX += fSubStep; + } + fA += 1.0f; + } + + volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +} + + +// BP specifically for SART. +// It includes (free) weighting with voxel weight. +// It assumes the proj texture is set up _without_ padding, unlike regular BP. +__global__ void devFanBP_SART(float* D_volData, unsigned int volPitch, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ); + const float fY = - ( Y - 0.5f*dims.iVolHeight + 0.5f ); + + float* volData = (float*)D_volData; + + // TODO: Distance correction? + + // TODO: Constant memory vs parameters. + const float fSrcX = gC_SrcX[0]; + const float fSrcY = gC_SrcY[0]; + const float fDetSX = gC_DetSX[0]; + const float fDetSY = gC_DetSY[0]; + const float fDetUX = gC_DetUX[0]; + const float fDetUY = gC_DetUY[0]; + + const float fXD = fSrcX - fX; + const float fYD = fSrcY - fY; + + const float fNum = fDetSY * fXD - fDetSX * fYD + fX*fSrcY - fY*fSrcX; + const float fDen = fDetUX * fYD - fDetUY * fXD; + + const float fT = fNum / fDen; + const float fVal = tex2D(gT_FanProjTexture, fT, 0.5f); + + volData[(Y+1)*volPitch+X+1] += fVal; +} + + +bool FanBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles) +{ + // TODO: process angles block by block + assert(dims.iProjAngles <= g_MaxAngles); + + bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + if (dims.iRaysPerPixelDim > 1) + devFanBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims); + else + devFanBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, dims); + } + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaStreamDestroy(stream); + + return true; +} + + +// D_projData is a pointer to one padded sinogram line +bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, + const SDimensions& dims, const SFanProjection* angles) +{ + // only one angle + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + + // transfer angle to constant memory +#define TRANSFER_TO_CONSTANT(name) do { cudaMemcpyToSymbol(gC_##name, &(angles[angle].f##name), sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + devFanBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, dims); + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + return true; +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 128; + dims.iVolHeight = 128; + dims.iProjAngles = 180; + dims.iProjDets = 256; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + SFanProjection projs[180]; + + projs[0].fSrcX = 0.0f; + projs[0].fSrcY = 1536.0f; + projs[0].fDetSX = 128.0f; + projs[0].fDetSY = -512.0f; + projs[0].fDetUX = -1.0f; + projs[0].fDetUY = 0.0f; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0) + + for (int i = 1; i < 180; ++i) { + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + } + +#undef ROTATE0 + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + FanBP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs); + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/fan_bp.h b/cuda/2d/fan_bp.h new file mode 100644 index 0000000..a4e62be --- /dev/null +++ b/cuda/2d/fan_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FAN_BP_H +#define _CUDA_FAN_BP_H + +namespace astraCUDA { + +_AstraExport bool FanBP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles); + +_AstraExport bool FanBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, + const SDimensions& dims, const SFanProjection* angles); + +} + +#endif diff --git a/cuda/2d/fan_fp.cu b/cuda/2d/fan_fp.cu new file mode 100644 index 0000000..cf9f352 --- /dev/null +++ b/cuda/2d/fan_fp.cu @@ -0,0 +1,370 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_FanVolumeTexture; + + +namespace astraCUDA { + +static const unsigned g_MaxAngles = 2560; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; + + +// optimization parameters +static const unsigned int g_anglesPerBlock = 16; +static const unsigned int g_detBlockSize = 32; +static const unsigned int g_blockSlices = 64; + +static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + dataArray = 0; + cudaMallocArray(&dataArray, &channelDesc, width, height); + cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); + + gT_FanVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_FanVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_FanVolumeTexture.filterMode = cudaFilterModeLinear; + gT_FanVolumeTexture.normalized = false; + + // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) + // not using an array is more efficient. + //cudaBindTexture2D(0, gT_FanVolumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + cudaBindTextureToArray(gT_FanVolumeTexture, dataArray, channelDesc); + + // TODO: error value? + + return true; +} + +// projection for angles that are roughly horizontal +// (detector roughly vertical) +__global__ void FanFPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + float* projData = (float*)D_projData; + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy > fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: y = alpha * x + beta + const float alpha = (fSrcY - fDetY) / (fSrcX - fDetX); + const float beta = fSrcY - alpha * fSrcX; + + const float fDistCorr = sqrt(alpha*alpha+1.0f) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fY = -alpha * (startSlice - 0.5f*dims.iVolWidth + 0.5f) - beta + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fX = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + float fV = 0.0f; + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fY -= alpha; + fX += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + + +// projection for angles that are roughly vertical +// (detector roughly horizontal) +__global__ void FanFPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + const int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const int detector = blockIdx.y * g_detBlockSize + relDet; + + if (detector < 0 || detector >= dims.iProjDets) + return; + + float* projData = (float*)D_projData; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fDetSX = gC_DetSX[angle]; + const float fDetSY = gC_DetSY[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + + float fVal = 0.0f; + + const float fdx = fabsf(fDetSX + detector*fDetUX + 0.5f - fSrcX); + const float fdy = fabsf(fDetSY + detector*fDetUY + 0.5f - fSrcY); + + if (fdy <= fdx) + return; + + + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + const float fDet = detector + (0.5f + iSubT) / dims.iRaysPerDet /*- gC_angle_offset[angle]*/; + + const float fDetX = fDetSX + fDet * fDetUX; + const float fDetY = fDetSY + fDet * fDetUY; + + // ray: x = alpha * y + beta + const float alpha = (fSrcX - fDetX) / (fSrcY - fDetY); + const float beta = fSrcX - alpha * fSrcY; + + const float fDistCorr = sqrt(alpha*alpha+1) * outputScale / dims.iRaysPerDet; + + // intersect ray with first slice + + float fX = -alpha * (startSlice - 0.5f*dims.iVolHeight + 0.5f) + beta + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fY = startSlice + 1.5f; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + float fV = 0.0f; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fV += tex2D(gT_FanVolumeTexture, fX, fY); + fX -= alpha; + fY += 1.0f; + } + + fVal += fV * fDistCorr; + + } + + projData[angle*projPitch+detector+1] += fVal; +} + +bool FanFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles, + float outputScale) +{ + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles + const unsigned int g_blockSliceSize = g_detBlockSize; + + std::list<cudaStream_t> streams; + + + unsigned int blockStart = 0; + unsigned int blockEnd = dims.iProjAngles; + + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (dims.iProjDets+g_blockSliceSize-1)/g_blockSliceSize); // angle blocks, regions + cudaStream_t stream1; + cudaStreamCreate(&stream1); + streams.push_back(stream1); + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FanFPhorizontal<<<dimGrid, dimBlock, 0, stream1>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStream_t stream2; + cudaStreamCreate(&stream2); + streams.push_back(stream2); + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FanFPvertical<<<dimGrid, dimBlock, 0, stream2>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + return true; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 128; + dims.iVolHeight = 128; + dims.iProjAngles = 180; + dims.iProjDets = 256; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + SFanProjection projs[180]; + + projs[0].fSrcX = 0.0f; + projs[0].fSrcY = 1536.0f; + projs[0].fDetSX = 128.0f; + projs[0].fDetSY = -512.0f; + projs[0].fDetUX = -1.0f; + projs[0].fDetUY = 0.0f; + +#define ROTATE0(name,i,alpha) do { projs[i].f##name##X = projs[0].f##name##X * cos(alpha) - projs[0].f##name##Y * sin(alpha); projs[i].f##name##Y = projs[0].f##name##X * sin(alpha) + projs[0].f##name##Y * cos(alpha); } while(0) + + for (int i = 1; i < 180; ++i) { + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + } + +#undef ROTATE0 + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* img = loadImage("phantom128.png", y, x); + + float* sino = new float[dims.iProjAngles * dims.iProjDets]; + + memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + FanFP(D_volumeData, volumePitch, D_projData, projPitch, dims, projs, 1.0f); + + delete[] angle; + + copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float s = 0.0f; + for (unsigned int y = 0; y < dims.iProjAngles; ++y) + for (unsigned int x = 0; x < dims.iProjDets; ++x) + s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; + printf("cpu norm: %f\n", s); + + //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + printf("gpu norm: %f\n", s); + + saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); + + + return 0; +} +#endif diff --git a/cuda/2d/fan_fp.h b/cuda/2d/fan_fp.h new file mode 100644 index 0000000..0734f40 --- /dev/null +++ b/cuda/2d/fan_fp.h @@ -0,0 +1,41 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FAN_FP_H +#define _CUDA_FAN_FP_H + +namespace astraCUDA { + +_AstraExport bool FanFP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const SFanProjection* angles, + float outputScale); + +} + +#endif diff --git a/cuda/2d/fbp_filters.h b/cuda/2d/fbp_filters.h new file mode 100644 index 0000000..1232f8e --- /dev/null +++ b/cuda/2d/fbp_filters.h @@ -0,0 +1,58 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef FBP_FILTERS_H +#define FBP_FILTERS_H + +enum E_FBPFILTER +{ + FILTER_NONE, //< no filter (regular BP) + FILTER_RAMLAK, //< default FBP filter + FILTER_SHEPPLOGAN, //< Shepp-Logan + FILTER_COSINE, //< Cosine + FILTER_HAMMING, //< Hamming filter + FILTER_HANN, //< Hann filter + FILTER_TUKEY, //< Tukey filter + FILTER_LANCZOS, //< Lanczos filter + FILTER_TRIANGULAR, //< Triangular filter + FILTER_GAUSSIAN, //< Gaussian filter + FILTER_BARTLETTHANN, //< Bartlett-Hann filter + FILTER_BLACKMAN, //< Blackman filter + FILTER_NUTTALL, //< Nuttall filter, continuous first derivative + FILTER_BLACKMANHARRIS, //< Blackman-Harris filter + FILTER_BLACKMANNUTTALL, //< Blackman-Nuttall filter + FILTER_FLATTOP, //< Flat top filter + FILTER_KAISER, //< Kaiser filter + FILTER_PARZEN, //< Parzen filter + FILTER_PROJECTION, //< all projection directions share one filter + FILTER_SINOGRAM, //< every projection direction has its own filter + FILTER_RPROJECTION, //< projection filter in real space (as opposed to fourier space) + FILTER_RSINOGRAM, //< sinogram filter in real space +}; + +#endif /* FBP_FILTERS_H */ diff --git a/cuda/2d/fft.cu b/cuda/2d/fft.cu new file mode 100644 index 0000000..79e4be7 --- /dev/null +++ b/cuda/2d/fft.cu @@ -0,0 +1,873 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "fft.h" +#include "util.h" + +#include <cufft.h> +#include <iostream> +#include <cuda.h> +#include <fstream> + +#include "../../include/astra/Logger.h" + +using namespace astra; + +// TODO: evaluate what we want to do in these situations: + +#define CHECK_ERROR(errorMessage) do { \ + cudaError_t err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error: %s in file '%s' in line %i : %s.\n", \ + errorMessage, __FILE__, __LINE__, cudaGetErrorString( err) );\ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } } while (0) + +#define SAFE_CALL( call) do { \ + cudaError err = call; \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } \ + err = cudaThreadSynchronize(); \ + if( cudaSuccess != err) { \ + fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \ + __FILE__, __LINE__, cudaGetErrorString( err) ); \ + CLogger::writeTerminalCUDAError(__FILE__, __LINE__, cudaGetErrorString( err)); \ + exit(EXIT_FAILURE); \ + } } while (0) + + +__global__ static void applyFilter_kernel(int _iProjectionCount, + int _iFreqBinCount, + cufftComplex * _pSinogram, + cufftComplex * _pFilter) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iFreqBinCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + float fA = _pSinogram[iIndex].x; + float fB = _pSinogram[iIndex].y; + float fC = _pFilter[iIndex].x; + float fD = _pFilter[iIndex].y; + + _pSinogram[iIndex].x = fA * fC - fB * fD; + _pSinogram[iIndex].y = fA * fD + fC * fB; +} + +__global__ static void rescaleInverseFourier_kernel(int _iProjectionCount, + int _iDetectorCount, + float* _pfInFourierOutput) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iDetectorCount; + int iDetectorIndex = iIndex % _iDetectorCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + _pfInFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex] /= (float)_iDetectorCount; +} + +static void rescaleInverseFourier(int _iProjectionCount, int _iDetectorCount, + float * _pfInFourierOutput) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iDetectorCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + rescaleInverseFourier_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iDetectorCount, + _pfInFourierOutput); + CHECK_ERROR("rescaleInverseFourier_kernel failed"); +} + +void applyFilter(int _iProjectionCount, int _iFreqBinCount, + cufftComplex * _pSinogram, cufftComplex * _pFilter) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iFreqBinCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + applyFilter_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iFreqBinCount, + _pSinogram, _pFilter); + CHECK_ERROR("applyFilter_kernel failed"); +} + +static bool invokeCudaFFT(int _iProjectionCount, int _iDetectorCount, + const float * _pfDevSource, + cufftComplex * _pDevTargetComplex) +{ + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_R2C, _iProjectionCount); + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to plan 1d r2c fft" << std::endl; + return false; + } + + result = cufftExecR2C(plan, (cufftReal *)_pfDevSource, _pDevTargetComplex); + cufftDestroy(plan); + + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to exec 1d r2c fft" << std::endl; + return false; + } + + return true; +} + +static bool invokeCudaIFFT(int _iProjectionCount, int _iDetectorCount, + const cufftComplex * _pDevSourceComplex, + float * _pfDevTarget) +{ + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, _iDetectorCount, CUFFT_C2R, _iProjectionCount); + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to plan 1d c2r fft" << std::endl; + return false; + } + + // todo: why do we have to get rid of the const qualifier? + result = cufftExecC2R(plan, (cufftComplex *)_pDevSourceComplex, + (cufftReal *)_pfDevTarget); + cufftDestroy(plan); + + if(result != CUFFT_SUCCESS) + { + std::cerr << "Failed to exec 1d c2r fft" << std::endl; + return false; + } + + return true; +} + +bool allocateComplexOnDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex ** _ppDevComplex) +{ + size_t bufferSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; + SAFE_CALL(cudaMalloc((void **)_ppDevComplex, bufferSize)); + return true; +} + +bool freeComplexOnDevice(cufftComplex * _pDevComplex) +{ + SAFE_CALL(cudaFree(_pDevComplex)); + return true; +} + +bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pHostComplexSource, + cufftComplex * _pDevComplexTarget) +{ + size_t memSize = sizeof(cufftComplex) * _iProjectionCount * _iDetectorCount; + SAFE_CALL(cudaMemcpy(_pDevComplexTarget, _pHostComplexSource, memSize, cudaMemcpyHostToDevice)); + + return true; +} + +bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, + int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, + cufftComplex * _pDevTargetComplex) +{ + float * pfDevRealFFTSource = NULL; + size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTSource, bufferMemSize)); + SAFE_CALL(cudaMemset(pfDevRealFFTSource, 0, bufferMemSize)); + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + const float * pfSourceLocation = _pfDevRealSource + iProjectionIndex * _iSourcePitch + _iSourcePadX; + float * pfTargetLocation = pfDevRealFFTSource + iProjectionIndex * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + } + + bool bResult = invokeCudaFFT(_iProjectionCount, _iFFTRealDetectorCount, + pfDevRealFFTSource, _pDevTargetComplex); + if(!bResult) + { + return false; + } + + SAFE_CALL(cudaFree(pfDevRealFFTSource)); + + return true; +} + +bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, + float * _pfRealTarget, + int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) +{ + float * pfDevRealFFTTarget = NULL; + size_t bufferMemSize = sizeof(float) * _iProjectionCount * _iFFTRealDetectorCount; + + SAFE_CALL(cudaMalloc((void **)&pfDevRealFFTTarget, bufferMemSize)); + + bool bResult = invokeCudaIFFT(_iProjectionCount, _iFFTRealDetectorCount, + _pDevSourceComplex, pfDevRealFFTTarget); + if(!bResult) + { + return false; + } + + rescaleInverseFourier(_iProjectionCount, _iFFTRealDetectorCount, + pfDevRealFFTTarget); + + SAFE_CALL(cudaMemset(_pfRealTarget, 0, sizeof(float) * _iProjectionCount * _iTargetPitch)); + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + const float * pfSourceLocation = pfDevRealFFTTarget + iProjectionIndex * _iFFTRealDetectorCount; + float* pfTargetLocation = _pfRealTarget + iProjectionIndex * _iTargetPitch + _iTargetPadX; + + SAFE_CALL(cudaMemcpy(pfTargetLocation, pfSourceLocation, sizeof(float) * _iProjDets, cudaMemcpyDeviceToDevice)); + } + + SAFE_CALL(cudaFree(pfDevRealFFTTarget)); + + return true; +} + + +// Because the input is real, the Fourier transform is symmetric. +// CUFFT only outputs the first half (ignoring the redundant second half), +// and expects the same as input for the IFFT. +int calcFFTFourSize(int _iFFTRealSize) +{ + int iFFTFourSize = _iFFTRealSize / 2 + 1; + + return iFFTFourSize; +} + +void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount) +{ + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; + _pFilter[iIndex].x = 1.0f; + _pFilter[iIndex].y = 0.0f; + } + } +} + +void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, + cufftComplex * _pFilter, int _iFFTRealDetectorCount, + int _iFFTFourierDetectorCount, float _fParameter /* = -1.0f */) +{ + float * pfFilt = new float[_iFFTFourierDetectorCount]; + float * pfW = new float[_iFFTFourierDetectorCount]; + + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fRelIndex = (float)iDetectorIndex / (float)_iFFTRealDetectorCount; + + // filt = 2*( 0:(order/2) )./order; + pfFilt[iDetectorIndex] = 2.0f * fRelIndex; + //pfFilt[iDetectorIndex] = 1.0f; + + // w = 2*pi*(0:size(filt,2)-1)/order + pfW[iDetectorIndex] = 3.1415f * 2.0f * fRelIndex; + } + + switch(_eFilter) + { + case FILTER_RAMLAK: + { + // do nothing + break; + } + case FILTER_SHEPPLOGAN: + { + // filt(2:end) = filt(2:end) .* (sin(w(2:end)/(2*d))./(w(2:end)/(2*d))) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (sinf(pfW[iDetectorIndex] / 2.0f / _fD) / (pfW[iDetectorIndex] / 2.0f / _fD)); + } + break; + } + case FILTER_COSINE: + { + // filt(2:end) = filt(2:end) .* cos(w(2:end)/(2*d)) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * cosf(pfW[iDetectorIndex] / 2.0f / _fD); + } + break; + } + case FILTER_HAMMING: + { + // filt(2:end) = filt(2:end) .* (.54 + .46 * cos(w(2:end)/d)) + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * ( 0.54f + 0.46f * cosf(pfW[iDetectorIndex] / _fD)); + } + break; + } + case FILTER_HANN: + { + // filt(2:end) = filt(2:end) .*(1+cos(w(2:end)./d)) / 2 + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + pfFilt[iDetectorIndex] = pfFilt[iDetectorIndex] * (1.0f + cosf(pfW[iDetectorIndex] / _fD)) / 2.0f; + } + break; + } + case FILTER_TUKEY: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 0.5f; + float fN = (float)_iFFTFourierDetectorCount; + float fHalfN = fN / 2.0f; + float fEnumTerm = fAlpha * fHalfN; + float fDenom = (1.0f - fAlpha) * fHalfN; + float fBlockStart = fHalfN - fEnumTerm; + float fBlockEnd = fHalfN + fEnumTerm; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fAbsSmallN = fabs((float)iDetectorIndex); + float fStoredValue = 0.0f; + + if((fBlockStart <= fAbsSmallN) && (fAbsSmallN <= fBlockEnd)) + { + fStoredValue = 1.0f; + } + else + { + float fEnum = fAbsSmallN - fEnumTerm; + float fCosInput = M_PI * fEnum / fDenom; + fStoredValue = 0.5f * (1.0f + cosf(fCosInput)); + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_LANCZOS: + { + float fDenum = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fX = 2.0f * fSmallN / fDenum - 1.0f; + float fSinInput = M_PI * fX; + float fStoredValue = 0.0f; + + if(fabsf(fSinInput) > 0.001f) + { + fStoredValue = sin(fSinInput)/fSinInput; + } + else + { + fStoredValue = 1.0f; + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_TRIANGULAR: + { + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fAbsInput = fSmallN - fNMinusOne / 2.0f; + float fParenInput = fNMinusOne / 2.0f - fabsf(fAbsInput); + float fStoredValue = 2.0f / fNMinusOne * fParenInput; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_GAUSSIAN: + { + float fSigma = _fParameter; + if(_fParameter < 0.0f) fSigma = 0.4f; + float fN = (float)_iFFTFourierDetectorCount; + float fQuotient = (fN - 1.0f) / 2.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fEnum = fSmallN - fQuotient; + float fDenom = fSigma * fQuotient; + float fPower = -0.5f * (fEnum / fDenom) * (fEnum / fDenom); + float fStoredValue = expf(fPower); + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BARTLETTHANN: + { + const float fA0 = 0.62f; + const float fA1 = 0.48f; + const float fA2 = 0.38f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fAbsInput = fSmallN / fNMinusOne - 0.5f; + float fFirstTerm = fA1 * fabsf(fAbsInput); + float fCosInput = 2.0f * M_PI * fSmallN / fNMinusOne; + float fSecondTerm = fA2 * cosf(fCosInput); + float fStoredValue = fA0 - fFirstTerm - fSecondTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMAN: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 0.16f; + float fA0 = (1.0f - fAlpha) / 2.0f; + float fA1 = 0.5f; + float fA2 = fAlpha / 2.0f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fCosInput1 = 2.0f * M_PI * 0.5f * fSmallN / fNMinusOne; + float fCosInput2 = 4.0f * M_PI * 0.5f * fSmallN / fNMinusOne; + float fStoredValue = fA0 - fA1 * cosf(fCosInput1) + fA2 * cosf(fCosInput2); + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_NUTTALL: + { + const float fA0 = 0.355768f; + const float fA1 = 0.487396f; + const float fA2 = 0.144232f; + const float fA3 = 0.012604f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMANHARRIS: + { + const float fA0 = 0.35875f; + const float fA1 = 0.48829f; + const float fA2 = 0.14128f; + const float fA3 = 0.01168f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_BLACKMANNUTTALL: + { + const float fA0 = 0.3635819f; + const float fA1 = 0.4891775f; + const float fA2 = 0.1365995f; + const float fA3 = 0.0106411f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_FLATTOP: + { + const float fA0 = 1.0f; + const float fA1 = 1.93f; + const float fA2 = 1.29f; + const float fA3 = 0.388f; + const float fA4 = 0.032f; + float fNMinusOne = (float)(_iFFTFourierDetectorCount) - 1.0f; + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fBaseCosInput = M_PI * fSmallN / fNMinusOne; + float fFirstTerm = fA1 * cosf(2.0f * fBaseCosInput); + float fSecondTerm = fA2 * cosf(4.0f * fBaseCosInput); + float fThirdTerm = fA3 * cosf(6.0f * fBaseCosInput); + float fFourthTerm = fA4 * cosf(8.0f * fBaseCosInput); + float fStoredValue = fA0 - fFirstTerm + fSecondTerm - fThirdTerm + fFourthTerm; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_KAISER: + { + float fAlpha = _fParameter; + if(_fParameter < 0.0f) fAlpha = 3.0f; + float fPiTimesAlpha = M_PI * fAlpha; + float fNMinusOne = (float)(_iFFTFourierDetectorCount - 1); + float fDenom = (float)j0((double)fPiTimesAlpha); + + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fSquareInput = 2.0f * fSmallN / fNMinusOne - 1; + float fSqrtInput = 1.0f - fSquareInput * fSquareInput; + float fBesselInput = fPiTimesAlpha * sqrt(fSqrtInput); + float fEnum = (float)j0((double)fBesselInput); + float fStoredValue = fEnum / fDenom; + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + case FILTER_PARZEN: + { + for(int iDetectorIndex = 1; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fSmallN = (float)iDetectorIndex; + float fQ = fSmallN / (float)(_iFFTFourierDetectorCount - 1); + float fStoredValue = 0.0f; + + if(fQ <= 0.5f) + { + fStoredValue = 1.0f - 6.0f * fQ * fQ * (1.0f - fQ); + } + else + { + float fCubedValue = 1.0f - fQ; + fStoredValue = 2.0f * fCubedValue * fCubedValue * fCubedValue; + } + + pfFilt[iDetectorIndex] *= fStoredValue; + } + + break; + } + default: + { + std::cerr << "Cannot serve requested filter" << std::endl; + } + } + + // filt(w>pi*d) = 0; + float fPiTimesD = M_PI * _fD; + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fWValue = pfW[iDetectorIndex]; + + if(fWValue > fPiTimesD) + { + pfFilt[iDetectorIndex] = 0.0f; + } + } + + for(int iDetectorIndex = 0; iDetectorIndex < _iFFTFourierDetectorCount; iDetectorIndex++) + { + float fFilterValue = pfFilt[iDetectorIndex]; + + for(int iProjectionIndex = 0; iProjectionIndex < _iProjectionCount; iProjectionIndex++) + { + int iIndex = iDetectorIndex + iProjectionIndex * _iFFTFourierDetectorCount; + _pFilter[iIndex].x = fFilterValue; + _pFilter[iIndex].y = 0.0f; + } + } + + delete[] pfFilt; + delete[] pfW; +} + +#ifdef STANDALONE + +__global__ static void doubleFourierOutput_kernel(int _iProjectionCount, + int _iDetectorCount, + cufftComplex* _pFourierOutput) +{ + int iIndex = threadIdx.x + blockIdx.x * blockDim.x; + int iProjectionIndex = iIndex / _iDetectorCount; + int iDetectorIndex = iIndex % _iDetectorCount; + + if(iProjectionIndex >= _iProjectionCount) + { + return; + } + + if(iDetectorIndex <= (_iDetectorCount / 2)) + { + return; + } + + int iOtherDetectorIndex = _iDetectorCount - iDetectorIndex; + + _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].x = _pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].x; + _pFourierOutput[iProjectionIndex * _iDetectorCount + iDetectorIndex].y = -_pFourierOutput[iProjectionIndex * _iDetectorCount + iOtherDetectorIndex].y; +} + +static void doubleFourierOutput(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pFourierOutput) +{ + const int iBlockSize = 256; + int iElementCount = _iProjectionCount * _iDetectorCount; + int iBlockCount = (iElementCount + iBlockSize - 1) / iBlockSize; + + doubleFourierOutput_kernel<<< iBlockCount, iBlockSize >>>(_iProjectionCount, + _iDetectorCount, + _pFourierOutput); + CHECK_ERROR("doubleFourierOutput_kernel failed"); +} + + + +static void writeToMatlabFile(const char * _fileName, const float * _pfData, + int _iRowCount, int _iColumnCount) +{ + std::ofstream out(_fileName); + + for(int iRowIndex = 0; iRowIndex < _iRowCount; iRowIndex++) + { + for(int iColumnIndex = 0; iColumnIndex < _iColumnCount; iColumnIndex++) + { + out << _pfData[iColumnIndex + iRowIndex * _iColumnCount] << " "; + } + + out << std::endl; + } +} + +static void convertComplexToRealImg(const cufftComplex * _pComplex, + int _iElementCount, + float * _pfReal, float * _pfImaginary) +{ + for(int iIndex = 0; iIndex < _iElementCount; iIndex++) + { + _pfReal[iIndex] = _pComplex[iIndex].x; + _pfImaginary[iIndex] = _pComplex[iIndex].y; + } +} + +void testCudaFFT() +{ + const int iProjectionCount = 2; + const int iDetectorCount = 1024; + const int iTotalElementCount = iProjectionCount * iDetectorCount; + + float * pfHostProj = new float[iTotalElementCount]; + memset(pfHostProj, 0, sizeof(float) * iTotalElementCount); + + for(int iProjectionIndex = 0; iProjectionIndex < iProjectionCount; iProjectionIndex++) + { + for(int iDetectorIndex = 0; iDetectorIndex < iDetectorCount; iDetectorIndex++) + { +// int + +// pfHostProj[iIndex] = (float)rand() / (float)RAND_MAX; + } + } + + writeToMatlabFile("proj.mat", pfHostProj, iProjectionCount, iDetectorCount); + + float * pfDevProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pfDevProj, sizeof(float) * iTotalElementCount)); + SAFE_CALL(cudaMemcpy(pfDevProj, pfHostProj, sizeof(float) * iTotalElementCount, cudaMemcpyHostToDevice)); + + cufftComplex * pDevFourProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pDevFourProj, sizeof(cufftComplex) * iTotalElementCount)); + + cufftHandle plan; + cufftResult result; + + result = cufftPlan1d(&plan, iDetectorCount, CUFFT_R2C, iProjectionCount); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to plan 1d r2c fft" << endl; + } + + result = cufftExecR2C(plan, pfDevProj, pDevFourProj); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to exec 1d r2c fft" << endl; + } + + cufftDestroy(plan); + + doubleFourierOutput(iProjectionCount, iDetectorCount, pDevFourProj); + + cufftComplex * pHostFourProj = new cufftComplex[iTotalElementCount]; + SAFE_CALL(cudaMemcpy(pHostFourProj, pDevFourProj, sizeof(cufftComplex) * iTotalElementCount, cudaMemcpyDeviceToHost)); + + float * pfHostFourProjReal = new float[iTotalElementCount]; + float * pfHostFourProjImaginary = new float[iTotalElementCount]; + + convertComplexToRealImg(pHostFourProj, iTotalElementCount, pfHostFourProjReal, pfHostFourProjImaginary); + + writeToMatlabFile("proj_four_real.mat", pfHostFourProjReal, iProjectionCount, iDetectorCount); + writeToMatlabFile("proj_four_imaginary.mat", pfHostFourProjImaginary, iProjectionCount, iDetectorCount); + + float * pfDevInFourProj = NULL; + SAFE_CALL(cudaMalloc((void **)&pfDevInFourProj, sizeof(float) * iTotalElementCount)); + + result = cufftPlan1d(&plan, iDetectorCount, CUFFT_C2R, iProjectionCount); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to plan 1d c2r fft" << endl; + } + + result = cufftExecC2R(plan, pDevFourProj, pfDevInFourProj); + if(result != CUFFT_SUCCESS) + { + cerr << "Failed to exec 1d c2r fft" << endl; + } + + cufftDestroy(plan); + + rescaleInverseFourier(iProjectionCount, iDetectorCount, pfDevInFourProj); + + float * pfHostInFourProj = new float[iTotalElementCount]; + SAFE_CALL(cudaMemcpy(pfHostInFourProj, pfDevInFourProj, sizeof(float) * iTotalElementCount, cudaMemcpyDeviceToHost)); + + writeToMatlabFile("in_four.mat", pfHostInFourProj, iProjectionCount, iDetectorCount); + + SAFE_CALL(cudaFree(pDevFourProj)); + SAFE_CALL(cudaFree(pfDevProj)); + + delete [] pfHostInFourProj; + delete [] pfHostFourProjReal; + delete [] pfHostFourProjImaginary; + delete [] pfHostProj; + delete [] pHostFourProj; +} + +void downloadDebugFilterComplex(float * _pfHostSinogram, int _iProjectionCount, + int _iDetectorCount, + cufftComplex * _pDevFilter, + int _iFilterDetCount) +{ + cufftComplex * pHostFilter = NULL; + size_t complMemSize = sizeof(cufftComplex) * _iFilterDetCount * _iProjectionCount; + pHostFilter = (cufftComplex *)malloc(complMemSize); + SAFE_CALL(cudaMemcpy(pHostFilter, _pDevFilter, complMemSize, cudaMemcpyDeviceToHost)); + + for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) + { + for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) + { + cufftComplex source = pHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; + float fReadValue = sqrtf(source.x * source.x + source.y * source.y); + _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fReadValue; + } + } + + free(pHostFilter); +} + +void downloadDebugFilterReal(float * _pfHostSinogram, int _iProjectionCount, + int _iDetectorCount, float * _pfDevFilter, + int _iFilterDetCount) +{ + float * pfHostFilter = NULL; + size_t memSize = sizeof(float) * _iFilterDetCount * _iProjectionCount; + pfHostFilter = (float *)malloc(memSize); + SAFE_CALL(cudaMemcpy(pfHostFilter, _pfDevFilter, memSize, cudaMemcpyDeviceToHost)); + + for(int iTargetProjIndex = 0; iTargetProjIndex < _iProjectionCount; iTargetProjIndex++) + { + for(int iTargetDetIndex = 0; iTargetDetIndex < min(_iDetectorCount, _iFilterDetCount); iTargetDetIndex++) + { + float fSource = pfHostFilter[iTargetDetIndex + iTargetProjIndex * _iFilterDetCount]; + _pfHostSinogram[iTargetDetIndex + iTargetProjIndex * _iDetectorCount] = fSource; + } + } + + free(pfHostFilter); +} + + +#endif diff --git a/cuda/2d/fft.h b/cuda/2d/fft.h new file mode 100644 index 0000000..55324e5 --- /dev/null +++ b/cuda/2d/fft.h @@ -0,0 +1,69 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef FFT_H +#define FFT_H + +#include <cufft.h> +#include <cuda.h> + +#include "fbp_filters.h" + +bool allocateComplexOnDevice(int _iProjectionCount, + int _iDetectorCount, + cufftComplex ** _ppDevComplex); + +bool freeComplexOnDevice(cufftComplex * _pDevComplex); + +bool uploadComplexArrayToDevice(int _iProjectionCount, int _iDetectorCount, + cufftComplex * _pHostComplexSource, + cufftComplex * _pDevComplexTarget); + +bool runCudaFFT(int _iProjectionCount, const float * _pfDevRealSource, + int _iSourcePitch, int _iSourcePadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount, + cufftComplex * _pDevTargetComplex); + +bool runCudaIFFT(int _iProjectionCount, const cufftComplex* _pDevSourceComplex, + float * _pfRealTarget, + int _iTargetPitch, int _iTargetPadX, int _iProjDets, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); + +void applyFilter(int _iProjectionCount, int _iFreqBinCount, + cufftComplex * _pSinogram, cufftComplex * _pFilter); + +int calcFFTFourSize(int _iFFTRealSize); + +void genFilter(E_FBPFILTER _eFilter, float _fD, int _iProjectionCount, + cufftComplex * _pFilter, int _iFFTRealDetectorCount, + int _iFFTFourierDetectorCount, float _fParameter = -1.0f); + +void genIdenFilter(int _iProjectionCount, cufftComplex * _pFilter, + int _iFFTRealDetectorCount, int _iFFTFourierDetectorCount); + +#endif /* FFT_H */ diff --git a/cuda/2d/par_bp.cu b/cuda/2d/par_bp.cu new file mode 100644 index 0000000..1057879 --- /dev/null +++ b/cuda/2d/par_bp.cu @@ -0,0 +1,357 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_projTexture; + + +namespace astraCUDA { + +const unsigned int g_anglesPerBlock = 16; +const unsigned int g_blockSliceSize = 32; +const unsigned int g_blockSlices = 16; + +const unsigned int g_MaxAngles = 2560; + +__constant__ float gC_angle_sin[g_MaxAngles]; +__constant__ float gC_angle_cos[g_MaxAngles]; +__constant__ float gC_angle_offset[g_MaxAngles]; + +static bool bindProjDataTexture(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_projTexture.addressMode[0] = cudaAddressModeClamp; + gT_projTexture.addressMode[1] = cudaAddressModeClamp; + gT_projTexture.filterMode = cudaFilterModeLinear; + gT_projTexture.normalized = false; + + cudaBindTexture2D(0, gT_projTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + + // TODO: error value? + + return true; +} + +__global__ void devBP(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + + if (offsets) { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + const float TOffset = gC_angle_offset[angle]; + + const float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; + fVal += tex2D(gT_projTexture, fT, fA); + fA += 1.0f; + } + + } else { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + const float fT = fT_base + fX * cos_theta - fY * sin_theta; + fVal += tex2D(gT_projTexture, fT, fA); + fA += 1.0f; + } + + } + + volData[(Y+1)*volPitch+X+1] += fVal; +} + +// supersampling version +__global__ void devBP_SS(float* D_volData, unsigned int volPitch, unsigned int startAngle, bool offsets, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f - 0.5f + 0.5f/dims.iRaysPerPixelDim) / dims.fDetScale; + + const float fSubStep = 1.0f/(dims.iRaysPerPixelDim * dims.fDetScale); + + float* volData = (float*)D_volData; + + float fVal = 0.0f; + float fA = startAngle + 0.5f; + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 1.5f; + + if (offsets) { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + const float TOffset = gC_angle_offset[angle]; + + float fT = fT_base + fX * cos_theta - fY * sin_theta + TOffset; + + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fTy = fT; + fT += fSubStep * cos_theta; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + fVal += tex2D(gT_projTexture, fTy, fA); + fTy -= fSubStep * sin_theta; + } + } + fA += 1.0f; + } + + } else { + + for (int angle = startAngle; angle < endAngle; ++angle) + { + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + float fT = fT_base + fX * cos_theta - fY * sin_theta; + + for (int iSubX = 0; iSubX < dims.iRaysPerPixelDim; ++iSubX) { + float fTy = fT; + fT += fSubStep * cos_theta; + for (int iSubY = 0; iSubY < dims.iRaysPerPixelDim; ++iSubY) { + fVal += tex2D(gT_projTexture, fTy, fA); + fTy -= fSubStep * sin_theta; + } + } + fA += 1.0f; + + } + + } + + volData[(Y+1)*volPitch+X+1] += fVal / (dims.iRaysPerPixelDim * dims.iRaysPerPixelDim); +} + +__global__ void devBP_SART(float* D_volData, unsigned int volPitch, float offset, float angle_sin, float angle_cos, const SDimensions dims) +{ + const int relX = threadIdx.x; + const int relY = threadIdx.y; + + const int X = blockIdx.x * g_blockSlices + relX; + const int Y = blockIdx.y * g_blockSliceSize + relY; + + if (X >= dims.iVolWidth || Y >= dims.iVolHeight) + return; + + const float fX = ( X - 0.5f*dims.iVolWidth + 0.5f ) / dims.fDetScale; + const float fY = ( Y - 0.5f*dims.iVolHeight + 0.5f ) / dims.fDetScale; + + const float fT_base = 0.5f*dims.iProjDets - 0.5f + 0.5f; + + const float fT = fT_base + fX * angle_cos - fY * angle_sin + offset; + const float fVal = tex2D(gT_projTexture, fT, 0.5f); + + D_volData[(Y+1)*volPitch+X+1] += fVal; +} + + +bool BP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, const float* TOffsets) +{ + // TODO: process angles block by block + assert(dims.iProjAngles <= g_MaxAngles); + + float* angle_sin = new float[dims.iProjAngles]; + float* angle_cos = new float[dims.iProjAngles]; + + bindProjDataTexture(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + angle_sin[i] = sinf(angles[i]); + angle_cos[i] = cosf(angles[i]); + } + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e1 == cudaSuccess); + assert(e2 == cudaSuccess); + + if (TOffsets) { + cudaError_t e3 = cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e3 == cudaSuccess); + } + + delete[] angle_sin; + delete[] angle_cos; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + cudaStream_t stream; + cudaStreamCreate(&stream); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + + if (dims.iRaysPerPixelDim > 1) + devBP_SS<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims); + else + devBP<<<dimGrid, dimBlock, 0, stream>>>(D_volumeData, volumePitch, i, (TOffsets != 0), dims); + } + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaStreamDestroy(stream); + + return true; +} + +bool BP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, const SDimensions& dims, + const float* angles, const float* TOffsets) +{ + // only one angle + bindProjDataTexture(D_projData, projPitch, dims.iProjDets, 1); + + float angle_sin = sinf(angles[angle]); + float angle_cos = cosf(angles[angle]); + + float offset = 0.0f; + if (TOffsets) + offset = TOffsets[angle]; + + dim3 dimBlock(g_blockSlices, g_blockSliceSize); + dim3 dimGrid((dims.iVolWidth+g_blockSlices-1)/g_blockSlices, + (dims.iVolHeight+g_blockSliceSize-1)/g_blockSliceSize); + + devBP_SART<<<dimGrid, dimBlock>>>(D_volumeData, volumePitch, offset, angle_sin, angle_cos, dims); + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + return true; +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + + unsigned int volumePitch, projPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + memset(img, 0, dims.iVolWidth*dims.iVolHeight*sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + BP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0); + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif diff --git a/cuda/2d/par_bp.h b/cuda/2d/par_bp.h new file mode 100644 index 0000000..c6dbd59 --- /dev/null +++ b/cuda/2d/par_bp.h @@ -0,0 +1,48 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR_BP_H +#define _CUDA_PAR_BP_H + +#include "dims.h" + +namespace astraCUDA { + +_AstraExport bool BP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets); + +_AstraExport bool BP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, const SDimensions& dims, + const float* angles, const float* TOffsets); + +} + +#endif diff --git a/cuda/2d/par_fp.cu b/cuda/2d/par_fp.cu new file mode 100644 index 0000000..585cb06 --- /dev/null +++ b/cuda/2d/par_fp.cu @@ -0,0 +1,704 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#define PIXELTRACE + + +typedef texture<float, 2, cudaReadModeElementType> texture2D; + +static texture2D gT_volumeTexture; + + +namespace astraCUDA { + +static const unsigned g_MaxAngles = 2560; +__constant__ float gC_angle[g_MaxAngles]; +__constant__ float gC_angle_offset[g_MaxAngles]; + + +// optimization parameters +static const unsigned int g_anglesPerBlock = 16; +static const unsigned int g_detBlockSize = 32; +static const unsigned int g_blockSlices = 64; + +// fixed point scaling factor +#define fPREC_FACTOR 16.0f +#define iPREC_FACTOR 16 + + +// if necessary, a buffer of zeroes of size g_MaxAngles +static float* g_pfZeroes = 0; + + +static bool bindVolumeDataTexture(float* data, cudaArray*& dataArray, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + dataArray = 0; + cudaMallocArray(&dataArray, &channelDesc, width, height); + cudaMemcpy2DToArray(dataArray, 0, 0, data, pitch*sizeof(float), width*sizeof(float), height, cudaMemcpyDeviceToDevice); + + gT_volumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_volumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_volumeTexture.filterMode = cudaFilterModeLinear; + gT_volumeTexture.normalized = false; + + // TODO: For very small sizes (roughly <=512x128) with few angles (<=180) + // not using an array is more efficient. +// cudaBindTexture2D(0, gT_volumeTexture, (const void*)data, channelDesc, width, height, sizeof(float)*pitch); + cudaBindTextureToArray(gT_volumeTexture, dataArray, channelDesc); + + // TODO: error value? + + return true; +} + +// projection for angles that are roughly horizontal +// theta between 45 and 135 degrees +__global__ void FPhorizontal(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + // (The same for all threadIdx.x) + // ------------------------------------- + + const int midSlice = startSlice + g_blockSlices / 2; + + // ASSUMPTION: fDetScale >= 1.0f + // problem: detector regions get skipped because slice blocks aren't large + // enough + const unsigned int g_blockSliceSize = g_detBlockSize; + + // project (midSlice,midRegion) on this thread's detector + + const float fBase = 0.5f*dims.iProjDets - 0.5f + + ( + (midSlice - 0.5f*dims.iVolWidth + 0.5f) * cos_theta + - (g_blockSliceSize/2 - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + + gC_angle_offset[angle] + ) / dims.fDetScale; + int iBase = (int)floorf(fBase * fPREC_FACTOR); + int iInc = (int)floorf(g_blockSliceSize * sin_theta / dims.fDetScale * -fPREC_FACTOR); + + // ASSUMPTION: 16 > regionOffset / fDetScale + const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + const int detPrevRegion = (iBase + (blockIdx.y - regionOffset - 1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + + if (blockIdx.y > 0 && detRegion == detPrevRegion) + return; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = -dims.fDetScale / sin_theta; + float fSliceStep = cos_theta / sin_theta; + float fDistCorr; + if (sin_theta > 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + // project detector on slice + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fS = startSlice + 1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSliceStep; + fS += 1.0f; + } + + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + +// projection for angles that are roughly vertical +// theta between 0 and 45, or 135 and 180 degrees +__global__ void FPvertical(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, int regionOffset, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + // (The same for all threadIdx.x) + // ------------------------------------- + + const int midSlice = startSlice + g_blockSlices / 2; + + // project (midSlice,midRegion) on this thread's detector + + // ASSUMPTION: fDetScale >= 1.0f + // problem: detector regions get skipped because slice blocks aren't large + // enough + const unsigned int g_blockSliceSize = g_detBlockSize; + + const float fBase = 0.5f*dims.iProjDets - 0.5f + + ( + (g_blockSliceSize/2 - 0.5f*dims.iVolWidth + 0.5f) * cos_theta + - (midSlice - 0.5f*dims.iVolHeight + 0.5f) * sin_theta + + gC_angle_offset[angle] + ) / dims.fDetScale; + int iBase = (int)floorf(fBase * fPREC_FACTOR); + int iInc = (int)floorf(g_blockSliceSize * cos_theta / dims.fDetScale * fPREC_FACTOR); + + // ASSUMPTION: 16 > regionOffset / fDetScale + const int detRegion = (iBase + (blockIdx.y - regionOffset)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + const int detPrevRegion = (iBase + (blockIdx.y - regionOffset-1)*iInc + 16*iPREC_FACTOR*g_detBlockSize) / (iPREC_FACTOR * g_detBlockSize) - 16; + + if (blockIdx.y > 0 && detRegion == detPrevRegion) + return; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = dims.fDetScale / cos_theta; + float fSliceStep = sin_theta / cos_theta; + float fDistCorr; + if (cos_theta < 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fS = startSlice+1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSliceStep; + fS += 1.0f; + } + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + +// projection for angles that are roughly horizontal +// (detector roughly vertical) +__global__ void FPhorizontal_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + const int detRegion = blockIdx.y; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = -dims.fDetScale / sin_theta; + float fSliceStep = cos_theta / sin_theta; + float fDistCorr; + if (sin_theta > 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + // project detector on slice + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolWidth + 0.5f) * fSliceStep + 0.5f*dims.iVolHeight - 0.5f + 1.5f; + float fS = startSlice + 1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolWidth) + endSlice = dims.iVolWidth; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fS, fP); + fP += fSliceStep; + fS += 1.0f; + } + + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + + +// projection for angles that are roughly vertical +// (detector roughly horizontal) +__global__ void FPvertical_simple(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions dims, float outputScale) +{ + const int relDet = threadIdx.x; + const int relAngle = threadIdx.y; + + int angle = startAngle + blockIdx.x * g_anglesPerBlock + relAngle; + + if (angle >= endAngle) + return; + + const float theta = gC_angle[angle]; + const float cos_theta = __cosf(theta); + const float sin_theta = __sinf(theta); + + // compute start detector for this block/angle: + const int detRegion = blockIdx.y; + + const int detector = detRegion * g_detBlockSize + relDet; + + // Now project the part of the ray to angle,detector through + // slices startSlice to startSlice+g_blockSlices-1 + + if (detector < 0 || detector >= dims.iProjDets) + return; + + const float fDetStep = dims.fDetScale / cos_theta; + float fSliceStep = sin_theta / cos_theta; + float fDistCorr; + if (cos_theta < 0.0f) + fDistCorr = -fDetStep; + else + fDistCorr = fDetStep; + fDistCorr *= outputScale; + + float fVal = 0.0f; + float fP = (detector - 0.5f*dims.iProjDets + 0.5f - gC_angle_offset[angle]) * fDetStep + (startSlice - 0.5f*dims.iVolHeight + 0.5f) * fSliceStep + 0.5f*dims.iVolWidth - 0.5f + 1.5f; + float fS = startSlice+1.5f; + int endSlice = startSlice + g_blockSlices; + if (endSlice > dims.iVolHeight) + endSlice = dims.iVolHeight; + + if (dims.iRaysPerDet > 1) { + + fP += (-0.5f*dims.iRaysPerDet + 0.5f)/dims.iRaysPerDet * fDetStep; + const float fSubDetStep = fDetStep / dims.iRaysPerDet; + fDistCorr /= dims.iRaysPerDet; + + fSliceStep -= dims.iRaysPerDet * fSubDetStep; + + for (int slice = startSlice; slice < endSlice; ++slice) + { + for (int iSubT = 0; iSubT < dims.iRaysPerDet; ++iSubT) { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSubDetStep; + } + fP += fSliceStep; + fS += 1.0f; + } + + } else { + + for (int slice = startSlice; slice < endSlice; ++slice) + { + fVal += tex2D(gT_volumeTexture, fP, fS); + fP += fSliceStep; + fS += 1.0f; + } + + } + + D_projData[angle*projPitch+detector+1] += fVal * fDistCorr; +} + + + +bool FP_simple(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float outputScale) +{ + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + if (TOffsets) { + cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } else { + if (!g_pfZeroes) { + g_pfZeroes = new float[g_MaxAngles]; + memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); + } + cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // detector block size, angles + + std::list<cudaStream_t> streams; + + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + // TODO: Check if it's worth it to store this info instead + // of recomputing it every FP. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + bool blockVertical = false; + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + bool vertical; + // TODO: Having <= instead of < below causes a 5% speedup. + // Maybe we should detect corner cases and put them in the optimal + // group of angles. + if (a != dims.iProjAngles) + vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); + if (a == dims.iProjAngles || vertical != blockVertical) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (dims.iProjDets+g_detBlockSize-1)/g_detBlockSize); // angle blocks, detector blocks + + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); + if (!blockVertical) + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FPhorizontal_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + else + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FPvertical_simple<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, dims, outputScale); + } + blockVertical = vertical; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + + return true; +} + +bool FP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float outputScale) +{ + return FP_simple(D_volumeData, volumePitch, D_projData, projPitch, + dims, angles, TOffsets, outputScale); + + // TODO: Fix bug in this non-simple FP with large detscale and TOffsets +#if 0 + + // TODO: load angles into constant memory in smaller blocks + assert(dims.iProjAngles <= g_MaxAngles); + + // TODO: compute region size dynamically to resolve these two assumptions + // ASSUMPTION: 16 > regionOffset / fDetScale + const unsigned int g_blockSliceSize = g_detBlockSize; + assert(16 > (g_blockSlices / g_blockSliceSize) / dims.fDetScale); + // ASSUMPTION: fDetScale >= 1.0f + assert(dims.fDetScale > 0.9999f); + + cudaArray* D_dataArray; + bindVolumeDataTexture(D_volumeData, D_dataArray, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + + cudaMemcpyToSymbol(gC_angle, angles, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + if (TOffsets) { + cudaMemcpyToSymbol(gC_angle_offset, TOffsets, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } else { + if (!g_pfZeroes) { + g_pfZeroes = new float[g_MaxAngles]; + memset(g_pfZeroes, 0, g_MaxAngles * sizeof(float)); + } + cudaMemcpyToSymbol(gC_angle_offset, g_pfZeroes, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + } + + int regionOffset = g_blockSlices / g_blockSliceSize; + + dim3 dimBlock(g_detBlockSize, g_anglesPerBlock); // region size, angles + + std::list<cudaStream_t> streams; + + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + // TODO: Check if it's worth it to store this info instead + // of recomputing it every FP. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + bool blockVertical = false; + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + bool vertical; + // TODO: Having <= instead of < below causes a 5% speedup. + // Maybe we should detect corner cases and put them in the optimal + // group of angles. + if (a != dims.iProjAngles) + vertical = (fabsf(sinf(angles[a])) <= fabsf(cosf(angles[a]))); + if (a == dims.iProjAngles || vertical != blockVertical) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + unsigned int length = dims.iVolHeight; + if (blockVertical) + length = dims.iVolWidth; + dim3 dimGrid((blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock, + (length+g_blockSliceSize-1)/g_blockSliceSize+2*regionOffset); // angle blocks, regions + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + //printf("angle block: %d to %d, %d\n", blockStart, blockEnd, blockVertical); + if (!blockVertical) + for (unsigned int i = 0; i < dims.iVolWidth; i += g_blockSlices) + FPhorizontal<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); + else + for (unsigned int i = 0; i < dims.iVolHeight; i += g_blockSlices) + FPvertical<<<dimGrid, dimBlock, 0, stream>>>(D_projData, projPitch, i, blockStart, blockEnd, regionOffset, dims, outputScale); + } + blockVertical = vertical; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaThreadSynchronize(); + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(D_dataArray); + + + return true; +#endif +} + + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_projData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, projPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + printf("pitch: %u\n", projPitch); + + unsigned int y, x; + float* img = loadImage("phantom.png", y, x); + + float* sino = new float[dims.iProjAngles * dims.iProjDets]; + + memset(sino, 0, dims.iProjAngles * dims.iProjDets * sizeof(float)); + + copyVolumeToDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + FP(D_volumeData, volumePitch, D_projData, projPitch, dims, angle, 0, 1.0f); + + delete[] angle; + + copySinogramFromDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_projData, projPitch); + + float s = 0.0f; + for (unsigned int y = 0; y < dims.iProjAngles; ++y) + for (unsigned int x = 0; x < dims.iProjDets; ++x) + s += sino[y*dims.iProjDets+x] * sino[y*dims.iProjDets+x]; + printf("cpu norm: %f\n", s); + + //zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + printf("gpu norm: %f\n", s); + + saveImage("sino.png",dims.iProjAngles,dims.iProjDets,sino); + + + return 0; +} +#endif diff --git a/cuda/2d/par_fp.h b/cuda/2d/par_fp.h new file mode 100644 index 0000000..3213b14 --- /dev/null +++ b/cuda/2d/par_fp.h @@ -0,0 +1,41 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR_FP_H +#define _CUDA_PAR_FP_H + +namespace astraCUDA { + +_AstraExport bool FP(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, float fOutputScale); + +} + +#endif diff --git a/cuda/2d/sart.cu b/cuda/2d/sart.cu new file mode 100644 index 0000000..a40176d --- /dev/null +++ b/cuda/2d/sart.cu @@ -0,0 +1,283 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sart.h" +#include "util.h" +#include "arith.h" +#include "fan_fp.h" +#include "fan_bp.h" +#include "par_fp.h" +#include "par_bp.h" + +namespace astraCUDA { + + +__global__ void devMUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + // Copy result down and left one pixel. + pfOut[x + pitch] = pfOut[x + 1] * pfIn[x + 1]; +} + +void MUL_SART(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, 1); + + devMUL_SART<<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width); + + cudaTextForceKernelsCompletion(); +} + + + +SART::SART() : ReconAlgo() +{ + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + + projectionOrder = 0; + projectionCount = 0; + iteration = 0; + customOrder = false; +} + + +SART::~SART() +{ + reset(); +} + +void SART::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_lineWeight); + + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + + useVolumeMask = false; + useSinogramMask = false; + + if (projectionOrder != NULL) delete[] projectionOrder; + projectionOrder = 0; + projectionCount = 0; + iteration = 0; + customOrder = false; + + ReconAlgo::reset(); +} + +bool SART::init() +{ + if (useVolumeMask) { + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + } + + // HACK: D_projData consists of two lines. The first is used padded, + // the second unpadded. This is to satisfy the alignment requirements + // of resp. FP and BP_SART. + allocateVolume(D_projData, dims.iProjDets+2, 2, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, 1); + + allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute lineWeights when using a mask + if (!useVolumeMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SART::setProjectionOrder(int* _projectionOrder, int _projectionCount) +{ + customOrder = true; + projectionCount = _projectionCount; + projectionOrder = new int[projectionCount]; + for (int i = 0; i < projectionCount; i++) { + projectionOrder[i] = _projectionOrder[i]; + } + + return true; +} + + +bool SART::precomputeWeights() +{ + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + if (useVolumeMask) { + callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); + } else { + // Allocate tmpData temporarily + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + + processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); + + + cudaFree(D_tmpData); + D_tmpData = 0; + } + processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + + return true; +} + +bool SART::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask) + precomputeWeights(); + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + int angle; + if (customOrder) { + angle = projectionOrder[iteration % projectionCount]; + } else { + angle = iteration % dims.iProjAngles; + } + + // copy one line of sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData + angle*sinoPitch, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), 1, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle, -1.0f); + } else { + callFP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle, -1.0f); + } + + MUL_SART(D_projData, D_lineWeight + angle*linePitch, projPitch, dims.iProjDets); + + if (useVolumeMask) { + // BP, mask, and add back + // TODO: Try putting the masking directly in the BP + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + callBP_SART(D_tmpData, tmpPitch, D_projData, projPitch, angle); + processVol<opAddMul, VOL>(D_volumeData, D_maskData, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + } else { + callBP_SART(D_volumeData, volumePitch, D_projData, projPitch, angle); + } + + if (useMinConstraint) + processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (useMaxConstraint) + processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + + iteration++; + + } + + return true; +} + +float SART::computeDiffNorm() +{ + unsigned int pPitch; + float *D_p; + allocateVolume(D_p, dims.iProjDets+2, dims.iProjAngles, pPitch); + zeroVolume(D_p, pPitch, dims.iProjDets+2, dims.iProjAngles); + + // copy sinogram to D_p + cudaMemcpy2D(D_p, sizeof(float)*pPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_p + float s = dotProduct2D(D_p, pPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + cudaFree(D_p); + + return sqrt(s); +} + +bool SART::callFP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, float outputScale) +{ + SDimensions d = dims; + d.iProjAngles = 1; + if (angles) { + assert(!fanProjs); + return FP(D_volumeData, volumePitch, D_projData, projPitch, + d, &angles[angle], TOffsets, outputScale); + } else { + assert(fanProjs); + return FanFP(D_volumeData, volumePitch, D_projData, projPitch, + d, &fanProjs[angle], outputScale); + } +} + +bool SART::callBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle) +{ + if (angles) { + assert(!fanProjs); + return BP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + angle, dims, angles, TOffsets); + } else { + assert(fanProjs); + return FanBP_SART(D_volumeData, volumePitch, D_projData + projPitch, projPitch, + angle, dims, fanProjs); + } + +} + + +} + + diff --git a/cuda/2d/sart.h b/cuda/2d/sart.h new file mode 100644 index 0000000..ad80259 --- /dev/null +++ b/cuda/2d/sart.h @@ -0,0 +1,85 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SART_H +#define _CUDA_SART_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport SART : public ReconAlgo { +public: + SART(); + ~SART(); + + // disable some features + virtual bool enableSinogramMask() { return false; } + + virtual bool init(); + + virtual bool setProjectionOrder(int* projectionOrder, int projectionCount); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + bool callFP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle, float outputScale); + bool callBP_SART(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + unsigned int angle); + + + // projection angle variables + bool customOrder; + int* projectionOrder; + int projectionCount; + int iteration; + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; // Only used when there's a volume mask + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_lineWeight; + unsigned int linePitch; +}; + +} + +#endif diff --git a/cuda/2d/sirt.cu b/cuda/2d/sirt.cu new file mode 100644 index 0000000..31954e4 --- /dev/null +++ b/cuda/2d/sirt.cu @@ -0,0 +1,342 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sirt.h" +#include "util.h" +#include "arith.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA { + +SIRT::SIRT() : ReconAlgo() +{ + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + D_pixelWeight = 0; + + D_minMaskData = 0; + D_maxMaskData = 0; + + freeMinMaxMasks = false; +} + + +SIRT::~SIRT() +{ + reset(); +} + +void SIRT::reset() +{ + cudaFree(D_projData); + cudaFree(D_tmpData); + cudaFree(D_lineWeight); + cudaFree(D_pixelWeight); + if (freeMinMaxMasks) { + cudaFree(D_minMaskData); + cudaFree(D_maxMaskData); + } + + D_projData = 0; + D_tmpData = 0; + + D_lineWeight = 0; + D_pixelWeight = 0; + + freeMinMaxMasks = false; + D_minMaskData = 0; + D_maxMaskData = 0; + + useVolumeMask = false; + useSinogramMask = false; + + ReconAlgo::reset(); +} + +bool SIRT::init() +{ + allocateVolume(D_pixelWeight, dims.iVolWidth+2, dims.iVolHeight+2, pixelPitch); + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_tmpData, dims.iVolWidth+2, dims.iVolHeight+2, tmpPitch); + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + allocateVolume(D_projData, dims.iProjDets+2, dims.iProjAngles, projPitch); + zeroVolume(D_projData, projPitch, dims.iProjDets+2, dims.iProjAngles); + + allocateVolume(D_lineWeight, dims.iProjDets+2, dims.iProjAngles, linePitch); + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + + // We can't precompute lineWeights and pixelWeights when using a mask + if (!useVolumeMask && !useSinogramMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SIRT::precomputeWeights() +{ + zeroVolume(D_lineWeight, linePitch, dims.iProjDets+2, dims.iProjAngles); + if (useVolumeMask) { + callFP(D_maskData, maskPitch, D_lineWeight, linePitch, 1.0f); + } else { + processVol<opSet, VOL>(D_tmpData, 1.0f, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_lineWeight, linePitch, 1.0f); + } + processVol<opInvert, SINO>(D_lineWeight, linePitch, dims.iProjDets, dims.iProjAngles); + + if (useSinogramMask) { + // scale line weights with sinogram mask to zero out masked sinogram pixels + processVol<opMul, SINO>(D_lineWeight, D_smaskData, linePitch, dims.iProjDets, dims.iProjAngles); + } + + + zeroVolume(D_pixelWeight, pixelPitch, dims.iVolWidth+2, dims.iVolHeight+2); + if (useSinogramMask) { + callBP(D_pixelWeight, pixelPitch, D_smaskData, smaskPitch); + } else { + processVol<opSet, SINO>(D_projData, 1.0f, projPitch, dims.iProjDets, dims.iProjAngles); + callBP(D_pixelWeight, pixelPitch, D_projData, projPitch); + } + processVol<opInvert, VOL>(D_pixelWeight, pixelPitch, dims.iVolWidth, dims.iVolHeight); + + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol<opMul, VOL>(D_pixelWeight, D_maskData, pixelPitch, dims.iVolWidth, dims.iVolHeight); + } + + return true; +} + +bool SIRT::setMinMaxMasks(float* D_minMaskData_, float* D_maxMaskData_, + unsigned int iPitch) +{ + D_minMaskData = D_minMaskData_; + D_maxMaskData = D_maxMaskData_; + minMaskPitch = iPitch; + maxMaskPitch = iPitch; + + freeMinMaxMasks = false; + return true; +} + +bool SIRT::uploadMinMaxMasks(const float* pfMinMaskData, const float* pfMaxMaskData, + unsigned int iPitch) +{ + freeMinMaxMasks = true; + bool ok = true; + if (pfMinMaskData) { + allocateVolume(D_minMaskData, dims.iVolWidth+2, dims.iVolHeight+2, minMaskPitch); + ok = copyVolumeToDevice(pfMinMaskData, iPitch, + dims.iVolWidth, dims.iVolHeight, + D_minMaskData, minMaskPitch); + } + if (!ok) + return false; + + if (pfMaxMaskData) { + allocateVolume(D_maxMaskData, dims.iVolWidth+2, dims.iVolHeight+2, maxMaskPitch); + ok = copyVolumeToDevice(pfMaxMaskData, iPitch, + dims.iVolWidth, dims.iVolHeight, + D_maxMaskData, maxMaskPitch); + } + if (!ok) + return false; + + return true; +} + +bool SIRT::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask || useSinogramMask) + precomputeWeights(); + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + processVol<opMul, SINO>(D_projData, D_lineWeight, projPitch, dims.iProjDets, dims.iProjAngles); + + zeroVolume(D_tmpData, tmpPitch, dims.iVolWidth+2, dims.iVolHeight+2); + + callBP(D_tmpData, tmpPitch, D_projData, projPitch); + + processVol<opAddMul, VOL>(D_volumeData, D_pixelWeight, D_tmpData, volumePitch, dims.iVolWidth, dims.iVolHeight); + + if (useMinConstraint) + processVol<opClampMin, VOL>(D_volumeData, fMinConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (useMaxConstraint) + processVol<opClampMax, VOL>(D_volumeData, fMaxConstraint, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (D_minMaskData) + processVol<opClampMinMask, VOL>(D_volumeData, D_minMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + if (D_maxMaskData) + processVol<opClampMaxMask, VOL>(D_volumeData, D_maxMaskData, volumePitch, dims.iVolWidth, dims.iVolHeight); + } + + return true; +} + +float SIRT::computeDiffNorm() +{ + // copy sinogram to projection data + cudaMemcpy2D(D_projData, sizeof(float)*projPitch, D_sinoData, sizeof(float)*sinoPitch, sizeof(float)*(dims.iProjDets+2), dims.iProjAngles, cudaMemcpyDeviceToDevice); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + cudaMemcpy2D(D_tmpData, sizeof(float)*tmpPitch, D_volumeData, sizeof(float)*volumePitch, sizeof(float)*(dims.iVolWidth+2), dims.iVolHeight+2, cudaMemcpyDeviceToDevice); + processVol<opMul, VOL>(D_tmpData, D_maskData, tmpPitch, dims.iVolWidth, dims.iVolHeight); + callFP(D_tmpData, tmpPitch, D_projData, projPitch, -1.0f); + } else { + callFP(D_volumeData, volumePitch, D_projData, projPitch, -1.0f); + } + + + // compute norm of D_projData + + float s = dotProduct2D(D_projData, projPitch, dims.iProjDets, dims.iProjAngles, 1, 0); + + return sqrt(s); +} + + +bool doSIRT(float* D_volumeData, unsigned int volumePitch, + float* D_sinoData, unsigned int sinoPitch, + float* D_maskData, unsigned int maskPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations) +{ + SIRT sirt; + bool ok = true; + + ok &= sirt.setGeometry(dims, angles); + if (D_maskData) + ok &= sirt.enableVolumeMask(); + if (TOffsets) + ok &= sirt.setTOffsets(TOffsets); + + if (!ok) + return false; + + ok = sirt.init(); + if (!ok) + return false; + + if (D_maskData) + ok &= sirt.setVolumeMask(D_maskData, maskPitch); + + ok &= sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + if (!ok) + return false; + + ok = sirt.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA; + +int main() +{ + float* D_volumeData; + float* D_sinoData; + + SDimensions dims; + dims.iVolWidth = 1024; + dims.iVolHeight = 1024; + dims.iProjAngles = 512; + dims.iProjDets = 1536; + dims.fDetScale = 1.0f; + dims.iRaysPerDet = 1; + unsigned int volumePitch, sinoPitch; + + allocateVolume(D_volumeData, dims.iVolWidth+2, dims.iVolHeight+2, volumePitch); + zeroVolume(D_volumeData, volumePitch, dims.iVolWidth+2, dims.iVolHeight+2); + printf("pitch: %u\n", volumePitch); + + allocateVolume(D_sinoData, dims.iProjDets+2, dims.iProjAngles, sinoPitch); + zeroVolume(D_sinoData, sinoPitch, dims.iProjDets+2, dims.iProjAngles); + printf("pitch: %u\n", sinoPitch); + + unsigned int y, x; + float* sino = loadImage("sino.png", y, x); + + float* img = new float[dims.iVolWidth*dims.iVolHeight]; + + copySinogramToDevice(sino, dims.iProjDets, dims.iProjDets, dims.iProjAngles, D_sinoData, sinoPitch); + + float* angle = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) + angle[i] = i*(M_PI/dims.iProjAngles); + + SIRT sirt; + + sirt.setGeometry(dims, angle); + sirt.init(); + + sirt.setBuffers(D_volumeData, volumePitch, D_sinoData, sinoPitch); + + sirt.iterate(25); + + + delete[] angle; + + copyVolumeFromDevice(img, dims.iVolWidth, dims.iVolWidth, dims.iVolHeight, D_volumeData, volumePitch); + + saveImage("vol.png",dims.iVolHeight,dims.iVolWidth,img); + + return 0; +} +#endif + diff --git a/cuda/2d/sirt.h b/cuda/2d/sirt.h new file mode 100644 index 0000000..5592616 --- /dev/null +++ b/cuda/2d/sirt.h @@ -0,0 +1,90 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SIRT_H +#define _CUDA_SIRT_H + +#include "util.h" +#include "algo.h" + +namespace astraCUDA { + +class _AstraExport SIRT : public ReconAlgo { +public: + SIRT(); + ~SIRT(); + + virtual bool init(); + + // Set min/max masks to existing GPU memory buffers + bool setMinMaxMasks(float* D_minMaskData, float* D_maxMaskData, + unsigned int pitch); + + // Set min/max masks from RAM buffers + bool uploadMinMaxMasks(const float* minMaskData, const float* maxMaskData, + unsigned int pitch); + + virtual bool iterate(unsigned int iterations); + + virtual float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + // Temporary buffers + float* D_projData; + unsigned int projPitch; + + float* D_tmpData; + unsigned int tmpPitch; + + // Geometry-specific precomputed data + float* D_lineWeight; + unsigned int linePitch; + + float* D_pixelWeight; + unsigned int pixelPitch; + + // Masks + bool freeMinMaxMasks; + float* D_minMaskData; + unsigned int minMaskPitch; + float* D_maxMaskData; + unsigned int maxMaskPitch; +}; + +bool doSIRT(float* D_volumeData, unsigned int volumePitch, + float* D_projData, unsigned int projPitch, + float* D_maskData, unsigned int maskPitch, + const SDimensions& dims, const float* angles, + const float* TOffsets, unsigned int iterations); + +} + +#endif diff --git a/cuda/2d/util.cu b/cuda/2d/util.cu new file mode 100644 index 0000000..06f6714 --- /dev/null +++ b/cuda/2d/util.cu @@ -0,0 +1,244 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util.h" + +namespace astraCUDA { + +bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, sizeof(float)*(width+2), height+2); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + out_pitch + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + assert(err == cudaSuccess); + return true; +} + +bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + (in_pitch + 1), sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch) +{ + cudaError_t err = cudaMemcpy2D(out_data, sizeof(float)*out_pitch, inD_data + 1, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyDeviceToHost); + ASTRA_CUDA_ASSERT(err); + return true; +} + +bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch) +{ + // TODO: a full memset isn't necessary. Only the edges. + cudaError_t err; + err = cudaMemset2D(outD_data, sizeof(float)*out_pitch, 0, (width+2)*sizeof(float), height); + ASTRA_CUDA_ASSERT(err); + err = cudaMemcpy2D(outD_data + 1, sizeof(float)*out_pitch, in_data, sizeof(float)*in_pitch, sizeof(float)*width, height, cudaMemcpyHostToDevice); + ASTRA_CUDA_ASSERT(err); + return true; +} + + +bool allocateVolume(float*& ptr, unsigned int width, unsigned int height, unsigned int& pitch) +{ + size_t p; + cudaError_t ret = cudaMallocPitch((void**)&ptr, &p, sizeof(float)*width, height); + if (ret != cudaSuccess) { + reportCudaError(ret); + fprintf(stderr, "Failed to allocate %dx%d GPU buffer\n", width, height); + return false; + } + + assert(p % sizeof(float) == 0); + + pitch = p / sizeof(float); + + return true; +} + +void zeroVolume(float* data, unsigned int pitch, unsigned int width, unsigned int height) +{ + cudaError_t err; + err = cudaMemset2D(data, sizeof(float)*pitch, 0, sizeof(float)*width, height); + ASTRA_CUDA_ASSERT(err); +} + + +template <unsigned int blockSize> +__global__ void reduce1D(float *g_idata, float *g_odata, unsigned int n) +{ + extern __shared__ float sdata[]; + unsigned int tid = threadIdx.x; + + unsigned int i = blockIdx.x*(blockSize*2) + tid; + unsigned int gridSize = blockSize*gridDim.x; + sdata[tid] = 0; + while (i < n) { sdata[tid] += g_idata[i]; i += gridSize; } + __syncthreads(); + if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } + if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } + if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } + if (tid < 32) { + volatile float* smem = sdata; + if (blockSize >= 64) smem[tid] += smem[tid + 32]; + if (blockSize >= 32) smem[tid] += smem[tid + 16]; + if (blockSize >= 16) smem[tid] += smem[tid + 8]; + if (blockSize >= 8) smem[tid] += smem[tid + 4]; + if (blockSize >= 4) smem[tid] += smem[tid + 2]; + if (blockSize >= 2) smem[tid] += smem[tid + 1]; + } + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + +__global__ void reduce2D(float *g_idata, float *g_odata, + unsigned int pitch, + unsigned int nx, unsigned int ny, + unsigned int padX, unsigned int padY) +{ + extern __shared__ float sdata[]; + const unsigned int tidx = threadIdx.x; + const unsigned int tidy = threadIdx.y; + const unsigned int tid = tidy * 16 + tidx; + + unsigned int x = blockIdx.x*16 + tidx; + unsigned int y = blockIdx.y*16 + tidy; + + sdata[tid] = 0; + + if (x >= padX && x < padX + nx) { + + while (y < padY + ny) { + if (y >= padY) + sdata[tid] += (g_idata[pitch*y+x] * g_idata[pitch*y+x]); + y += 16 * gridDim.y; + } + + } + + __syncthreads(); + + if (tid < 128) + sdata[tid] += sdata[tid + 128]; + __syncthreads(); + + if (tid < 64) + sdata[tid] += sdata[tid + 64]; + __syncthreads(); + + if (tid < 32) { // 32 is warp size + volatile float* smem = sdata; + smem[tid] += smem[tid + 32]; + smem[tid] += smem[tid + 16]; + smem[tid] += smem[tid + 8]; + smem[tid] += smem[tid + 4]; + smem[tid] += smem[tid + 2]; + smem[tid] += smem[tid + 1]; + } + + if (tid == 0) + g_odata[blockIdx.y * gridDim.x + blockIdx.x] = sdata[0]; +} + +float dotProduct2D(float* D_data, unsigned int pitch, + unsigned int width, unsigned int height, + unsigned int padX, unsigned int padY) +{ + unsigned int bx = ((width+padX) + 15) / 16; + unsigned int by = ((height+padY) + 127) / 128; + unsigned int shared_mem2 = sizeof(float) * 16 * 16; + + dim3 dimBlock2(16, 16); + dim3 dimGrid2(bx, by); + + float* D_buf; + cudaMalloc(&D_buf, sizeof(float) * (bx * by + 1) ); + + // Step 1: reduce 2D from image to a single vector, taking sum of squares + + reduce2D<<< dimGrid2, dimBlock2, shared_mem2>>>(D_data, D_buf, pitch, width, height, padX, padY); + cudaTextForceKernelsCompletion(); + + // Step 2: reduce 1D: add up elements in vector + if (bx * by > 512) + reduce1D<512><<< 1, 512, sizeof(float)*512>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 128) + reduce1D<128><<< 1, 128, sizeof(float)*128>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 32) + reduce1D<32><<< 1, 32, sizeof(float)*32*2>>>(D_buf, D_buf+(bx*by), bx*by); + else if (bx * by > 8) + reduce1D<8><<< 1, 8, sizeof(float)*8*2>>>(D_buf, D_buf+(bx*by), bx*by); + else + reduce1D<1><<< 1, 1, sizeof(float)*1*2>>>(D_buf, D_buf+(bx*by), bx*by); + + float x; + cudaMemcpy(&x, D_buf+(bx*by), 4, cudaMemcpyDeviceToHost); + + cudaTextForceKernelsCompletion(); + + cudaFree(D_buf); + + return x; +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +void reportCudaError(cudaError_t err) +{ + if(err != cudaSuccess) + fprintf(stderr, "CUDA error %d: %s.\n", err, cudaGetErrorString(err)); +} + + + +} diff --git a/cuda/2d/util.h b/cuda/2d/util.h new file mode 100644 index 0000000..d31e2eb --- /dev/null +++ b/cuda/2d/util.h @@ -0,0 +1,90 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_UTIL_H +#define _CUDA_UTIL_H + +#include <cuda.h> +#include <driver_types.h> + +#ifdef _MSC_VER + +#ifdef DLL_EXPORTS +#define _AstraExport __declspec(dllexport) +#define EXPIMP_TEMPLATE +#else +#define _AstraExport __declspec(dllimport) +#define EXPIMP_TEMPLATE extern +#endif + +#else + +#define _AstraExport + +#endif + +#include "dims.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define ASTRA_CUDA_ASSERT(err) do { if (err != cudaSuccess) { astraCUDA::reportCudaError(err); assert(err == cudaSuccess); } } while(0) + + +namespace astraCUDA { + +bool copyVolumeToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch); +bool copyVolumeFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch); +bool copySinogramFromDevice(float* out_data, unsigned int out_pitch, + unsigned int width, unsigned int height, + float* inD_data, unsigned int in_pitch); +bool copySinogramToDevice(const float* in_data, unsigned int in_pitch, + unsigned int width, unsigned int height, + float* outD_data, unsigned int out_pitch); + +bool allocateVolume(float*& D_ptr, unsigned int width, unsigned int height, unsigned int& pitch); + +void zeroVolume(float* D_data, unsigned int pitch, unsigned int width, unsigned int height); + +bool cudaTextForceKernelsCompletion(); +void reportCudaError(cudaError_t err); + + + +float dotProduct2D(float* D_data, unsigned int pitch, + unsigned int width, unsigned int height, + unsigned int padX, unsigned int padY); + +} + +#endif diff --git a/cuda/3d/algo3d.cu b/cuda/3d/algo3d.cu new file mode 100644 index 0000000..20e7381 --- /dev/null +++ b/cuda/3d/algo3d.cu @@ -0,0 +1,108 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cassert> + +#include "algo3d.h" +#include "cone_fp.h" +#include "cone_bp.h" +#include "par3d_fp.h" +#include "par3d_bp.h" + +namespace astraCUDA3d { + +ReconAlgo3D::ReconAlgo3D() +{ + coneProjs = 0; + par3DProjs = 0; + shouldAbort = false; +} + +ReconAlgo3D::~ReconAlgo3D() +{ + reset(); +} + +void ReconAlgo3D::reset() +{ + delete[] coneProjs; + coneProjs = 0; + delete[] par3DProjs; + par3DProjs = 0; + shouldAbort = false; +} + +bool ReconAlgo3D::setConeGeometry(const SDimensions3D& _dims, const SConeProjection* _angles) +{ + dims = _dims; + + coneProjs = new SConeProjection[dims.iProjAngles]; + par3DProjs = 0; + + memcpy(coneProjs, _angles, sizeof(coneProjs[0]) * dims.iProjAngles); + + return true; +} + +bool ReconAlgo3D::setPar3DGeometry(const SDimensions3D& _dims, const SPar3DProjection* _angles) +{ + dims = _dims; + + par3DProjs = new SPar3DProjection[dims.iProjAngles]; + coneProjs = 0; + + memcpy(par3DProjs, _angles, sizeof(par3DProjs[0]) * dims.iProjAngles); + + return true; +} + + +bool ReconAlgo3D::callFP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData, + float outputScale) +{ + if (coneProjs) { + return ConeFP(D_volumeData, D_projData, dims, coneProjs, outputScale); + } else { + return Par3DFP(D_volumeData, D_projData, dims, par3DProjs, outputScale); + } +} + +bool ReconAlgo3D::callBP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData) +{ + if (coneProjs) { + return ConeBP(D_volumeData, D_projData, dims, coneProjs); + } else { + return Par3DBP(D_volumeData, D_projData, dims, par3DProjs); + } +} + + + +} diff --git a/cuda/3d/algo3d.h b/cuda/3d/algo3d.h new file mode 100644 index 0000000..2b44f6f --- /dev/null +++ b/cuda/3d/algo3d.h @@ -0,0 +1,68 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ALGO_H +#define _CUDA_ALGO_H + +#include "dims3d.h" +#include "util3d.h" + +namespace astraCUDA3d { + +class _AstraExport ReconAlgo3D { +public: + ReconAlgo3D(); + ~ReconAlgo3D(); + + bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + bool setPar3DGeometry(const SDimensions3D& dims, const SPar3DProjection* projs); + + void signalAbort() { shouldAbort = true; } + +protected: + void reset(); + + bool callFP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData, + float outputScale); + bool callBP(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + SDimensions3D dims; + SConeProjection* coneProjs; + SPar3DProjection* par3DProjs; + + volatile bool shouldAbort; + +}; + + +} + +#endif + diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu new file mode 100644 index 0000000..9a19be0 --- /dev/null +++ b/cuda/3d/arith3d.cu @@ -0,0 +1,610 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util3d.h" +#include "arith3d.h" +#include <cassert> + +namespace astraCUDA3d { + +struct opAddScaled { + __device__ void operator()(float& out, const float in, const float inp) { + out += in * inp; + } +}; +struct opScaleAndAdd { + __device__ void operator()(float& out, const float in, const float inp) { + out = in + out * inp; + } +}; +struct opAddMulScaled { + __device__ void operator()(float& out, const float in1, const float in2, const float inp) { + out += in1 * in2 * inp; + } +}; +struct opAddMul { + __device__ void operator()(float& out, const float in1, const float in2) { + out += in1 * in2; + } +}; +struct opMul { + __device__ void operator()(float& out, const float in) { + out *= in; + } +}; +struct opMul2 { + __device__ void operator()(float& out, const float in1, const float in2) { + out *= in1 * in2; + } +}; +struct opDividedBy { + __device__ void operator()(float& out, const float in) { + if (out > 0.000001f) // out is assumed to be positive + out = in / out; + else + out = 0.0f; + } +}; +struct opInvert { + __device__ void operator()(float& out) { + if (out > 0.000001f) // out is assumed to be positive + out = 1 / out; + else + out = 0.0f; + } +}; +struct opSet { + __device__ void operator()(float& out, const float inp) { + out = inp; + } +}; +struct opClampMin { + __device__ void operator()(float& out, const float inp) { + if (out < inp) + out = inp; + } +}; +struct opClampMax { + __device__ void operator()(float& out, const float inp) { + if (out > inp) + out = inp; + } +}; + + + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], fParam); + off += pitch; + y++; + } +} + + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn[off], fParam); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off]); + off += pitch; + y++; + } +} + +template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +__global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + unsigned int x = threadIdx.x + 16*blockIdx.x; + if (x >= width) return; + + unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; + unsigned int off = (y+padY)*pitch+x+padX; + for (unsigned int i = 0; i < repeat; ++i) { + if (y >= height) + break; + op()(pfOut[off], pfIn1[off], pfIn2[off], fParam); + off += pitch; + y++; + } +} + + + + + + + + + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+511)/512); + + float *pfOut = (float*)out; + + devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + + devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn = (const float*)in; + + devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn = (const float*)in; + + devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn1 = (const float*)in1; + const float *pfIn2 = (const float*)in2; + + devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + +template<typename op, VolType t> +void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height) +{ + dim3 blockSize(16,16); + dim3 gridSize((width+15)/16, (height+15)/16); + + float *pfOut = (float*)out; + const float *pfIn1 = (const float*)in1; + const float *pfIn2 = (const float*)in2; + + devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iVolY; + + for (unsigned int i = 0; i < dims.iVolZ; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn = (float*)in.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + +template<typename op> +void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims) +{ + dim3 blockSize(16,16); + dim3 gridSize((dims.iProjU+15)/16, (dims.iProjAngles+511)/512); + float *pfOut = (float*)out.ptr; + float *pfIn1 = (float*)in1.ptr; + float *pfIn2 = (float*)in2.ptr; + unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; + + for (unsigned int i = 0; i < dims.iProjV; ++i) { + devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + pfOut += step; + pfIn1 += step; + pfIn2 += step; + } + + cudaTextForceKernelsCompletion(); +} + + + + + + + + + + + + + + + + + + +#define INST_DFtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); + +#define INST_DtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); + +#define INST_DDtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +#define INST_DDFtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); + + +#define INST_toD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); + +#define INST_FtoD(name) \ + template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \ + template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); + + + +INST_DFtoD(opAddScaled) +INST_DFtoD(opScaleAndAdd) +INST_DDFtoD(opAddMulScaled) +INST_DDtoD(opAddMul) +INST_DDtoD(opMul2) +INST_DtoD(opMul) +INST_DtoD(opDividedBy) +INST_toD(opInvert) +INST_FtoD(opSet) +INST_FtoD(opClampMin) +INST_FtoD(opClampMax) + + +} diff --git a/cuda/3d/arith3d.h b/cuda/3d/arith3d.h new file mode 100644 index 0000000..53c9b79 --- /dev/null +++ b/cuda/3d/arith3d.h @@ -0,0 +1,79 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ARITH3D_H +#define _CUDA_ARITH3D_H + +#include <cuda.h> + +namespace astraCUDA3d { + +struct opAddScaled; +struct opScaleAndAdd; +struct opAddMulScaled; +struct opAddMul; +struct opMul; +struct opMul2; +struct opDividedBy; +struct opInvert; +struct opSet; +struct opClampMin; +struct opClampMax; + +enum VolType { + SINO = 0, + VOL = 1 +}; + + +template<typename op, VolType t> void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); +template<typename op, VolType t> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); + +template<typename op> void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + +template<typename op> void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); +template<typename op> void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); + + + +} + +#endif diff --git a/cuda/3d/astra3d.cu b/cuda/3d/astra3d.cu new file mode 100644 index 0000000..fd4b370 --- /dev/null +++ b/cuda/3d/astra3d.cu @@ -0,0 +1,1620 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls3d.h" +#include "sirt3d.h" +#include "util3d.h" +#include "cone_fp.h" +#include "cone_bp.h" +#include "par3d_fp.h" +#include "par3d_bp.h" +#include "fdk.h" +#include "arith3d.h" +#include "astra3d.h" + +#include <iostream> + +using namespace astraCUDA3d; + +namespace astra { + +enum CUDAProjectionType3d { + PROJ_PARALLEL, + PROJ_CONE +}; + + +static SConeProjection* genConeProjections(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + double fOriginSourceDistance, + double fOriginDetectorDistance, + double fDetUSize, + double fDetVSize, + const float *pfAngles) +{ + SConeProjection base; + base.fSrcX = 0.0f; + base.fSrcY = -fOriginSourceDistance; + base.fSrcZ = 0.0f; + + base.fDetSX = iProjU * fDetUSize * -0.5f; + base.fDetSY = fOriginDetectorDistance; + base.fDetSZ = iProjV * fDetVSize * -0.5f; + + base.fDetUX = fDetUSize; + base.fDetUY = 0.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = fDetVSize; + + SConeProjection* p = new SConeProjection[iProjAngles]; + +#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0) + + for (unsigned int i = 0; i < iProjAngles; ++i) { + ROTATE0(Src, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + ROTATE0(DetV, i, pfAngles[i]); + } + +#undef ROTATE0 + + return p; +} + +static SPar3DProjection* genPar3DProjections(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + double fDetUSize, + double fDetVSize, + const float *pfAngles) +{ + SPar3DProjection base; + base.fRayX = 0.0f; + base.fRayY = 1.0f; + base.fRayZ = 0.0f; + + base.fDetSX = iProjU * fDetUSize * -0.5f; + base.fDetSY = 0.0f; + base.fDetSZ = iProjV * fDetVSize * -0.5f; + + base.fDetUX = fDetUSize; + base.fDetUY = 0.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = fDetVSize; + + SPar3DProjection* p = new SPar3DProjection[iProjAngles]; + +#define ROTATE0(name,i,alpha) do { p[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); p[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); p[i].f##name##Z = base.f##name##Z; } while(0) + + for (unsigned int i = 0; i < iProjAngles; ++i) { + ROTATE0(Ray, i, pfAngles[i]); + ROTATE0(DetS, i, pfAngles[i]); + ROTATE0(DetU, i, pfAngles[i]); + ROTATE0(DetV, i, pfAngles[i]); + } + +#undef ROTATE0 + + return p; +} + + + + +class AstraSIRT3d_internal { +public: + SDimensions3D dims; + CUDAProjectionType3d projType; + + float* angles; + float fOriginSourceDistance; + float fOriginDetectorDistance; + float fSourceZ; + float fDetSize; + + SConeProjection* projs; + SPar3DProjection* parprojs; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + bool useVolumeMask; + bool useSinogramMask; + + // Input/output + cudaPitchedPtr D_projData; + cudaPitchedPtr D_volumeData; + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + SIRT sirt; +}; + +AstraSIRT3d::AstraSIRT3d() +{ + pData = new AstraSIRT3d_internal(); + + pData->angles = 0; + pData->D_projData.ptr = 0; + pData->D_volumeData.ptr = 0; + pData->D_maskData.ptr = 0; + pData->D_smaskData.ptr = 0; + + pData->dims.iVolX = 0; + pData->dims.iVolY = 0; + pData->dims.iVolZ = 0; + pData->dims.iProjAngles = 0; + pData->dims.iProjU = 0; + pData->dims.iProjV = 0; + pData->dims.iRaysPerDetDim = 1; + pData->dims.iRaysPerVoxelDim = 1; + + pData->projs = 0; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->useVolumeMask = false; + pData->useSinogramMask = false; +} + +AstraSIRT3d::~AstraSIRT3d() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->projs; + pData->projs = 0; + + cudaFree(pData->D_projData.ptr); + pData->D_projData.ptr = 0; + + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + + cudaFree(pData->D_maskData.ptr); + pData->D_maskData.ptr = 0; + + cudaFree(pData->D_smaskData.ptr); + pData->D_smaskData.ptr = 0; + + delete pData; + pData = 0; +} + +bool AstraSIRT3d::setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/) +{ + if (pData->initialized) + return false; + + pData->dims.iVolX = iVolX; + pData->dims.iVolY = iVolY; + pData->dims.iVolZ = iVolZ; + + return (iVolX > 0 && iVolY > 0 && iVolZ > 0); +} + + +bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->parprojs = new SPar3DProjection[iProjAngles]; + memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_PARALLEL; + + return true; +} + +bool AstraSIRT3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->parprojs = p; + pData->projType = PROJ_PARALLEL; + + return true; +} + + + +bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->projs = new SConeProjection[iProjAngles]; + memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraSIRT3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->projs = p; + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraSIRT3d::enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling) +{ + if (pData->initialized) + return false; + + if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0) + return false; + + pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling; + pData->dims.iRaysPerDetDim = iDetectorSuperSampling; + + return true; +} + +bool AstraSIRT3d::enableVolumeMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->sirt.enableVolumeMask(); + pData->useVolumeMask = ok; + + return ok; +} + +bool AstraSIRT3d::enableSinogramMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->sirt.enableSinogramMask(); + pData->useSinogramMask = ok; + + return ok; +} + +bool AstraSIRT3d::setGPUIndex(int index) +{ + cudaSetDevice(index); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool AstraSIRT3d::init() +{ + fprintf(stderr, "001: %d\n", true); + if (pData->initialized) + return false; + fprintf(stderr, "002: %d\n", true); + + if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0) + return false; + fprintf(stderr, "003: %d\n", true); + + bool ok; + + if (pData->projType == PROJ_PARALLEL) { + ok = pData->sirt.setPar3DGeometry(pData->dims, pData->parprojs); + } else { + ok = pData->sirt.setConeGeometry(pData->dims, pData->projs); + } + fprintf(stderr, "004: %d\n", ok); + + if (!ok) + return false; + + ok = pData->sirt.init(); + if (!ok) + return false; + fprintf(stderr, "005: %d\n", ok); + + pData->D_volumeData = allocateVolumeData(pData->dims); + ok = pData->D_volumeData.ptr; + if (!ok) + return false; + fprintf(stderr, "006: %d\n", ok); + + fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV); + pData->D_projData = allocateProjectionData(pData->dims); + ok = pData->D_projData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + return false; + } + fprintf(stderr, "007: %d\n", ok); + + if (pData->useVolumeMask) { + pData->D_maskData = allocateVolumeData(pData->dims); + ok = pData->D_maskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + return false; + } + } + + if (pData->useSinogramMask) { + pData->D_smaskData = allocateProjectionData(pData->dims); + ok = pData->D_smaskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + cudaFree(pData->D_maskData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + pData->D_maskData.ptr = 0; + return false; + } + } + fprintf(stderr, "008: %d\n", ok); + + pData->initialized = true; + + return true; +} + +bool AstraSIRT3d::setMinConstraint(float fMin) +{ + if (!pData->initialized) + return false; + return pData->sirt.setMinConstraint(fMin); +} + +bool AstraSIRT3d::setMaxConstraint(float fMax) +{ + if (!pData->initialized) + return false; + return pData->sirt.setMaxConstraint(fMax); +} + +bool AstraSIRT3d::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch); + + if (!ok) + return false; + + ok = pData->sirt.setBuffers(pData->D_volumeData, pData->D_projData); + if (!ok) + return false; + + pData->setStartReconstruction = false; + + return true; +} + +bool AstraSIRT3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useVolumeMask) + return false; + if (!pfMask) + return false; + + bool ok = copyVolumeToDevice(pfMask, pData->D_maskData, + pData->dims, iMaskPitch); + if (!ok) + return false; + + ok = pData->sirt.setVolumeMask(pData->D_maskData); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useSinogramMask) + return false; + if (!pfMask) + return false; + + bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch); + + if (!ok) + return false; + + ok = pData->sirt.setSinogramMask(pData->D_smaskData); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch) +{ + if (!pData->initialized) + return false; + if (!pfReconstruction) + return false; + + bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + pData->setStartReconstruction = true; + + return true; +} + +bool AstraSIRT3d::iterate(unsigned int iIterations) +{ + if (!pData->initialized) + return false; + + if (!pData->setStartReconstruction) + zeroVolumeData(pData->D_volumeData, pData->dims); + + bool ok = pData->sirt.iterate(iIterations); + if (!ok) + return false; + + return true; +} + +bool AstraSIRT3d::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + return true; +} + +void AstraSIRT3d::signalAbort() +{ + if (!pData->initialized) + return; + + pData->sirt.signalAbort(); +} + +float AstraSIRT3d::computeDiffNorm() +{ + if (!pData->initialized) + return 0.0f; // FIXME: Error? + + return pData->sirt.computeDiffNorm(); +} + + + + +class AstraCGLS3d_internal { +public: + SDimensions3D dims; + CUDAProjectionType3d projType; + + float* angles; + float fOriginSourceDistance; + float fOriginDetectorDistance; + float fSourceZ; + float fDetSize; + + SConeProjection* projs; + SPar3DProjection* parprojs; + + float fPixelSize; + + bool initialized; + bool setStartReconstruction; + + bool useVolumeMask; + bool useSinogramMask; + + // Input/output + cudaPitchedPtr D_projData; + cudaPitchedPtr D_volumeData; + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + CGLS cgls; +}; + +AstraCGLS3d::AstraCGLS3d() +{ + pData = new AstraCGLS3d_internal(); + + pData->angles = 0; + pData->D_projData.ptr = 0; + pData->D_volumeData.ptr = 0; + pData->D_maskData.ptr = 0; + pData->D_smaskData.ptr = 0; + + pData->dims.iVolX = 0; + pData->dims.iVolY = 0; + pData->dims.iVolZ = 0; + pData->dims.iProjAngles = 0; + pData->dims.iProjU = 0; + pData->dims.iProjV = 0; + pData->dims.iRaysPerDetDim = 1; + pData->dims.iRaysPerVoxelDim = 1; + + pData->projs = 0; + + pData->initialized = false; + pData->setStartReconstruction = false; + + pData->useVolumeMask = false; + pData->useSinogramMask = false; +} + +AstraCGLS3d::~AstraCGLS3d() +{ + delete[] pData->angles; + pData->angles = 0; + + delete[] pData->projs; + pData->projs = 0; + + cudaFree(pData->D_projData.ptr); + pData->D_projData.ptr = 0; + + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + + cudaFree(pData->D_maskData.ptr); + pData->D_maskData.ptr = 0; + + cudaFree(pData->D_smaskData.ptr); + pData->D_smaskData.ptr = 0; + + delete pData; + pData = 0; +} + +bool AstraCGLS3d::setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/) +{ + if (pData->initialized) + return false; + + pData->dims.iVolX = iVolX; + pData->dims.iVolY = iVolY; + pData->dims.iVolZ = iVolZ; + + return (iVolX > 0 && iVolY > 0 && iVolZ > 0); +} + + +bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->parprojs = new SPar3DProjection[iProjAngles]; + memcpy(pData->parprojs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_PARALLEL; + + return true; +} + +bool AstraCGLS3d::setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->parprojs = p; + pData->projType = PROJ_PARALLEL; + + return true; +} + + + +bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs) +{ + if (pData->initialized) + return false; + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || projs == 0) + return false; + + pData->projs = new SConeProjection[iProjAngles]; + memcpy(pData->projs, projs, iProjAngles * sizeof(projs[0])); + + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraCGLS3d::setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles) +{ + if (pData->initialized) + return false; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + pData->dims.iProjAngles = iProjAngles; + pData->dims.iProjU = iProjU; + pData->dims.iProjV = iProjV; + + pData->projs = p; + pData->projType = PROJ_CONE; + + return true; +} + +bool AstraCGLS3d::enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling) +{ + if (pData->initialized) + return false; + + if (iVoxelSuperSampling == 0 || iDetectorSuperSampling == 0) + return false; + + pData->dims.iRaysPerVoxelDim = iVoxelSuperSampling; + pData->dims.iRaysPerDetDim = iDetectorSuperSampling; + + return true; +} + +bool AstraCGLS3d::enableVolumeMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->cgls.enableVolumeMask(); + pData->useVolumeMask = ok; + + return ok; +} + +#if 0 +bool AstraCGLS3d::enableSinogramMask() +{ + if (pData->initialized) + return false; + + bool ok = pData->cgls.enableSinogramMask(); + pData->useSinogramMask = ok; + + return ok; +} +#endif + +bool AstraCGLS3d::setGPUIndex(int index) +{ + cudaSetDevice(index); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; +} + +bool AstraCGLS3d::init() +{ + fprintf(stderr, "001: %d\n", true); + if (pData->initialized) + return false; + fprintf(stderr, "002: %d\n", true); + + if (pData->dims.iVolX == 0 || pData->dims.iProjAngles == 0) + return false; + fprintf(stderr, "003: %d\n", true); + + bool ok; + + if (pData->projType == PROJ_PARALLEL) { + ok = pData->cgls.setPar3DGeometry(pData->dims, pData->parprojs); + } else { + ok = pData->cgls.setConeGeometry(pData->dims, pData->projs); + } + fprintf(stderr, "004: %d\n", ok); + + if (!ok) + return false; + + ok = pData->cgls.init(); + if (!ok) + return false; + fprintf(stderr, "005: %d\n", ok); + + pData->D_volumeData = allocateVolumeData(pData->dims); + ok = pData->D_volumeData.ptr; + if (!ok) + return false; + fprintf(stderr, "006: %d\n", ok); + + fprintf(stderr, "proj: %d %d %d\n", pData->dims.iProjAngles, pData->dims.iProjU, pData->dims.iProjV); + pData->D_projData = allocateProjectionData(pData->dims); + ok = pData->D_projData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + pData->D_volumeData.ptr = 0; + return false; + } + fprintf(stderr, "007: %d\n", ok); + + if (pData->useVolumeMask) { + pData->D_maskData = allocateVolumeData(pData->dims); + ok = pData->D_maskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + return false; + } + } + + if (pData->useSinogramMask) { + pData->D_smaskData = allocateProjectionData(pData->dims); + ok = pData->D_smaskData.ptr; + if (!ok) { + cudaFree(pData->D_volumeData.ptr); + cudaFree(pData->D_projData.ptr); + cudaFree(pData->D_maskData.ptr); + pData->D_volumeData.ptr = 0; + pData->D_projData.ptr = 0; + pData->D_maskData.ptr = 0; + return false; + } + } + fprintf(stderr, "008: %d\n", ok); + + pData->initialized = true; + + return true; +} + +#if 0 +bool AstraCGLS3d::setMinConstraint(float fMin) +{ + if (!pData->initialized) + return false; + return pData->cgls.setMinConstraint(fMin); +} + +bool AstraCGLS3d::setMaxConstraint(float fMax) +{ + if (!pData->initialized) + return false; + return pData->cgls.setMaxConstraint(fMax); +} +#endif + +bool AstraCGLS3d::setSinogram(const float* pfSinogram, + unsigned int iSinogramPitch) +{ + if (!pData->initialized) + return false; + if (!pfSinogram) + return false; + + bool ok = copyProjectionsToDevice(pfSinogram, pData->D_projData, pData->dims, iSinogramPitch); + + if (!ok) + return false; + + ok = pData->cgls.setBuffers(pData->D_volumeData, pData->D_projData); + if (!ok) + return false; + + pData->setStartReconstruction = false; + + return true; +} + +bool AstraCGLS3d::setVolumeMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useVolumeMask) + return false; + if (!pfMask) + return false; + + bool ok = copyVolumeToDevice(pfMask, pData->D_maskData, + pData->dims, iMaskPitch); + if (!ok) + return false; + + ok = pData->cgls.setVolumeMask(pData->D_maskData); + if (!ok) + return false; + + return true; +} + +#if 0 +bool AstraCGLS3d::setSinogramMask(const float* pfMask, unsigned int iMaskPitch) +{ + if (!pData->initialized) + return false; + if (!pData->useSinogramMask) + return false; + if (!pfMask) + return false; + + bool ok = copyProjectionsToDevice(pfMask, pData->D_smaskData, pData->dims, iMaskPitch); + + if (!ok) + return false; + + ok = pData->cgls.setSinogramMask(pData->D_smaskData); + if (!ok) + return false; + + return true; +} +#endif + +bool AstraCGLS3d::setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch) +{ + if (!pData->initialized) + return false; + if (!pfReconstruction) + return false; + + bool ok = copyVolumeToDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + pData->setStartReconstruction = true; + + return true; +} + +bool AstraCGLS3d::iterate(unsigned int iIterations) +{ + if (!pData->initialized) + return false; + + if (!pData->setStartReconstruction) + zeroVolumeData(pData->D_volumeData, pData->dims); + + bool ok = pData->cgls.iterate(iIterations); + if (!ok) + return false; + + return true; +} + +bool AstraCGLS3d::getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const +{ + if (!pData->initialized) + return false; + + bool ok = copyVolumeFromDevice(pfReconstruction, pData->D_volumeData, + pData->dims, iReconstructionPitch); + if (!ok) + return false; + + return true; +} + +void AstraCGLS3d::signalAbort() +{ + if (!pData->initialized) + return; + + pData->cgls.signalAbort(); +} + +float AstraCGLS3d::computeDiffNorm() +{ + if (!pData->initialized) + return 0.0f; // FIXME: Error? + + return pData->cgls.computeDiffNorm(); +} + + + +bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaConeFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling); + + delete[] p; + + return ok; +} + +bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerDetDim = iDetectorSuperSampling; + + if (iDetectorSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + ok &= zeroProjectionData(D_projData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= ConeFP(D_volumeData, D_projData, dims, pfAngles, 1.0f); + + ok &= copyProjectionsFromDevice(pfProjections, D_projData, + dims, dims.iProjU); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaPar3DFP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iDetectorSuperSampling, + projKernel); + + delete[] p; + + return ok; +} + + +bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerDetDim = iDetectorSuperSampling; + + if (iDetectorSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyVolumeToDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + ok &= zeroProjectionData(D_projData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + switch (projKernel) { + case ker3d_default: + ok &= Par3DFP(D_volumeData, D_projData, dims, pfAngles, 1.0f); + break; + case ker3d_sum_square_weights: + ok &= Par3DFP_SumSqW(D_volumeData, D_projData, dims, pfAngles, 1.0f); + break; + default: + assert(false); + } + + ok &= copyProjectionsFromDevice(pfProjections, D_projData, + dims, dims.iProjU); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SConeProjection* p = genConeProjections(iProjAngles, + iProjU, iProjV, + fOriginSourceDistance, + fOriginDetectorDistance, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaConeBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + + delete[] p; + + return ok; +} + +bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, + dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= ConeBP(D_volumeData, D_projData, dims, pfAngles); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + +bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + SPar3DProjection* p = genPar3DProjections(iProjAngles, + iProjU, iProjV, + fDetUSize, fDetVSize, + pfAngles); + + bool ok; + ok = astraCudaPar3DBP(pfVolume, pfProjections, iVolX, iVolY, iVolZ, + iProjAngles, iProjU, iProjV, p, iGPUIndex, iVoxelSuperSampling); + + delete[] p; + + return ok; +} + + +bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, + dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + ok &= Par3DBP(D_volumeData, D_projData, dims, pfAngles); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + + + +bool astraCudaFDK(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + bool bShortScan, + int iGPUIndex, int iVoxelSuperSampling) +{ + SDimensions3D dims; + + dims.iVolX = iVolX; + dims.iVolY = iVolY; + dims.iVolZ = iVolZ; + if (iVolX == 0 || iVolY == 0 || iVolZ == 0) + return false; + + dims.iProjAngles = iProjAngles; + dims.iProjU = iProjU; + dims.iProjV = iProjV; + + if (iProjAngles == 0 || iProjU == 0 || iProjV == 0 || pfAngles == 0) + return false; + + dims.iRaysPerVoxelDim = iVoxelSuperSampling; + + if (iVoxelSuperSampling == 0) + return false; + + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + + + cudaPitchedPtr D_volumeData = allocateVolumeData(dims); + bool ok = D_volumeData.ptr; + if (!ok) + return false; + + cudaPitchedPtr D_projData = allocateProjectionData(dims); + ok = D_projData.ptr; + if (!ok) { + cudaFree(D_volumeData.ptr); + return false; + } + + ok &= copyProjectionsToDevice(pfProjections, D_projData, dims, dims.iProjU); + + ok &= zeroVolumeData(D_volumeData, dims); + + if (!ok) { + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + return false; + } + + // TODO: Offer interface for SrcZ, DetZ + ok &= FDK(D_volumeData, D_projData, fOriginSourceDistance, + fOriginDetectorDistance, 0, 0, fDetUSize, fDetVSize, + dims, pfAngles, bShortScan); + + ok &= copyVolumeFromDevice(pfVolume, D_volumeData, dims, dims.iVolX); + + + cudaFree(D_volumeData.ptr); + cudaFree(D_projData.ptr); + + return ok; + +} + + + + +} diff --git a/cuda/3d/astra3d.h b/cuda/3d/astra3d.h new file mode 100644 index 0000000..5712f89 --- /dev/null +++ b/cuda/3d/astra3d.h @@ -0,0 +1,450 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_ASTRA3D_H +#define _CUDA_ASTRA3D_H + +#include "dims3d.h" + +namespace astra { + + +// TODO: Switch to a class hierarchy as with the 2D algorithms + + +enum Cuda3DProjectionKernel { + ker3d_default = 0, + ker3d_sum_square_weights +}; + + +class AstraSIRT3d_internal; + + +class _AstraExport AstraSIRT3d { +public: + + AstraSIRT3d(); + ~AstraSIRT3d(); + + // Set the number of pixels in the reconstruction rectangle, + // and the length of the edge of a pixel. + // Volume pixels are assumed to be square. + // This must be called before setting the projection geometry. + bool setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/); + + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs); + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fSourceZ, + float fDetSize, + const float *pfAngles); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fSourceZ, + float fDetSize, + const float *pfAngles); + + // Enable supersampling. + // + // The number of rays used in FP is the square of iDetectorSuperSampling. + // The number of rays used in BP is the cube of iVoxelSuperSampling. + bool enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling); + + // Enable volume/sinogram masks + // + // This may optionally be called before init(). + // If it is called, setVolumeMask()/setSinogramMask() must be called between + // setSinogram() and iterate(). + bool enableVolumeMask(); + bool enableSinogramMask(); + + // Set GPU index + // + // This should be called before init(). Note that setting the GPU index + // in a thread which has already used the GPU may not work. + bool setGPUIndex(int index); + + // Allocate GPU buffers and + // precompute geometry-specific data. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry() or setFanProjectionGeometry(). + bool init(); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size XXX + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Setup volume mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only volume pixels + // for which pfMask[z] is 1.0f are processed. + bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch); + + // Setup sinogram mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels + // for which pfMask[z] is 1.0f are processed. + bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch); + + // Set the starting reconstruction for SIRT. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called between setSinogram() and iterate(). + // If this function is not called before iterate(), SIRT will start + // from a zero reconstruction. + // + // pfReconstruction will only be read from during this call. + bool setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch); + + // Enable min/max constraint. + // + // These may optionally be called between init() and iterate() + bool setMinConstraint(float fMin); + bool setMaxConstraint(float fMax); + + // Perform a number of (additive) SIRT iterations. + // This must be called after setSinogram(). + // + // If called multiple times, without calls to setSinogram() or + // setStartReconstruction() in between, iterate() will continue from + // the result of the previous call. + // Calls to getReconstruction() are allowed between calls to iterate() and + // do not change the state. + bool iterate(unsigned int iIterations); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after iterate(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate(). + float computeDiffNorm(); + + // Signal the algorithm that it should abort after the current iteration. + // This is intended to be called from another thread. + void signalAbort(); + +protected: + AstraSIRT3d_internal *pData; +}; + + +class AstraCGLS3d_internal; + + +class _AstraExport AstraCGLS3d { +public: + + AstraCGLS3d(); + ~AstraCGLS3d(); + + // Set the number of pixels in the reconstruction rectangle, + // and the length of the edge of a pixel. + // Volume pixels are assumed to be square. + // This must be called before setting the projection geometry. + bool setReconstructionGeometry(unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ/*, + float fPixelSize = 1.0f*/); + + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection* projs); + bool setConeGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fSourceZ, + float fDetSize, + const float *pfAngles); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection* projs); + bool setPar3DGeometry(unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fSourceZ, + float fDetSize, + const float *pfAngles); + + // Enable supersampling. + // + // The number of rays used in FP is the square of iDetectorSuperSampling. + // The number of rays used in BP is the cube of iVoxelSuperSampling. + bool enableSuperSampling(unsigned int iVoxelSuperSampling, + unsigned int iDetectorSuperSampling); + + // Enable volume/sinogram masks + // + // This may optionally be called before init(). + // If it is called, setVolumeMask()/setSinogramMask() must be called between + // setSinogram() and iterate(). + bool enableVolumeMask(); + //bool enableSinogramMask(); + + // Set GPU index + // + // This should be called before init(). Note that setting the GPU index + // in a thread which has already used the GPU may not work. + bool setGPUIndex(int index); + + // Allocate GPU buffers and + // precompute geometry-specific data. + // + // This must be called after calling setReconstructionGeometry() and + // setProjectionGeometry() or setFanProjectionGeometry(). + bool init(); + + // Setup input sinogram for a slice. + // pfSinogram must be a float array of size XXX + // NB: iSinogramPitch is measured in floats, not in bytes. + // + // This must be called after init(), and before iterate(). It may be + // called again after iterate()/getReconstruction() to start a new slice. + // + // pfSinogram will only be read from during this call. + bool setSinogram(const float* pfSinogram, unsigned int iSinogramPitch); + + // Setup volume mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only volume pixels + // for which pfMask[z] is 1.0f are processed. + bool setVolumeMask(const float* pfMask, unsigned int iMaskPitch); + + // Setup sinogram mask for a slice. + // pfMask must be a float array of size XXX + // NB: iMaskPitch is measured in floats, not in bytes. + // + // It may only contain the exact values 0.0f and 1.0f. Only sinogram pixels + // for which pfMask[z] is 1.0f are processed. + //bool setSinogramMask(const float* pfMask, unsigned int iMaskPitch); + + // Set the starting reconstruction for SIRT. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called between setSinogram() and iterate(). + // If this function is not called before iterate(), SIRT will start + // from a zero reconstruction. + // + // pfReconstruction will only be read from during this call. + bool setStartReconstruction(const float* pfReconstruction, + unsigned int iReconstructionPitch); + + // Enable min/max constraint. + // + // These may optionally be called between init() and iterate() + //bool setMinConstraint(float fMin); + //bool setMaxConstraint(float fMax); + + // Perform a number of (additive) SIRT iterations. + // This must be called after setSinogram(). + // + // If called multiple times, without calls to setSinogram() or + // setStartReconstruction() in between, iterate() will continue from + // the result of the previous call. + // Calls to getReconstruction() are allowed between calls to iterate() and + // do not change the state. + bool iterate(unsigned int iIterations); + + // Get the reconstructed slice. + // pfReconstruction must be a float array of size XXX + // NB: iReconstructionPitch is measured in floats, not in bytes. + // + // This may be called after iterate(). + bool getReconstruction(float* pfReconstruction, + unsigned int iReconstructionPitch) const; + + // Compute the norm of the difference of the FP of the current + // reconstruction and the sinogram. (This performs one FP.) + // It can be called after iterate(). + float computeDiffNorm(); + + // Signal the algorithm that it should abort after the current iteration. + // This is intended to be called from another thread. + void signalAbort(); + +protected: + AstraCGLS3d_internal *pData; +}; + + + +_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling); + +_AstraExport bool astraCudaConeFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling); + +_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel); + +_AstraExport bool astraCudaPar3DFP(const float* pfVolume, float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iDetectorSuperSampling, + Cuda3DProjectionKernel projKernel); + + +_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaConeBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SConeProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaPar3DBP(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + const SPar3DProjection *pfAngles, + int iGPUIndex, int iVoxelSuperSampling); + +_AstraExport bool astraCudaFDK(float* pfVolume, const float* pfProjections, + unsigned int iVolX, + unsigned int iVolY, + unsigned int iVolZ, + unsigned int iProjAngles, + unsigned int iProjU, + unsigned int iProjV, + float fOriginSourceDistance, + float fOriginDetectorDistance, + float fDetUSize, + float fDetVSize, + const float *pfAngles, + bool bShortScan, + int iGPUIndex, int iVoxelSuperSampling); + +} + + +#endif diff --git a/cuda/3d/cgls3d.cu b/cuda/3d/cgls3d.cu new file mode 100644 index 0000000..72bb9cd --- /dev/null +++ b/cuda/3d/cgls3d.cu @@ -0,0 +1,428 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "cgls3d.h" +#include "util3d.h" +#include "arith3d.h" +#include "cone_fp.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA3d { + +CGLS::CGLS() : ReconAlgo3D() +{ + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_r.ptr = 0; + D_w.ptr = 0; + D_z.ptr = 0; + D_p.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; +} + + +CGLS::~CGLS() +{ + reset(); +} + +void CGLS::reset() +{ + cudaFree(D_r.ptr); + cudaFree(D_w.ptr); + cudaFree(D_z.ptr); + cudaFree(D_p.ptr); + + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_r.ptr = 0; + D_w.ptr = 0; + D_z.ptr = 0; + D_p.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + sliceInitialized = false; + + ReconAlgo3D::reset(); +} + +bool CGLS::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool CGLS::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool CGLS::init() +{ + D_z = allocateVolumeData(dims); + D_p = allocateVolumeData(dims); + D_r = allocateProjectionData(dims); + D_w = allocateProjectionData(dims); + + // TODO: check if allocations succeeded + return true; +} + +bool CGLS::setVolumeMask(cudaPitchedPtr& _D_maskData) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + + return true; +} + +bool CGLS::setSinogramMask(cudaPitchedPtr& _D_smaskData) +{ + return false; +#if 0 + // TODO: Implement this + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + return true; +#endif +} + +bool CGLS::setBuffers(cudaPitchedPtr& _D_volumeData, + cudaPitchedPtr& _D_projData) +{ + D_volumeData = _D_volumeData; + D_sinoData = _D_projData; + + fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr); + + sliceInitialized = false; + + return true; +} + +bool CGLS::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (!sliceInitialized) { + + // copy sinogram + duplicateProjectionData(D_r, D_sinoData, dims); + + // r = sino - A*x + if (useVolumeMask) { + duplicateVolumeData(D_z, D_volumeData, dims); + processVol3D<opMul>(D_z, D_maskData, dims); + callFP(D_z, D_r, -1.0f); + } else { + callFP(D_volumeData, D_r, -1.0f); + } + + // p = A'*r + zeroVolumeData(D_p, dims); + callBP(D_p, D_r); + if (useVolumeMask) + processVol3D<opMul>(D_p, D_maskData, dims); + + gamma = dotProduct3D(D_p, dims.iVolX, dims.iVolY, dims.iVolZ); + + sliceInitialized = true; + + } + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + + // w = A*p + zeroProjectionData(D_w, dims); + callFP(D_p, D_w, 1.0f); + + // alpha = gamma / <w,w> + float ww = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV); + float alpha = gamma / ww; + + // x += alpha*p + processVol3D<opAddScaled>(D_volumeData, D_p, alpha, dims); + + // r -= alpha*w + processSino3D<opAddScaled>(D_r, D_w, -alpha, dims); + + // z = A'*r + zeroVolumeData(D_z, dims); + callBP(D_z, D_r); + if (useVolumeMask) + processVol3D<opMul>(D_z, D_maskData, dims); + + float beta = 1.0f / gamma; + gamma = dotProduct3D(D_z, dims.iVolX, dims.iVolY, dims.iVolZ); + + beta *= gamma; + + // p = z + beta*p + processVol3D<opScaleAndAdd>(D_p, D_z, beta, dims); + } + + return true; +} + +float CGLS::computeDiffNorm() +{ + // We can use w and z as temporary storage here since they're not + // used outside of iterations. + + // copy sinogram to w + duplicateProjectionData(D_w, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_z, D_volumeData, dims); + processVol3D<opMul>(D_z, D_maskData, dims); + callFP(D_z, D_w, -1.0f); + } else { + callFP(D_volumeData, D_w, -1.0f); + } + + float s = dotProduct3D(D_w, dims.iProjU, dims.iProjAngles, dims.iProjV); + return sqrt(s); +} + + +bool doCGLS(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_sinoData, + cudaPitchedPtr& D_maskData, + const SDimensions3D& dims, const SConeProjection* angles, + unsigned int iterations) +{ + CGLS cgls; + bool ok = true; + + ok &= cgls.setConeGeometry(dims, angles); + if (D_maskData.ptr) + ok &= cgls.enableVolumeMask(); + + if (!ok) + return false; + + ok = cgls.init(); + if (!ok) + return false; + + if (D_maskData.ptr) + ok &= cgls.setVolumeMask(D_maskData); + + ok &= cgls.setBuffers(D_volumeData, D_sinoData); + if (!ok) + return false; + + ok = cgls.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 100; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + SConeProjection angle[100]; + angle[0].fSrcX = -2905.6; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 694.4; + angle[0].fDetSY = -122.4704; + angle[0].fDetSZ = -122.4704; + + angle[0].fDetUX = 0; + angle[0].fDetUY = .4784; + //angle[0].fDetUY = .5; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = .4784; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 100; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/100); + ROTATE0(DetS, i, i*2*M_PI/100); + ROTATE0(DetU, i, i*2*M_PI/100); + ROTATE0(DetV, i, i*2*M_PI/100); + } +#undef ROTATE0 + + + cudaPitchedPtr volData = allocateVolumeData(dims); + cudaPitchedPtr projData = allocateProjectionData(dims); + zeroProjectionData(projData, dims); + + float* pbuf = new float[100*512*512]; + copyProjectionsFromDevice(pbuf, projData, dims); + copyProjectionsToDevice(pbuf, projData, dims); + delete[] pbuf; + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256; ++i) { + for (unsigned int y = 0; y < 256; ++y) + for (unsigned int x = 0; x < 256; ++x) + slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f; + + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + +#else + + for (int i = 0; i < 100; ++i) { + char fname[32]; + sprintf(fname, "Tiffs/%04d.png", 4*i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + for (int j = 0; j < 512*512; ++j) { + float v = bufp[j]; + if (v > 236.0f) v = 236.0f; + v = logf(236.0f / v); + bufp[j] = 256*v; + } + + for (int j = 0; j < 512; ++j) { + cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + + } +#endif + +#if 0 + float* bufs = new float[100*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 100, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + zeroVolumeData(volData, dims); + + cudaPitchedPtr maskData; + maskData.ptr = 0; + + astraCUDA3d::doCGLS(volData, projData, maskData, dims, angle, 50); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + return 0; +} +#endif + diff --git a/cuda/3d/cgls3d.h b/cuda/3d/cgls3d.h new file mode 100644 index 0000000..d16b571 --- /dev/null +++ b/cuda/3d/cgls3d.h @@ -0,0 +1,114 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CGLS3D_H +#define _CUDA_CGLS3D_H + +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + +class _AstraExport CGLS : public ReconAlgo3D { +public: + CGLS(); + ~CGLS(); + +// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + + + bool enableVolumeMask(); + bool enableSinogramMask(); + + // init should be called after setting all geometry + bool init(); + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(cudaPitchedPtr& D_maskData); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(cudaPitchedPtr& D_smaskData); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + bool setBuffers(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + bool setMinConstraint(float fMin) { return false; } + bool setMaxConstraint(float fMax) { return false; } + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + bool iterate(unsigned int iterations); + + // Compute the norm of the difference of the FP of the current reconstruction + // and the sinogram. (This performs one FP.) + // It can be called after iterate. + float computeDiffNorm(); + +protected: + void reset(); + + bool useVolumeMask; + bool useSinogramMask; + + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + // Input/output + cudaPitchedPtr D_sinoData; + cudaPitchedPtr D_volumeData; + + // Temporary buffers + cudaPitchedPtr D_r; + cudaPitchedPtr D_w; + cudaPitchedPtr D_z; + cudaPitchedPtr D_p; + + float gamma; + + bool sliceInitialized; +}; + +_AstraExport bool doCGLS(cudaPitchedPtr D_volumeData, unsigned int volumePitch, + cudaPitchedPtr D_projData, unsigned int projPitch, + cudaPitchedPtr D_maskData, unsigned int maskPitch, + const SDimensions3D& dims, const SConeProjection* projs, + unsigned int iterations); + +} + +#endif diff --git a/cuda/3d/cone_bp.cu b/cuda/3d/cone_bp.cu new file mode 100644 index 0000000..7f8e320 --- /dev/null +++ b/cuda/3d/cone_bp.cu @@ -0,0 +1,481 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "cone_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned g_MaxAngles = 1024; + +__constant__ float gC_Cux[g_MaxAngles]; +__constant__ float gC_Cuy[g_MaxAngles]; +__constant__ float gC_Cuz[g_MaxAngles]; +__constant__ float gC_Cuc[g_MaxAngles]; +__constant__ float gC_Cvx[g_MaxAngles]; +__constant__ float gC_Cvy[g_MaxAngles]; +__constant__ float gC_Cvz[g_MaxAngles]; +__constant__ float gC_Cvc[g_MaxAngles]; +__constant__ float gC_Cdx[g_MaxAngles]; +__constant__ float gC_Cdy[g_MaxAngles]; +__constant__ float gC_Cdz[g_MaxAngles]; +__constant__ float gC_Cdc[g_MaxAngles]; + + +bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.filterMode = cudaFilterModeLinear; + gT_coneProjTexture.normalized = false; + + cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + const float fCdx = gC_Cdx[angle]; + const float fCdy = gC_Cdy[angle]; + const float fCdz = gC_Cdz[angle]; + const float fCdc = gC_Cdc[angle]; + + const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; + const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; + const float fDen = fCdc + fX * fCdx + fY * fCdy + fZ * fCdz; + + const float fU = fUNum / fDen + 1.0f; + const float fV = fVNum / fDen + 1.0f; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; + } + +} + +// supersampling version +__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + const float fSubStep = 1.0f/dims.iRaysPerVoxelDim; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + const float fCdx = gC_Cdx[angle]; + const float fCdy = gC_Cdy[angle]; + const float fCdz = gC_Cdz[angle]; + const float fCdc = gC_Cdc[angle]; + + float fXs = fX; + for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { + + const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; + const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; + const float fDen = fCdc + fXs * fCdx + fYs * fCdy + fZs * fCdz; + + const float fU = fUNum / fDen + 1.0f; + const float fV = fVNum / fDen + 1.0f; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim); + } + +} + + +bool ConeBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SConeProjection* angles) +{ + bindProjDataTexture(D_projArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVY - (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVZ , Cux ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVZ -(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetVX , Cuy ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetVX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetVY , Cuz ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fSrcX - (angles[i].fDetSX*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVX)*angles[i].fSrcY + (angles[i].fDetSX*angles[i].fDetVY - angles[i].fDetSY*angles[i].fDetVX)*angles[i].fSrcZ , Cuc ); + + TRANSFER_TO_CONSTANT( (angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUZ-(angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUY, Cvx ); + TRANSFER_TO_CONSTANT( (angles[i].fDetSZ - angles[i].fSrcZ)*angles[i].fDetUX - (angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUZ , Cvy ); + TRANSFER_TO_CONSTANT((angles[i].fDetSX - angles[i].fSrcX)*angles[i].fDetUY-(angles[i].fDetSY - angles[i].fSrcY)*angles[i].fDetUX , Cvz ); + TRANSFER_TO_CONSTANT( -(angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fSrcX + (angles[i].fDetSX*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUX)*angles[i].fSrcY - (angles[i].fDetSX*angles[i].fDetUY - angles[i].fDetSY*angles[i].fDetUX)*angles[i].fSrcZ , Cvc ); + + TRANSFER_TO_CONSTANT( angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY , Cdx ); + TRANSFER_TO_CONSTANT( angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ , Cdy ); + TRANSFER_TO_CONSTANT( angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX , Cdz ); + TRANSFER_TO_CONSTANT( -angles[i].fSrcX * (angles[i].fDetUY*angles[i].fDetVZ - angles[i].fDetUZ*angles[i].fDetVY) - angles[i].fSrcY * (angles[i].fDetUZ*angles[i].fDetVX - angles[i].fDetUX*angles[i].fDetVZ) - angles[i].fSrcZ * (angles[i].fDetUX*angles[i].fDetVY - angles[i].fDetUY*angles[i].fDetVX) , Cdc ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (dims.iRaysPerVoxelDim == 1) + dev_cone_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + else + dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + } + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool ConeBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 180; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } + + + SConeProjection angle[180]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); +#if 0 + float* bufs = new float[180*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 180, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 0.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + + astraCUDA3d::ConeBP(volData, projData, dims, angle); +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + +} +#endif diff --git a/cuda/3d/cone_bp.h b/cuda/3d/cone_bp.h new file mode 100644 index 0000000..c77714e --- /dev/null +++ b/cuda/3d/cone_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_BP_H +#define _CUDA_CONE_BP_H + +namespace astraCUDA3d { + +_AstraExport bool ConeBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SConeProjection* angles); + +_AstraExport bool ConeBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles); + + +} + +#endif diff --git a/cuda/3d/cone_fp.cu b/cuda/3d/cone_fp.cu new file mode 100644 index 0000000..40dca4f --- /dev/null +++ b/cuda/3d/cone_fp.cu @@ -0,0 +1,513 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneVolumeTexture; + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 64; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_SrcZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +bool bindVolumeDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneVolumeTexture.filterMode = cudaFilterModeLinear; + gT_coneVolumeTexture.normalized = false; + + cudaBindTextureToArray(gT_coneVolumeTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + // threadIdx: x = ??? detector (u?) + // y = relative angle + + // blockIdx: x = ??? detector (u+v?) + // y = angle block + + +#define CONE_FP_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fSrcX = gC_SrcX[angle]; \ + const float fSrcY = gC_SrcY[angle]; \ + const float fSrcZ = gC_SrcZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray from Src to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \ + const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \ + const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \ + const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + +#define CONE_FP_SS_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fSrcX = gC_SrcX[angle]; \ + const float fSrcY = gC_SrcY[angle]; \ + const float fSrcZ = gC_SrcZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVolX) \ + endSlice = dims.iVolX; \ + \ + const float fSubStep = 1.0f/dims.iRaysPerDetDim; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray from Src to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + float fV = 0.0f; \ + \ + float fdU = detectorU - 0.5f + 0.5f*fSubStep; \ + for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \ + float fdV = detectorV - 0.5f + 0.5f*fSubStep; \ + for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \ + \ + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \ + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \ + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = (fSrc##c1 - fDet##c1) / (fSrc##c0 - fDet##c0); \ + const float a##c2 = (fSrc##c2 - fDet##c2) / (fSrc##c0 - fDet##c0); \ + const float b##c1 = fSrc##c1 - a##c1 * fSrc##c0; \ + const float b##c2 = fSrc##c2 - a##c2 * fSrc##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f; \ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f; \ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_coneVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + fV += fVal; \ + \ + } \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\ + } + + + + + +__global__ void FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(X,Y,Z) +} + +__global__ void FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(Y,X,Z) +} + +__global__ void FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_BODY(Z,X,Y) +} + + +__global__ void FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(X,Y,Z) +} + +__global__ void FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(Y,X,Z) +} + +__global__ void FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +CONE_FP_SS_BODY(Z,X,Y) +} + + + +bool ConeFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale) +{ + bindVolumeDataTexture(D_volArray); + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(SrcZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f)); + float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f)); + float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f)); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool ConeFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale) +{ + // transfer volume to array + + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + bool ret = ConeFP_Array(cuArray, D_projData, dims, angles, fOutputScale); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 32; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjV; + extentP.depth = dims.iProjAngles; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaError err = cudaMemcpy3D(&p); + assert(!err); + } + + + SConeProjection angle[32]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 200; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 32; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*1*M_PI/180); + ROTATE0(DetS, i, i*1*M_PI/180); + ROTATE0(DetU, i, i*1*M_PI/180); + ROTATE0(DetV, i, i*1*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + + float* buf = new float[512*512]; + + cudaMemcpy(buf, ((float*)projData.ptr)+512*512*8, 512*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + saveImage("proj.png", 512, 512, buf); + + +} +#endif diff --git a/cuda/3d/cone_fp.h b/cuda/3d/cone_fp.h new file mode 100644 index 0000000..2a0463b --- /dev/null +++ b/cuda/3d/cone_fp.h @@ -0,0 +1,46 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_FP_H +#define _CUDA_CONE_FP_H + +namespace astraCUDA3d { + +_AstraExport bool ConeFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale); + +_AstraExport bool ConeFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + float fOutputScale); + +} + +#endif diff --git a/cuda/3d/darthelper3d.cu b/cuda/3d/darthelper3d.cu new file mode 100644 index 0000000..68330a1 --- /dev/null +++ b/cuda/3d/darthelper3d.cu @@ -0,0 +1,229 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include "util3d.h" +#include "dims3d.h" +#include "darthelper3d.h" +#include <cassert> + +namespace astraCUDA3d { + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + __global__ void devDartSmoothing(cudaPitchedPtr out, cudaPitchedPtr in, float b, SDimensions3D dims) + { + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) { + + float* d = (float*)in.ptr; + float* m = (float*)out.ptr; + + unsigned int index; + unsigned int p = (out.pitch >> 2); + + for (unsigned int z = 0; z <= dims.iVolZ-1; z++) { + + float res = 0.0f; + + // bottom slice + if (z > 0) { + index = ((z-1)*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index ] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + } + + // top slice + if (z < dims.iVolZ-1) { + index = ((z+1)*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index ] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + } + + // same slice + index = (z*dims.iVolY + y) * p + x; + res += d[index-p-1] + d[index-p] + d[index-p+1] + + d[index -1] + d[index +1] + + d[index+p-1] + d[index+p] + d[index+p+1]; + + // result + m[index] = (1.0f-b) * d[index] + b * 0.038461538f * res; + + } + + } + } + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims) + { + cudaPitchedPtr D_inData; + D_inData = allocateVolumeData(dims); + copyVolumeToDevice(in, D_inData, dims); + + cudaPitchedPtr D_outData; + D_outData = allocateVolumeData(dims); + copyVolumeToDevice(out, D_outData, dims); + + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16); + + devDartSmoothing<<<gridSize, blockSize>>>(D_outData, D_inData, b, dims); + + copyVolumeFromDevice(out, D_outData, dims); + + cudaFree(D_outData.ptr); + cudaFree(D_inData.ptr); + + } + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + // CUDA function for the masking of DART with a radius == 1 + __global__ void devDartMasking(cudaPitchedPtr mask, cudaPitchedPtr in, unsigned int conn, SDimensions3D dims) + { + unsigned int x = threadIdx.x + 16*blockIdx.x; + unsigned int y = threadIdx.y + 16*blockIdx.y; + + // Sacrifice the border pixels to simplify the implementation. + if (x > 0 && x < dims.iVolX - 1 && y > 0 && y < dims.iVolY - 1) { + + float* d = (float*)in.ptr; + float* m = (float*)mask.ptr; + + unsigned int index; + unsigned int p = (in.pitch >> 2); + + for (unsigned int z = 0; z <= dims.iVolZ-1; z++) { + + unsigned int o2 = (z*dims.iVolY + y) * p + x; + + m[o2] = 0.0f; + + // bottom slice + if (z > 0) { + index = ((z-1)*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && d[index] != d[o2])) + { + m[o2] = 1.0f; + continue; + } + } + + // top slice + if (z < dims.iVolZ-1) { + index = ((z+1)*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index ] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && d[index] != d[o2])) + { + m[o2] = 1.0f; + continue; + } + } + + // other slices + index = (z*dims.iVolY + y) * p + x; + if ((conn == 26 && + (d[index-p-1] != d[o2] || d[index-p] != d[o2] || d[index-p+1] != d[o2] || + d[index -1] != d[o2] || d[index +1] != d[o2] || + d[index+p-1] != d[o2] || d[index+p] != d[o2] || d[index+p+1] != d[o2] )) + || + (conn == 6 && + ( d[index-p] != d[o2] || + d[index -1] != d[o2] || d[index +1] != d[o2] || + d[index+p] != d[o2] ))) + { + m[o2] = 1.0f; + continue; + } + + } + + } + } + + + + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + void dartMasking(float* mask, const float* segmentation, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims) + { + cudaPitchedPtr D_maskData; + D_maskData = allocateVolumeData(dims); + copyVolumeToDevice(mask, D_maskData, dims); + + cudaPitchedPtr D_segmentationData; + D_segmentationData = allocateVolumeData(dims); + copyVolumeToDevice(segmentation, D_segmentationData, dims); + + dim3 blockSize(16,16); + dim3 gridSize((dims.iVolX+15)/16, (dims.iVolY+15)/16); + + if (threshold == 1 && radius == 1) + devDartMasking<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, dims); + //else if (threshold > 1 && radius == 1) + // devADartMask<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, threshold, pitch, width, height, 1, 1); + //else if (threshold == 1 && radius > 1) + // devDartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, pitch, width, height, 1, 1); + //else + // devADartMaskRadius<<<gridSize, blockSize>>>(D_maskData, D_segmentationData, conn, radius, threshold, pitch, width, height, 1, 1); + + copyVolumeFromDevice(mask, D_maskData, dims); + + cudaFree(D_maskData.ptr); + cudaFree(D_segmentationData.ptr); + + } + // ------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + bool setGPUIndex(int iGPUIndex) + { + cudaSetDevice(iGPUIndex); + cudaError_t err = cudaGetLastError(); + + // Ignore errors caused by calling cudaSetDevice multiple times + if (err != cudaSuccess && err != cudaErrorSetOnActiveProcess) + return false; + + return true; + } + + +} diff --git a/cuda/3d/darthelper3d.h b/cuda/3d/darthelper3d.h new file mode 100644 index 0000000..7899629 --- /dev/null +++ b/cuda/3d/darthelper3d.h @@ -0,0 +1,46 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_DARTHELPER3_H +#define _CUDA_DARTHELPER3_H + +#include <cuda.h> +#include <driver_types.h> +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + + void dartSmoothing(float* out, const float* in, float b, unsigned int radius, SDimensions3D dims); + void dartMasking(float* out, const float* in, unsigned int conn, unsigned int radius, unsigned int threshold, SDimensions3D dims); + + bool setGPUIndex(int index); + +} + +#endif diff --git a/cuda/3d/dims3d.h b/cuda/3d/dims3d.h new file mode 100644 index 0000000..ec3c4a3 --- /dev/null +++ b/cuda/3d/dims3d.h @@ -0,0 +1,84 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_CONE_DIMS_H +#define _CUDA_CONE_DIMS_H + +namespace astra { + +struct SConeProjection { + // the source + double fSrcX, fSrcY, fSrcZ; + + // the origin ("bottom left") of the (flat-panel) detector + double fDetSX, fDetSY, fDetSZ; + + // the U-edge of a detector pixel + double fDetUX, fDetUY, fDetUZ; + + // the V-edge of a detector pixel + double fDetVX, fDetVY, fDetVZ; +}; + +struct SPar3DProjection { + // the ray direction + double fRayX, fRayY, fRayZ; + + // the origin ("bottom left") of the (flat-panel) detector + double fDetSX, fDetSY, fDetSZ; + + // the U-edge of a detector pixel + double fDetUX, fDetUY, fDetUZ; + + // the V-edge of a detector pixel + double fDetVX, fDetVY, fDetVZ; +}; + +} + + +namespace astraCUDA3d { + +using astra::SConeProjection; +using astra::SPar3DProjection; + +struct SDimensions3D { + unsigned int iVolX; + unsigned int iVolY; + unsigned int iVolZ; + unsigned int iProjAngles; + unsigned int iProjU; // number of detectors in the U direction + unsigned int iProjV; // number of detectors in the V direction + unsigned int iRaysPerDetDim; + unsigned int iRaysPerVoxelDim; +}; + +} + +#endif + diff --git a/cuda/3d/fdk.cu b/cuda/3d/fdk.cu new file mode 100644 index 0000000..ad0604c --- /dev/null +++ b/cuda/3d/fdk.cu @@ -0,0 +1,646 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "cone_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" +#include "../2d/fft.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_coneProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned int g_anglesPerWeightBlock = 16; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 2048; + +__constant__ float gC_angle_sin[g_MaxAngles]; +__constant__ float gC_angle_cos[g_MaxAngles]; +__constant__ float gC_angle[g_MaxAngles]; + + +// per-detector u/v shifts? + +static bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_coneProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_coneProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_coneProjTexture.filterMode = cudaFilterModeLinear; + gT_coneProjTexture.normalized = false; + + cudaBindTextureToArray(gT_coneProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void devBP_FDK(void* D_volData, unsigned int volPitch, int startAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fInvDetUSize, float fInvDetVSize, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X > dims.iVolX) + return; + if (Y > dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - fSrcZ; + + const float fU_base = 0.5f*dims.iProjU - 0.5f + 1.5f; + const float fV_base = 0.5f*dims.iProjV - 0.5f + 1.5f + (fDetZ-fSrcZ); + + // Note re. fZ/rV_base: the computations below are all relative to the + // optical axis, so we do the Z-adjustments beforehand. + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float cos_theta = gC_angle_cos[angle]; + const float sin_theta = gC_angle_sin[angle]; + + const float fR = fSrcOrigin; + const float fD = fR - fX * sin_theta + fY * cos_theta; + float fWeight = fR / fD; + fWeight *= fWeight; + + const float fScaleFactor = (fR + fDetOrigin) / fD; + const float fU = fU_base + (fX*cos_theta+fY*sin_theta) * fScaleFactor * fInvDetUSize; + const float fV = fV_base + fZ * fScaleFactor * fInvDetVSize; + + fVal += tex3D(gT_coneProjTexture, fU, fAngle, fV); + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; +// projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU] = 10.0f; +// if (threadIdx.x == 0 && threadIdx.y == 0) { printf("%d,%d,%d [%d / %d] -> %f\n", angle, detectorU, detectorV, (angle*dims.iProjV+detectorV)*projPitch+detectorU, projPitch, projData[(angle*dims.iProjV+detectorV)*projPitch+detectorU]); } + } + +} + + +bool FDK_BP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bindProjDataTexture(cuArray); + + float* angle_sin = new float[dims.iProjAngles]; + float* angle_cos = new float[dims.iProjAngles]; + + for (unsigned int i = 0; i < dims.iProjAngles; ++i) { + angle_sin[i] = sinf(angles[i]); + angle_cos[i] = cosf(angles[i]); + } + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle_sin, angle_sin, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + cudaError_t e2 = cudaMemcpyToSymbol(gC_angle_cos, angle_cos, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + assert(e1 == cudaSuccess); + assert(e2 == cudaSuccess); + + delete[] angle_sin; + delete[] angle_cos; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + devBP_FDK<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, 1.0f / fDetUSize, 1.0f / fDetVSize, dims); + } + + cudaTextForceKernelsCompletion(); + + cudaFreeArray(cuArray); + + // printf("%f\n", toc(t)); + + return true; +} + +__global__ void devFDK_preweight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, const SDimensions3D dims) +{ + float* projData = (float*)D_projData; + int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + // We need the length of the central ray and the length of the ray(s) to + // our detector pixel(s). + + const float fCentralRayLength = fSrcOrigin + fDetOrigin; + + const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize; + + const float fT = fCentralRayLength * fCentralRayLength + fU * fU; + + float fV = (startDetectorV - 0.5f*dims.iProjV + 0.5f) * fDetVSize + fDetZ - fSrcZ; + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + const float fRayLength = sqrtf(fT + fV * fV); + + const float fWeight = fCentralRayLength / fRayLength; + + projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight; + + fV += 1.0f; + } +} + +__global__ void devFDK_ParkerWeight(void* D_projData, unsigned int projPitch, unsigned int startAngle, unsigned int endAngle, float fSrcOrigin, float fDetOrigin, float fSrcZ, float fDetZ, float fDetUSize, float fCentralFanAngle, const SDimensions3D dims) +{ + float* projData = (float*)D_projData; + int angle = startAngle + blockIdx.y * g_anglesPerWeightBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + // We need the length of the central ray and the length of the projection + // of our ray onto the central slice + + const float fCentralRayLength = fSrcOrigin + fDetOrigin; + + // TODO: Detector pixel size + const float fU = (detectorU - 0.5f*dims.iProjU + 0.5f) * fDetUSize; + + //const float fGamma = atanf(fU / fCentralRayLength); + //const float fBeta = gC_angle[angle]; + const float fGamma = atanf(fU / fCentralRayLength); + const float fBeta = -gC_angle[angle]; + + // compute the weight depending on the location in the central fan's radon + // space + float fWeight; + + if (fBeta <= 0.0f) { + fWeight = 0.0f; + } else if (fBeta <= 2.0f*(fCentralFanAngle + fGamma)) { + fWeight = sinf((M_PI / 4.0f) * fBeta / (fCentralFanAngle + fGamma)); + fWeight *= fWeight; + } else if (fBeta <= M_PI + 2*fGamma) { + fWeight = 1.0f; + } else if (fBeta <= M_PI + 2*fCentralFanAngle) { + fWeight = sinf((M_PI / 4.0f) * (M_PI + 2.0f*fCentralFanAngle - fBeta) / (fCentralFanAngle - fGamma)); + fWeight *= fWeight; + } else { + fWeight = 0.0f; + } + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + + projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] *= fWeight; + + } +} + + + +// Perform the FDK pre-weighting and filtering +bool FDK_Filter(cudaPitchedPtr D_projData, + cufftComplex * D_filter, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, + float fDetUSize, float fDetVSize, bool bShortScan, + const SDimensions3D& dims, const float* angles) +{ + // The pre-weighting factor for a ray is the cosine of the angle between + // the central line and the ray. + + dim3 dimBlock(g_detBlockU, g_anglesPerWeightBlock); + dim3 dimGrid( ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), + (dims.iProjAngles+g_anglesPerWeightBlock-1)/g_anglesPerWeightBlock); + + int projPitch = D_projData.pitch/sizeof(float); + + devFDK_preweight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fDetVSize, dims); + + cudaTextForceKernelsCompletion(); + + if (bShortScan) { + // We do short-scan Parker weighting + + cudaError_t e1 = cudaMemcpyToSymbol(gC_angle, angles, + dims.iProjAngles*sizeof(float), 0, + cudaMemcpyHostToDevice); + assert(!e1); + + // TODO: detector pixel size! + float fCentralFanAngle = atanf((dims.iProjU*0.5f) / + (fSrcOrigin + fDetOrigin)); + + devFDK_ParkerWeight<<<dimGrid, dimBlock>>>(D_projData.ptr, projPitch, 0, dims.iProjAngles, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, fDetUSize, fCentralFanAngle, dims); + + } + + cudaTextForceKernelsCompletion(); + + + // The filtering is a regular ramp filter per detector line. + + int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); + int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + + + + // We process one sinogram at a time. + float* D_sinoData = (float*)D_projData.ptr; + + cufftComplex * D_sinoFFT = NULL; + allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_sinoFFT); + + bool ok = true; + + for (int v = 0; v < dims.iProjV; ++v) { + + ok = runCudaFFT(dims.iProjAngles, D_sinoData, projPitch, 0, + dims.iProjU, iPaddedDetCount, iHalfFFTSize, + D_sinoFFT); + + if (!ok) break; + + applyFilter(dims.iProjAngles, iHalfFFTSize, D_sinoFFT, D_filter); + + + ok = runCudaIFFT(dims.iProjAngles, D_sinoFFT, D_sinoData, projPitch, + 0, dims.iProjU, iPaddedDetCount, iHalfFFTSize); + + if (!ok) break; + + D_sinoData += (dims.iProjAngles * projPitch); + } + + freeComplexOnDevice(D_sinoFFT); + + return ok; +} + + +bool FDK(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles, bool bShortScan) +{ + bool ok; + // Generate filter + // TODO: Check errors + cufftComplex * D_filter; + int iPaddedDetCount = calcNextPowerOfTwo(2 * dims.iProjU); + int iHalfFFTSize = calcFFTFourSize(iPaddedDetCount); + + cufftComplex *pHostFilter = new cufftComplex[dims.iProjAngles * iHalfFFTSize]; + memset(pHostFilter, 0, sizeof(cufftComplex) * dims.iProjAngles * iHalfFFTSize); + + genFilter(FILTER_RAMLAK, 1.0f, dims.iProjAngles, pHostFilter, iPaddedDetCount, iHalfFFTSize); + + allocateComplexOnDevice(dims.iProjAngles, iHalfFFTSize, &D_filter); + uploadComplexArrayToDevice(dims.iProjAngles, iHalfFFTSize, pHostFilter, D_filter); + + delete [] pHostFilter; + + + // Perform filtering + + ok = FDK_Filter(D_projData, D_filter, fSrcOrigin, fDetOrigin, + fSrcZ, fDetZ, fDetUSize, fDetVSize, + bShortScan, dims, angles); + + // Clean up filter + freeComplexOnDevice(D_filter); + + + if (!ok) + return false; + + // Perform BP + + ok = FDK_BP(D_volumeData, D_projData, fSrcOrigin, fDetOrigin, fSrcZ, fDetZ, + fDetUSize, fDetVSize, dims, angles); + + if (!ok) + return false; + + return true; +} + + +} + +#ifdef STANDALONE +void dumpVolume(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* buf = new float[dims.iVolX*dims.iVolY]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iVolZ; ++i) { + cudaMemcpy2D(buf, dims.iVolX*sizeof(float), ((float*)data.ptr)+pitch*dims.iVolY*i, data.pitch, dims.iVolX*sizeof(float), dims.iVolY, cudaMemcpyDeviceToHost); + + char fname[512]; + sprintf(fname, filespec, dims.iVolZ-i-1); + saveImage(fname, dims.iVolY, dims.iVolX, buf, fMin, fMax); + } +} + +void dumpSinograms(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* bufs = new float[dims.iProjAngles*dims.iProjU]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iProjV; ++i) { + cudaMemcpy2D(bufs, dims.iProjU*sizeof(float), ((float*)data.ptr)+pitch*dims.iProjAngles*i, data.pitch, dims.iProjU*sizeof(float), dims.iProjAngles, cudaMemcpyDeviceToHost); + + char fname[512]; + sprintf(fname, filespec, i); + saveImage(fname, dims.iProjAngles, dims.iProjU, bufs, fMin, fMax); + } +} + +void dumpProjections(const char* filespec, const cudaPitchedPtr& data, const SDimensions3D& dims, float fMin, float fMax) +{ + float* bufp = new float[dims.iProjV*dims.iProjU]; + unsigned int pitch = data.pitch / sizeof(float); + + for (int i = 0; i < dims.iProjAngles; ++i) { + for (int j = 0; j < dims.iProjV; ++j) { + cudaMemcpy(bufp+dims.iProjU*j, ((float*)data.ptr)+pitch*dims.iProjAngles*j+pitch*i, dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[512]; + sprintf(fname, filespec, i); + saveImage(fname, dims.iProjV, dims.iProjU, bufp, fMin, fMax); + } +} + + + + +int main() +{ +#if 0 + SDimensions3D dims; + dims.iVolX = 512; + dims.iVolY = 512; + dims.iVolZ = 512; + dims.iProjAngles = 180; + dims.iProjU = 1024; + dims.iProjV = 1024; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } +#endif + + SConeProjection angle[180]; + angle[0].fSrcX = -1536; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 1024; + angle[0].fDetSY = -512; + angle[0].fDetSZ = 512; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = -1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + + //dumpSinograms("sino%03d.png", projData, dims, 0, 512); + //dumpProjections("proj%03d.png", projData, dims, 0, 512); + + astraCUDA3d::zeroVolumeData(volData, dims); + + float* angles = new float[dims.iProjAngles]; + for (int i = 0; i < 180; ++i) + angles[i] = i*2*M_PI/180; + + astraCUDA3d::FDK(volData, projData, 1536, 512, 0, 0, dims, angles); + + dumpVolume("vol%03d.png", volData, dims, -20, 100); + + +#else + + SDimensions3D dims; + dims.iVolX = 1000; + dims.iVolY = 999; + dims.iVolZ = 500; + dims.iProjAngles = 376; + dims.iProjU = 1024; + dims.iProjV = 524; + dims.iRaysPerDet = 1; + + float* angles = new float[dims.iProjAngles]; + for (int i = 0; i < dims.iProjAngles; ++i) + angles[i] = -i*(M_PI)/360; + + cudaPitchedPtr volData = astraCUDA3d::allocateVolumeData(dims); + cudaPitchedPtr projData = astraCUDA3d::allocateProjectionData(dims); + astraCUDA3d::zeroProjectionData(projData, dims); + astraCUDA3d::zeroVolumeData(volData, dims); + + timeval t; + tic(t); + + for (int i = 0; i < dims.iProjAngles; ++i) { + char fname[256]; + sprintf(fname, "/home/wpalenst/tmp/Elke/proj%04d.png", i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + int pitch = projData.pitch / sizeof(float); + for (int j = 0; j < dims.iProjV; ++j) { + cudaMemcpy(((float*)projData.ptr)+dims.iProjAngles*pitch*j+pitch*i, bufp+dims.iProjU*j, dims.iProjU*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + } + printf("Load time: %f\n", toc(t)); + + //dumpSinograms("sino%03d.png", projData, dims, -8.0f, 256.0f); + //astraCUDA3d::FDK(volData, projData, 7350, 62355, 0, 10, dims, angles); + //astraCUDA3d::FDK(volData, projData, 7350, -380, 0, 10, dims, angles); + + tic(t); + + astraCUDA3d::FDK(volData, projData, 7383.29867, 0, 0, 10, dims, angles); + + printf("FDK time: %f\n", toc(t)); + tic(t); + + dumpVolume("vol%03d.png", volData, dims, -65.9f, 200.0f); + //dumpVolume("vol%03d.png", volData, dims, 0.0f, 256.0f); + printf("Save time: %f\n", toc(t)); + +#endif + + +} +#endif diff --git a/cuda/3d/fdk.h b/cuda/3d/fdk.h new file mode 100644 index 0000000..5443b19 --- /dev/null +++ b/cuda/3d/fdk.h @@ -0,0 +1,43 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_FDK_H +#define _CUDA_FDK_H + +namespace astraCUDA3d { + +bool FDK(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + float fSrcOrigin, float fDetOrigin, + float fSrcZ, float fDetZ, float fDetUSize, float fDetVSize, + const SDimensions3D& dims, const float* angles, bool bShortScan); + + +} + +#endif diff --git a/cuda/3d/par3d_bp.cu b/cuda/3d/par3d_bp.cu new file mode 100644 index 0000000..872b1eb --- /dev/null +++ b/cuda/3d/par3d_bp.cu @@ -0,0 +1,464 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "par3d_fp.h" +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_par3DProjTexture; + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 16; + +static const unsigned int g_anglesPerBlock = 64; +static const unsigned int g_volBlockX = 32; +static const unsigned int g_volBlockY = 16; + +static const unsigned g_MaxAngles = 1024; + +__constant__ float gC_Cux[g_MaxAngles]; +__constant__ float gC_Cuy[g_MaxAngles]; +__constant__ float gC_Cuz[g_MaxAngles]; +__constant__ float gC_Cuc[g_MaxAngles]; +__constant__ float gC_Cvx[g_MaxAngles]; +__constant__ float gC_Cvy[g_MaxAngles]; +__constant__ float gC_Cvz[g_MaxAngles]; +__constant__ float gC_Cvc[g_MaxAngles]; + + +static bool bindProjDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_par3DProjTexture.addressMode[0] = cudaAddressModeClamp; + gT_par3DProjTexture.addressMode[1] = cudaAddressModeClamp; + gT_par3DProjTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DProjTexture.filterMode = cudaFilterModeLinear; + gT_par3DProjTexture.normalized = false; + + cudaBindTextureToArray(gT_par3DProjTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + +__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + + const float fUNum = fCuc + fX * fCux + fY * fCuy + fZ * fCuz; + const float fVNum = fCvc + fX * fCvx + fY * fCvy + fZ * fCvz; + + const float fU = fUNum + 1.0f; + const float fV = fVNum + 1.0f; + + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal; + } + +} + +// supersampling version +__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, int startAngle, const SDimensions3D dims) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles) + endAngle = dims.iProjAngles; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/dims.iRaysPerVoxelDim; + + const float fSubStep = 1.0f/dims.iRaysPerVoxelDim; + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + const float fCux = gC_Cux[angle]; + const float fCuy = gC_Cuy[angle]; + const float fCuz = gC_Cuz[angle]; + const float fCuc = gC_Cuc[angle]; + const float fCvx = gC_Cvx[angle]; + const float fCvy = gC_Cvy[angle]; + const float fCvz = gC_Cvz[angle]; + const float fCvc = gC_Cvc[angle]; + + float fXs = fX; + for (int iSubX = 0; iSubX < dims.iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < dims.iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < dims.iRaysPerVoxelDim; ++iSubZ) { + + const float fUNum = fCuc + fXs * fCux + fYs * fCuy + fZs * fCuz; + const float fVNum = fCvc + fXs * fCvx + fYs * fCvy + fZs * fCvz; + + const float fU = fUNum + 1.0f; + const float fV = fVNum + 1.0f; + + fVal += tex3D(gT_par3DProjTexture, fU, fAngle, fV); // TODO: check order + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal / (dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim*dims.iRaysPerVoxelDim); + } + +} + +bool Par3DBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SPar3DProjection* angles) +{ + bindProjDataTexture(D_projArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(expr,name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = (expr) ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + +#define DENOM (angles[i].fRayX*angles[i].fDetUY*angles[i].fDetVZ - angles[i].fRayX*angles[i].fDetUZ*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetUX*angles[i].fDetVZ + angles[i].fRayY*angles[i].fDetUZ*angles[i].fDetVX + angles[i].fRayZ*angles[i].fDetUX*angles[i].fDetVY - angles[i].fRayZ*angles[i].fDetUY*angles[i].fDetVX) + + + TRANSFER_TO_CONSTANT( ( - (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)) / DENOM , Cux ); + TRANSFER_TO_CONSTANT( ( (angles[i].fRayX*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVX)) / DENOM , Cuy ); + TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetVY - angles[i].fRayY*angles[i].fDetVX) ) / DENOM , Cuz ); + TRANSFER_TO_CONSTANT( (-(angles[i].fDetSY*angles[i].fDetVZ - angles[i].fDetSZ*angles[i].fDetVY)*angles[i].fRayX + (angles[i].fRayY*angles[i].fDetVZ - angles[i].fRayZ*angles[i].fDetVY)*angles[i].fDetSX - (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetVX) / DENOM , Cuc ); + + TRANSFER_TO_CONSTANT( ((angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY) ) / DENOM , Cvx ); + TRANSFER_TO_CONSTANT( (- (angles[i].fRayX*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUX) ) / DENOM , Cvy ); + TRANSFER_TO_CONSTANT( ((angles[i].fRayX*angles[i].fDetUY - angles[i].fRayY*angles[i].fDetUX) ) / DENOM , Cvz ); + TRANSFER_TO_CONSTANT( ((angles[i].fDetSY*angles[i].fDetUZ - angles[i].fDetSZ*angles[i].fDetUY)*angles[i].fRayX - (angles[i].fRayY*angles[i].fDetUZ - angles[i].fRayZ*angles[i].fDetUY)*angles[i].fDetSX + (angles[i].fRayY*angles[i].fDetSZ - angles[i].fRayZ*angles[i].fDetSY)*angles[i].fDetUX ) / DENOM , Cvc ); + +#undef TRANSFER_TO_CONSTANT +#undef DENOM + + delete[] tmp; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < dims.iProjAngles; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (dims.iRaysPerVoxelDim == 1) + dev_par3D_BP<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + else + dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), i, dims); + } + + cudaTextForceKernelsCompletion(); + + // printf("%f\n", toc(t)); + + return true; +} + +bool Par3DBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles); + + cudaFreeArray(cuArray); + + return ret; +} + + +} + +#ifdef STANDALONE +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 180; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&volData, extentV); + + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + cudaMalloc3D(&projData, extentP); + cudaMemset3D(projData, 0, extentP); + + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 1.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); +#if 0 + if (i == 128) { + for (unsigned int j = 0; j < 256*256; ++j) + slice[j] = 0.0f; + } +#endif + } + + + SPar3DProjection angle[180]; + angle[0].fRayX = 1; + angle[0].fRayY = 0; + angle[0].fRayZ = 0; + + angle[0].fDetSX = 512; + angle[0].fDetSY = -256; + angle[0].fDetSZ = -256; + + angle[0].fDetUX = 0; + angle[0].fDetUY = 1; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = 1; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 180; ++i) { + angle[i] = angle[0]; + ROTATE0(Ray, i, i*2*M_PI/180); + ROTATE0(DetS, i, i*2*M_PI/180); + ROTATE0(DetU, i, i*2*M_PI/180); + ROTATE0(DetV, i, i*2*M_PI/180); + } +#undef ROTATE0 + + astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f); +#if 1 + float* bufs = new float[180*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+180*512*i, 180*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 180, 512, bufs, 0, 512); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp, 0, 512); + } +#endif + for (unsigned int i = 0; i < 256*256; ++i) + slice[i] = 0.0f; + for (unsigned int i = 0; i < 256; ++i) { + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + + astraCUDA3d::Par3DBP(volData, projData, dims, angle); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", volData.pitch, volData.xsize, volData.ysize); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf, 0, 60000); + } +#endif + +} +#endif diff --git a/cuda/3d/par3d_bp.h b/cuda/3d/par3d_bp.h new file mode 100644 index 0000000..399a3cb --- /dev/null +++ b/cuda/3d/par3d_bp.h @@ -0,0 +1,45 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR3D_BP_H +#define _CUDA_PAR3D_BP_H + +namespace astraCUDA3d { + +_AstraExport bool Par3DBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SPar3DProjection* angles); + +_AstraExport bool Par3DBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles); + + +} + +#endif diff --git a/cuda/3d/par3d_fp.cu b/cuda/3d/par3d_fp.cu new file mode 100644 index 0000000..6bf9037 --- /dev/null +++ b/cuda/3d/par3d_fp.cu @@ -0,0 +1,814 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> +#include "util3d.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +#include "dims3d.h" + +typedef texture<float, 3, cudaReadModeElementType> texture3D; + +static texture3D gT_par3DVolumeTexture; + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 64; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_RayX[g_MaxAngles]; +__constant__ float gC_RayY[g_MaxAngles]; +__constant__ float gC_RayZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +static bool bindVolumeDataTexture(const cudaArray* array) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + + gT_par3DVolumeTexture.addressMode[0] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[1] = cudaAddressModeClamp; + gT_par3DVolumeTexture.addressMode[2] = cudaAddressModeClamp; + gT_par3DVolumeTexture.filterMode = cudaFilterModeLinear; + gT_par3DVolumeTexture.normalized = false; + + cudaBindTextureToArray(gT_par3DVolumeTexture, array, channelDesc); + + // TODO: error value? + + return true; +} + + + +// threadIdx: x = u detector +// y = relative angle +// blockIdx: x = u/v detector +// y = angle block + +#define PAR3D_FP_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + + + +// Supersampling version +#define PAR3D_FP_SS_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + const float fSubStep = 1.0f/dims.iRaysPerDetDim; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + \ + float fV = 0.0f; \ + \ + float fdU = detectorU - 0.5f + 0.5f*fSubStep; \ + for (int iSubU = 0; iSubU < dims.iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { \ + float fdV = detectorV - 0.5f + 0.5f*fSubStep; \ + for (int iSubV = 0; iSubV < dims.iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { \ + \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; \ + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; \ + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += tex3D(gT_par3DVolumeTexture, fX, fY, fZ); \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + fVal *= fDistCorr; \ + fV += fVal; \ + \ + } \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (dims.iRaysPerDetDim * dims.iRaysPerDetDim);\ + } + + + +__global__ void par3D_FP_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(X,Y,Z) +} + +__global__ void par3D_FP_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Y,X,Z) +} + +__global__ void par3D_FP_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_BODY(Z,X,Y) +} + +__global__ void par3D_FP_SS_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SS_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SS_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SS_BODY(Z,X,Y) +} + + +__device__ float dirWeights(float fX, float fN) { + if (fX <= 0.5f) // outside image on left + return 0.0f; + if (fX <= 1.5f) // half outside image on left + return (fX - 0.5f) * (fX - 0.5f); + if (fX <= fN + 0.5f) { // inside image + float t = fX - 0.5f - floorf(fX - 0.5f); + return t*t + (1-t)*(1-t); + } + if (fX <= fN + 1.5f) // half outside image on right + return (fN + 1.5f - fX) * (fN + 1.5f - fX); + return 0.0f; // outside image on right +} + +#define PAR3D_FP_SUMSQW_BODY(c0,c1,c2) \ + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; \ + if (angle >= endAngle) \ + return; \ + \ + const float fRayX = gC_RayX[angle]; \ + const float fRayY = gC_RayY[angle]; \ + const float fRayZ = gC_RayZ[angle]; \ + const float fDetUX = gC_DetUX[angle]; \ + const float fDetUY = gC_DetUY[angle]; \ + const float fDetUZ = gC_DetUZ[angle]; \ + const float fDetVX = gC_DetVX[angle]; \ + const float fDetVY = gC_DetVY[angle]; \ + const float fDetVZ = gC_DetVZ[angle]; \ + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; \ + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; \ + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; \ + \ + \ + \ + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; \ + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; \ + int endDetectorV = startDetectorV + g_detBlockV; \ + if (endDetectorV > dims.iProjV) \ + endDetectorV = dims.iProjV; \ + \ + int endSlice = startSlice + g_blockSlices; \ + if (endSlice > dims.iVol##c0) \ + endSlice = dims.iVol##c0; \ + \ + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) \ + { \ + /* Trace ray in direction Ray to (detectorU,detectorV) from */ \ + /* X = startSlice to X = endSlice */ \ + \ + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; \ + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; \ + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; \ + \ + /* (x) ( 1) ( 0) */ \ + /* ray: (y) = (ay) * x + (by) */ \ + /* (z) (az) (bz) */ \ + \ + const float a##c1 = fRay##c1 / fRay##c0; \ + const float a##c2 = fRay##c2 / fRay##c0; \ + const float b##c1 = fDet##c1 - a##c1 * fDet##c0; \ + const float b##c2 = fDet##c2 - a##c2 * fDet##c0; \ + \ + const float fDistCorr = sqrt(a##c1*a##c1+a##c2*a##c2+1.0f) * fOutputScale; \ + \ + float fVal = 0.0f; \ + \ + float f##c0 = startSlice + 1.5f; \ + float f##c1 = a##c1 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c1 + 0.5f*dims.iVol##c1 - 0.5f + 1.5f;\ + float f##c2 = a##c2 * (startSlice - 0.5f*dims.iVol##c0 + 0.5f) + b##c2 + 0.5f*dims.iVol##c2 - 0.5f + 1.5f;\ + \ + for (int s = startSlice; s < endSlice; ++s) \ + { \ + fVal += dirWeights(f##c1, dims.iVol##c1) * dirWeights(f##c2, dims.iVol##c2) * fDistCorr * fDistCorr; \ + f##c0 += 1.0f; \ + f##c1 += a##c1; \ + f##c2 += a##c2; \ + } \ + \ + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; \ + } + +// Supersampling version +// TODO + + +__global__ void par3D_FP_SumSqW_dirX(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(X,Y,Z) +} + +__global__ void par3D_FP_SumSqW_dirY(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Y,X,Z) +} + +__global__ void par3D_FP_SumSqW_dirZ(float* D_projData, unsigned int projPitch, unsigned int startSlice, unsigned int startAngle, unsigned int endAngle, const SDimensions3D dims, float fOutputScale) +{ +PAR3D_FP_SUMSQW_BODY(Z,X,Y) +} + + + +bool Par3DFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + + bindVolumeDataTexture(D_volArray); + + + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else + par3D_FP_SS_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + +bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer volume to array + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + bool ret = Par3DFP_Array(cuArray, D_projData, dims, angles, fOutputScale); + + cudaFreeArray(cuArray); + + return ret; +} + + + +bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale) +{ + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (dims.iRaysPerDetDim == 1) + par3D_FP_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); + else +#if 0 + par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) + cudaStreamDestroy(*iter); + + streams.clear(); + + cudaTextForceKernelsCompletion(); + + + // printf("%f\n", toc(t)); + + return true; +} + + + + + + + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + cudaSetDevice(1); + + + SDimensions3D dims; + dims.iVolX = 500; + dims.iVolY = 500; + dims.iVolZ = 81; + dims.iProjAngles = 241; + dims.iProjU = 600; + dims.iProjV = 100; + dims.iRaysPerDet = 1; + + SPar3DProjection base; + base.fRayX = 1.0f; + base.fRayY = 0.0f; + base.fRayZ = 0.1f; + + base.fDetSX = 0.0f; + base.fDetSY = -300.0f; + base.fDetSZ = -50.0f; + + base.fDetUX = 0.0f; + base.fDetUY = 1.0f; + base.fDetUZ = 0.0f; + + base.fDetVX = 0.0f; + base.fDetVY = 0.0f; + base.fDetVZ = 1.0f; + + SPar3DProjection angle[dims.iProjAngles]; + + cudaPitchedPtr volData; // pitch, ptr, xsize, ysize + + volData = allocateVolumeData(dims); + + cudaPitchedPtr projData; // pitch, ptr, xsize, ysize + + projData = allocateProjectionData(dims); + + unsigned int ix = 500,iy = 500; + + float* buf = new float[dims.iProjU*dims.iProjV]; + + float* slice = new float[dims.iVolX*dims.iVolY]; + for (int i = 0; i < dims.iVolX*dims.iVolY; ++i) + slice[i] = 1.0f; + + for (unsigned int a = 0; a < 241; a += dims.iProjAngles) { + + zeroProjectionData(projData, dims); + + for (int y = 0; y < iy; y += dims.iVolY) { + for (int x = 0; x < ix; x += dims.iVolX) { + + timeval st; + tic(st); + + for (int z = 0; z < dims.iVolZ; ++z) { +// char sfn[256]; +// sprintf(sfn, "/home/wpalenst/projects/cone_simulation/phantom_4096/mouse_fem_phantom_%04d.png", 30+z); +// float* slice = loadSubImage(sfn, x, y, dims.iVolX, dims.iVolY); + + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = dims.iVolX*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, z }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaError err = cudaMemcpy3D(&p); + assert(!err); +// delete[] slice; + } + + printf("Load: %f\n", toc(st)); + +#if 0 + + cudaPos zp = { 0, 0, 0 }; + + cudaPitchedPtr t; + t.ptr = new float[1024*1024]; + t.pitch = 1024*4; + t.xsize = 1024*4; + t.ysize = 1024; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = volData; + p.extent = extentS; + p.dstArray = 0; + p.dstPtr = t; + p.dstPos = zp; + p.kind = cudaMemcpyDeviceToHost; + cudaError err = cudaMemcpy3D(&p); + assert(!err); + + char fn[32]; + sprintf(fn, "t%d%d.png", x / dims.iVolX, y / dims.iVolY); + saveImage(fn, 1024, 1024, (float*)t.ptr); + saveImage("s.png", 4096, 4096, slice); + delete[] (float*)t.ptr; +#endif + + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = base.f##name##X * cos(alpha) - base.f##name##Y * sin(alpha); angle[i].f##name##Y = base.f##name##X * sin(alpha) + base.f##name##Y * cos(alpha); angle[i].f##name##Z = base.f##name##Z; } while(0) +#define SHIFT(name,i,x,y) do { angle[i].f##name##X += x; angle[i].f##name##Y += y; } while(0) + for (int i = 0; i < dims.iProjAngles; ++i) { + ROTATE0(Ray, i, (a+i)*.8*M_PI/180); + ROTATE0(DetS, i, (a+i)*.8*M_PI/180); + ROTATE0(DetU, i, (a+i)*.8*M_PI/180); + ROTATE0(DetV, i, (a+i)*.8*M_PI/180); + + +// SHIFT(Src, i, (-x+1536), (-y+1536)); +// SHIFT(DetS, i, (-x+1536), (-y+1536)); + } +#undef ROTATE0 +#undef SHIFT + tic(st); + + astraCUDA3d::Par3DFP(volData, projData, dims, angle, 1.0f); + + printf("FP: %f\n", toc(st)); + + } + } + for (unsigned int aa = 0; aa < dims.iProjAngles; ++aa) { + for (unsigned int v = 0; v < dims.iProjV; ++v) + cudaMemcpy(buf+v*dims.iProjU, ((float*)projData.ptr)+(v*dims.iProjAngles+aa)*(projData.pitch/sizeof(float)), dims.iProjU*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[32]; + sprintf(fname, "proj%03d.png", a+aa); + saveImage(fname, dims.iProjV, dims.iProjU, buf, 0.0f, 1000.0f); + } + } + + delete[] buf; + +} +#endif diff --git a/cuda/3d/par3d_fp.h b/cuda/3d/par3d_fp.h new file mode 100644 index 0000000..7208361 --- /dev/null +++ b/cuda/3d/par3d_fp.h @@ -0,0 +1,51 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_PAR3D_FP_H +#define _CUDA_PAR3D_FP_H + +namespace astraCUDA3d { + +_AstraExport bool Par3DFP_Array(cudaArray *D_volArray, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +_AstraExport bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +_AstraExport bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + float fOutputScale); + +} + +#endif diff --git a/cuda/3d/sirt3d.cu b/cuda/3d/sirt3d.cu new file mode 100644 index 0000000..f615204 --- /dev/null +++ b/cuda/3d/sirt3d.cu @@ -0,0 +1,533 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> + +#include "sirt3d.h" +#include "util3d.h" +#include "arith3d.h" +#include "cone_fp.h" + +#ifdef STANDALONE +#include "testutil.h" +#endif + +namespace astraCUDA3d { + +SIRT::SIRT() : ReconAlgo3D() +{ + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_projData.ptr = 0; + D_tmpData.ptr = 0; + + D_lineWeight.ptr = 0; + D_pixelWeight.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + useMinConstraint = false; + useMaxConstraint = false; +} + + +SIRT::~SIRT() +{ + reset(); +} + +void SIRT::reset() +{ + cudaFree(D_projData.ptr); + cudaFree(D_tmpData.ptr); + cudaFree(D_lineWeight.ptr); + cudaFree(D_pixelWeight.ptr); + + D_maskData.ptr = 0; + D_smaskData.ptr = 0; + + D_sinoData.ptr = 0; + D_volumeData.ptr = 0; + + D_projData.ptr = 0; + D_tmpData.ptr = 0; + + D_lineWeight.ptr = 0; + D_pixelWeight.ptr = 0; + + useVolumeMask = false; + useSinogramMask = false; + + ReconAlgo3D::reset(); +} + +bool SIRT::enableVolumeMask() +{ + useVolumeMask = true; + return true; +} + +bool SIRT::enableSinogramMask() +{ + useSinogramMask = true; + return true; +} + + +bool SIRT::init() +{ + D_pixelWeight = allocateVolumeData(dims); + zeroVolumeData(D_pixelWeight, dims); + + D_tmpData = allocateVolumeData(dims); + zeroVolumeData(D_tmpData, dims); + + D_projData = allocateProjectionData(dims); + zeroProjectionData(D_projData, dims); + + D_lineWeight = allocateProjectionData(dims); + zeroProjectionData(D_lineWeight, dims); + + // We can't precompute lineWeights and pixelWeights when using a mask + if (!useVolumeMask && !useSinogramMask) + precomputeWeights(); + + // TODO: check if allocations succeeded + return true; +} + +bool SIRT::setMinConstraint(float fMin) +{ + fMinConstraint = fMin; + useMinConstraint = true; + return true; +} + +bool SIRT::setMaxConstraint(float fMax) +{ + fMaxConstraint = fMax; + useMaxConstraint = true; + return true; +} + +bool SIRT::precomputeWeights() +{ + zeroProjectionData(D_lineWeight, dims); + if (useVolumeMask) { + callFP(D_maskData, D_lineWeight, 1.0f); + } else { + processVol3D<opSet>(D_tmpData, 1.0f, dims); + callFP(D_tmpData, D_lineWeight, 1.0f); + } + processSino3D<opInvert>(D_lineWeight, dims); + + if (useSinogramMask) { + // scale line weights with sinogram mask to zero out masked sinogram pixels + processSino3D<opMul>(D_lineWeight, D_smaskData, dims); + } + + zeroVolumeData(D_pixelWeight, dims); + + if (useSinogramMask) { + callBP(D_pixelWeight, D_smaskData); + } else { + processSino3D<opSet>(D_projData, 1.0f, dims); + callBP(D_pixelWeight, D_projData); + } +#if 0 + float* bufp = new float[512*512]; + + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "ray%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "pix%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + processVol3D<opInvert>(D_pixelWeight, dims); + + if (useVolumeMask) { + // scale pixel weights with mask to zero out masked pixels + processVol3D<opMul>(D_pixelWeight, D_maskData, dims); + } + + return true; +} + + +bool SIRT::setVolumeMask(cudaPitchedPtr& _D_maskData) +{ + assert(useVolumeMask); + + D_maskData = _D_maskData; + + return true; +} + +bool SIRT::setSinogramMask(cudaPitchedPtr& _D_smaskData) +{ + assert(useSinogramMask); + + D_smaskData = _D_smaskData; + + return true; +} + +bool SIRT::setBuffers(cudaPitchedPtr& _D_volumeData, + cudaPitchedPtr& _D_projData) +{ + D_volumeData = _D_volumeData; + D_sinoData = _D_projData; + + fprintf(stderr, "Reconstruction buffer: %p\n", (void*)D_volumeData.ptr); + + return true; +} + +bool SIRT::iterate(unsigned int iterations) +{ + shouldAbort = false; + + if (useVolumeMask || useSinogramMask) + precomputeWeights(); + +#if 0 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_pixelWeight.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "pix%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif +#if 0 + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_lineWeight.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "ray%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + + // iteration + for (unsigned int iter = 0; iter < iterations && !shouldAbort; ++iter) { + // copy sinogram to projection data + duplicateProjectionData(D_projData, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_tmpData, D_volumeData, dims); + processVol3D<opMul>(D_tmpData, D_maskData, dims); + callFP(D_tmpData, D_projData, -1.0f); + } else { + callFP(D_volumeData, D_projData, -1.0f); + } + + processSino3D<opMul>(D_projData, D_lineWeight, dims); + + zeroVolumeData(D_tmpData, dims); +#if 0 + float* bufp = new float[512*512]; + printf("Dumping projData: %p\n", (void*)D_projData.ptr); + for (int i = 0; i < 180; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)D_projData.ptr)+180*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "diff%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + + callBP(D_tmpData, D_projData); +#if 0 + printf("Dumping tmpData: %p\n", (void*)D_tmpData.ptr); + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)D_tmpData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "add%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + + processVol3D<opAddMul>(D_volumeData, D_tmpData, D_pixelWeight, dims); + + if (useMinConstraint) + processVol3D<opClampMin>(D_volumeData, fMinConstraint, dims); + if (useMaxConstraint) + processVol3D<opClampMax>(D_volumeData, fMaxConstraint, dims); + } + + return true; +} + +float SIRT::computeDiffNorm() +{ + // copy sinogram to projection data + duplicateProjectionData(D_projData, D_sinoData, dims); + + // do FP, subtracting projection from sinogram + if (useVolumeMask) { + duplicateVolumeData(D_tmpData, D_volumeData, dims); + processVol3D<opMul>(D_tmpData, D_maskData, dims); + callFP(D_tmpData, D_projData, -1.0f); + } else { + callFP(D_volumeData, D_projData, -1.0f); + } + + float s = dotProduct3D(D_projData, dims.iProjU, dims.iProjAngles, dims.iProjV); + return sqrt(s); +} + + +bool doSIRT(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_sinoData, + cudaPitchedPtr& D_maskData, + const SDimensions3D& dims, const SConeProjection* angles, + unsigned int iterations) +{ + SIRT sirt; + bool ok = true; + + ok &= sirt.setConeGeometry(dims, angles); + if (D_maskData.ptr) + ok &= sirt.enableVolumeMask(); + + if (!ok) + return false; + + ok = sirt.init(); + if (!ok) + return false; + + if (D_maskData.ptr) + ok &= sirt.setVolumeMask(D_maskData); + + ok &= sirt.setBuffers(D_volumeData, D_sinoData); + if (!ok) + return false; + + ok = sirt.iterate(iterations); + + return ok; +} + +} + +#ifdef STANDALONE + +using namespace astraCUDA3d; + +int main() +{ + SDimensions3D dims; + dims.iVolX = 256; + dims.iVolY = 256; + dims.iVolZ = 256; + dims.iProjAngles = 100; + dims.iProjU = 512; + dims.iProjV = 512; + dims.iRaysPerDet = 1; + + SConeProjection angle[100]; + angle[0].fSrcX = -2905.6; + angle[0].fSrcY = 0; + angle[0].fSrcZ = 0; + + angle[0].fDetSX = 694.4; + angle[0].fDetSY = -122.4704; + angle[0].fDetSZ = -122.4704; + + angle[0].fDetUX = 0; + angle[0].fDetUY = .4784; + //angle[0].fDetUY = .5; + angle[0].fDetUZ = 0; + + angle[0].fDetVX = 0; + angle[0].fDetVY = 0; + angle[0].fDetVZ = .4784; + +#define ROTATE0(name,i,alpha) do { angle[i].f##name##X = angle[0].f##name##X * cos(alpha) - angle[0].f##name##Y * sin(alpha); angle[i].f##name##Y = angle[0].f##name##X * sin(alpha) + angle[0].f##name##Y * cos(alpha); } while(0) + for (int i = 1; i < 100; ++i) { + angle[i] = angle[0]; + ROTATE0(Src, i, i*2*M_PI/100); + ROTATE0(DetS, i, i*2*M_PI/100); + ROTATE0(DetU, i, i*2*M_PI/100); + ROTATE0(DetV, i, i*2*M_PI/100); + } +#undef ROTATE0 + + + cudaPitchedPtr volData = allocateVolumeData(dims); + cudaPitchedPtr projData = allocateProjectionData(dims); + zeroProjectionData(projData, dims); + + float* pbuf = new float[100*512*512]; + copyProjectionsFromDevice(pbuf, projData, dims); + copyProjectionsToDevice(pbuf, projData, dims); + delete[] pbuf; + +#if 0 + float* slice = new float[256*256]; + cudaPitchedPtr ptr; + ptr.ptr = slice; + ptr.pitch = 256*sizeof(float); + ptr.xsize = 256*sizeof(float); + ptr.ysize = 256; + + for (unsigned int i = 0; i < 256; ++i) { + for (unsigned int y = 0; y < 256; ++y) + for (unsigned int x = 0; x < 256; ++x) + slice[y*256+x] = (i-127.5)*(i-127.5)+(y-127.5)*(y-127.5)+(x-127.5)*(x-127.5) < 4900 ? 1.0f : 0.0f; + + cudaExtent extentS; + extentS.width = dims.iVolX*sizeof(float); + extentS.height = dims.iVolY; + extentS.depth = 1; + cudaPos sp = { 0, 0, 0 }; + cudaPos dp = { 0, 0, i }; + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = sp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = dp; + p.dstPtr = volData; + p.extent = extentS; + p.kind = cudaMemcpyHostToDevice; + cudaMemcpy3D(&p); + } + astraCUDA3d::ConeFP(volData, projData, dims, angle, 1.0f); + +#else + + for (int i = 0; i < 100; ++i) { + char fname[32]; + sprintf(fname, "Tiffs/%04d.png", 4*i); + unsigned int w,h; + float* bufp = loadImage(fname, w,h); + + for (int j = 0; j < 512*512; ++j) { + float v = bufp[j]; + if (v > 236.0f) v = 236.0f; + v = logf(236.0f / v); + bufp[j] = 256*v; + } + + for (int j = 0; j < 512; ++j) { + cudaMemcpy(((float*)projData.ptr)+100*512*j+512*i, bufp+512*j, 512*sizeof(float), cudaMemcpyHostToDevice); + } + + delete[] bufp; + + } +#endif + +#if 0 + float* bufs = new float[100*512]; + + for (int i = 0; i < 512; ++i) { + cudaMemcpy(bufs, ((float*)projData.ptr)+100*512*i, 100*512*sizeof(float), cudaMemcpyDeviceToHost); + + printf("%d %d %d\n", projData.pitch, projData.xsize, projData.ysize); + + char fname[20]; + sprintf(fname, "sino%03d.png", i); + saveImage(fname, 100, 512, bufs); + } + + float* bufp = new float[512*512]; + + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < 512; ++j) { + cudaMemcpy(bufp+512*j, ((float*)projData.ptr)+100*512*j+512*i, 512*sizeof(float), cudaMemcpyDeviceToHost); + } + + char fname[20]; + sprintf(fname, "proj%03d.png", i); + saveImage(fname, 512, 512, bufp); + } +#endif + + zeroVolumeData(volData, dims); + + cudaPitchedPtr maskData; + maskData.ptr = 0; + + astraCUDA3d::doSIRT(volData, projData, maskData, dims, angle, 50); +#if 1 + float* buf = new float[256*256]; + + for (int i = 0; i < 256; ++i) { + cudaMemcpy(buf, ((float*)volData.ptr)+256*256*i, 256*256*sizeof(float), cudaMemcpyDeviceToHost); + + char fname[20]; + sprintf(fname, "vol%03d.png", i); + saveImage(fname, 256, 256, buf); + } +#endif + + return 0; +} +#endif + diff --git a/cuda/3d/sirt3d.h b/cuda/3d/sirt3d.h new file mode 100644 index 0000000..c3752c2 --- /dev/null +++ b/cuda/3d/sirt3d.h @@ -0,0 +1,118 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_SIRT3D_H +#define _CUDA_SIRT3D_H + +#include "util3d.h" +#include "algo3d.h" + +namespace astraCUDA3d { + +class _AstraExport SIRT : public ReconAlgo3D { +public: + SIRT(); + ~SIRT(); + +// bool setConeGeometry(const SDimensions3D& dims, const SConeProjection* projs); + + + bool enableVolumeMask(); + bool enableSinogramMask(); + + // init should be called after setting all geometry + bool init(); + + // setVolumeMask should be called after init and before iterate, + // but only if enableVolumeMask was called before init. + // It may be called again after iterate. + bool setVolumeMask(cudaPitchedPtr& D_maskData); + + // setSinogramMask should be called after init and before iterate, + // but only if enableSinogramMask was called before init. + // It may be called again after iterate. + bool setSinogramMask(cudaPitchedPtr& D_smaskData); + + + // setBuffers should be called after init and before iterate. + // It may be called again after iterate. + bool setBuffers(cudaPitchedPtr& D_volumeData, + cudaPitchedPtr& D_projData); + + + // set Min/Max constraints. They may be called at any time, and will affect + // any iterate() calls afterwards. + bool setMinConstraint(float fMin); + bool setMaxConstraint(float fMax); + + // iterate should be called after init and setBuffers. + // It may be called multiple times. + bool iterate(unsigned int iterations); + + // Compute the norm of the difference of the FP of the current reconstruction + // and the sinogram. (This performs one FP.) + // It can be called after iterate. + float computeDiffNorm(); + +protected: + void reset(); + bool precomputeWeights(); + + bool useVolumeMask; + bool useSinogramMask; + + bool useMinConstraint; + bool useMaxConstraint; + float fMinConstraint; + float fMaxConstraint; + + cudaPitchedPtr D_maskData; + cudaPitchedPtr D_smaskData; + + // Input/output + cudaPitchedPtr D_sinoData; + cudaPitchedPtr D_volumeData; + + // Temporary buffers + cudaPitchedPtr D_projData; + cudaPitchedPtr D_tmpData; + + // Geometry-specific precomputed data + cudaPitchedPtr D_lineWeight; + cudaPitchedPtr D_pixelWeight; +}; + +bool doSIRT(cudaPitchedPtr D_volumeData, unsigned int volumePitch, + cudaPitchedPtr D_projData, unsigned int projPitch, + cudaPitchedPtr D_maskData, unsigned int maskPitch, + const SDimensions3D& dims, const SConeProjection* projs, + unsigned int iterations); + +} + +#endif diff --git a/cuda/3d/util3d.cu b/cuda/3d/util3d.cu new file mode 100644 index 0000000..81ea823 --- /dev/null +++ b/cuda/3d/util3d.cu @@ -0,0 +1,514 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#include <cstdio> +#include <cassert> +#include "util3d.h" +#include "../2d/util.h" + +namespace astraCUDA3d { + + +cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPitchedPtr volData; + + cudaError err = cudaMalloc3D(&volData, extentV); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iVolX, dims.iVolY, dims.iVolZ); + volData.ptr = 0; + // TODO: return 0 somehow? + } + + return volData; +} +cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims) +{ + cudaExtent extentP; + extentP.width = dims.iProjU*sizeof(float); + extentP.height = dims.iProjAngles; + extentP.depth = dims.iProjV; + + cudaPitchedPtr projData; + + cudaError err = cudaMalloc3D(&projData, extentP); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU buffer\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + projData.ptr = 0; + // TODO: return 0 somehow? + } + + return projData; +} +bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iVolZ; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iVolX*sizeof(float), dims.iVolY); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iVolY; + } + return true; +} +bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims) +{ + char* t = (char*)D_data.ptr; + cudaError err; + + for (unsigned int z = 0; z < dims.iProjV; ++z) { + err = cudaMemset2D(t, D_data.pitch, 0, dims.iProjU*sizeof(float), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + t += D_data.pitch * dims.iProjAngles; + } + + return true; +} +bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = (void*)data; // const cast away + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = ptr; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_data; + p.extent = extentV; + p.kind = cudaMemcpyHostToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iVolX; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iVolX*sizeof(float); + ptr.ysize = dims.iVolY; + + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch) +{ + if (!pitch) + pitch = dims.iProjU; + + cudaPitchedPtr ptr; + ptr.ptr = data; + ptr.pitch = pitch*sizeof(float); + ptr.xsize = dims.iProjU*sizeof(float); + ptr.ysize = dims.iProjAngles; + + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_data; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = ptr; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToHost; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + +bool duplicateVolumeData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iVolX*sizeof(float); + extentV.height = dims.iVolY; + extentV.depth = dims.iVolZ; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} +bool duplicateProjectionData(cudaPitchedPtr& D_dst, const cudaPitchedPtr& D_src, const SDimensions3D& dims) +{ + cudaExtent extentV; + extentV.width = dims.iProjU*sizeof(float); + extentV.height = dims.iProjAngles; + extentV.depth = dims.iProjV; + + cudaPos zp = { 0, 0, 0 }; + + cudaMemcpy3DParms p; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_src; + p.dstArray = 0; + p.dstPos = zp; + p.dstPtr = D_dst; + p.extent = extentV; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err; + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + return err == cudaSuccess; +} + + + +// TODO: Consider using a single array of size max(proj,volume) (per dim) +// instead of allocating a new one each time + +// TODO: Figure out a faster way of zeroing the padding? + +cudaArray* allocateVolumeArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = dims.iVolZ+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iVolX, dims.iVolY, dims.iVolZ); + return 0; + } + + zeroVolumeArray(cuArray, dims); + + return cuArray; +} +cudaArray* allocateProjectionArray(const SDimensions3D& dims) +{ + cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>(); + cudaArray* cuArray; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV+2; + cudaError err = cudaMalloc3DArray(&cuArray, &channelDesc, extentA); + + if (err != cudaSuccess) { + astraCUDA::reportCudaError(err); + fprintf(stderr, "Failed to allocate %dx%dx%d GPU array\n", dims.iProjU, dims.iProjAngles, dims.iProjV); + return 0; + } + + zeroProjectionArray(cuArray, dims); + + return cuArray; +} +bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iVolX+2); + extentS.height = dims.iVolY+2; + extentS.depth = 1; + + cudaExtent extentA; + extentA.width = dims.iVolX+2; + extentA.height = dims.iVolY+2; + extentA.depth = 1; + + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iVolX+2), dims.iVolY+2); + ASTRA_CUDA_ASSERT(err); + + // zero array + for (unsigned int i = 0; i < dims.iVolZ+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + + return true; +} +bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims) +{ + cudaPitchedPtr zeroBuf; + cudaExtent extentS; + extentS.width = sizeof(float)*(dims.iProjU+2); + extentS.height = dims.iProjAngles; + extentS.depth = 1; + cudaExtent extentA; + extentA.width = dims.iProjU+2; + extentA.height = dims.iProjAngles; + extentA.depth = 1; + + + cudaError err; + err = cudaMalloc3D(&zeroBuf, extentS); + ASTRA_CUDA_ASSERT(err); + err = cudaMemset2D(zeroBuf.ptr, zeroBuf.pitch, 0, sizeof(float)*(dims.iProjU+2), dims.iProjAngles); + ASTRA_CUDA_ASSERT(err); + + for (unsigned int i = 0; i < dims.iProjV+2; ++i) { + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {0, 0, i}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = zeroBuf; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + } + cudaFree(zeroBuf.ptr); + + // TODO: check errors + return true; +} + + +bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iVolX; + extentA.height = dims.iVolY; + extentA.depth = dims.iVolZ; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 1, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_volumeData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + // TODO: check errors + + return true; +} +bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims) +{ + cudaExtent extentA; + extentA.width = dims.iProjU; + extentA.height = dims.iProjAngles; + extentA.depth = dims.iProjV; + + cudaMemcpy3DParms p; + cudaPos zp = {0, 0, 0}; + cudaPos dp = {1, 0, 1}; + p.srcArray = 0; + p.srcPos = zp; + p.srcPtr = D_projData; + p.dstArray = array; + p.dstPtr.ptr = 0; + p.dstPtr.pitch = 0; + p.dstPtr.xsize = 0; + p.dstPtr.ysize = 0; + p.dstPos = dp; + p.extent = extentA; + p.kind = cudaMemcpyDeviceToDevice; + + cudaError err = cudaMemcpy3D(&p); + ASTRA_CUDA_ASSERT(err); + + // TODO: check errors + + return true; +} + + +float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, + unsigned int z) +{ + return astraCUDA::dotProduct2D((float*)data.ptr, data.pitch/sizeof(float), x, y*z, 0, 0); +} + + +bool cudaTextForceKernelsCompletion() +{ + cudaError_t returnedCudaError = cudaThreadSynchronize(); + + if(returnedCudaError != cudaSuccess) { + fprintf(stderr, "Failed to force completion of cuda kernels: %d: %s.\n", returnedCudaError, cudaGetErrorString(returnedCudaError)); + return false; + } + + return true; +} + +int calcNextPowerOfTwo(int _iValue) +{ + int iOutput = 1; + while(iOutput < _iValue) + iOutput *= 2; + return iOutput; +} + +} diff --git a/cuda/3d/util3d.h b/cuda/3d/util3d.h new file mode 100644 index 0000000..cf04a18 --- /dev/null +++ b/cuda/3d/util3d.h @@ -0,0 +1,69 @@ +/* +----------------------------------------------------------------------- +Copyright 2012 iMinds-Vision Lab, University of Antwerp + +Contact: astra@ua.ac.be +Website: http://astra.ua.ac.be + + +This file is part of the +All Scale Tomographic Reconstruction Antwerp Toolbox ("ASTRA Toolbox"). + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +$Id$ +*/ + +#ifndef _CUDA_UTIL3D_H +#define _CUDA_UTIL3D_H + +#include <cuda.h> +#include "dims3d.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif +#include "../2d/util.h" + +namespace astraCUDA3d { + +cudaPitchedPtr allocateVolumeData(const SDimensions3D& dims); +cudaPitchedPtr allocateProjectionData(const SDimensions3D& dims); +bool zeroVolumeData(cudaPitchedPtr& D_data, const SDimensions3D& dims); +bool zeroProjectionData(cudaPitchedPtr& D_data, const SDimensions3D& dims); +bool copyVolumeToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyProjectionsToDevice(const float* data, cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyVolumeFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool copyProjectionsFromDevice(float* data, const cudaPitchedPtr& D_data, const SDimensions3D& dims, unsigned int pitch = 0); +bool duplicateVolumeData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); +bool duplicateProjectionData(cudaPitchedPtr& D_dest, const cudaPitchedPtr& D_src, const SDimensions3D& dims); + + +bool transferProjectionsToArray(cudaPitchedPtr D_projData, cudaArray* array, const SDimensions3D& dims); +bool transferVolumeToArray(cudaPitchedPtr D_volumeData, cudaArray* array, const SDimensions3D& dims); +bool zeroProjectionArray(cudaArray* array, const SDimensions3D& dims); +bool zeroVolumeArray(cudaArray* array, const SDimensions3D& dims); +cudaArray* allocateProjectionArray(const SDimensions3D& dims); +cudaArray* allocateVolumeArray(const SDimensions3D& dims); + +bool cudaTextForceKernelsCompletion(); + +float dotProduct3D(cudaPitchedPtr data, unsigned int x, unsigned int y, unsigned int z); + +int calcNextPowerOfTwo(int _iValue); + +} + +#endif |