diff options
author | Willem Jan Palenstijn <WillemJan.Palenstijn@uantwerpen.be> | 2014-04-16 11:13:01 +0000 |
---|---|---|
committer | wpalenst <WillemJan.Palenstijn@uantwerpen.be> | 2014-04-16 11:13:01 +0000 |
commit | 7ce0b7cca179e903e8011cd96c9910cbdf62ae00 (patch) | |
tree | 2ebfa5687c8126d1d2b345a1bfc8c374a62a227b /cuda/3d/arith3d.cu | |
parent | 3a6769465bee7d56d0ddff36613b886446421e07 (diff) | |
download | astra-7ce0b7cca179e903e8011cd96c9910cbdf62ae00.tar.gz astra-7ce0b7cca179e903e8011cd96c9910cbdf62ae00.tar.bz2 astra-7ce0b7cca179e903e8011cd96c9910cbdf62ae00.tar.xz astra-7ce0b7cca179e903e8011cd96c9910cbdf62ae00.zip |
Remove padding in 3D cuda in favour of Border mode
Diffstat (limited to 'cuda/3d/arith3d.cu')
-rw-r--r-- | cuda/3d/arith3d.cu | 90 |
1 files changed, 42 insertions, 48 deletions
diff --git a/cuda/3d/arith3d.cu b/cuda/3d/arith3d.cu index 9a19be0..7cb56f6 100644 --- a/cuda/3d/arith3d.cu +++ b/cuda/3d/arith3d.cu @@ -99,14 +99,14 @@ struct opClampMax { -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -116,14 +116,14 @@ __global__ void devtoD(float* pfOut, unsigned int pitch, unsigned int width, uns } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -134,14 +134,14 @@ __global__ void devFtoD(float* pfOut, float fParam, unsigned int pitch, unsigned } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -151,14 +151,14 @@ __global__ void devDtoD(float* pfOut, const float* pfIn, unsigned int pitch, uns } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -168,14 +168,14 @@ __global__ void devDFtoD(float* pfOut, const float* pfIn, float fParam, unsigned } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -185,14 +185,14 @@ __global__ void devDDtoD(float* pfOut, const float* pfIn1, const float* pfIn2, u } } -template<class op, unsigned int padX, unsigned int padY, unsigned int repeat> +template<class op, unsigned int repeat> __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { unsigned int x = threadIdx.x + 16*blockIdx.x; if (x >= width) return; unsigned int y = (threadIdx.y + 16*blockIdx.y)*repeat; - unsigned int off = (y+padY)*pitch+x+padX; + unsigned int off = y*pitch+x; for (unsigned int i = 0; i < repeat; ++i) { if (y >= height) break; @@ -210,7 +210,7 @@ __global__ void devDDFtoD(float* pfOut, const float* pfIn1, const float* pfIn2, -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -218,12 +218,12 @@ void processVol(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsign float *pfOut = (float*)out; - devtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -231,12 +231,12 @@ void processVol(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int float *pfOut = (float*)out; - devFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -245,12 +245,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, uns float *pfOut = (float*)out; const float *pfIn = (const float*)in; - devDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -259,12 +259,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned float *pfOut = (float*)out; const float *pfIn = (const float*)in; - devDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -274,12 +274,12 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 const float *pfIn1 = (const float*)in1; const float *pfIn2 = (const float*)in2; - devDDFtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, pitch, width, height); cudaTextForceKernelsCompletion(); } -template<typename op, VolType t> +template<typename op> void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height) { dim3 blockSize(16,16); @@ -289,7 +289,7 @@ void processVol(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2 const float *pfIn1 = (const float*)in1; const float *pfIn2 = (const float*)in2; - devDDtoD<op, 1, t, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, pitch, width, height); cudaTextForceKernelsCompletion(); } @@ -319,7 +319,7 @@ void processVol3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -335,7 +335,7 @@ void processVol3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; } @@ -352,7 +352,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensio unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -370,7 +370,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, c unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn += step; } @@ -389,7 +389,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -409,7 +409,7 @@ void processVol3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitc unsigned int step = out.pitch/sizeof(float) * dims.iVolY; for (unsigned int i = 0; i < dims.iVolZ; ++i) { - devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iVolX, dims.iVolY); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -439,7 +439,7 @@ void processSino3D(cudaPitchedPtr& out, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devtoD<op, 32><<<gridSize, blockSize>>>(pfOut, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -455,7 +455,7 @@ void processSino3D(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims) unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; } @@ -472,7 +472,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensi unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -490,7 +490,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn += step; } @@ -509,7 +509,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDFtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDFtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, fParam, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -529,7 +529,7 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit unsigned int step = out.pitch/sizeof(float) * dims.iProjAngles; for (unsigned int i = 0; i < dims.iProjV; ++i) { - devDDtoD<op, 0, 0, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); + devDDtoD<op, 32><<<gridSize, blockSize>>>(pfOut, pfIn1, pfIn2, out.pitch/sizeof(float), dims.iProjU, dims.iProjAngles); pfOut += step; pfIn1 += step; pfIn2 += step; @@ -556,39 +556,33 @@ void processSino3D(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPit #define INST_DFtoD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, float fParam, const SDimensions3D& dims); #define INST_DtoD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in, const SDimensions3D& dims); #define INST_DDtoD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, const SDimensions3D& dims); #define INST_DDFtoD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, const CUdeviceptr* in1, const CUdeviceptr* in2, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const cudaPitchedPtr& in1, const cudaPitchedPtr& in2, float fParam, const SDimensions3D& dims); #define INST_toD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, const SDimensions3D& dims); #define INST_FtoD(name) \ - template void processVol<name, VOL>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ - template void processVol<name, SINO>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ + template void processVol<name>(CUdeviceptr* out, float fParam, unsigned int pitch, unsigned int width, unsigned int height); \ template void processVol3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); \ template void processSino3D<name>(cudaPitchedPtr& out, float fParam, const SDimensions3D& dims); |