diff options
14 files changed, 3551 insertions, 0 deletions
diff --git a/patches/astra-toolbox-approximate-projectors/cone_bp.cu b/patches/astra-toolbox-approximate-projectors/cone_bp.cu new file mode 100644 index 0000000..01edcb9 --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/cone_bp.cu @@ -0,0 +1,397 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp + 2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 6; + +static const unsigned int g_anglesPerBlock = 32; +static const unsigned int g_volBlockX = 16; +static const unsigned int g_volBlockY = 32; + +static const unsigned g_MaxAngles = 1024; + +struct DevConeParams { + float4 fNumU; + float4 fNumV; + float4 fDen; +}; + +__constant__ DevConeParams gC_C[g_MaxAngles]; + +#include "rounding.h" + +//__launch_bounds__(32*16, 4) +template<bool FDKWEIGHT, unsigned int ZSIZE> +__global__ void dev_cone_BP(void* D_volData, unsigned int volPitch, + cudaTextureObject_t tex, + int startAngle, int angleOffset, + const astraCUDA3d::SDimensions3D dims, + float fOutputScale) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles - angleOffset) + endAngle = dims.iProjAngles - angleOffset; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + const float fX = X - 0.5f*dims.iVolX + 0.5f; + const float fY = Y - 0.5f*dims.iVolY + 0.5f; + const float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + float Z[ZSIZE]; + for(int i=0; i < ZSIZE; i++) + Z[i] = 0.0f; + + + { + float fAngle = startAngle + angleOffset + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + float4 fCu = gC_C[angle].fNumU; + float4 fCv = gC_C[angle].fNumV; + float4 fCd = gC_C[angle].fDen; + + float fUNum = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z; + float fVNum = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z; + float fDen = (FDKWEIGHT ? 1.0f : fCd.w) + fX * fCd.x + fY * fCd.y + fZ * fCd.z; + + float fU,fV, fr; + + for (int idx = 0; idx < ZSIZE; idx++) + { + fr = __fdividef(1.0f, fDen); + fU = fUNum * fr; + fV = fVNum * fr; + + float fUf = round(fU) - 0.5f; + float fVf = round(fV) - 0.5f; + + textype fU_ = texto(fU); + textype fV_ = texto(fV); + textype fUf_ = texto(fUf); + textype fVf_ = texto(fVf); + + textype fVal0_0; textocheck(fVal0_0, "bp", tex3D<float>(tex, fUf, fAngle, fVf)); + textype fVal1_0; textocheck(fVal1_0, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf)); + textype fVal0_1; textocheck(fVal0_1, "bp", tex3D<float>(tex, fUf, fAngle, fVf + 1.0f)); + textype fVal1_1; textocheck(fVal1_1, "bp", tex3D<float>(tex, fUf + 1.0f, fAngle, fVf + 1.0f)); + + textype fVal0 = interpolate(fVal0_0, fVal0_1, (fV_ - fVf_)); + textype fVal1 = interpolate(fVal1_0, fVal1_1, (fV_ - fVf_)); + float fVal = texfrom(interpolate(fVal0, fVal1, (fU_ - fUf_))); + +// float fVal = tex3D<float>(tex, fU, fAngle, fV); + Z[idx] += fr*fr*fVal; + + fUNum += fCu.z; + fVNum += fCv.z; + fDen += fCd.z; + } + } + } + + int endZ = ZSIZE; + if (endZ > dims.iVolZ - startZ) + endZ = dims.iVolZ - startZ; + + for(int i=0; i < endZ; i++) + volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale; +} //End kernel + + + +// supersampling version +__global__ void dev_cone_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles - angleOffset) + endAngle = dims.iProjAngles - angleOffset; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + const float fSubStep = 1.0f/iRaysPerVoxelDim; + + fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim); + + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + angleOffset + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + float4 fCu = gC_C[angle].fNumU; + float4 fCv = gC_C[angle].fNumV; + float4 fCd = gC_C[angle].fDen; + + float fXs = fX; + for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) { + + const float fUNum = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z; + const float fVNum = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z; + const float fDen = fCd.w + fXs * fCd.x + fYs * fCd.y + fZs * fCd.z; + + const float fr = __fdividef(1.0f, fDen); + const float fU = fUNum * fr; + const float fV = fVNum * fr; + + fVal += tex3D<float>(tex, fU, fAngle, fV) * fr * fr; + + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale; + } +} + + +bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params) +{ + DevConeParams *p = new DevConeParams[iProjAngles]; + + // We need three things in the kernel: + // projected coordinates of pixels on the detector: + + // u: || (x-s) v (s-d) || / || u v (s-x) || + // v: -|| u (x-s) (s-d) || / || u v (s-x) || + + // ray density weighting factor for the adjoint + // || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 ) + + // FDK weighting factor + // ( || u v s || / || u v (s-x) || ) ^ 2 + + // Since u and v are ratios with the same denominator, we have + // a degree of freedom to scale the denominator. We use that to make + // the square of the denominator equal to the relevant weighting factor. + + + for (unsigned int i = 0; i < iProjAngles; ++i) { + Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ); + Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ); + Vec3 s(angles[i].fSrcX, angles[i].fSrcY, angles[i].fSrcZ); + Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ); + + + + double fScale; + if (!params.bFDKWeighting) { + // goal: 1/fDen^2 = || u v (s-d) ||^2 / ( |cross(u,v)| * || u v (s-x) ||^2 ) + // fDen = ( sqrt(|cross(u,v)|) * || u v (s-x) || ) / || u v (s-d) || + // i.e. scale = sqrt(|cross(u,v)|) * / || u v (s-d) || + + + // NB: for cross(u,v) we invert the volume scaling (for the voxel + // size normalization) to get the proper dimensions for + // the scaling of the adjoint + + fScale = sqrt(scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm()) / det3(u, v, s-d); + } else { + // goal: 1/fDen = || u v s || / || u v (s-x) || + // fDen = || u v (s-x) || / || u v s || + // i.e., scale = 1 / || u v s || + + fScale = 1.0 / det3(u, v, s); + } + + p[i].fNumU.w = fScale * det3(s,v,d); + p[i].fNumU.x = fScale * det3x(v,s-d); + p[i].fNumU.y = fScale * det3y(v,s-d); + p[i].fNumU.z = fScale * det3z(v,s-d); + p[i].fNumV.w = -fScale * det3(s,u,d); + p[i].fNumV.x = -fScale * det3x(u,s-d); + p[i].fNumV.y = -fScale * det3y(u,s-d); + p[i].fNumV.z = -fScale * det3z(u,s-d); + p[i].fDen.w = fScale * det3(u, v, s); // == 1.0 for FDK + p[i].fDen.x = -fScale * det3x(u, v); + p[i].fDen.y = -fScale * det3y(u, v); + p[i].fDen.z = -fScale * det3z(u, v); + } + + // TODO: Check for errors + cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevConeParams), 0, cudaMemcpyHostToDevice); + + delete[] p; + + return true; +} + + +bool ConeBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SConeProjection* angles, + const SProjectorParams3D& params) +{ + cudaTextureObject_t D_texObj; + if (!createTextureObject3D(D_projArray, D_texObj)) + return false; + + float fOutputScale; + if (params.bFDKWeighting) { + // NB: assuming cube voxels here + fOutputScale = params.fOutputScale / (params.fVolScaleX); + } else { + fOutputScale = params.fOutputScale * (params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ); + } + + bool ok = true; + + for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) { + unsigned int angleCount = g_MaxAngles; + if (th + angleCount > dims.iProjAngles) + angleCount = dims.iProjAngles - th; + + ok = transferConstants(angles, angleCount, params); + if (!ok) + break; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX/1+g_volBlockX-1)/(g_volBlockX))*((dims.iVolY/1+1*g_volBlockY-1)/(1*g_volBlockY)), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (params.bFDKWeighting) { + if (dims.iVolZ == 1) { + dev_cone_BP<true, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } else { + dev_cone_BP<true, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } + } else if (params.iRaysPerVoxelDim == 1) { + if (dims.iVolZ == 1) { + dev_cone_BP<false, 1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } else { + dev_cone_BP<false, g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } + } else + dev_cone_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale); + } + + // TODO: Consider not synchronizing here, if possible. + ok = checkCuda(cudaThreadSynchronize(), "cone_bp"); + if (!ok) + break; + + angles = angles + angleCount; + // printf("%f\n", toc(t)); + + } + + cudaDestroyTextureObject(D_texObj); + + return ok; +} + +bool ConeBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + const SProjectorParams3D& params) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = ConeBP_Array(D_volumeData, cuArray, dims, angles, params); + + cudaFreeArray(cuArray); + + return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/cone_fp.cu b/patches/astra-toolbox-approximate-projectors/cone_fp.cu new file mode 100644 index 0000000..f11813c --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/cone_fp.cu @@ -0,0 +1,516 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp + 2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 4; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_SrcX[g_MaxAngles]; +__constant__ float gC_SrcY[g_MaxAngles]; +__constant__ float gC_SrcZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +// x=0, y=1, z=2 +struct DIR_X { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float c0(float x, float y, float z) const { return x; } + __device__ float c1(float x, float y, float z) const { return y; } + __device__ float c2(float x, float y, float z) const { return z; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); } + __device__ float x(float f0, float f1, float f2) const { return f0; } + __device__ float y(float f0, float f1, float f2) const { return f1; } + __device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// y=0, x=1, z=2 +struct DIR_Y { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float c0(float x, float y, float z) const { return y; } + __device__ float c1(float x, float y, float z) const { return x; } + __device__ float c2(float x, float y, float z) const { return z; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); } + __device__ float x(float f0, float f1, float f2) const { return f1; } + __device__ float y(float f0, float f1, float f2) const { return f0; } + __device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// z=0, x=1, y=2 +struct DIR_Z { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float c0(float x, float y, float z) const { return z; } + __device__ float c1(float x, float y, float z) const { return x; } + __device__ float c2(float x, float y, float z) const { return y; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); } + __device__ float x(float f0, float f1, float f2) const { return f1; } + __device__ float y(float f0, float f1, float f2) const { return f2; } + __device__ float z(float f0, float f1, float f2) const { return f0; } +}; + +struct SCALE_CUBE { + float fOutputScale; + __device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; } +}; + +struct SCALE_NONCUBE { + float fScale1; + float fScale2; + float fOutputScale; + __device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; } +}; + + +bool transferConstants(const SConeProjection* angles, unsigned int iProjAngles) +{ + // transfer angles to constant memory + float* tmp = new float[iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(SrcX); + TRANSFER_TO_CONSTANT(SrcY); + TRANSFER_TO_CONSTANT(SrcZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + return true; +} + + +#include "rounding.h" + + // threadIdx: x = ??? detector (u?) + // y = relative angle + + // blockIdx: x = ??? detector (u+v?) + // y = angle block + +template<class COORD, class SCALE> +__global__ void cone_FP_t(float* D_projData, unsigned int projPitch, + cudaTextureObject_t tex, + unsigned int startSlice, + unsigned int startAngle, unsigned int endAngle, + const SDimensions3D dims, + SCALE sc) +{ + COORD c; + + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fSrcZ = gC_SrcZ[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + const float fDetUZ = gC_DetUZ[angle]; + const float fDetVX = gC_DetVX[angle]; + const float fDetVY = gC_DetVY[angle]; + const float fDetVZ = gC_DetVZ[angle]; + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + if (detectorU >= dims.iProjU) + return; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > c.nSlices(dims)) + endSlice = c.nSlices(dims); + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + /* Trace ray from Src to (detectorU,detectorV) from */ + /* X = startSlice to X = endSlice */ + + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + + /* (x) ( 1) ( 0) */ + /* ray: (y) = (ay) * x + (by) */ + /* (z) (az) (bz) */ + + const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); + const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); + const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ); + const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ); + + const float fDistCorr = sc.scale(a1, a2); + + float fVal = 0.0f; + + float f0 = startSlice + 0.5f; + float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; + float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + + for (int s = startSlice; s < endSlice; ++s) + { + + float f1f = round(f1) - 0.5f; + float f2f = round(f2) - 0.5f; + + textype f1_ = texto(f1); + textype f2_ = texto(f2); + textype f1f_ = texto(f1f); + textype f2f_ = texto(f2f); + + textype fVal0_0; textocheck(fVal0_0, "fp", c.tex(tex, f0, f1f, f2f)); + textype fVal1_0; textocheck(fVal1_0, "fp", c.tex(tex, f0, f1f + 1.0f, f2f)); + textype fVal0_1; textocheck(fVal0_1, "fp", c.tex(tex, f0, f1f, f2f + 1.0f)); + textype fVal1_1; textocheck(fVal1_1, "fp", c.tex(tex, f0, f1f + 1.0f, f2f + 1.0f)); + + textype fVal0 = interpolate(fVal0_0, fVal0_1, (f2_ - f2f_)); + textype fVal1 = interpolate(fVal1_0, fVal1_1, (f2_ - f2f_)); + fVal += texfrom(interpolate(fVal0, fVal1, (f1_ - f1f_))); + +// fVal += c.tex(tex, f0, f1, f2); + f0 += 1.0f; + f1 += a1; + f2 += a2; + } + + fVal *= fDistCorr; + + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; + } +} + +template<class COORD> +__global__ void cone_FP_SS_t(float* D_projData, unsigned int projPitch, + cudaTextureObject_t tex, + unsigned int startSlice, + unsigned int startAngle, unsigned int endAngle, + const SDimensions3D dims, int iRaysPerDetDim, + SCALE_NONCUBE sc) +{ + COORD c; + + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const float fSrcX = gC_SrcX[angle]; + const float fSrcY = gC_SrcY[angle]; + const float fSrcZ = gC_SrcZ[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + const float fDetUZ = gC_DetUZ[angle]; + const float fDetVX = gC_DetVX[angle]; + const float fDetVY = gC_DetVY[angle]; + const float fDetVZ = gC_DetVZ[angle]; + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + if (detectorU >= dims.iProjU) + return; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > c.nSlices(dims)) + endSlice = c.nSlices(dims); + + const float fSubStep = 1.0f/iRaysPerDetDim; + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + /* Trace ray from Src to (detectorU,detectorV) from */ + /* X = startSlice to X = endSlice */ + + float fV = 0.0f; + + float fdU = detectorU - 0.5f + 0.5f*fSubStep; + for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { + float fdV = detectorV - 0.5f + 0.5f*fSubStep; + for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { + + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; + + /* (x) ( 1) ( 0) */ + /* ray: (y) = (ay) * x + (by) */ + /* (z) (az) (bz) */ + + const float a1 = (c.c1(fSrcX,fSrcY,fSrcZ) - c.c1(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); + const float a2 = (c.c2(fSrcX,fSrcY,fSrcZ) - c.c2(fDetX,fDetY,fDetZ)) / (c.c0(fSrcX,fSrcY,fSrcZ) - c.c0(fDetX,fDetY,fDetZ)); + const float b1 = c.c1(fSrcX,fSrcY,fSrcZ) - a1 * c.c0(fSrcX,fSrcY,fSrcZ); + const float b2 = c.c2(fSrcX,fSrcY,fSrcZ) - a2 * c.c0(fSrcX,fSrcY,fSrcZ); + + const float fDistCorr = sc.scale(a1, a2); + + float fVal = 0.0f; + + float f0 = startSlice + 0.5f; + float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; + float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + + for (int s = startSlice; s < endSlice; ++s) + { + fVal += c.tex(tex, f0, f1, f2); + f0 += 1.0f; + f1 += a1; + f2 += a2; + } + + fVal *= fDistCorr; + fV += fVal; + + } + } + + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim); + } +} + + +bool ConeFP_Array_internal(cudaPitchedPtr D_projData, + cudaTextureObject_t D_texObj, + const SDimensions3D& dims, + unsigned int angleCount, const SConeProjection* angles, + const SProjectorParams3D& params) +{ + if (!transferConstants(angles, angleCount)) + return false; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + bool cube = true; + if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001) + cube = false; + if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001) + cube = false; + + SCALE_CUBE scube; + scube.fOutputScale = params.fOutputScale * params.fVolScaleX; + + SCALE_NONCUBE snoncubeX; + float fS1 = params.fVolScaleY / params.fVolScaleX; + snoncubeX.fScale1 = fS1 * fS1; + float fS2 = params.fVolScaleZ / params.fVolScaleX; + snoncubeX.fScale2 = fS2 * fS2; + snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + + SCALE_NONCUBE snoncubeY; + fS1 = params.fVolScaleX / params.fVolScaleY; + snoncubeY.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleZ / params.fVolScaleY; + snoncubeY.fScale2 = fS2 * fS2; + snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + + SCALE_NONCUBE snoncubeZ; + fS1 = params.fVolScaleX / params.fVolScaleZ; + snoncubeZ.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleY / params.fVolScaleZ; + snoncubeZ.fScale2 = fS2 * fS2; + snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= angleCount; ++a) { + int dir = -1; + if (a != angleCount) { + float dX = fabsf(angles[a].fSrcX - (angles[a].fDetSX + dims.iProjU*angles[a].fDetUX*0.5f + dims.iProjV*angles[a].fDetVX*0.5f)); + float dY = fabsf(angles[a].fSrcY - (angles[a].fDetSY + dims.iProjU*angles[a].fDetUY*0.5f + dims.iProjV*angles[a].fDetVY*0.5f)); + float dZ = fabsf(angles[a].fSrcZ - (angles[a].fDetSZ + dims.iProjU*angles[a].fDetUZ*0.5f + dims.iProjV*angles[a].fDetVZ*0.5f)); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == angleCount || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); + else + cone_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeX); + else + cone_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); + else + cone_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeY); + else + cone_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); + else + cone_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, snoncubeZ); + else + cone_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + bool ok = true; + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "cone_fp"); + cudaStreamDestroy(*iter); + } + + // printf("%f\n", toc(t)); + + return ok; +} + + +bool ConeFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SConeProjection* angles, + const SProjectorParams3D& params) +{ + // transfer volume to array + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + cudaTextureObject_t D_texObj; + if (!createTextureObject3D(cuArray, D_texObj)) { + cudaFreeArray(cuArray); + return false; + } + + bool ret; + + for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { + unsigned int iEndAngle = iAngle + g_MaxAngles; + if (iEndAngle >= dims.iProjAngles) + iEndAngle = dims.iProjAngles; + + cudaPitchedPtr D_subprojData = D_projData; + D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch; + + ret = ConeFP_Array_internal(D_subprojData, D_texObj, + dims, iEndAngle - iAngle, angles + iAngle, + params); + if (!ret) + break; + } + + cudaFreeArray(cuArray); + + return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/par3d_bp.cu b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu new file mode 100644 index 0000000..7958ac9 --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/par3d_bp.cu @@ -0,0 +1,327 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp + 2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_volBlockZ = 6; + +static const unsigned int g_anglesPerBlock = 32; +static const unsigned int g_volBlockX = 16; +static const unsigned int g_volBlockY = 32; + +static const unsigned g_MaxAngles = 1024; + +struct DevPar3DParams { + float4 fNumU; + float4 fNumV; +}; + +__constant__ DevPar3DParams gC_C[g_MaxAngles]; +__constant__ float gC_scale[g_MaxAngles]; + + +#include "rounding.h" + +template<unsigned int ZSIZE> +__global__ void dev_par3D_BP(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, float fOutputScale) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles - angleOffset) + endAngle = dims.iProjAngles - angleOffset; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f; + float fY = Y - 0.5f*dims.iVolY + 0.5f; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f; + + float Z[ZSIZE]; + for(int i=0; i < ZSIZE; i++) + Z[i] = 0.0f; + + { + float fAngle = startAngle + angleOffset + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + + float4 fCu = gC_C[angle].fNumU; + float4 fCv = gC_C[angle].fNumV; + float fS = gC_scale[angle]; + + float fU = fCu.w + fX * fCu.x + fY * fCu.y + fZ * fCu.z; + float fV = fCv.w + fX * fCv.x + fY * fCv.y + fZ * fCv.z; +// printf("%f %f\n", fU, fV); + + for (int idx = 0; idx < ZSIZE; ++idx) { + float fVal; + textype h5 = texto(0.5f); + textype fU_ = texto(fU); + textype fUf_ = texto(floor(fU)); + float fUf = floor(fU); + + if ((fU - fUf) < 0.5f) { + textype fVal1 = texto(tex3D<float>(tex, fUf - 0.5f, fAngle, fV)); + textype fVal2 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV)); + fVal = texfrom(fVal1 + (fU_ + h5 - fUf_) * (fVal2 - fVal1)); + } else { + textype fVal1 = texto(tex3D<float>(tex, fUf + 0.5f, fAngle, fV)); + textype fVal2 = texto(tex3D<float>(tex, fUf + 1.5f, fAngle, fV)); + fVal = texfrom(fVal1 + (fU_ - h5 - fUf_) * (fVal2 - fVal1)); + } + +// float fVal = tex3D<float>(tex, fU, fAngle, fV); + Z[idx] += fVal * fS; + + fU += fCu.z; + fV += fCv.z; + } + + } + } + + int endZ = ZSIZE; + if (endZ > dims.iVolZ - startZ) + endZ = dims.iVolZ - startZ; + + for(int i=0; i < endZ; i++) + volData[((startZ+i)*dims.iVolY+Y)*volPitch+X] += Z[i] * fOutputScale; +} + +// supersampling version +__global__ void dev_par3D_BP_SS(void* D_volData, unsigned int volPitch, cudaTextureObject_t tex, int startAngle, int angleOffset, const SDimensions3D dims, int iRaysPerVoxelDim, float fOutputScale) +{ + float* volData = (float*)D_volData; + + int endAngle = startAngle + g_anglesPerBlock; + if (endAngle > dims.iProjAngles - angleOffset) + endAngle = dims.iProjAngles - angleOffset; + + // threadIdx: x = rel x + // y = rel y + + // blockIdx: x = x + y + // y = z + + + // TO TRY: precompute part of detector intersection formulas in shared mem? + // TO TRY: inner loop over z, gather ray values in shared mem + + const int X = blockIdx.x % ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockX + threadIdx.x; + const int Y = blockIdx.x / ((dims.iVolX+g_volBlockX-1)/g_volBlockX) * g_volBlockY + threadIdx.y; + + if (X >= dims.iVolX) + return; + if (Y >= dims.iVolY) + return; + + const int startZ = blockIdx.y * g_volBlockZ; + int endZ = startZ + g_volBlockZ; + if (endZ > dims.iVolZ) + endZ = dims.iVolZ; + + float fX = X - 0.5f*dims.iVolX + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + float fY = Y - 0.5f*dims.iVolY + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + float fZ = startZ - 0.5f*dims.iVolZ + 0.5f - 0.5f + 0.5f/iRaysPerVoxelDim; + + const float fSubStep = 1.0f/iRaysPerVoxelDim; + + fOutputScale /= (iRaysPerVoxelDim*iRaysPerVoxelDim*iRaysPerVoxelDim); + + + for (int Z = startZ; Z < endZ; ++Z, fZ += 1.0f) + { + + float fVal = 0.0f; + float fAngle = startAngle + angleOffset + 0.5f; + + for (int angle = startAngle; angle < endAngle; ++angle, fAngle += 1.0f) + { + float4 fCu = gC_C[angle].fNumU; + float4 fCv = gC_C[angle].fNumV; + float fS = gC_scale[angle]; + + float fXs = fX; + for (int iSubX = 0; iSubX < iRaysPerVoxelDim; ++iSubX) { + float fYs = fY; + for (int iSubY = 0; iSubY < iRaysPerVoxelDim; ++iSubY) { + float fZs = fZ; + for (int iSubZ = 0; iSubZ < iRaysPerVoxelDim; ++iSubZ) { + + const float fU = fCu.w + fXs * fCu.x + fYs * fCu.y + fZs * fCu.z; + const float fV = fCv.w + fXs * fCv.x + fYs * fCv.y + fZs * fCv.z; + + fVal += tex3D<float>(tex, fU, fAngle, fV) * fS; + fZs += fSubStep; + } + fYs += fSubStep; + } + fXs += fSubStep; + } + + } + + volData[(Z*dims.iVolY+Y)*volPitch+X] += fVal * fOutputScale; + } + +} + +bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles, const SProjectorParams3D& params) +{ + DevPar3DParams *p = new DevPar3DParams[iProjAngles]; + float *s = new float[iProjAngles]; + + for (unsigned int i = 0; i < iProjAngles; ++i) { + Vec3 u(angles[i].fDetUX, angles[i].fDetUY, angles[i].fDetUZ); + Vec3 v(angles[i].fDetVX, angles[i].fDetVY, angles[i].fDetVZ); + Vec3 r(angles[i].fRayX, angles[i].fRayY, angles[i].fRayZ); + Vec3 d(angles[i].fDetSX, angles[i].fDetSY, angles[i].fDetSZ); + + double fDen = det3(r,u,v); + p[i].fNumU.x = -det3x(r,v) / fDen; + p[i].fNumU.y = -det3y(r,v) / fDen; + p[i].fNumU.z = -det3z(r,v) / fDen; + p[i].fNumU.w = -det3(r,d,v) / fDen; + p[i].fNumV.x = det3x(r,u) / fDen; + p[i].fNumV.y = det3y(r,u) / fDen; + p[i].fNumV.z = det3z(r,u) / fDen; + p[i].fNumV.w = det3(r,d,u) / fDen; + + s[i] = 1.0 / scaled_cross3(u,v,Vec3(params.fVolScaleX,params.fVolScaleY,params.fVolScaleZ)).norm(); + } + + cudaMemcpyToSymbol(gC_C, p, iProjAngles*sizeof(DevPar3DParams), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(gC_scale, s, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); + + delete[] p; + delete[] s; + + return true; +} + +bool Par3DBP_Array(cudaPitchedPtr D_volumeData, + cudaArray *D_projArray, + const SDimensions3D& dims, const SPar3DProjection* angles, + const SProjectorParams3D& params) +{ + cudaTextureObject_t D_texObj; + if (!createTextureObject3D(D_projArray, D_texObj)) + return false; + + float fOutputScale = params.fOutputScale * params.fVolScaleX * params.fVolScaleY * params.fVolScaleZ; + + bool ok = true; + + for (unsigned int th = 0; th < dims.iProjAngles; th += g_MaxAngles) { + unsigned int angleCount = g_MaxAngles; + if (th + angleCount > dims.iProjAngles) + angleCount = dims.iProjAngles - th; + + ok = transferConstants(angles, angleCount, params); + if (!ok) + break; + + dim3 dimBlock(g_volBlockX, g_volBlockY); + + dim3 dimGrid(((dims.iVolX+g_volBlockX-1)/g_volBlockX)*((dims.iVolY+g_volBlockY-1)/g_volBlockY), (dims.iVolZ+g_volBlockZ-1)/g_volBlockZ); + + // timeval t; + // tic(t); + + for (unsigned int i = 0; i < angleCount; i += g_anglesPerBlock) { + // printf("Calling BP: %d, %dx%d, %dx%d to %p\n", i, dimBlock.x, dimBlock.y, dimGrid.x, dimGrid.y, (void*)D_volumeData.ptr); + if (params.iRaysPerVoxelDim == 1) { + if (dims.iVolZ == 1) { + dev_par3D_BP<1><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } else { + dev_par3D_BP<g_volBlockZ><<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, fOutputScale); + } + } else + dev_par3D_BP_SS<<<dimGrid, dimBlock>>>(D_volumeData.ptr, D_volumeData.pitch/sizeof(float), D_texObj, i, th, dims, params.iRaysPerVoxelDim, fOutputScale); + } + + // TODO: Consider not synchronizing here, if possible. + ok = checkCuda(cudaThreadSynchronize(), "cone_bp"); + if (!ok) + break; + + angles = angles + angleCount; + // printf("%f\n", toc(t)); + + } + + cudaDestroyTextureObject(D_texObj); + + return true; +} + +bool Par3DBP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + const SProjectorParams3D& params) +{ + // transfer projections to array + + cudaArray* cuArray = allocateProjectionArray(dims); + transferProjectionsToArray(D_projData, cuArray, dims); + + bool ret = Par3DBP_Array(D_volumeData, cuArray, dims, angles, params); + + cudaFreeArray(cuArray); + + return ret; +} + + +} diff --git a/patches/astra-toolbox-approximate-projectors/par3d_fp.cu b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu new file mode 100644 index 0000000..075784b --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/par3d_fp.cu @@ -0,0 +1,770 @@ +/* +----------------------------------------------------------------------- +Copyright: 2010-2021, imec Vision Lab, University of Antwerp + 2014-2021, CWI, Amsterdam + +Contact: astra@astra-toolbox.com +Website: http://www.astra-toolbox.com/ + +This file is part of the ASTRA Toolbox. + + +The ASTRA Toolbox is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +The ASTRA Toolbox is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with the ASTRA Toolbox. If not, see <http://www.gnu.org/licenses/>. + +----------------------------------------------------------------------- +*/ + +#include "astra/cuda/3d/util3d.h" +#include "astra/cuda/3d/dims3d.h" + +#include <cstdio> +#include <cassert> +#include <iostream> +#include <list> + +#include <cuda.h> + +namespace astraCUDA3d { + +static const unsigned int g_anglesPerBlock = 4; + +// thickness of the slices we're splitting the volume up into +static const unsigned int g_blockSlices = 32; +static const unsigned int g_detBlockU = 32; +static const unsigned int g_detBlockV = 32; + +static const unsigned g_MaxAngles = 1024; +__constant__ float gC_RayX[g_MaxAngles]; +__constant__ float gC_RayY[g_MaxAngles]; +__constant__ float gC_RayZ[g_MaxAngles]; +__constant__ float gC_DetSX[g_MaxAngles]; +__constant__ float gC_DetSY[g_MaxAngles]; +__constant__ float gC_DetSZ[g_MaxAngles]; +__constant__ float gC_DetUX[g_MaxAngles]; +__constant__ float gC_DetUY[g_MaxAngles]; +__constant__ float gC_DetUZ[g_MaxAngles]; +__constant__ float gC_DetVX[g_MaxAngles]; +__constant__ float gC_DetVY[g_MaxAngles]; +__constant__ float gC_DetVZ[g_MaxAngles]; + + +// x=0, y=1, z=2 +struct DIR_X { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float c0(float x, float y, float z) const { return x; } + __device__ float c1(float x, float y, float z) const { return y; } + __device__ float c2(float x, float y, float z) const { return z; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f0, f1, f2); } + __device__ float x(float f0, float f1, float f2) const { return f0; } + __device__ float y(float f0, float f1, float f2) const { return f1; } + __device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// y=0, x=1, z=2 +struct DIR_Y { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float c0(float x, float y, float z) const { return y; } + __device__ float c1(float x, float y, float z) const { return x; } + __device__ float c2(float x, float y, float z) const { return z; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f0, f2); } + __device__ float x(float f0, float f1, float f2) const { return f1; } + __device__ float y(float f0, float f1, float f2) const { return f0; } + __device__ float z(float f0, float f1, float f2) const { return f2; } +}; + +// z=0, x=1, y=2 +struct DIR_Z { + __device__ float nSlices(const SDimensions3D& dims) const { return dims.iVolZ; } + __device__ float nDim1(const SDimensions3D& dims) const { return dims.iVolX; } + __device__ float nDim2(const SDimensions3D& dims) const { return dims.iVolY; } + __device__ float c0(float x, float y, float z) const { return z; } + __device__ float c1(float x, float y, float z) const { return x; } + __device__ float c2(float x, float y, float z) const { return y; } + __device__ float tex(cudaTextureObject_t tex, float f0, float f1, float f2) const { return tex3D<float>(tex, f1, f2, f0); } + __device__ float x(float f0, float f1, float f2) const { return f1; } + __device__ float y(float f0, float f1, float f2) const { return f2; } + __device__ float z(float f0, float f1, float f2) const { return f0; } +}; + +struct SCALE_CUBE { + float fOutputScale; + __device__ float scale(float a1, float a2) const { return sqrt(a1*a1+a2*a2+1.0f) * fOutputScale; } +}; + +struct SCALE_NONCUBE { + float fScale1; + float fScale2; + float fOutputScale; + __device__ float scale(float a1, float a2) const { return sqrt(a1*a1*fScale1+a2*a2*fScale2+1.0f) * fOutputScale; } +}; + +bool transferConstants(const SPar3DProjection* angles, unsigned int iProjAngles) +{ + // transfer angles to constant memory + float* tmp = new float[iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + return true; +} + + +// threadIdx: x = u detector +// y = relative angle +// blockIdx: x = u/v detector +// y = angle block + +#include "rounding.h" + +template<class COORD, class SCALE> +__global__ void par3D_FP_t(float* D_projData, unsigned int projPitch, + cudaTextureObject_t tex, + unsigned int startSlice, + unsigned int startAngle, unsigned int endAngle, + const SDimensions3D dims, + SCALE sc) +{ + COORD c; + + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const float fRayX = gC_RayX[angle]; + const float fRayY = gC_RayY[angle]; + const float fRayZ = gC_RayZ[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + const float fDetUZ = gC_DetUZ[angle]; + const float fDetVX = gC_DetVX[angle]; + const float fDetVY = gC_DetVY[angle]; + const float fDetVZ = gC_DetVZ[angle]; + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + + const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float fDistCorr = sc.scale(a1, a2); + + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + if (detectorU >= dims.iProjU) + return; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > c.nSlices(dims)) + endSlice = c.nSlices(dims); + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + /* Trace ray in direction Ray to (detectorU,detectorV) from */ + /* X = startSlice to X = endSlice */ + + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + + /* (x) ( 1) ( 0) */ + /* ray: (y) = (ay) * x + (by) */ + /* (z) (az) (bz) */ + + const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); + const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + + float fVal = 0.0f; + + float f0 = startSlice + 0.5f; + float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; + float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + //printf("%f, %f (%f), %f (%f)\n", f0, f1, a1, f2, a2); // Only f1 non linear + + for (int s = startSlice; s < endSlice; ++s) + { + + textype h5 = texto(0.5f); + textype f1_ = texto(f1); + textype f1f_ = texto(floor(f1)); + float f1f = floor(f1); + + if ((f1 - f1f) < 0.5f) { + textype fVal1 = texto(c.tex(tex, f0, f1f - 0.5f, f2)); + textype fVal2 = texto(c.tex(tex, f0, f1f + 0.5f, f2)); + fVal += texfrom(fVal1 + (f1_ + h5 - f1f_) * (fVal2 - fVal1)); +// fVal += texfrom(__hfma(__hadd(h5,__hsub(f1_, f1f_)), __hsub(fVal2, fVal1), fVal1)); + } else { + textype fVal1 = texto(c.tex(tex, f0, f1f + 0.5f, f2)); + textype fVal2 = texto(c.tex(tex, f0, f1f + 1.5f, f2)); + fVal += texfrom(fVal1 + (f1_ - h5 - f1f_) * (fVal2 - fVal1)); + } + +// fVal += c.tex(tex, f0, f1, f2); + f0 += 1.0f; + f1 += a1; + f2 += a2; + } + + fVal *= fDistCorr; + + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; + } +} + +// Supersampling version +template<class COORD> +__global__ void par3D_FP_SS_t(float* D_projData, unsigned int projPitch, + cudaTextureObject_t tex, + unsigned int startSlice, + unsigned int startAngle, unsigned int endAngle, + const SDimensions3D dims, int iRaysPerDetDim, + SCALE_NONCUBE sc) +{ + COORD c; + + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const float fRayX = gC_RayX[angle]; + const float fRayY = gC_RayY[angle]; + const float fRayZ = gC_RayZ[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + const float fDetUZ = gC_DetUZ[angle]; + const float fDetVX = gC_DetVX[angle]; + const float fDetVY = gC_DetVY[angle]; + const float fDetVZ = gC_DetVZ[angle]; + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + + const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float fDistCorr = sc.scale(a1, a2); + + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + if (detectorU >= dims.iProjU) + return; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > c.nSlices(dims)) + endSlice = c.nSlices(dims); + + const float fSubStep = 1.0f/iRaysPerDetDim; + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + + float fV = 0.0f; + + float fdU = detectorU - 0.5f + 0.5f*fSubStep; + for (int iSubU = 0; iSubU < iRaysPerDetDim; ++iSubU, fdU+=fSubStep) { + float fdV = detectorV - 0.5f + 0.5f*fSubStep; + for (int iSubV = 0; iSubV < iRaysPerDetDim; ++iSubV, fdV+=fSubStep) { + + /* Trace ray in direction Ray to (detectorU,detectorV) from */ + /* X = startSlice to X = endSlice */ + + const float fDetX = fDetSX + fdU*fDetUX + fdV*fDetVX; + const float fDetY = fDetSY + fdU*fDetUY + fdV*fDetVY; + const float fDetZ = fDetSZ + fdU*fDetUZ + fdV*fDetVZ; + + /* (x) ( 1) ( 0) */ + /* ray: (y) = (ay) * x + (by) */ + /* (z) (az) (bz) */ + + const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); + const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + + + float fVal = 0.0f; + + float f0 = startSlice + 0.5f; + float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; + float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + + for (int s = startSlice; s < endSlice; ++s) + { + fVal += c.tex(tex, f0, f1, f2); + + f0 += 1.0f; + f1 += a1; + f2 += a2; + } + + fV += fVal; + + } + } + + fV *= fDistCorr; + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fV / (iRaysPerDetDim * iRaysPerDetDim); + } +} + + +__device__ float dirWeights(float fX, float fN) { + if (fX <= -0.5f) // outside image on left + return 0.0f; + if (fX <= 0.5f) // half outside image on left + return (fX + 0.5f) * (fX + 0.5f); + if (fX <= fN - 0.5f) { // inside image + float t = fX + 0.5f - floorf(fX + 0.5f); + return t*t + (1-t)*(1-t); + } + if (fX <= fN + 0.5f) // half outside image on right + return (fN + 0.5f - fX) * (fN + 0.5f - fX); + return 0.0f; // outside image on right +} + +template<class COORD> +__global__ void par3D_FP_SumSqW_t(float* D_projData, unsigned int projPitch, + unsigned int startSlice, + unsigned int startAngle, unsigned int endAngle, + const SDimensions3D dims, + SCALE_NONCUBE sc) +{ + COORD c; + + int angle = startAngle + blockIdx.y * g_anglesPerBlock + threadIdx.y; + if (angle >= endAngle) + return; + + const float fRayX = gC_RayX[angle]; + const float fRayY = gC_RayY[angle]; + const float fRayZ = gC_RayZ[angle]; + const float fDetUX = gC_DetUX[angle]; + const float fDetUY = gC_DetUY[angle]; + const float fDetUZ = gC_DetUZ[angle]; + const float fDetVX = gC_DetVX[angle]; + const float fDetVY = gC_DetVY[angle]; + const float fDetVZ = gC_DetVZ[angle]; + const float fDetSX = gC_DetSX[angle] + 0.5f * fDetUX + 0.5f * fDetVX; + const float fDetSY = gC_DetSY[angle] + 0.5f * fDetUY + 0.5f * fDetVY; + const float fDetSZ = gC_DetSZ[angle] + 0.5f * fDetUZ + 0.5f * fDetVZ; + + const float a1 = c.c1(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float a2 = c.c2(fRayX,fRayY,fRayZ) / c.c0(fRayX,fRayY,fRayZ); + const float fDistCorr = sc.scale(a1, a2); + + + const int detectorU = (blockIdx.x%((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockU + threadIdx.x; + if (detectorU >= dims.iProjU) + return; + const int startDetectorV = (blockIdx.x/((dims.iProjU+g_detBlockU-1)/g_detBlockU)) * g_detBlockV; + int endDetectorV = startDetectorV + g_detBlockV; + if (endDetectorV > dims.iProjV) + endDetectorV = dims.iProjV; + + int endSlice = startSlice + g_blockSlices; + if (endSlice > c.nSlices(dims)) + endSlice = c.nSlices(dims); + + for (int detectorV = startDetectorV; detectorV < endDetectorV; ++detectorV) + { + /* Trace ray in direction Ray to (detectorU,detectorV) from */ + /* X = startSlice to X = endSlice */ + + const float fDetX = fDetSX + detectorU*fDetUX + detectorV*fDetVX; + const float fDetY = fDetSY + detectorU*fDetUY + detectorV*fDetVY; + const float fDetZ = fDetSZ + detectorU*fDetUZ + detectorV*fDetVZ; + + /* (x) ( 1) ( 0) */ + /* ray: (y) = (ay) * x + (by) */ + /* (z) (az) (bz) */ + + const float b1 = c.c1(fDetX,fDetY,fDetZ) - a1 * c.c0(fDetX,fDetY,fDetZ); + const float b2 = c.c2(fDetX,fDetY,fDetZ) - a2 * c.c0(fDetX,fDetY,fDetZ); + + float fVal = 0.0f; + + float f0 = startSlice + 0.5f; + float f1 = a1 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b1 + 0.5f*c.nDim1(dims) - 0.5f + 0.5f; + float f2 = a2 * (startSlice - 0.5f*c.nSlices(dims) + 0.5f) + b2 + 0.5f*c.nDim2(dims) - 0.5f + 0.5f; + + for (int s = startSlice; s < endSlice; ++s) + { + fVal += dirWeights(f1, c.nDim1(dims)) * dirWeights(f2, c.nDim2(dims)); + f0 += 1.0f; + f1 += a1; + f2 += a2; + } + + fVal *= fDistCorr * fDistCorr; + D_projData[(detectorV*dims.iProjAngles+angle)*projPitch+detectorU] += fVal; + } +} + +// Supersampling version +// TODO + + +bool Par3DFP_Array_internal(cudaPitchedPtr D_projData, + cudaTextureObject_t D_texObj, + const SDimensions3D& dims, + unsigned int angleCount, const SPar3DProjection* angles, + const SProjectorParams3D& params) +{ + if (!transferConstants(angles, angleCount)) + return false; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + bool cube = true; + if (abs(params.fVolScaleX / params.fVolScaleY - 1.0) > 0.00001) + cube = false; + if (abs(params.fVolScaleX / params.fVolScaleZ - 1.0) > 0.00001) + cube = false; + + SCALE_CUBE scube; + scube.fOutputScale = params.fOutputScale * params.fVolScaleX; + + SCALE_NONCUBE snoncubeX; + float fS1 = params.fVolScaleY / params.fVolScaleX; + snoncubeX.fScale1 = fS1 * fS1; + float fS2 = params.fVolScaleZ / params.fVolScaleX; + snoncubeX.fScale2 = fS2 * fS2; + snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + + SCALE_NONCUBE snoncubeY; + fS1 = params.fVolScaleX / params.fVolScaleY; + snoncubeY.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleY / params.fVolScaleY; + snoncubeY.fScale2 = fS2 * fS2; + snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + + SCALE_NONCUBE snoncubeZ; + fS1 = params.fVolScaleX / params.fVolScaleZ; + snoncubeZ.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleY / params.fVolScaleZ; + snoncubeZ.fScale2 = fS2 * fS2; + snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= angleCount; ++a) { + int dir = -1; + if (a != angleCount) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == angleCount || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: consider limiting number of handle (chaotic) geoms + // with many alternating directions + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj, i, blockStart, blockEnd, dims, scube); + else + par3D_FP_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeX); + else + par3D_FP_SS_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeX); + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, scube); + else + par3D_FP_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeY); + else + par3D_FP_SS_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeY); + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + if (cube) + par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, scube); + else + par3D_FP_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, snoncubeZ); + else + par3D_FP_SS_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), D_texObj,i, blockStart, blockEnd, dims, params.iRaysPerDetDim, snoncubeZ); + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + bool ok = true; + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "par3d_fp"); + cudaStreamDestroy(*iter); + } + + // printf("%f\n", toc(t)); + + return ok; +} + +bool Par3DFP(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + const SProjectorParams3D& params) +{ + + // transfer volume to array + cudaArray* cuArray = allocateVolumeArray(dims); + transferVolumeToArray(D_volumeData, cuArray, dims); + + cudaTextureObject_t D_texObj; + if (!createTextureObject3D(cuArray, D_texObj)) { + cudaFreeArray(cuArray); + return false; + } + + bool ret; + + for (unsigned int iAngle = 0; iAngle < dims.iProjAngles; iAngle += g_MaxAngles) { + unsigned int iEndAngle = iAngle + g_MaxAngles; + if (iEndAngle >= dims.iProjAngles) + iEndAngle = dims.iProjAngles; + + cudaPitchedPtr D_subprojData = D_projData; + D_subprojData.ptr = (char*)D_projData.ptr + iAngle * D_projData.pitch; + + ret = Par3DFP_Array_internal(D_subprojData, D_texObj, + dims, iEndAngle - iAngle, angles + iAngle, + params); + if (!ret) + break; + } + + cudaFreeArray(cuArray); + + cudaDestroyTextureObject(D_texObj); + + return ret; +} + + + +bool Par3DFP_SumSqW(cudaPitchedPtr D_volumeData, + cudaPitchedPtr D_projData, + const SDimensions3D& dims, const SPar3DProjection* angles, + const SProjectorParams3D& params) +{ + // transfer angles to constant memory + float* tmp = new float[dims.iProjAngles]; + +#define TRANSFER_TO_CONSTANT(name) do { for (unsigned int i = 0; i < dims.iProjAngles; ++i) tmp[i] = angles[i].f##name ; cudaMemcpyToSymbol(gC_##name, tmp, dims.iProjAngles*sizeof(float), 0, cudaMemcpyHostToDevice); } while (0) + + TRANSFER_TO_CONSTANT(RayX); + TRANSFER_TO_CONSTANT(RayY); + TRANSFER_TO_CONSTANT(RayZ); + TRANSFER_TO_CONSTANT(DetSX); + TRANSFER_TO_CONSTANT(DetSY); + TRANSFER_TO_CONSTANT(DetSZ); + TRANSFER_TO_CONSTANT(DetUX); + TRANSFER_TO_CONSTANT(DetUY); + TRANSFER_TO_CONSTANT(DetUZ); + TRANSFER_TO_CONSTANT(DetVX); + TRANSFER_TO_CONSTANT(DetVY); + TRANSFER_TO_CONSTANT(DetVZ); + +#undef TRANSFER_TO_CONSTANT + + delete[] tmp; + + std::list<cudaStream_t> streams; + dim3 dimBlock(g_detBlockU, g_anglesPerBlock); // region size, angles + + // Run over all angles, grouping them into groups of the same + // orientation (roughly horizontal vs. roughly vertical). + // Start a stream of grids for each such group. + + unsigned int blockStart = 0; + unsigned int blockEnd = 0; + int blockDirection = 0; + + SCALE_NONCUBE snoncubeX; + float fS1 = params.fVolScaleY / params.fVolScaleX; + snoncubeX.fScale1 = fS1 * fS1; + float fS2 = params.fVolScaleZ / params.fVolScaleX; + snoncubeX.fScale2 = fS2 * fS2; + snoncubeX.fOutputScale = params.fOutputScale * params.fVolScaleX; + + SCALE_NONCUBE snoncubeY; + fS1 = params.fVolScaleX / params.fVolScaleY; + snoncubeY.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleY / params.fVolScaleY; + snoncubeY.fScale2 = fS2 * fS2; + snoncubeY.fOutputScale = params.fOutputScale * params.fVolScaleY; + + SCALE_NONCUBE snoncubeZ; + fS1 = params.fVolScaleX / params.fVolScaleZ; + snoncubeZ.fScale1 = fS1 * fS1; + fS2 = params.fVolScaleY / params.fVolScaleZ; + snoncubeZ.fScale2 = fS2 * fS2; + snoncubeZ.fOutputScale = params.fOutputScale * params.fVolScaleZ; + + + // timeval t; + // tic(t); + + for (unsigned int a = 0; a <= dims.iProjAngles; ++a) { + int dir; + if (a != dims.iProjAngles) { + float dX = fabsf(angles[a].fRayX); + float dY = fabsf(angles[a].fRayY); + float dZ = fabsf(angles[a].fRayZ); + + if (dX >= dY && dX >= dZ) + dir = 0; + else if (dY >= dX && dY >= dZ) + dir = 1; + else + dir = 2; + } + + if (a == dims.iProjAngles || dir != blockDirection) { + // block done + + blockEnd = a; + if (blockStart != blockEnd) { + + dim3 dimGrid( + ((dims.iProjU+g_detBlockU-1)/g_detBlockU)*((dims.iProjV+g_detBlockV-1)/g_detBlockV), +(blockEnd-blockStart+g_anglesPerBlock-1)/g_anglesPerBlock); + // TODO: check if we can't immediately + // destroy the stream after use + cudaStream_t stream; + cudaStreamCreate(&stream); + streams.push_back(stream); + + // printf("angle block: %d to %d, %d (%dx%d, %dx%d)\n", blockStart, blockEnd, blockDirection, dimGrid.x, dimGrid.y, dimBlock.x, dimBlock.y); + + if (blockDirection == 0) { + for (unsigned int i = 0; i < dims.iVolX; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + par3D_FP_SumSqW_t<DIR_X><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeX); + else +#if 0 + par3D_FP_SS_SumSqW_dirX<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 1) { + for (unsigned int i = 0; i < dims.iVolY; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + par3D_FP_SumSqW_t<DIR_Y><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeY); + else +#if 0 + par3D_FP_SS_SumSqW_dirY<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } else if (blockDirection == 2) { + for (unsigned int i = 0; i < dims.iVolZ; i += g_blockSlices) + if (params.iRaysPerDetDim == 1) + par3D_FP_SumSqW_t<DIR_Z><<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, snoncubeZ); + else +#if 0 + par3D_FP_SS_SumSqW_dirZ<<<dimGrid, dimBlock, 0, stream>>>((float*)D_projData.ptr, D_projData.pitch/sizeof(float), i, blockStart, blockEnd, dims, fOutputScale); +#else + assert(false); +#endif + } + + } + + blockDirection = dir; + blockStart = a; + } + } + + bool ok = true; + + for (std::list<cudaStream_t>::iterator iter = streams.begin(); iter != streams.end(); ++iter) { + ok &= checkCuda(cudaStreamSynchronize(*iter), "Par3DFP_SumSqW"); + cudaStreamDestroy(*iter); + } + + // printf("%f\n", toc(t)); + + return ok; +} + + + + + + + +} diff --git a/patches/astra-toolbox-approximate-projectors/rounding.h b/patches/astra-toolbox-approximate-projectors/rounding.h new file mode 100644 index 0000000..c1cbffb --- /dev/null +++ b/patches/astra-toolbox-approximate-projectors/rounding.h @@ -0,0 +1,50 @@ +#include <cuda_fp16.h> + +#define precision 8 +#define approximate_interpolation + +#ifdef approximate_interpolation +# ifdef precision +# if precision == 16 +# define texto(v) __float2half(v) +# define texfrom(v) __half2float(v) +# define textype half +# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +# define textocheck(var,msg,val) var=texto(val); +# else +# define precision_mult ((1<<precision)-1) +# define texto(v) ((unsigned char)(precision_mult*v)) +# define texfrom(v) (1.f * v / precision_mult) +# define textype unsigned +# define interpolate(v0, v1, pos) (v0 + ((unsigned)(256 * pos)) * (v1 - v0) / 256) +# define textocheck(var, msg, val) \ + if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \ + var=texto(val); +# endif +# else +# define texto(v) (v) +# define texfrom(v) (v) +# define textype float +# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +# define textocheck(var,msg,val) var=texto(val); +# endif +#else +# ifdef precision +# if precision == 16 +# define texto(v) __half2float(__float2half(v)) +# define textocheck(var,msg,val) var=texto(val); +# else +# define precision_mult ((1<<precision)-1) +# define texto(v) (floor(precision_mult*v)/precision_mult) +# define textocheck(var, msg, val) \ + if ((val<0)||(val>1)) { printf("Received out-of-range value (%f) in %s texture fetch\n", val, msg); } \ + var=texto(val); +# endif +# else +# define texto(v) (v) +# define textocheck(var,msg,val) var=texto(val); +# endif +# define texfrom(v) (v) +# define textype float +# define interpolate(v0, v1, pos) (v0 + pos*(v1-v0)) +#endif diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt new file mode 100644 index 0000000..893abc5 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/CMakeLists.txt @@ -0,0 +1,141 @@ +# Copyright 2018 Edoardo Pasca +#cmake_minimum_required (VERSION 3.0) + +project(RGL_core) +#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py + +# The version number. + +set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE) + +# conda orchestrated build +message("CIL_VERSION ${CIL_VERSION}") +#include (GenerateExportHeader) + +## Build the regularisers package as a library +message("Creating Regularisers as a shared library") + +set(CMAKE_BUILD_TYPE "Release") + +set (EXTRA_LIBRARIES "") +if(WIN32) + set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib") + message("library lib: ${LIBRARY_LIB}") +elseif(APPLE) + set (FLAGS "-DCCPiReconstructionIterative_EXPORTS ") +elseif(UNIX) + set (FLAGS "-O3 -funsigned-char -Wall -Wl,--no-undefined -DCCPiReconstructionIterative_EXPORTS") + set(EXTRA_LIBRARIES "m") +endif() + +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + + set (CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}") +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") + + +## Build the regularisers package as a library +message("Adding regularisers as a shared library") + +add_library(cilreg SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c + ) +target_link_libraries(cilreg ${OpenMP_EXE_LINKER_FLAGS} ${EXTRA_LIBRARIES}) +#set (CMAKE_SHARED_LINKER_FLAGS "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") +#target_link_options(cilreg PUBLIC +include_directories(cilreg PUBLIC + ${LIBRARY_INC}/include + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ ) + +## Install + +if (UNIX) +message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +install(TARGETS cilreg + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +elseif(WIN32) +message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilreg + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +endif() + + + +# GPU Regularisers +if (BUILD_CUDA) + find_package(CUDA) + if (CUDA_FOUND) + set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES") + message("CUDA FLAGS ${CUDA_NVCC_FLAGS}") + CUDA_ADD_LIBRARY(cilregcuda SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu + ) + if (UNIX) + message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") + install(TARGETS cilregcuda + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + elseif(WIN32) + message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilregcuda + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + endif() + else() + message("CUDA NOT FOUND") + endif() +endif() + +if (${BUILD_MATLAB_WRAPPER}) + if (WIN32) + install(TARGETS cilreg DESTINATION ${MATLAB_DEST}) + if (CUDA_FOUND) + install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST}) + endif() + endif() +endif() diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c new file mode 100644 index 0000000..91fd746 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.c @@ -0,0 +1,266 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2019 Daniil Kazantsev + * Copyright 2019 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "FGP_TV_core.h" + +/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ) +{ + int ll; + long j, DimTotal; + float re, re1; + re = 0.0f; re1 = 0.0f; + float tk = 1.0f; + float tkp1 =1.0f; + int count = 0; + + if (dimZ <= 1) { + /*2D case */ + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + DimTotal = (long)(dimX*dimY); + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + P1 = calloc(DimTotal, sizeof(float)); + P2 = calloc(DimTotal, sizeof(float)); + P1_prev = calloc(DimTotal, sizeof(float)); + P2_prev = calloc(DimTotal, sizeof(float)); + R1 = calloc(DimTotal, sizeof(float)); + R2 = calloc(DimTotal, sizeof(float)); + + /* begin iterations */ + for(ll=0; ll<iterationsNumb; ll++) { + + if ((epsil != 0.0f) && (ll % 5 == 0)) copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); + /* computing the gradient of the objective function */ + Obj_func2D(Input, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); + + /* apply nonnegativity */ + if (nonneg == 1) for(j=0; j<DimTotal; j++) {if (Output[j] < 0.0f) Output[j] = 0.0f;} + + /*Taking a step towards minus of the gradient*/ + Grad_func2D(P1, P2, Output, R1, R2, lambdaPar, (long)(dimX), (long)(dimY)); + + /* projection step */ + Proj_func2D(P1, P2, methodTV, DimTotal); + + /*updating R and t*/ + tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; + Rupd_func2D(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, DimTotal); + + /*storing old values*/ + copyIm(P1, P1_prev, (long)(dimX), (long)(dimY), 1l); + copyIm(P2, P2_prev, (long)(dimX), (long)(dimY), 1l); + tk = tkp1; + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + } + } + if (epsil != 0.0f) free(Output_prev); + free(P1); free(P2); free(P1_prev); free(P2_prev); free(R1); free(R2); + } + else { + /*3D case*/ + float *P1=NULL, *P2=NULL, *P3=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + DimTotal = (long)dimX*(long)dimY*(long)dimZ; + + P1 = calloc(DimTotal, sizeof(float)); + P2 = calloc(DimTotal, sizeof(float)); + P3 = calloc(DimTotal, sizeof(float)); + + R1 = calloc(DimTotal, sizeof(float)); + R2 = calloc(DimTotal, sizeof(float)); + R3 = calloc(DimTotal, sizeof(float)); + + /* begin iterations */ + for(ll=0; ll<iterationsNumb; ll++) { + /* computing the gradient of the objective function */ + re = Obj_func3D(Input, Output, R1, R2, R3, lambdaPar, nonneg, (long)(dimX), (long)(dimY), (long)(dimZ)); + + tkp1 = (1.0f + sqrtf(1.0f + 4.0f*tk*tk))*0.5f; + All_func3D(P1, P2, P3, Output, R1, R2, R3, lambdaPar, tkp1, tk, (long)(dimX), (long)(dimY), (long)(dimZ)); + + // calculate norm - stopping rules + if ((epsil != 0.0f) && (ll % 5 == 0)) { + // stop if the norm residual is less than the tolerance EPS + if (re < epsil) count++; + if (count > 3) break; + } + + tk = tkp1; + } + + free(P1); free(P2); free(P3); free(R1); free(R2); free(R3); +// printf("TV iters %i (of %i)\n", ll, iterationsNumb); + } + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + + return 0; +} + +float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY) +{ + float val1, val2; + long i,j,index; +#pragma omp parallel for shared(A,D,R1,R2) private(index,i,j,val1,val2) + for(j=0; j<dimY; j++) { + for(i=0; i<dimX; i++) { + index = j*dimX+i; + /* boundary conditions */ + if (i == 0) {val1 = 0.0f;} else {val1 = R1[j*dimX + (i-1)];} + if (j == 0) {val2 = 0.0f;} else {val2 = R2[(j-1)*dimX + i];} + D[index] = A[index] - lambda*(R1[index] + R2[index] - val1 - val2); + }} + return *D; +} +float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, long dimX, long dimY) +{ + float val1, val2, multip; + long i,j,index; + multip = (1.0f/(8.0f*lambda)); +#pragma omp parallel for shared(P1,P2,D,R1,R2,multip) private(index,i,j,val1,val2) + for(j=0; j<dimY; j++) { + for(i=0; i<dimX; i++) { + index = j*dimX+i; + /* boundary conditions */ + if (i == dimX-1) val1 = 0.0f; else val1 = D[index] - D[j*dimX + (i+1)]; + if (j == dimY-1) val2 = 0.0f; else val2 = D[index] - D[(j+1)*dimX + i]; + P1[index] = R1[index] + multip*val1; + P2[index] = R2[index] + multip*val2; + }} + return 1; +} +float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal) +{ + long i; + float multip; + multip = ((tk-1.0f)/tkp1); +#pragma omp parallel for shared(P1,P2,P1_old,P2_old,R1,R2,multip) private(i) + for(i=0; i<DimTotal; i++) { + R1[i] = P1[i] + multip*(P1[i] - P1_old[i]); + R2[i] = P2[i] + multip*(P2[i] - P2_old[i]); + } + return 1; +} + + // too much threads poisoning caches. there is a sweet spot +#define n_threads 16 + +/* 3D-case related Functions */ +/*****************************************************************/ +float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ) +{ + float D_new; + float val1, val2, val3; + float re = 0.0f, re1 = 0.0f; + long i,j,k,index; + +#pragma omp parallel for reduction(+ : re, re1) shared(A,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,D_new) num_threads(n_threads) + for(k=0; k<dimZ; k++) { + for(j=0; j<dimY; j++) { + for(i=0; i<dimX; i++) { + index = (dimX*dimY)*k + j*dimX+i; + /* boundary conditions */ + val1 = (i == 0)?0.f:R1[(dimX*dimY)*k + j*dimX + (i-1)]; + val2 = (j == 0)?0.f:R2[(dimX*dimY)*k + (j-1)*dimX + i]; + val3 = (k == 0)?0.f:R3[(dimX*dimY)*(k-1) + j*dimX + i]; + + D_new = A[index] - lambda*(R1[index] + R2[index] + R3[index] - val1 - val2 - val3); + if ((nonneg == 1)&&(D_new < 0.0f)) D_new = 0.0f; + + re += powf(D_new - D[index], 2); + re1 += powf(D_new, 2); + D[index] = D_new; + }}} + + return sqrtf(re)/sqrtf(re1); +} + +float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ) +{ + float val1, val2, val3, denom, sq_denom; + float P1_new, P2_new, P3_new; + long i,j,k, index; + float multip = (1.0f/(26.0f*lambda)); + float multip2 = ((tk-1.0f)/tkp1); + +#pragma omp parallel for shared(P1_old,P2_old,P3_old,D,R1,R2,R3) private(index,i,j,k,val1,val2,val3,denom,sq_denom,multip,multip2,P1_new,P2_new,P3_new) num_threads(n_threads) + for(k=0; k<dimZ; k++) { + for(j=0; j<dimY; j++) { + for(i=0; i<dimX; i++) { + index = (dimX*dimY)*k + j*dimX+i; + + /* boundary conditions */ + val1 = (i == dimX-1)?0.0f: D[index] - D[(dimX*dimY)*k + j*dimX + (i+1)]; + val2 = (j == dimY-1)?0.0f: D[index] - D[(dimX*dimY)*k + (j+1)*dimX + i]; + val3 = (k == dimZ-1)?0.0f: D[index] - D[(dimX*dimY)*(k+1) + j*dimX + i]; + + P1_new = R1[index] + multip*val1; + P2_new = R2[index] + multip*val2; + P3_new = R3[index] + multip*val3; + + denom = powf(P1_new,2) + powf(P2_new,2) + powf(P3_new,2); + if (denom > 1.0f) { + sq_denom = 1.0f/sqrtf(denom); + P1_new *= sq_denom; + P2_new *= sq_denom; + P3_new *= sq_denom; + } + + R1[index] = P1_new + multip2*(P1_new - P1_old[index]); + R2[index] = P2_new + multip2*(P2_new - P2_old[index]); + R3[index] = P3_new + multip2*(P3_new - P3_old[index]); + + P1_old[index] = P1_new; + P2_old[index] = P2_new; + P3_old[index] = P3_new; + }}} + + + return 1; +} diff --git a/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h new file mode 100644 index 0000000..cda0f40 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-fgptv/FGP_TV_core.h @@ -0,0 +1,62 @@ +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//#include <matrix.h> +#include <math.h> +#include <stdlib.h> +#include <memory.h> +#include <stdio.h> +#include "omp.h" +#include "utils.h" +#include "CCPiDefines.h" + +/* C-OMP implementation of FGP-TV [1] denoising/regularization model (2D/3D case) + * + * Input Parameters: + * 1. Noisy image/volume + * 2. lambda - regularization parameter + * 3. Number of iterations + * 4. eplsilon: tolerance constant + * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) + * 6. nonneg: 'nonnegativity (0 is OFF by default) + * + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + + * + * This function is based on the Matlab's code and paper by + * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" + */ + +#ifdef __cplusplus +extern "C" { +#endif +CCPI_EXPORT float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); + +CCPI_EXPORT float Obj_func2D(float *A, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); +CCPI_EXPORT float Grad_func2D(float *P1, float *P2, float *D, float *R1, float *R2, float lambda, long dimX, long dimY); +CCPI_EXPORT float Rupd_func2D(float *P1, float *P1_old, float *P2, float *P2_old, float *R1, float *R2, float tkp1, float tk, long DimTotal); + +CCPI_EXPORT float Obj_func3D(float *A, float *D, float *R1, float *R2, float *R3, float lambda, int nonneg, long dimX, long dimY, long dimZ); +CCPI_EXPORT float All_func3D(float *P1_old, float *P2_old, float *P3_old, float *D, float *R1, float *R2, float *R3, float lambda, float tkp1, float tk, long dimX, long dimY, long dimZ); + +#ifdef __cplusplus +} +#endif diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt new file mode 100644 index 0000000..1a3b37a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/CMakeLists.txt @@ -0,0 +1,169 @@ +# Copyright 2018 Edoardo Pasca +#cmake_minimum_required (VERSION 3.0) + +project(RGL_core) +#https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py + +# The version number. + +set (CIL_VERSION $ENV{CIL_VERSION} CACHE INTERNAL "Core Imaging Library version" FORCE) + +# conda orchestrated build +message("CIL_VERSION ${CIL_VERSION}") +#include (GenerateExportHeader) + +## Build the regularisers package as a library +message("Creating Regularisers as a shared library") + +find_package(GLIB2 REQUIRED) +find_package(OpenMP REQUIRED) +if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") +endif() + +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}") +message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}") +message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") + +set(CMAKE_BUILD_TYPE "Release") + +if(WIN32) + set (FLAGS "/DWIN32 /EHsc /DCCPiCore_EXPORTS /openmp") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:MSVCRT.lib") + + set (EXTRA_LIBRARIES) + + message("library lib: ${LIBRARY_LIB}") + +elseif(UNIX) + set (FLAGS "-O2 -funsigned-char -Wall -Wl,--no-undefined -DCCPiReconstructionIterative_EXPORTS ") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + + set (EXTRA_LIBRARIES + "gomp" + "m" + ) + +endif() +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") + +## Build the regularisers package as a library +message("Adding regularisers as a shared library") + +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") +#set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC") +add_library(cilreg SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TGV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffusion_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Diffus4th_order_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/LLT_ROF_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c + ) +target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} ) +include_directories(cilreg PUBLIC + ${GLIB2_INCLUDE_DIR} + ${LIBRARY_INC}/include + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/ ) + +## Install + +if (UNIX) +message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") +install(TARGETS cilreg + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +elseif(WIN32) +message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilreg + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) +endif() + + + +# GPU Regularisers +if (BUILD_CUDA) + find_package(CUDA) + if (CUDA_FOUND) + set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES -allow-unsupported-compiler") + message("CUDA FLAGS ${CUDA_NVCC_FLAGS}") + CUDA_ADD_LIBRARY(cilregcuda SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_PD_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TV_SB_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/LLT_ROF_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/TGV_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/dTV_FGP_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/NonlDiff_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/Diffus_4thO_GPU_core.cu + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_GPU/PatchSelect_GPU_core.cu + ) + if (UNIX) + message ("I'd install into ${CMAKE_INSTALL_PREFIX}/lib") + install(TARGETS cilregcuda + LIBRARY DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + elseif(WIN32) + message ("I'd install into ${CMAKE_INSTALL_PREFIX} lib bin") + install(TARGETS cilregcuda + RUNTIME DESTINATION bin + ARCHIVE DESTINATION lib + CONFIGURATIONS ${CMAKE_BUILD_TYPE} + ) + endif() + else() + message("CUDA NOT FOUND") + endif() +endif() + +if (${BUILD_MATLAB_WRAPPER}) + if (WIN32) + install(TARGETS cilreg DESTINATION ${MATLAB_DEST}) + if (CUDA_FOUND) + install(TARGETS cilregcuda DESTINATION ${MATLAB_DEST}) + endif() + endif() +endif() diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c new file mode 100755 index 0000000..cbce96a --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.c @@ -0,0 +1,668 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <malloc.h> +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef _Float16 floatxx; // Large arrays, allways float16 if we go mixed-precision. +//typedef _Float16 floatyy; // Small arrays which we can do both ways. +typedef float floatyy; + +typedef struct { + int offY, stepY, copY; + floatxx *Input, *u, *qx, *qy, *gradx, *grady, *div; + floatyy *div0, *udiff0; + floatyy *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->qx); + free(ctx->qy); + free(ctx->gradx); + free(ctx->grady); + free(ctx->div); + + free(ctx->div0); + free(ctx->udiff0); + + free(ctx->gradxdiff); + free(ctx->gradydiff); + free(ctx->ubarx); + free(ctx->ubary); + free(ctx->udiff); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + long DimRow = (long)(dimX * padZ); + long DimCell = (long)(padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(floatxx)); + ctx->u = memalign(64, Dim1Total * sizeof(floatxx)); + ctx->qx = memalign(64, DimTotal * sizeof(floatxx)); + ctx->qy = memalign(64, DimTotal * sizeof(floatxx)); + ctx->gradx = memalign(64, DimTotal * sizeof(floatxx)); + ctx->grady = memalign(64, DimTotal * sizeof(floatxx)); + ctx->div = memalign(64, Dim1Total * sizeof(floatxx)); + + ctx->div0 = memalign(64, DimRow * sizeof(floatyy)); + ctx->udiff0 = memalign(64, DimRow * sizeof(floatyy)); + + ctx->gradxdiff = memalign(64, DimCell * sizeof(floatyy)); + ctx->gradydiff = memalign(64, DimCell * sizeof(floatyy)); + ctx->ubarx = memalign(64, DimCell * sizeof(floatyy)); + ctx->ubary = memalign(64, DimCell * sizeof(floatyy)); + ctx->udiff = memalign(64, DimCell * sizeof(floatyy)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); + memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); + memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); + memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); + + for(k=0; k<dimZ; k++) { + for(j=0; j<copY; j++) { + for(i=0; i<dimX; i++) { + ctx->Input[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; k<dimZ; k++) { + for(j=0; j<stepY; j++) { + for(i=0; i<dimX; i++) { + tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(floatxx)); + memset(ctx->qx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->qy, 0, DimTotal * sizeof(floatxx)); + memset(ctx->gradx, 0, DimTotal * sizeof(floatxx)); + memset(ctx->grady, 0, DimTotal * sizeof(floatxx)); + memset(ctx->div, 0, Dim1Total * sizeof(floatxx)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + floatxx *Input = ctx->Input; + floatxx *u = ctx->u; + floatxx *qx = ctx->qx; + floatxx *qy = ctx->qy; + floatxx *gradx = ctx->gradx; + floatxx *grady = ctx->grady; + floatxx *div = ctx->div; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + floatyy qxdiff; + floatyy qydiff; + floatyy divdiff; + floatyy *gradxdiff = ctx->gradxdiff; + floatyy *gradydiff = ctx->gradydiff; + floatyy *ubarx = ctx->ubarx; + floatyy *ubary = ctx->ubary; + floatyy *udiff = ctx->udiff; + + floatyy *udiff0 = ctx->udiff0; + floatyy *div0 = ctx->div0; + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_loop.h" + } + } + } // i + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + + + + ctx->resprimal = resprimal; + ctx->resdual = resdual1 + resdual2; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + int started = 0; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = (ctx0->stepY - 1) * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + floatyy divdiff = ctx->div0[l] - ctx->div[l]; + floatyy udiff = ctx->udiff0[l]; + + ctx->div[l] -= ctx0->qy[l + m]; + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + ctx0->u[m + l + dimX*padZ] = ctx->u[l]; + + divdiff += ctx0->qy[l + m]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + floatyy divdiff = ctx->div0[l] - ctx->div[l]; + floatyy udiff = ctx->udiff0[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; +// printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + if (started) { + fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } + } else { + started = 1; + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); +// printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h new file mode 100644 index 0000000..aa050a4 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core.h @@ -0,0 +1,47 @@ +#include <math.h> +#include <stdlib.h> +#include <memory.h> +#include <stdio.h> +#include "omp.h" +#include "utils.h" +#include "CCPiDefines.h" + +#define fTiny 0.00000001f +#define fLarge 100000000.0f +#define INFNORM -1 + +#define MAX(i,j) ((i)<(j) ? (j):(i)) +#define MIN(i,j) ((i)<(j) ? (i):(j)) + +/* +This work is part of the Core Imaging Library developed by +Visual Analytics and Imaging System Group of the Science Technology +Facilities Council, STFC + +Copyright 2017 Daniil Kazantsev +Copyright 2017 Srikanth Nagella, Edoardo Pasca + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifdef __cplusplus +extern "C" { +#endif +CCPI_EXPORT float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ); + +/*float PDHG(float *A, float *B, float tau, float sigma, float theta, float lambda, int p, int q, int r, float tol, int maxIter, int d_c, int d_w, int d_h);*/ +CCPI_EXPORT float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ); +CCPI_EXPORT float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ); +CCPI_EXPORT float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ); +CCPI_EXPORT float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ); +#ifdef __cplusplus +} +#endif
\ No newline at end of file diff --git a/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h new file mode 100644 index 0000000..86299cd --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fast-tnv/TNV_core_loop.h @@ -0,0 +1,107 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + floatxx *__restrict u_next = u + l + padZ; + floatxx *__restrict u_current = u + l; + floatxx *__restrict u_next_row = u + l + m; + + + floatxx *__restrict qx_current = qx + l; + floatxx *__restrict qy_current = qy + l; + floatxx *__restrict qx_prev = qx + l - padZ; + floatxx *__restrict qy_prev = qy + l - m; + + +// __assume(padZ%16==0); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + floatyy gradx_upd = 0; +#else + floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + floatyy grady_upd = 0; +#else + floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + floatyy gx_upd = vx * t[0] + vy * t[1]; + floatyy gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + floatyy div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } +
\ No newline at end of file diff --git a/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch new file mode 100644 index 0000000..cc53b1d --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fgptv-integer-overflow.patch @@ -0,0 +1,12 @@ +diff -dPNur CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c +--- CCPi-Regularisation-Toolkit-orig/src/Core/regularisers_CPU/FGP_TV_core.c 2021-12-17 12:17:12.000000000 +0100 ++++ CCPi-Regularisation-Toolkit/src/Core/regularisers_CPU/FGP_TV_core.c 2022-09-06 18:52:32.460225737 +0200 +@@ -104,7 +104,7 @@ + else { + /*3D case*/ + float *Output_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; +- DimTotal = (long)(dimX*dimY*dimZ); ++ DimTotal = (long)dimX*(long)dimY*(long)dimZ; + + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + P1 = calloc(DimTotal, sizeof(float)); diff --git a/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch new file mode 100644 index 0000000..09b5f51 --- /dev/null +++ b/patches/ccpi-regularisation-toolkit-fgptv-openmp.patch @@ -0,0 +1,19 @@ +--- a/src/Core/regularisers_CPU/utils.c ++++ b/src/Core/regularisers_CPU/utils.c +@@ -176,6 +176,7 @@ float Proj_func2D(float *P1, float *P2, int methTV, long DimTotal) + } + return 1; + } ++ + /*3D Projection onto convex set for P (called in PD_TV, FGP_TV, FGP_dTV methods)*/ + float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) + { +@@ -183,7 +184,7 @@ float Proj_func3D(float *P1, float *P2, float *P3, int methTV, long DimTotal) + long i; + if (methTV == 0) { + /* isotropic TV*/ +-#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,sq_denom) ++#pragma omp parallel for shared(P1,P2,P3) private(i,val1,val2,val3,denom,sq_denom) + for(i=0; i<DimTotal; i++) { + denom = powf(P1[i],2) + powf(P2[i],2) + powf(P3[i],2); + if (denom > 1.0f) { |