diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Core/regularisers_CPU/FGP_TV_core.c | 21 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/ROF_TV_core.c | 9 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_FGP_GPU_core.cu | 71 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_FGP_GPU_core.h | 2 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_ROF_GPU_core.cu | 156 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_ROF_GPU_core.h | 2 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_SB_GPU_core.cu | 30 | ||||
-rw-r--r-- | src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu | 30 | ||||
-rw-r--r-- | src/Core/regularisers_GPU/shared.h | 9 | ||||
-rwxr-xr-x | src/Matlab/CMakeLists.txt | 6 | ||||
-rw-r--r-- | src/Python/ccpi/filters/regularisers.py | 6 | ||||
-rw-r--r-- | src/Python/setup-regularisers.py.in | 2 | ||||
-rw-r--r-- | src/Python/src/gpu_regularisers.pyx | 72 |
13 files changed, 255 insertions, 161 deletions
diff --git a/src/Core/regularisers_CPU/FGP_TV_core.c b/src/Core/regularisers_CPU/FGP_TV_core.c index 3c3cbd4..a6bb7e0 100644 --- a/src/Core/regularisers_CPU/FGP_TV_core.c +++ b/src/Core/regularisers_CPU/FGP_TV_core.c @@ -86,15 +86,17 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb /* check early stopping criteria */ if (epsil != 0.0f) { + re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); } - re = sqrt(re)/sqrt(re1); - if (re < epsil) count++; - if (count > 4) break; - copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 4) break; + + copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); } } if (epsil != 0.0f) free(Output_prev); @@ -137,12 +139,13 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb /* calculate norm - stopping rules*/ if (epsil != 0.0f) { + re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); } - re = sqrt(re)/sqrt(re1); + re = sqrtf(re)/sqrtf(re1); /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; if (count > 4) break; diff --git a/src/Core/regularisers_CPU/ROF_TV_core.c b/src/Core/regularisers_CPU/ROF_TV_core.c index 1b7cf76..a7291d7 100644 --- a/src/Core/regularisers_CPU/ROF_TV_core.c +++ b/src/Core/regularisers_CPU/ROF_TV_core.c @@ -19,7 +19,7 @@ #include "ROF_TV_core.h" -#define EPS 1.0e-15 +#define EPS 1.0e-8 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -76,12 +76,13 @@ float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lamb /* check early stopping criteria */ if (epsil != 0.0f) { + re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); } - re = sqrt(re)/sqrt(re1); + re = sqrtf(re)/sqrtf(re1); if (re < epsil) count++; if (count > 4) break; copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu index b371c5d..96b1173 100755 --- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu @@ -19,6 +19,7 @@ limitations under the License. #include "TV_FGP_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> @@ -31,10 +32,10 @@ limitations under the License. * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) * 6. nonneg: 'nonnegativity (0 is OFF by default) - * 7. print information: 0 (off) or 1 (on) * * Output: - * [1] Filtered/regularized image + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * * This function is based on the Matlab's code and paper by * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems" @@ -49,7 +50,7 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ @@ -343,7 +344,7 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ ////////////MAIN HOST FUNCTION /////////////// -extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ) +extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices cudaGetDeviceCount(&deviceCount); @@ -354,20 +355,21 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int int count = 0, i; float re, multip,multip2; - float tk = 1.0f; + re = 0.0f; + float tk = 1.0f; float tkp1=1.0f; if (dimZ <= 1) { /*2D verson*/ - int ImSize = dimX*dimY; - float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; + int ImSize = dimX*dimY; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); @@ -416,40 +418,43 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + tk = tkp1; + if (epsil != 0.0f) { /* calculate norm - stopping rules using the Thrust library */ - FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize); + FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm re = (reduction/reduction2); if (re < epsil) count++; - if (count > 4) break; + if (count > 4) break; FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - } - - FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - tk = tkp1; - } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); - /***************************************************************/ + } + + } //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); @@ -542,7 +547,6 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int tk = tkp1; } - if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i); /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); @@ -560,5 +564,8 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int cudaFree(R3); } //cudaDeviceReset(); + /*adding info into info_vector */ + infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ return 0; } diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h index bf13508..597b6c6 100755 --- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h @@ -4,6 +4,6 @@ #include "CCPiDefines.h" #include <memory.h> -extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); #endif diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu index 76f5be9..97f2351 100755 --- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu @@ -3,8 +3,8 @@ This work is part of the Core Imaging Library developed by Visual Analytics and Imaging System Group of the Science Technology Facilities Council, STFC -Copyright 2017 Daniil Kazantsev -Copyright 2017 Srikanth Nagella, Edoardo Pasca +Copyright 2019 Daniil Kazantsev +Copyright 2019 Srikanth Nagella, Edoardo Pasca Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -18,6 +18,10 @@ limitations under the License. */ #include "TV_ROF_GPU_core.h" +#include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case) * @@ -26,16 +30,16 @@ limitations under the License. * 2. lambda - regularization parameter [REQUIRED] * 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] -* -* Output: -* [1] Regularized image/volume +* 5. eplsilon: tolerance constant + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] + + * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" -* -* D. Kazantsev, 2016-18 */ -#include "shared.h" #define BLKXSIZE 8 #define BLKYSIZE 8 @@ -43,7 +47,7 @@ limitations under the License. #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 -#define EPS 1.0e-12 +#define EPS 1.0e-8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) @@ -136,7 +140,7 @@ __host__ __device__ int sign (float x) dv1 = D1[index] - D1[j2*N + i]; dv2 = D2[index] - D2[j*N + i2]; - Update[index] += tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index])); + Update[index] += tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index])); } } @@ -286,42 +290,116 @@ __host__ __device__ int sign (float x) dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; - Update[index] += tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); + Update[index] += tau*(lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); } } +__global__ void ROFcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + + +__global__ void ROFResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} ///////////////////////////////////////////////// -// HOST FUNCTION -extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z) +///////////////// HOST FUNCTION ///////////////// +extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z) { - // set up device - int dev = 0; - CHECK(cudaSetDevice(dev)); - float *d_input, *d_update, *d_D1, *d_D2; + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + float re; + re = 0.0f; + int ImSize, count, n; + count = 0; n = 0; + float *d_input, *d_update, *d_D1, *d_D2, *d_update_prev=NULL; if (Z == 0) Z = 1; - CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float))); - CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float))); + ImSize = N*M*Z; + CHECK(cudaMalloc((void**)&d_input,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_update,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D1,ImSize*sizeof(float))); + CHECK(cudaMalloc((void**)&d_D2,ImSize*sizeof(float))); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); - CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + CHECK(cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - if (Z > 1) { - // TV - 3D case + if (Z == 1) { + // TV - 2D case + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); + dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); + + for(n=0; n < iter; n++) { + /* calculate differences */ + D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M); + CHECK(cudaDeviceSynchronize()); + D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M); + CHECK(cudaDeviceSynchronize()); + /*running main kernel*/ + TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M); + CHECK(cudaDeviceSynchronize()); + + if (epsil != 0.0f) { + /* calculate norm - stopping rules using the Thrust library */ + ROFResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 4) break; + + ROFcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + + } + } + else { + // TV - 3D case dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); float *d_D3; CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float))); - for(int n=0; n < iter; n++) { + for(n=0; n < iter; n++) { /* calculate differences */ D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z); CHECK(cudaDeviceSynchronize()); - D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z); + D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z); CHECK(cudaDeviceSynchronize()); D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z); CHECK(cudaDeviceSynchronize()); @@ -331,28 +409,16 @@ extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int } CHECK(cudaFree(d_D3)); - } - else { - // TV - 2D case - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); - dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iter; n++) { - /* calculate differences */ - D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M); - CHECK(cudaDeviceSynchronize()); - D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M); - CHECK(cudaDeviceSynchronize()); - /*running main kernel*/ - TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M); - CHECK(cudaDeviceSynchronize()); - } - } + } CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); + if (epsil != 0.0f) cudaFree(d_update_prev); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_update)); CHECK(cudaFree(d_D1)); CHECK(cudaFree(d_D2)); - //cudaDeviceReset(); + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h index 3a09296..0a75124 100755 --- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z); +extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z); #endif diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu index 1f494ee..c31f104 100755 --- a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu @@ -49,7 +49,7 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +// struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ @@ -430,14 +430,14 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); +// thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + // float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; + // re = (reduction/reduction2); + // if (re < epsil) count++; + // if (count > 4) break; } } @@ -521,14 +521,14 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); +// float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; + // re = (reduction/reduction2); + // if (re < epsil) count++; + // if (count > 4) break; } } if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); diff --git a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu index 7503ec7..9db594e 100644 --- a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu +++ b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu @@ -53,7 +53,7 @@ limitations under the License. #define BLKZSIZE 8 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) -struct square { __host__ __device__ float operator()(float x) { return x * x; } }; +//struct square { __host__ __device__ float operator()(float x) { return x * x; } }; /************************************************/ /*****************2D modules*********************/ @@ -552,14 +552,14 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); + // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + // float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; + // re = (reduction/reduction2); + // if (re < epsil) count++; + // if (count > 4) break; dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); @@ -686,14 +686,14 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); - float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); + // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); + // thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + // float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - re = (reduction/reduction2); - if (re < epsil) count++; - if (count > 4) break; + // re = (reduction/reduction2); + // if (re < epsil) count++; + // if (count > 4) break; dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); diff --git a/src/Core/regularisers_GPU/shared.h b/src/Core/regularisers_GPU/shared.h index fe98cd6..698dddd 100644 --- a/src/Core/regularisers_GPU/shared.h +++ b/src/Core/regularisers_GPU/shared.h @@ -1,4 +1,13 @@ /*shared macros*/ +template <typename T> +struct square +{ + __host__ __device__ + T operator()(const T& x) const { + return (float)(x*x); + } +}; + /*checks CUDA call, should be used in functions returning <int> value diff --git a/src/Matlab/CMakeLists.txt b/src/Matlab/CMakeLists.txt index b97f845..6c5e6be 100755 --- a/src/Matlab/CMakeLists.txt +++ b/src/Matlab/CMakeLists.txt @@ -38,9 +38,9 @@ find_package(Matlab REQUIRED COMPONENTS MAIN_PROGRAM MX_LIBRARY ENG_LIBRARY ) #set (MEX_TARGETS "CPU_TNV;CPU_ROF")
#list(APPEND MEX_TARGETS "CPU_TNV")
#list(APPEND MEX_TARGETS "CPU_ROF")
-
+ file(GLOB CPU_MEX_FILES
- "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_CPU/*.c"
+ "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_CPU/*.c"
#"${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.c"
)
@@ -101,7 +101,7 @@ if (BUILD_CUDA) find_package(CUDA)
if (CUDA_FOUND)
file(GLOB GPU_MEX_FILES
- "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.cpp"
+ "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_GPU/*.cpp"
)
list(LENGTH GPU_MEX_FILES num)
diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py index 67f432b..05120af 100644 --- a/src/Python/ccpi/filters/regularisers.py +++ b/src/Python/ccpi/filters/regularisers.py @@ -22,7 +22,8 @@ def ROF_TV(inputData, regularisation_parameter, iterations, return TV_ROF_GPU(inputData, regularisation_parameter, iterations, - time_marching_parameter) + time_marching_parameter, + tolerance_param) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') @@ -44,8 +45,7 @@ def FGP_TV(inputData, regularisation_parameter,iterations, iterations, tolerance_param, methodTV, - nonneg, - 1) + nonneg) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') diff --git a/src/Python/setup-regularisers.py.in b/src/Python/setup-regularisers.py.in index 39b820a..4c578e3 100644 --- a/src/Python/setup-regularisers.py.in +++ b/src/Python/setup-regularisers.py.in @@ -68,7 +68,7 @@ setup( ], zip_safe = False, - packages = {'ccpi','ccpi.filters', 'ccpi.supp'}, + packages = {'ccpi', 'ccpi.filters', 'ccpi.supp'}, ) diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx index b52f669..db4fa67 100644 --- a/src/Python/src/gpu_regularisers.pyx +++ b/src/Python/src/gpu_regularisers.pyx @@ -20,8 +20,8 @@ cimport numpy as np CUDAErrorMessage = 'CUDA error' -cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z); -cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z); +cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z); +cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z); cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z); cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z); @@ -34,17 +34,20 @@ cdef extern int PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned def TV_ROF_GPU(inputData, regularisation_parameter, iterations, - time_marching_parameter): + time_marching_parameter, + tolerance_param): if inputData.ndim == 2: return ROFTV2D(inputData, regularisation_parameter, iterations, - time_marching_parameter) + time_marching_parameter, + tolerance_param) elif inputData.ndim == 3: return ROFTV3D(inputData, regularisation_parameter, iterations, - time_marching_parameter) + time_marching_parameter, + tolerance_param) # Total-variation Fast-Gradient-Projection (FGP) def TV_FGP_GPU(inputData, @@ -52,24 +55,21 @@ def TV_FGP_GPU(inputData, iterations, tolerance_param, methodTV, - nonneg, - printM): + nonneg): if inputData.ndim == 2: return FGPTV2D(inputData, regularisation_parameter, iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) elif inputData.ndim == 3: return FGPTV3D(inputData, regularisation_parameter, iterations, tolerance_param, methodTV, - nonneg, - printM) + nonneg) # Total-variation Split Bregman (SB) def TV_SB_GPU(inputData, regularisation_parameter, @@ -179,7 +179,8 @@ def Diff4th_GPU(inputData, def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, int iterations, - float time_marching_parameter): + float time_marching_parameter, + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] @@ -187,22 +188,26 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') # Running CUDA code here if (TV_ROF_GPU_main( - &inputData[0,0], &outputData[0,0], + &inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, - iterations , + iterations, time_marching_parameter, + tolerance_param, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterations, - float time_marching_parameter): + float time_marching_parameter, + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] @@ -211,15 +216,18 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') # Running CUDA code here if (TV_ROF_GPU_main( - &inputData[0,0,0], &outputData[0,0,0], + &inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameter, - iterations , + iterations, time_marching_parameter, + tolerance_param, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); #****************************************************************# @@ -231,8 +239,7 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, int iterations, float tolerance_param, int methodTV, - int nonneg, - int printM): + int nonneg): cdef long dims[2] dims[0] = inputData.shape[0] @@ -240,47 +247,48 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') # Running CUDA code here - if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0], + if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameter, - iterations, + iterations, tolerance_param, methodTV, nonneg, - printM, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); - def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterations, float tolerance_param, int methodTV, - int nonneg, - int printM): + int nonneg): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] dims[2] = inputData.shape[2] + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') # Running CUDA code here - if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], - regularisation_parameter , + if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, iterations, tolerance_param, methodTV, nonneg, - printM, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); |