summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rwxr-xr-xCore/regularisers_GPU/TV_FGP_GPU_core.cu65
-rwxr-xr-xCore/regularisers_GPU/TV_SB_GPU_core.cu63
-rw-r--r--Core/regularisers_GPU/dTV_FGP_GPU_core.cu69
-rw-r--r--Core/regularisers_GPU/utils_cu.h56
-rw-r--r--Wrappers/Matlab/mex_compile/compileGPU_mex.m2
5 files changed, 173 insertions, 82 deletions
diff --git a/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/Core/regularisers_GPU/TV_FGP_GPU_core.cu
index 35267f2..c0ae890 100755
--- a/Core/regularisers_GPU/TV_FGP_GPU_core.cu
+++ b/Core/regularisers_GPU/TV_FGP_GPU_core.cu
@@ -18,7 +18,6 @@ limitations under the License.
*/
#include "TV_FGP_GPU_core.h"
-#include "utils_cu.h"
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
@@ -302,6 +301,56 @@ __global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_tota
if (Output[index] < 0.0f) Output[index] = 0.0f;
}
}
+__global__ void FGPcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void FGPcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void FGPResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
+__global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
////////////MAIN HOST FUNCTION ///////////////
@@ -382,7 +431,7 @@ extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, in
if (epsil != 0.0f) {
/* calculate norm - stopping rules using the Thrust library */
- ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+ FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -395,16 +444,16 @@ extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, in
if (re < epsil) count++;
if (count > 4) break;
- copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+ FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
}
- copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+ FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+ FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -490,15 +539,15 @@ extern "C" void TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, in
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+ FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+ FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+ FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
diff --git a/Core/regularisers_GPU/TV_SB_GPU_core.cu b/Core/regularisers_GPU/TV_SB_GPU_core.cu
index c6f1e29..0bc466f 100755
--- a/Core/regularisers_GPU/TV_SB_GPU_core.cu
+++ b/Core/regularisers_GPU/TV_SB_GPU_core.cu
@@ -18,7 +18,6 @@ limitations under the License.
*/
#include "TV_SB_GPU_core.h"
-#include "utils_cu.h"
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
@@ -312,6 +311,56 @@ __global__ void updBxBy3D_kernel(float *U, float *Dx, float *Dy, float *Dz, floa
return;
}
+__global__ void SBcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void SBcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void SBResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
+__global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
/********************* MAIN HOST FUNCTION ******************/
/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
@@ -361,7 +410,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
for (ll = 0; ll < iter; ll++) {
/* storing old value */
- copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal);
+ SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -369,7 +418,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
gauss_seidel2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Bx, By, lambda, mu, normConst, dimX, dimY, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal);
+ SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
/* 2nd GS iteration */
@@ -388,7 +437,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
if (epsil != 0.0f) {
/* calculate norm - stopping rules using the Thrust library */
- ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal);
+ SBResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -452,7 +501,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
for (ll = 0; ll < iter; ll++) {
/* storing old value */
- copy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal);
+ SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -460,7 +509,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
gauss_seidel3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Dz, Bx, By, Bz, lambda, mu, normConst, dimX, dimY, dimZ, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal);
+ SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
/* 2nd GS iteration */
@@ -479,7 +528,7 @@ extern "C" void TV_SB_GPU_main(float *Input, float *Output, float mu, int iter,
if (epsil != 0.0f) {
/* calculate norm - stopping rules using the Thrust library */
- ResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal);
+ SBResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
diff --git a/Core/regularisers_GPU/dTV_FGP_GPU_core.cu b/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
index 9e425ba..d4699f9 100644
--- a/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
+++ b/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
@@ -18,7 +18,6 @@ limitations under the License.
*/
#include "dTV_FGP_GPU_core.h"
-#include "utils_cu.h"
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
@@ -230,6 +229,56 @@ __global__ void dTVnonneg2D_kernel(float* Output, int N, int M, int num_total)
if (Output[index] < 0.0f) Output[index] = 0.0f;
}
}
+__global__ void dTVcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void dTVcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input[index];
+ }
+}
+
+__global__ void dTVResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+ int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+ int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+
+ int index = xIndex + N*yIndex;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
+__global__ void dTVResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
+{
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ int j = blockDim.y * blockIdx.y + threadIdx.y;
+ int k = blockDim.z * blockIdx.z + threadIdx.z;
+
+ int index = (N*M)*k + i + N*j;
+
+ if (index < num_total) {
+ Output[index] = Input1[index] - Input2[index];
+ }
+}
+
/************************************************/
/*****************3D modules*********************/
/************************************************/
@@ -512,7 +561,7 @@ extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, f
if (epsil != 0.0f) {
/* calculate norm - stopping rules using the Thrust library */
- ResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+ dTVResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -525,16 +574,16 @@ extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, f
if (re < epsil) count++;
if (count > 4) break;
- copy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
+ dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
}
- copy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+ dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+ dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -646,7 +695,7 @@ extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, f
if (epsil != 0.0f) {
/* calculate norm - stopping rules using the Thrust library */
- ResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, dimZ, ImSize);
+ dTVResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
@@ -659,20 +708,20 @@ extern "C" void dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, f
if (re < epsil) count++;
if (count > 4) break;
- copy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize);
+ dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
}
- copy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
+ dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
+ dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
- copy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
+ dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize);
checkCudaErrors( cudaDeviceSynchronize() );
checkCudaErrors(cudaPeekAtLastError() );
diff --git a/Core/regularisers_GPU/utils_cu.h b/Core/regularisers_GPU/utils_cu.h
deleted file mode 100644
index aff17a2..0000000
--- a/Core/regularisers_GPU/utils_cu.h
+++ /dev/null
@@ -1,56 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <memory.h>
-
-/*Some CUDA functions which frequently re-used from various modules */
-/***********************************************************************/
-__global__ void copy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
-{
- int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
- int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-
- int index = xIndex + N*yIndex;
-
- if (index < num_total) {
- Output[index] = Input[index];
- }
-}
-
-__global__ void copy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total)
-{
- int i = blockDim.x * blockIdx.x + threadIdx.x;
- int j = blockDim.y * blockIdx.y + threadIdx.y;
- int k = blockDim.z * blockIdx.z + threadIdx.z;
-
- int index = (N*M)*k + i + N*j;
-
- if (index < num_total) {
- Output[index] = Input[index];
- }
-}
-
-__global__ void ResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
-{
- int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
- int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
-
- int index = xIndex + N*yIndex;
-
- if (index < num_total) {
- Output[index] = Input1[index] - Input2[index];
- }
-}
-
-__global__ void ResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total)
-{
- int i = blockDim.x * blockIdx.x + threadIdx.x;
- int j = blockDim.y * blockIdx.y + threadIdx.y;
- int k = blockDim.z * blockIdx.z + threadIdx.z;
-
- int index = (N*M)*k + i + N*j;
-
- if (index < num_total) {
- Output[index] = Input1[index] - Input2[index];
- }
-}
-
diff --git a/Wrappers/Matlab/mex_compile/compileGPU_mex.m b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
index 0143c69..3dbeb8a 100644
--- a/Wrappers/Matlab/mex_compile/compileGPU_mex.m
+++ b/Wrappers/Matlab/mex_compile/compileGPU_mex.m
@@ -31,7 +31,7 @@ movefile SB_TV_GPU.mex* ../installed/
mex -g -I/usr/local/cuda-7.5/include -L/usr/local/cuda-7.5/lib64 -lcudart -lcufft -lmwgpu FGP_dTV_GPU.cpp dTV_FGP_GPU_core.o
movefile FGP_dTV_GPU.mex* ../installed/
-delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* CCPiDefines.h utils_cu.h
+delete TV_ROF_GPU_core* TV_FGP_GPU_core* TV_SB_GPU_core* dTV_FGP_GPU_core* CCPiDefines.h
fprintf('%s \n', 'All successfully compiled!');
cd ../../