diff options
author | Daniil Kazantsev <dkazanc@hotmail.com> | 2019-03-08 14:22:43 +0000 |
---|---|---|
committer | Daniil Kazantsev <dkazanc@hotmail.com> | 2019-03-08 14:22:43 +0000 |
commit | 49761c3730e2ddf2ec40c84952572c43e9334ccb (patch) | |
tree | ca76282ad2a8ce7c67d55efb462ec9efa8ae8f21 | |
parent | 47693d15132130513f8d0f74fd4831a3bbf69159 (diff) | |
download | regularization-49761c3730e2ddf2ec40c84952572c43e9334ccb.tar.gz regularization-49761c3730e2ddf2ec40c84952572c43e9334ccb.tar.bz2 regularization-49761c3730e2ddf2ec40c84952572c43e9334ccb.tar.xz regularization-49761c3730e2ddf2ec40c84952572c43e9334ccb.zip |
SBTV,LLTROF completed
-rw-r--r-- | demos/demo_cpu_regularisers.py | 89 | ||||
-rw-r--r-- | demos/demo_cpu_regularisers3D.py | 69 | ||||
-rw-r--r-- | demos/demo_gpu_regularisers.py | 84 | ||||
-rw-r--r-- | demos/demo_gpu_regularisers3D.py | 61 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/FGP_TV_core.c | 8 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/LLT_ROF_core.c | 141 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/LLT_ROF_core.h | 14 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/ROF_TV_core.c | 4 | ||||
-rwxr-xr-x | src/Core/regularisers_CPU/SB_TV_core.c | 156 | ||||
-rw-r--r-- | src/Core/regularisers_CPU/SB_TV_core.h | 8 | ||||
-rw-r--r-- | src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu | 360 | ||||
-rw-r--r-- | src/Core/regularisers_GPU/LLT_ROF_GPU_core.h | 4 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_FGP_GPU_core.cu | 287 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_ROF_GPU_core.cu | 299 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_SB_GPU_core.cu | 383 | ||||
-rwxr-xr-x | src/Core/regularisers_GPU/TV_SB_GPU_core.h | 4 | ||||
-rw-r--r-- | src/Python/ccpi/filters/regularisers.py | 30 | ||||
-rw-r--r-- | src/Python/src/cpu_regularisers.pyx | 132 | ||||
-rw-r--r-- | src/Python/src/gpu_regularisers.pyx | 72 |
19 files changed, 1215 insertions, 990 deletions
diff --git a/demos/demo_cpu_regularisers.py b/demos/demo_cpu_regularisers.py index 4866811..f2d2f33 100644 --- a/demos/demo_cpu_regularisers.py +++ b/demos/demo_cpu_regularisers.py @@ -32,7 +32,7 @@ def printParametersToString(pars): ############################################################################### #filename = os.path.join( "data" ,"lena_gray_512.tif") -filename = "/home/kjy41806/Documents/SOFT/CCPi-Regularisation-Toolkit/test/lena_gray_512.tif" +filename = "/home/algol/Documents/DEV/CCPi-Regularisation-Toolkit/test/lena_gray_512.tif" # read image Im = plt.imread(filename) @@ -130,14 +130,14 @@ imgplot = plt.imshow(u0,cmap="gray") pars = {'algorithm' : FGP_TV, \ 'input' : u0,\ 'regularisation_parameter':0.02, \ - 'number_of_iterations' :200 ,\ + 'number_of_iterations' :400 ,\ 'tolerance_constant':1e-06,\ 'methodTV': 0 ,\ 'nonneg': 0} print ("#############FGP TV CPU####################") start_time = timeit.default_timer() -fgp_cpu,info_vec_cpu = FGP_TV(pars['input'], +(fgp_cpu,info_vec_cpu) = FGP_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], @@ -175,21 +175,18 @@ imgplot = plt.imshow(u0,cmap="gray") # set parameters pars = {'algorithm' : SB_TV, \ 'input' : u0,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :150 ,\ + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :250 ,\ 'tolerance_constant':1e-06,\ - 'methodTV': 0 ,\ - 'printingOut': 0 - } + 'methodTV': 0} print ("#############SB TV CPU####################") start_time = timeit.default_timer() -sb_cpu = SB_TV(pars['input'], +(sb_cpu,info_vec_cpu) = SB_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], - pars['methodTV'], - pars['printingOut'],'cpu') + pars['methodTV'],'cpu') Qtools = QualityTools(Im, sb_cpu) pars['rmse'] = Qtools.rmse() @@ -209,37 +206,35 @@ plt.title('{}'.format('CPU results')) #%% print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") -print ("_____Total Generalised Variation (2D)______") +print ("______________LLT- ROF (2D)________________") print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") ## plot fig = plt.figure() -plt.suptitle('Performance of TGV regulariser using the CPU') +plt.suptitle('Performance of LLT-ROF regulariser using the CPU') a=fig.add_subplot(1,2,1) a.set_title('Noisy Image') imgplot = plt.imshow(u0,cmap="gray") # set parameters -pars = {'algorithm' : TGV, \ +pars = {'algorithm' : LLT_ROF, \ 'input' : u0,\ - 'regularisation_parameter':0.04, \ - 'alpha1':1.0,\ - 'alpha0':2.0,\ - 'number_of_iterations' :1350 ,\ - 'LipshitzConstant' :12 ,\ - } + 'regularisation_parameterROF':0.01, \ + 'regularisation_parameterLLT':0.0085, \ + 'number_of_iterations' :6000 ,\ + 'time_marching_parameter' :0.001 ,\ + 'tolerance_constant':1e-06} -print ("#############TGV CPU####################") +print ("#############LLT- ROF CPU####################") start_time = timeit.default_timer() -tgv_cpu = TGV(pars['input'], - pars['regularisation_parameter'], - pars['alpha1'], - pars['alpha0'], +(lltrof_cpu,info_vec_cpu) = LLT_ROF(pars['input'], + pars['regularisation_parameterROF'], + pars['regularisation_parameterLLT'], pars['number_of_iterations'], - pars['LipshitzConstant'],'cpu') - - -Qtools = QualityTools(Im, tgv_cpu) + pars['time_marching_parameter'], + pars['tolerance_constant'], 'cpu') + +Qtools = QualityTools(Im, lltrof_cpu) pars['rmse'] = Qtools.rmse() txtstr = printParametersToString(pars) @@ -252,40 +247,42 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.75) # place a text box in upper left in axes coords a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14, verticalalignment='top', bbox=props) -imgplot = plt.imshow(tgv_cpu, cmap="gray") +imgplot = plt.imshow(lltrof_cpu, cmap="gray") plt.title('{}'.format('CPU results')) #%% - print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") -print ("______________LLT- ROF (2D)________________") +print ("_____Total Generalised Variation (2D)______") print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") ## plot fig = plt.figure() -plt.suptitle('Performance of LLT-ROF regulariser using the CPU') +plt.suptitle('Performance of TGV regulariser using the CPU') a=fig.add_subplot(1,2,1) a.set_title('Noisy Image') imgplot = plt.imshow(u0,cmap="gray") # set parameters -pars = {'algorithm' : LLT_ROF, \ +pars = {'algorithm' : TGV, \ 'input' : u0,\ - 'regularisation_parameterROF':0.04, \ - 'regularisation_parameterLLT':0.01, \ - 'number_of_iterations' :500 ,\ - 'time_marching_parameter' :0.0025 ,\ + 'regularisation_parameter':0.04, \ + 'alpha1':1.0,\ + 'alpha0':2.0,\ + 'number_of_iterations' :1350 ,\ + 'LipshitzConstant' :12 ,\ } -print ("#############LLT- ROF CPU####################") +print ("#############TGV CPU####################") start_time = timeit.default_timer() -lltrof_cpu = LLT_ROF(pars['input'], - pars['regularisation_parameterROF'], - pars['regularisation_parameterLLT'], +tgv_cpu = TGV(pars['input'], + pars['regularisation_parameter'], + pars['alpha1'], + pars['alpha0'], pars['number_of_iterations'], - pars['time_marching_parameter'],'cpu') - -Qtools = QualityTools(Im, lltrof_cpu) + pars['LipshitzConstant'],'cpu') + + +Qtools = QualityTools(Im, tgv_cpu) pars['rmse'] = Qtools.rmse() txtstr = printParametersToString(pars) @@ -298,7 +295,7 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.75) # place a text box in upper left in axes coords a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14, verticalalignment='top', bbox=props) -imgplot = plt.imshow(lltrof_cpu, cmap="gray") +imgplot = plt.imshow(tgv_cpu, cmap="gray") plt.title('{}'.format('CPU results')) #%% diff --git a/demos/demo_cpu_regularisers3D.py b/demos/demo_cpu_regularisers3D.py index fd6c545..0f9cd1a 100644 --- a/demos/demo_cpu_regularisers3D.py +++ b/demos/demo_cpu_regularisers3D.py @@ -29,8 +29,9 @@ def printParametersToString(pars): txt += '\n' return txt ############################################################################### -#%% -filename = os.path.join( "data" ,"lena_gray_512.tif") + +# filename = os.path.join( "data" ,"lena_gray_512.tif") +filename = "/home/algol/Documents/DEV/CCPi-Regularisation-Toolkit/test/lena_gray_512.tif" # read image Im = plt.imread(filename) @@ -94,16 +95,18 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm': ROF_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04,\ - 'number_of_iterations': 500,\ - 'time_marching_parameter': 0.0025 - } + 'regularisation_parameter':0.02,\ + 'number_of_iterations': 7000,\ + 'time_marching_parameter': 0.0007,\ + 'tolerance_constant':1e-06} + print ("#############ROF TV CPU####################") start_time = timeit.default_timer() -rof_cpu3D = ROF_TV(pars['input'], +(rof_cpu3D, info_vec_cpu) = ROF_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], - pars['time_marching_parameter'],'cpu') + pars['time_marching_parameter'], + pars['tolerance_constant'], 'cpu') Qtools = QualityTools(idealVol, rof_cpu3D) pars['rmse'] = Qtools.rmse() @@ -136,23 +139,20 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : FGP_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :300 ,\ - 'tolerance_constant':0.00001,\ + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :1000 ,\ + 'tolerance_constant':1e-06,\ 'methodTV': 0 ,\ - 'nonneg': 0 ,\ - 'printingOut': 0 - } - -print ("#############FGP TV CPU####################") + 'nonneg': 0} + +print ("#############FGP TV GPU####################") start_time = timeit.default_timer() -fgp_cpu3D = FGP_TV(pars['input'], +(fgp_cpu3D, info_vec_cpu) = FGP_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], pars['methodTV'], - pars['nonneg'], - pars['printingOut'],'cpu') + pars['nonneg'], 'cpu') Qtools = QualityTools(idealVol, fgp_cpu3D) pars['rmse'] = Qtools.rmse() @@ -185,22 +185,18 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : SB_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :150 ,\ - 'tolerance_constant':0.00001,\ - 'methodTV': 0 ,\ - 'printingOut': 0 - } + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :250 ,\ + 'tolerance_constant':1e-06,\ + 'methodTV': 0} print ("#############SB TV CPU####################") start_time = timeit.default_timer() -sb_cpu3D = SB_TV(pars['input'], +(sb_cpu3D, info_vec_cpu) = SB_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], - pars['methodTV'], - pars['printingOut'],'cpu') - + pars['methodTV'],'cpu') Qtools = QualityTools(idealVol, sb_cpu3D) pars['rmse'] = Qtools.rmse() @@ -234,19 +230,20 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : LLT_ROF, \ 'input' : noisyVol,\ - 'regularisation_parameterROF':0.04, \ - 'regularisation_parameterLLT':0.015, \ - 'number_of_iterations' :300 ,\ - 'time_marching_parameter' :0.0025 ,\ - } + 'regularisation_parameterROF':0.01, \ + 'regularisation_parameterLLT':0.008, \ + 'number_of_iterations' :500 ,\ + 'time_marching_parameter' :0.001 ,\ + 'tolerance_constant':1e-06} print ("#############LLT ROF CPU####################") start_time = timeit.default_timer() -lltrof_cpu3D = LLT_ROF(pars['input'], +(lltrof_cpu3D,info_vec_cpu) = LLT_ROF(pars['input'], pars['regularisation_parameterROF'], pars['regularisation_parameterLLT'], pars['number_of_iterations'], - pars['time_marching_parameter'],'cpu') + pars['time_marching_parameter'], + pars['tolerance_constant'], 'cpu') Qtools = QualityTools(idealVol, lltrof_cpu3D) diff --git a/demos/demo_gpu_regularisers.py b/demos/demo_gpu_regularisers.py index 212ad5a..6aec283 100644 --- a/demos/demo_gpu_regularisers.py +++ b/demos/demo_gpu_regularisers.py @@ -84,7 +84,7 @@ imgplot = plt.imshow(u0,cmap="gray") pars = {'algorithm': ROF_TV, \ 'input' : u0,\ 'regularisation_parameter':0.02,\ - 'number_of_iterations': 5000,\ + 'number_of_iterations': 6000,\ 'time_marching_parameter': 0.001,\ 'tolerance_constant':1e-06} @@ -128,7 +128,7 @@ imgplot = plt.imshow(u0,cmap="gray") pars = {'algorithm' : FGP_TV, \ 'input' : u0,\ 'regularisation_parameter':0.02, \ - 'number_of_iterations' :300 ,\ + 'number_of_iterations' :400 ,\ 'tolerance_constant':1e-06,\ 'methodTV': 0 ,\ 'nonneg': 0} @@ -171,21 +171,18 @@ imgplot = plt.imshow(u0,cmap="gray") # set parameters pars = {'algorithm' : SB_TV, \ 'input' : u0,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :150 ,\ + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :250 ,\ 'tolerance_constant':1e-06,\ - 'methodTV': 0 ,\ - 'printingOut': 0 - } + 'methodTV': 0} print ("##############SB TV GPU##################") start_time = timeit.default_timer() -sb_gpu = SB_TV(pars['input'], +(sb_gpu, info_vec_gpu) = SB_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], - pars['methodTV'], - pars['printingOut'],'gpu') + pars['methodTV'], 'gpu') Qtools = QualityTools(Im, sb_gpu) pars['rmse'] = Qtools.rmse() @@ -205,36 +202,35 @@ plt.title('{}'.format('GPU results')) #%% print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") -print ("_____Total Generalised Variation (2D)______") +print ("______________LLT- ROF (2D)________________") print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") ## plot fig = plt.figure() -plt.suptitle('Performance of TGV regulariser using the GPU') +plt.suptitle('Performance of LLT-ROF regulariser using the GPU') a=fig.add_subplot(1,2,1) a.set_title('Noisy Image') imgplot = plt.imshow(u0,cmap="gray") # set parameters -pars = {'algorithm' : TGV, \ +pars = {'algorithm' : LLT_ROF, \ 'input' : u0,\ - 'regularisation_parameter':0.04, \ - 'alpha1':1.0,\ - 'alpha0':2.0,\ - 'number_of_iterations' :1250 ,\ - 'LipshitzConstant' :12 ,\ - } + 'regularisation_parameterROF':0.01, \ + 'regularisation_parameterLLT':0.0085, \ + 'number_of_iterations' : 6000 ,\ + 'time_marching_parameter' :0.001 ,\ + 'tolerance_constant':1e-06} -print ("#############TGV CPU####################") +print ("#############LLT- ROF GPU####################") start_time = timeit.default_timer() -tgv_gpu = TGV(pars['input'], - pars['regularisation_parameter'], - pars['alpha1'], - pars['alpha0'], +(lltrof_gpu, info_vec_gpu) = LLT_ROF(pars['input'], + pars['regularisation_parameterROF'], + pars['regularisation_parameterLLT'], pars['number_of_iterations'], - pars['LipshitzConstant'],'gpu') - -Qtools = QualityTools(Im, tgv_gpu) + pars['time_marching_parameter'], + pars['tolerance_constant'], 'gpu') + +Qtools = QualityTools(Im, lltrof_gpu) pars['rmse'] = Qtools.rmse() txtstr = printParametersToString(pars) txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time) @@ -246,40 +242,42 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.75) # place a text box in upper left in axes coords a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14, verticalalignment='top', bbox=props) -imgplot = plt.imshow(tgv_gpu, cmap="gray") +imgplot = plt.imshow(lltrof_gpu, cmap="gray") plt.title('{}'.format('GPU results')) #%% print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") -print ("______________LLT- ROF (2D)________________") +print ("_____Total Generalised Variation (2D)______") print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") ## plot fig = plt.figure() -plt.suptitle('Performance of LLT-ROF regulariser using the GPU') +plt.suptitle('Performance of TGV regulariser using the GPU') a=fig.add_subplot(1,2,1) a.set_title('Noisy Image') imgplot = plt.imshow(u0,cmap="gray") # set parameters -pars = {'algorithm' : LLT_ROF, \ +pars = {'algorithm' : TGV, \ 'input' : u0,\ - 'regularisation_parameterROF':0.04, \ - 'regularisation_parameterLLT':0.01, \ - 'number_of_iterations' :500 ,\ - 'time_marching_parameter' :0.0025 ,\ + 'regularisation_parameter':0.04, \ + 'alpha1':1.0,\ + 'alpha0':2.0,\ + 'number_of_iterations' :1250 ,\ + 'LipshitzConstant' :12 ,\ } -print ("#############LLT- ROF GPU####################") +print ("#############TGV CPU####################") start_time = timeit.default_timer() -lltrof_gpu = LLT_ROF(pars['input'], - pars['regularisation_parameterROF'], - pars['regularisation_parameterLLT'], +tgv_gpu = TGV(pars['input'], + pars['regularisation_parameter'], + pars['alpha1'], + pars['alpha0'], pars['number_of_iterations'], - pars['time_marching_parameter'],'gpu') - -Qtools = QualityTools(Im, lltrof_gpu) + pars['LipshitzConstant'],'gpu') + +Qtools = QualityTools(Im, tgv_gpu) pars['rmse'] = Qtools.rmse() txtstr = printParametersToString(pars) txtstr += "%s = %.3fs" % ('elapsed time',timeit.default_timer() - start_time) @@ -291,7 +289,7 @@ props = dict(boxstyle='round', facecolor='wheat', alpha=0.75) # place a text box in upper left in axes coords a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14, verticalalignment='top', bbox=props) -imgplot = plt.imshow(lltrof_gpu, cmap="gray") +imgplot = plt.imshow(tgv_gpu, cmap="gray") plt.title('{}'.format('GPU results')) #%% diff --git a/demos/demo_gpu_regularisers3D.py b/demos/demo_gpu_regularisers3D.py index be16921..1a13c86 100644 --- a/demos/demo_gpu_regularisers3D.py +++ b/demos/demo_gpu_regularisers3D.py @@ -101,16 +101,18 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm': ROF_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04,\ - 'number_of_iterations': 500,\ - 'time_marching_parameter': 0.0025 - } -print ("#############ROF TV GPU####################") + 'regularisation_parameter':0.02,\ + 'number_of_iterations': 7000,\ + 'time_marching_parameter': 0.0007,\ + 'tolerance_constant':1e-06} + +print ("#############ROF TV CPU####################") start_time = timeit.default_timer() -rof_gpu3D = ROF_TV(pars['input'], +(rof_gpu3D, info_vec_gpu) = ROF_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], - pars['time_marching_parameter'],'gpu') + pars['time_marching_parameter'], + pars['tolerance_constant'], 'gpu') Qtools = QualityTools(idealVol, rof_gpu3D) pars['rmse'] = Qtools.rmse() @@ -141,23 +143,20 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : FGP_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :300 ,\ - 'tolerance_constant':0.00001,\ + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :1000 ,\ + 'tolerance_constant':1e-06,\ 'methodTV': 0 ,\ - 'nonneg': 0 ,\ - 'printingOut': 0 - } + 'nonneg': 0} print ("#############FGP TV GPU####################") start_time = timeit.default_timer() -fgp_gpu3D = FGP_TV(pars['input'], +(fgp_gpu3D, info_vec_gpu) = FGP_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], pars['methodTV'], - pars['nonneg'], - pars['printingOut'],'gpu') + pars['nonneg'], 'gpu') Qtools = QualityTools(idealVol, fgp_gpu3D) pars['rmse'] = Qtools.rmse() @@ -189,21 +188,18 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : SB_TV, \ 'input' : noisyVol,\ - 'regularisation_parameter':0.04, \ - 'number_of_iterations' :100 ,\ - 'tolerance_constant':1e-05,\ - 'methodTV': 0 ,\ - 'printingOut': 0 - } + 'regularisation_parameter':0.02, \ + 'number_of_iterations' :300 ,\ + 'tolerance_constant':1e-06,\ + 'methodTV': 0 } print ("#############SB TV GPU####################") start_time = timeit.default_timer() -sb_gpu3D = SB_TV(pars['input'], +(sb_gpu3D, info_vec_gpu) = SB_TV(pars['input'], pars['regularisation_parameter'], pars['number_of_iterations'], pars['tolerance_constant'], - pars['methodTV'], - pars['printingOut'],'gpu') + pars['methodTV'],'gpu') Qtools = QualityTools(idealVol, sb_gpu3D) pars['rmse'] = Qtools.rmse() @@ -235,19 +231,20 @@ imgplot = plt.imshow(noisyVol[10,:,:],cmap="gray") # set parameters pars = {'algorithm' : LLT_ROF, \ 'input' : noisyVol,\ - 'regularisation_parameterROF':0.04, \ - 'regularisation_parameterLLT':0.015, \ - 'number_of_iterations' :300 ,\ - 'time_marching_parameter' :0.0025 ,\ - } + 'regularisation_parameterROF':0.01, \ + 'regularisation_parameterLLT':0.008, \ + 'number_of_iterations' : 500 ,\ + 'time_marching_parameter' :0.001 ,\ + 'tolerance_constant':1e-06} print ("#############LLT ROF CPU####################") start_time = timeit.default_timer() -lltrof_gpu3D = LLT_ROF(pars['input'], +(lltrof_gpu3D,info_vec_gpu) = LLT_ROF(pars['input'], pars['regularisation_parameterROF'], pars['regularisation_parameterLLT'], pars['number_of_iterations'], - pars['time_marching_parameter'],'gpu') + pars['time_marching_parameter'], + pars['tolerance_constant'], 'gpu') Qtools = QualityTools(idealVol, lltrof_gpu3D) pars['rmse'] = Qtools.rmse() diff --git a/src/Core/regularisers_CPU/FGP_TV_core.c b/src/Core/regularisers_CPU/FGP_TV_core.c index a6bb7e0..3248867 100644 --- a/src/Core/regularisers_CPU/FGP_TV_core.c +++ b/src/Core/regularisers_CPU/FGP_TV_core.c @@ -85,7 +85,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb tk = tkp1; /* check early stopping criteria */ - if (epsil != 0.0f) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { @@ -94,7 +94,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb } re = sqrtf(re)/sqrtf(re1); if (re < epsil) count++; - if (count > 4) break; + if (count > 3) break; copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); } @@ -138,7 +138,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb Rupd_func3D(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, DimTotal); /* calculate norm - stopping rules*/ - if (epsil != 0.0f) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { @@ -148,7 +148,7 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb re = sqrtf(re)/sqrtf(re1); /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; + if (count > 3) break; copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); } diff --git a/src/Core/regularisers_CPU/LLT_ROF_core.c b/src/Core/regularisers_CPU/LLT_ROF_core.c index 8416a14..f9fea66 100644 --- a/src/Core/regularisers_CPU/LLT_ROF_core.c +++ b/src/Core/regularisers_CPU/LLT_ROF_core.c @@ -18,7 +18,7 @@ limitations under the License. */ #include "LLT_ROF_core.h" -#define EPS_LLT 0.01 +#define EPS_LLT 1.0e-12 #define EPS_ROF 1.0e-12 #define MAX(x, y) (((x) > (y)) ? (x) : (y)) #define MIN(x, y) (((x) < (y)) ? (x) : (y)) @@ -29,49 +29,55 @@ int signLLT(float x) { } /* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step +* 4. tau - time-marching step * 5. iter - iterations number (for both models) +* 6. eplsilon: tolerance constant * * Output: -* Filtered/regularised image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ -float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ) +float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ) { long DimTotal; - int ll; - float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL; - - DimTotal = (long)(dimX*dimY*dimZ); - + int ll, j; + float re, re1; + re = 0.0f; re1 = 0.0f; + int count = 0; + + float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL, *Output_prev=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + D1_ROF = calloc(DimTotal, sizeof(float)); D2_ROF = calloc(DimTotal, sizeof(float)); D3_ROF = calloc(DimTotal, sizeof(float)); - + D1_LLT = calloc(DimTotal, sizeof(float)); D2_LLT = calloc(DimTotal, sizeof(float)); D3_LLT = calloc(DimTotal, sizeof(float)); - + copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); /* initialize */ - - for(ll = 0; ll < iterationsNumb; ll++) { + if (epsil != 0.0f) Output_prev = calloc(DimTotal, sizeof(float)); + + for(ll = 0; ll < iterationsNumb; ll++) { if (dimZ == 1) { - /* 2D case */ - /****************ROF******************/ - /* calculate first-order differences */ + /* 2D case */ + /****************ROF******************/ + /* calculate first-order differences */ D1_func_ROF(Output, D1_ROF, (long)(dimX), (long)(dimY), 1l); D2_func_ROF(Output, D2_ROF, (long)(dimX), (long)(dimY), 1l); /****************LLT******************/ @@ -81,21 +87,41 @@ float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambd Update2D_LLT_ROF(Input, Output, D1_LLT, D2_LLT, D1_ROF, D2_ROF, lambdaROF, lambdaLLT, tau, (long)(dimX), (long)(dimY), 1l); } else { - /* 3D case */ - /* calculate first-order differences */ + /* 3D case */ + /* calculate first-order differences */ D1_func_ROF(Output, D1_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); D2_func_ROF(Output, D2_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); - D3_func_ROF(Output, D3_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); + D3_func_ROF(Output, D3_ROF, (long)(dimX), (long)(dimY), (long)(dimZ)); /****************LLT******************/ /* estimate second-order derrivatives */ der3D_LLT(Output, D1_LLT, D2_LLT, D3_LLT,(long)(dimX), (long)(dimY), (long)(dimZ)); /* Joint update for ROF and LLT models */ Update3D_LLT_ROF(Input, Output, D1_LLT, D2_LLT, D3_LLT, D1_ROF, D2_ROF, D3_ROF, lambdaROF, lambdaLLT, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); - } - } /*end of iterations*/ + } + + /* check early stopping criteria */ + if ((epsil != 0.0f) && (ll % 5 == 0)) { + re = 0.0f; re1 = 0.0f; + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + if (re < epsil) count++; + if (count > 3) break; + copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); + } + + } /*end of iterations*/ free(D1_LLT);free(D2_LLT);free(D3_LLT); free(D1_ROF);free(D2_ROF);free(D3_ROF); - return *Output; + if (epsil != 0.0f) free(Output_prev); + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } /*************************************************************************/ @@ -143,17 +169,17 @@ float der3D_LLT(float *U, float *D1, float *D2, float *D3, long dimX, long dimY, j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + index = (dimX*dimY)*k + j*dimX+i; - + dxx = U[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*U[index] + U[(dimX*dimY)*k + j*dimX+i_m]; dyy = U[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k + j_m*dimX+i]; dzz = U[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k_m + j*dimX+i]; - + denom_xx = fabs(dxx) + EPS_LLT; denom_yy = fabs(dyy) + EPS_LLT; denom_zz = fabs(dzz) + EPS_LLT; - + D1[index] = dxx / denom_xx; D2[index] = dyy / denom_yy; D3[index] = dzz / denom_zz; @@ -172,7 +198,7 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D1, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1,NOMy_1,NOMy_0,NOMz_1,NOMz_0,denom1,denom2,denom3,T1) for(j=0; j<dimY; j++) { @@ -186,17 +212,17 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -216,13 +242,13 @@ float D1_func_ROF(float *A, float *D1, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ /*NOMx_0 = (A[(i)*dimY + j] - A[(i2)*dimY + j]); */ /* x- */ NOMy_0 = A[index] - A[(j)*dimX + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(fabs(NOMy_1),fabs(NOMy_0))); denom2 = denom2*denom2; @@ -237,7 +263,7 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; long i,j,k,i1,i2,k1,j1,j2,k2,index; - + if (dimZ > 1) { #pragma omp parallel for shared (A, D2, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2) for(j=0; j<dimY; j++) { @@ -251,16 +277,16 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ NOMz_0 = A[index] - A[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -280,13 +306,13 @@ float D2_func_ROF(float *A, float *D2, long dimX, long dimY, long dimZ) i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; - + /* Forward-backward differences */ NOMx_1 = A[j1*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[j*dimX + i1] - A[index]; /* y+ */ NOMx_0 = A[index] - A[j2*dimX + i]; /* x- */ /*NOMy_0 = A[(i)*dimY + j] - A[(i)*dimY + j2]; */ /* y- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -302,7 +328,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; long index,i,j,k,i1,i2,k1,j1,j2,k2; - + #pragma omp parallel for shared (A, D3, dimX, dimY, dimZ) private(index, i, j, k, i1, j1, k1, i2, j2, k2, NOMx_1, NOMy_1, NOMy_0, NOMx_0, NOMz_1, denom1, denom2, denom3, T3) for(j=0; j<dimY; j++) { for(i=0; i<dimX; i++) { @@ -315,7 +341,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = A[(dimX*dimY)*k + (j1)*dimX + i] - A[index]; /* x+ */ NOMy_1 = A[(dimX*dimY)*k + (j)*dimX + i1] - A[index]; /* y+ */ @@ -323,7 +349,7 @@ float D3_func_ROF(float *A, float *D3, long dimX, long dimY, long dimZ) NOMx_0 = A[index] - A[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = A[(dimX*dimY)*k1 + j*dimX + i] - A[index]; /* z+ */ /*NOMz_0 = A[(dimX*dimY)*k + (i)*dimY + j] - A[(dimX*dimY)*k2 + (i)*dimY + j]; */ /* z- */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5f*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(fabs(NOMx_1),fabs(NOMx_0))); denom2 = denom2*denom2; @@ -352,19 +378,19 @@ float Update2D_LLT_ROF(float *U0, float *U, float *D1_LLT, float *D2_LLT, float i_m = i - 1; if (i_m < 0) i_m = i + 1; j_p = j + 1; if (j_p == dimY) j_p = j - 1; j_m = j - 1; if (j_m < 0) j_m = j + 1; - + /*LLT-related part*/ dxx = D1_LLT[j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[j*dimX+i_m]; dyy = D2_LLT[j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[j_m*dimX+i]; laplc = dxx + dyy; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[j_m*dimX + i]; dv2 = D2_ROF[index] - D2_ROF[j*dimX + i_m]; div = dv1 + dv2; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } return *U; @@ -385,26 +411,25 @@ float Update3D_LLT_ROF(float *U0, float *U, float *D1_LLT, float *D2_LLT, float j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + index = (dimX*dimY)*k + j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[(dimX*dimY)*k + j*dimX+i_m]; dyy = D2_LLT[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[(dimX*dimY)*k + j_m*dimX+i]; dzz = D3_LLT[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*D3_LLT[index] + D3_LLT[(dimX*dimY)*k_m + j*dimX+i]; laplc = dxx + dyy + dzz; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[(dimX*dimY)*k + j_m*dimX+i]; dv2 = D2_ROF[index] - D2_ROF[(dimX*dimY)*k + j*dimX+i_m]; dv3 = D3_ROF[index] - D3_ROF[(dimX*dimY)*k_m + j*dimX+i]; div = dv1 + dv2 + dv3; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } } return *U; } - diff --git a/src/Core/regularisers_CPU/LLT_ROF_core.h b/src/Core/regularisers_CPU/LLT_ROF_core.h index 8e6591e..abf0d60 100644 --- a/src/Core/regularisers_CPU/LLT_ROF_core.h +++ b/src/Core/regularisers_CPU/LLT_ROF_core.h @@ -26,22 +26,22 @@ limitations under the License. #include "CCPiDefines.h" /* C-OMP implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step +* 4. tau - time-marching step * 5. iter - iterations number (for both models) * * Output: * Filtered/regularised image * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ @@ -49,7 +49,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +CCPI_EXPORT float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); CCPI_EXPORT float der2D_LLT(float *U, float *D1, float *D2, long dimX, long dimY, long dimZ); CCPI_EXPORT float der3D_LLT(float *U, float *D1, float *D2, float *D3, long dimX, long dimY, long dimZ); diff --git a/src/Core/regularisers_CPU/ROF_TV_core.c b/src/Core/regularisers_CPU/ROF_TV_core.c index a7291d7..8ea2552 100644 --- a/src/Core/regularisers_CPU/ROF_TV_core.c +++ b/src/Core/regularisers_CPU/ROF_TV_core.c @@ -75,7 +75,7 @@ float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lamb TV_kernel(D1, D2, D3, Output, Input, lambdaPar, tau, (long)(dimX), (long)(dimY), (long)(dimZ)); /* check early stopping criteria */ - if (epsil != 0.0f) { + if ((epsil != 0.0f) && (i % 5 == 0)) { re = 0.0f; re1 = 0.0f; for(j=0; j<DimTotal; j++) { @@ -84,7 +84,7 @@ float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lamb } re = sqrtf(re)/sqrtf(re1); if (re < epsil) count++; - if (count > 4) break; + if (count > 3) break; copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); } } diff --git a/src/Core/regularisers_CPU/SB_TV_core.c b/src/Core/regularisers_CPU/SB_TV_core.c index 769ea67..07ed9b0 100755 --- a/src/Core/regularisers_CPU/SB_TV_core.c +++ b/src/Core/regularisers_CPU/SB_TV_core.c @@ -27,122 +27,120 @@ limitations under the License. * 3. Number of iterations [OPTIONAL parameter] * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * * Output: -* 1. Filtered/regularized image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ - -float SB_TV_CPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ) + +float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ) { int ll; - long j, DimTotal; - float re, re1, lambda; + long j, DimTotal; + float re, re1, lambda; + re = 0.0f; re1 = 0.0f; int count = 0; mu = 1.0f/mu; lambda = 2.0f*mu; - if (dimZ <= 1) { - /* 2D case */ - float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; - DimTotal = (long)(dimX*dimY); - - Output_prev = calloc(DimTotal, sizeof(float)); + float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; + DimTotal = (long)(dimX*dimY*dimZ); + Output_prev = calloc(DimTotal, sizeof(float)); Dx = calloc(DimTotal, sizeof(float)); Dy = calloc(DimTotal, sizeof(float)); Bx = calloc(DimTotal, sizeof(float)); By = calloc(DimTotal, sizeof(float)); - + + if (dimZ == 1) { + /* 2D case */ copyIm(Input, Output, (long)(dimX), (long)(dimY), 1l); /*initialize */ - + /* begin outer SB iterations */ for(ll=0; ll<iter; ll++) { - + /* storing old estimate */ copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); - + /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel2D(Output, Input, Output_prev, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda, mu); copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l); /*GS iteration */ gauss_seidel2D(Output, Input, Output_prev, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda, mu); - + /* TV-related step */ if (methodTV == 1) updDxDy_shrinkAniso2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda); else updDxDy_shrinkIso2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY), lambda); - + /* update for Bregman variables */ updBxBy2D(Output, Dx, Dy, Bx, By, (long)(dimX), (long)(dimY)); - + /* check early stopping criteria if epsilon not equal zero */ - if (epsil != 0) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; - } - /*printf("%f %i %i \n", re, ll, count); */ + if (count > 3) break; + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(Dx); free(Dy); free(Bx); free(By); } else { /* 3D case */ - float *Output_prev=NULL, *Dx=NULL, *Dy=NULL, *Dz=NULL, *Bx=NULL, *By=NULL, *Bz=NULL; - DimTotal = (long)(dimX*dimY*dimZ); - - Output_prev = calloc(DimTotal, sizeof(float)); - Dx = calloc(DimTotal, sizeof(float)); - Dy = calloc(DimTotal, sizeof(float)); + float *Dz=NULL, *Bz=NULL; + Dz = calloc(DimTotal, sizeof(float)); - Bx = calloc(DimTotal, sizeof(float)); - By = calloc(DimTotal, sizeof(float)); Bz = calloc(DimTotal, sizeof(float)); - + copyIm(Input, Output, (long)(dimX), (long)(dimY), (long)(dimZ)); /*initialize */ - + /* begin outer SB iterations */ for(ll=0; ll<iter; ll++) { - + /* storing old estimate */ copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel3D(Output, Input, Output_prev, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, mu); copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ)); /*GS iteration */ gauss_seidel3D(Output, Input, Output_prev, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda, mu); - + /* TV-related step */ if (methodTV == 1) updDxDyDz_shrinkAniso3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda); else updDxDyDz_shrinkIso3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ), lambda); - + /* update for Bregman variables */ updBxByBz3D(Output, Dx, Dy, Dz, Bx, By, Bz, (long)(dimX), (long)(dimY), (long)(dimZ)); - + /* check early stopping criteria if epsilon not equal zero */ - if (epsil != 0) { + if ((epsil != 0.0f) && (ll % 5 == 0)) { re = 0.0f; re1 = 0.0f; - for(j=0; j<DimTotal; j++) { - re += pow(Output[j] - Output_prev[j],2); - re1 += pow(Output[j],2); - } - re = sqrt(re)/sqrt(re1); + for(j=0; j<DimTotal; j++) + { + re += powf(Output[j] - Output_prev[j],2); + re1 += powf(Output[j],2); + } + re = sqrtf(re)/sqrtf(re1); + /* stop if the norm residual is less than the tolerance EPS */ if (re < epsil) count++; - if (count > 4) break; - } - /*printf("%f %i %i \n", re, ll, count); */ + if (count > 3) break; + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - free(Output_prev); free(Dx); free(Dy); free(Dz); free(Bx); free(By); free(Bz); + free(Dz); free(Bz); } - return *Output; + + free(Output_prev); free(Dx); free(Dy); free(Bx); free(By); + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } /********************************************************************/ @@ -153,7 +151,7 @@ float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl float sum, normConst; long i,j,i1,i2,j1,j2,index; normConst = 1.0f/(mu + 4.0f*lambda); - + #pragma omp parallel for shared(U) private(index,i,j,i1,i2,j1,j2,sum) for(i=0; i<dimX; i++) { /* symmetric boundary conditions (Neuman) */ @@ -164,7 +162,7 @@ float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl j1 = j+1; if (j1 == dimY) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; index = j*dimX+i; - + sum = Dx[j*dimX+i2] - Dx[index] + Dy[j2*dimX+i] - Dy[index] - Bx[j*dimX+i2] + Bx[index] - By[j2*dimX+i] + By[index]; sum += U_prev[j*dimX+i1] + U_prev[j*dimX+i2] + U_prev[j1*dimX+i] + U_prev[j2*dimX+i]; sum *= lambda; @@ -186,16 +184,16 @@ float updDxDy_shrinkAniso2D(float *U, float *Dx, float *Dy, float *Bx, float *By i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + val1 = (U[j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[j1*dimX+i] - U[index]) + By[index]; - + val11 = fabs(val1) - denom_lam; if (val11 < 0) val11 = 0; val22 = fabs(val2) - denom_lam; if (val22 < 0) val22 = 0; - + if (val1 !=0) Dx[index] = (val1/fabs(val1))*val11; else Dx[index] = 0; if (val2 !=0) Dy[index] = (val2/fabs(val2))*val22; else Dy[index] = 0; - + }} return 1; } @@ -204,7 +202,7 @@ float updDxDy_shrinkIso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long i,j,i1,j1,index; float val1, val11, val2, denom, denom_lam; denom_lam = 1.0f/lambda; - + #pragma omp parallel for shared(U,denom_lam) private(index,i,j,i1,j1,val1,val11,val2,denom) for(i=0; i<dimX; i++) { for(j=0; j<dimY; j++) { @@ -212,14 +210,14 @@ float updDxDy_shrinkIso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + val1 = (U[j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[j1*dimX+i] - U[index]) + By[index]; - + denom = sqrt(val1*val1 + val2*val2); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -241,7 +239,7 @@ float updBxBy2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long dimX, i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; index = j*dimX+i; - + Bx[index] += (U[j*dimX+i1] - U[index]) - Dx[index]; By[index] += (U[j1*dimX+i] - U[index]) - Dy[index]; }} @@ -269,7 +267,7 @@ float gauss_seidel3D(float *U, float *A, float *U_prev, float *Dx, float *Dy, fl k1 = k+1; if (k1 == dimZ) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; index = (dimX*dimY)*k + j*dimX+i; - + d_val = Dx[(dimX*dimY)*k + j*dimX+i2] - Dx[index] + Dy[(dimX*dimY)*k + j2*dimX+i] - Dy[index] + Dz[(dimX*dimY)*k2 + j*dimX+i] - Dz[index]; b_val = -Bx[(dimX*dimY)*k + j*dimX+i2] + Bx[index] - By[(dimX*dimY)*k + j2*dimX+i] + By[index] - Bz[(dimX*dimY)*k2 + j*dimX+i] + Bz[index]; sum = d_val + b_val; @@ -295,19 +293,19 @@ float updDxDyDz_shrinkAniso3D(float *U, float *Dx, float *Dy, float *Dz, float * i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + val1 = (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) + By[index]; val3 = (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) + Bz[index]; - + val11 = fabs(val1) - denom_lam; if (val11 < 0.0f) val11 = 0.0f; val22 = fabs(val2) - denom_lam; if (val22 < 0.0f) val22 = 0.0f; val33 = fabs(val3) - denom_lam; if (val33 < 0.0f) val33 = 0.0f; - + if (val1 !=0.0f) Dx[index] = (val1/fabs(val1))*val11; else Dx[index] = 0.0f; if (val2 !=0.0f) Dy[index] = (val2/fabs(val2))*val22; else Dy[index] = 0.0f; if (val3 !=0.0f) Dz[index] = (val3/fabs(val3))*val33; else Dz[index] = 0.0f; - + }}} return 1; } @@ -325,15 +323,15 @@ float updDxDyDz_shrinkIso3D(float *U, float *Dx, float *Dy, float *Dz, float *Bx i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + val1 = (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) + Bx[index]; val2 = (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) + By[index]; val3 = (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) + Bz[index]; - + denom = sqrt(val1*val1 + val2*val2 + val3*val3); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -359,7 +357,7 @@ float updBxByBz3D(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *B i1 = i+1; if (i1 == dimX) i1 = i-1; j1 = j+1; if (j1 == dimY) j1 = j-1; k1 = k+1; if (k1 == dimZ) k1 = k-1; - + Bx[index] += (U[(dimX*dimY)*k + j*dimX+i1] - U[index]) - Dx[index]; By[index] += (U[(dimX*dimY)*k + j1*dimX+i] - U[index]) - Dy[index]; Bz[index] += (U[(dimX*dimY)*k1 + j*dimX+i] - U[index]) - Dz[index]; diff --git a/src/Core/regularisers_CPU/SB_TV_core.h b/src/Core/regularisers_CPU/SB_TV_core.h index 7485e3b..b94da6a 100644 --- a/src/Core/regularisers_CPU/SB_TV_core.h +++ b/src/Core/regularisers_CPU/SB_TV_core.h @@ -34,10 +34,10 @@ limitations under the License. * 3. Number of iterations [OPTIONAL parameter] * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] -* 6. print information: 0 (off) or 1 (on) [OPTIONAL parameter] -* + * Output: -* 1. Filtered/regularized image +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] * * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ @@ -45,7 +45,7 @@ limitations under the License. #ifdef __cplusplus extern "C" { #endif -CCPI_EXPORT float SB_TV_CPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); +CCPI_EXPORT float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); CCPI_EXPORT float gauss_seidel2D(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Bx, float *By, long dimX, long dimY, float lambda, float mu); CCPI_EXPORT float updDxDy_shrinkAniso2D(float *U, float *Dx, float *Dy, float *Bx, float *By, long dimX, long dimY, float lambda); diff --git a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu index 87871be..64f04d5 100644 --- a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu +++ b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.cu @@ -15,28 +15,33 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "LLT_ROF_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> +#include <thrust/device_vector.h> +#include <thrust/transform_reduce.h> /* CUDA implementation of Lysaker, Lundervold and Tai (LLT) model [1] combined with Rudin-Osher-Fatemi [2] TV regularisation penalty. - * -* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. -* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase -* lambdaLLT starting with smaller values. + * +* This penalty can deliver visually pleasant piecewise-smooth recovery if regularisation parameters are selected well. +* The rule of thumb for selection is to start with lambdaLLT = 0 (just the ROF-TV model) and then proceed to increase +* lambdaLLT starting with smaller values. * * Input Parameters: * 1. U0 - original noise image/volume * 2. lambdaROF - ROF-related regularisation parameter * 3. lambdaLLT - LLT-related regularisation parameter -* 4. tau - time-marching step -* 5. iter - iterations number (for both models) -* -* Output: -* Filtered/regularised image +* 4. iter - iterations number (for both models) +* 5. tau - time-marching step +* 6. eplsilon: tolerance constant + + * Output: + * [1] Filtered/regularized image/volume + * [2] Information vector which contains [iteration no., reached tolerance] * -* References: +* References: * [1] Lysaker, M., Lundervold, A. and Tai, X.C., 2003. Noise removal using fourth-order partial differential equation with applications to medical magnetic resonance images in space and time. IEEE Transactions on image processing, 12(12), pp.1579-1590. * [2] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ @@ -44,12 +49,12 @@ limitations under the License. #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 -#define EPS_LLT 0.01 +#define EPS_LLT 1.0e-12 #define EPS_ROF 1.0e-12 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) @@ -60,8 +65,8 @@ limitations under the License. __host__ __device__ int signLLT (float x) { return (x > 0) - (x < 0); -} - +} + /*************************************************************************/ /**********************LLT-related functions *****************************/ /*************************************************************************/ @@ -71,11 +76,11 @@ __global__ void der2D_LLT_kernel(float *U, float *D1, float *D2, int dimX, int d float dxx, dyy, denom_xx, denom_yy; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -92,18 +97,18 @@ __global__ void der2D_LLT_kernel(float *U, float *D1, float *D2, int dimX, int d D2[index] = dyy / denom_yy; } } - + __global__ void der3D_LLT_kernel(float* U, float *D1, float *D2, float *D3, int dimX, int dimY, int dimZ) { int i_p, i_m, j_m, j_p, k_p, k_m; float dxx, dyy, dzz, denom_xx, denom_yy, denom_zz; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -111,17 +116,17 @@ __global__ void der3D_LLT_kernel(float* U, float *D1, float *D2, float *D3, int j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + int index = (dimX*dimY)*k + j*dimX+i; - + dxx = U[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*U[index] + U[(dimX*dimY)*k + j*dimX+i_m]; dyy = U[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k + j_m*dimX+i]; dzz = U[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*U[index] + U[(dimX*dimY)*k_m + j*dimX+i]; - + denom_xx = abs(dxx) + EPS_LLT; denom_yy = abs(dyy) + EPS_LLT; denom_zz = abs(dzz) + EPS_LLT; - + D1[index] = dxx / denom_xx; D2[index] = dyy / denom_yy; D3[index] = dzz / denom_zz; @@ -139,74 +144,74 @@ __global__ void D1_func2D_ROF_kernel(float* Input, float* D1, int N, int M) float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= M) j1 = j-1; - + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(signLLT((float)NOMy_1) + signLLT((float)NOMy_0))*(MIN(abs((float)NOMy_1),abs((float)NOMy_0))); denom2 = denom2*denom2; T1 = sqrt(denom1 + denom2 + EPS_ROF); D1[index] = NOMx_1/T1; - } + } } - + /* differences 2 */ -__global__ void D2_func2D_ROF_kernel(float* Input, float* D2, int N, int M) +__global__ void D2_func2D_ROF_kernel(float* Input, float* D2, int N, int M) { int i1, j1, j2; float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; j1 = j + 1; if (j1 >= M) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(signLLT((float)NOMx_1) + signLLT((float)NOMx_0))*(MIN(abs((float)NOMx_1),abs((float)NOMx_0))); denom2 = denom2*denom2; T2 = sqrt(denom1 + denom2 + EPS_ROF); - D2[index] = NOMy_1/T2; - } + D2[index] = NOMy_1/T2; + } } - + /* differences 1 */ -__global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY, int dimZ) +__global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (dimX*dimY)*k + j*dimX+i; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; @@ -214,38 +219,38 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5*(signLLT(NOMy_1) + signLLT(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); denom2 = denom2*denom2; denom3 = 0.5*(signLLT(NOMz_1) + signLLT(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); denom3 = denom3*denom3; T1 = sqrt(denom1 + denom2 + denom3 + EPS_ROF); - D1[index] = NOMx_1/T1; - } - } + D1[index] = NOMx_1/T1; + } + } /* differences 2 */ - __global__ void D2_func3D_ROF_kernel(float* Input, float* D2, int dimX, int dimY, int dimZ) + __global__ void D2_func3D_ROF_kernel(float* Input, float* D2, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -254,16 +259,16 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -273,19 +278,19 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY D2[index] = NOMy_1/T2; } } - + /* differences 3 */ - __global__ void D3_func3D_ROF_kernel(float* Input, float* D3, int dimX, int dimY, int dimZ) + __global__ void D3_func3D_ROF_kernel(float* Input, float* D3, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -294,14 +299,14 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5*(signLLT(NOMx_1) + signLLT(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -317,17 +322,17 @@ __global__ void D1_func3D_ROF_kernel(float* Input, float* D1, int dimX, int dimY __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, float *D2_LLT, float *D1_ROF, float *D2_ROF, float lambdaROF, float lambdaLLT, float tau, int dimX, int dimY) { - + int i_p, i_m, j_m, j_p; float div, laplc, dxx, dyy, dv1, dv2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + dimX*j; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -335,7 +340,7 @@ __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa j_m = j - 1; if (j_m < 0) j_m = j + 1; index = j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[j*dimX+i_m]; dyy = D2_LLT[j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[j_m*dimX+i]; @@ -344,9 +349,9 @@ __global__ void Update2D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa dv1 = D1_ROF[index] - D1_ROF[j_m*dimX + i]; dv2 = D2_ROF[index] - D2_ROF[j*dimX + i_m]; div = dv1 + dv2; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } @@ -354,13 +359,13 @@ __global__ void Update3D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa { int i_p, i_m, j_m, j_p, k_p, k_m; float div, laplc, dxx, dyy, dzz, dv1, dv2, dv3; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i_p = i + 1; if (i_p == dimX) i_p = i - 1; i_m = i - 1; if (i_m < 0) i_m = i + 1; @@ -368,106 +373,225 @@ __global__ void Update3D_LLT_ROF_kernel(float *U0, float *U, float *D1_LLT, floa j_m = j - 1; if (j_m < 0) j_m = j + 1; k_p = k + 1; if (k_p == dimZ) k_p = k - 1; k_m = k - 1; if (k_m < 0) k_m = k + 1; - + int index = (dimX*dimY)*k + j*dimX+i; - + /*LLT-related part*/ dxx = D1_LLT[(dimX*dimY)*k + j*dimX+i_p] - 2.0f*D1_LLT[index] + D1_LLT[(dimX*dimY)*k + j*dimX+i_m]; dyy = D2_LLT[(dimX*dimY)*k + j_p*dimX+i] - 2.0f*D2_LLT[index] + D2_LLT[(dimX*dimY)*k + j_m*dimX+i]; dzz = D3_LLT[(dimX*dimY)*k_p + j*dimX+i] - 2.0f*D3_LLT[index] + D3_LLT[(dimX*dimY)*k_m + j*dimX+i]; laplc = dxx + dyy + dzz; /*build Laplacian*/ - + /*ROF-related part*/ dv1 = D1_ROF[index] - D1_ROF[(dimX*dimY)*k + j_m*dimX+i]; dv2 = D2_ROF[index] - D2_ROF[(dimX*dimY)*k + j*dimX+i_m]; dv3 = D3_ROF[index] - D3_ROF[(dimX*dimY)*k_m + j*dimX+i]; div = dv1 + dv2 + dv3; /*build Divirgent*/ - + /*combine all into one cost function to minimise */ - U[index] += tau*(2.0f*lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); + U[index] += tau*(lambdaROF*(div) - lambdaLLT*(laplc) - (U[index] - U0[index])); } } +__global__ void ROFLLTcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + + +__global__ void ROFLLTResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total) +{ + int xIndex = blockDim.x * blockIdx.x + threadIdx.x; + int yIndex = blockDim.y * blockIdx.y + threadIdx.y; + + int index = xIndex + N*yIndex; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + +__global__ void ROFLLTcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void ROFLLTResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + /*******************************************************************/ /************************ HOST FUNCTION ****************************/ /*******************************************************************/ -extern "C" int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z) +extern "C" int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z) { - // set up device - int dev = 0; - int DimTotal; - DimTotal = N*M*Z; - CHECK(cudaSetDevice(dev)); + int deviceCount = -1; // number of devices + cudaGetDeviceCount(&deviceCount); + if (deviceCount == 0) { + fprintf(stderr, "No CUDA devices found\n"); + return -1; + } + float re; + re = 0.0f; + int DimTotal,count,n; + count = 0; n = 0; float *d_input, *d_update; - float *D1_LLT=NULL, *D2_LLT=NULL, *D3_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *D3_ROF=NULL; - + float *D1_LLT=NULL, *D2_LLT=NULL, *D1_ROF=NULL, *D2_ROF=NULL, *d_update_prev=NULL; + if (Z == 0) {Z = 1;} - + DimTotal = N*M*Z; + CHECK(cudaMalloc((void**)&d_input,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&d_update,DimTotal*sizeof(float))); - + CHECK(cudaMalloc((void**)&D1_LLT,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&D2_LLT,DimTotal*sizeof(float))); - CHECK(cudaMalloc((void**)&D3_LLT,DimTotal*sizeof(float))); - + CHECK(cudaMalloc((void**)&D1_ROF,DimTotal*sizeof(float))); CHECK(cudaMalloc((void**)&D2_ROF,DimTotal*sizeof(float))); - CHECK(cudaMalloc((void**)&D3_ROF,DimTotal*sizeof(float))); - + + CHECK(cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - + CHECK(cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)) + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) );; + if (Z == 1) { // TV - 2D case dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - - for(int n=0; n < iterationsNumb; n++) { + + for(n=0; n < iterationsNumb; n++) { /****************ROF******************/ /* calculate first-order differences */ D1_func2D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D1_ROF, N, M); CHECK(cudaDeviceSynchronize()); D2_func2D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /****************LLT******************/ /* estimate second-order derrivatives */ der2D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, N, M); /* Joint update for ROF and LLT models */ Update2D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D1_ROF, D2_ROF, lambdaROF, lambdaLLT, tau, N, M); CHECK(cudaDeviceSynchronize()); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFLLTResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, D1_ROF, N, M, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(D1_ROF, D1_ROF + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + ROFLLTcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } } } else { // 3D case dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); - - for(int n=0; n < iterationsNumb; n++) { + + float *D3_LLT=NULL, *D3_ROF=NULL; + + CHECK(cudaMalloc((void**)&D3_LLT,DimTotal*sizeof(float))); + CHECK(cudaMalloc((void**)&D3_ROF,DimTotal*sizeof(float))); + + for(n=0; n < iterationsNumb; n++) { /****************ROF******************/ /* calculate first-order differences */ D1_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D1_ROF, N, M, Z); CHECK(cudaDeviceSynchronize()); D2_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D2_ROF, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); D3_func3D_ROF_kernel<<<dimGrid,dimBlock>>>(d_update, D3_ROF, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /****************LLT******************/ /* estimate second-order derrivatives */ der3D_LLT_kernel<<<dimGrid,dimBlock>>>(d_update, D1_LLT, D2_LLT, D3_LLT, N, M, Z); /* Joint update for ROF and LLT models */ Update3D_LLT_ROF_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, D1_LLT, D2_LLT, D3_LLT, D1_ROF, D2_ROF, D3_ROF, lambdaROF, lambdaLLT, tau, N, M, Z); CHECK(cudaDeviceSynchronize()); + + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFLLTResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, D1_ROF, N, M, Z, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(D1_ROF, D1_ROF + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + ROFLLTcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, Z, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + } - } + CHECK(cudaFree(D3_LLT)); + CHECK(cudaFree(D3_ROF)); + } /*end of else */ + CHECK(cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost)); + CHECK(cudaFree(d_input)); CHECK(cudaFree(d_update)); + if (epsil != 0.0f) cudaFree(d_update_prev); CHECK(cudaFree(D1_LLT)); CHECK(cudaFree(D2_LLT)); - CHECK(cudaFree(D3_LLT)); CHECK(cudaFree(D1_ROF)); CHECK(cudaFree(D2_ROF)); - CHECK(cudaFree(D3_ROF)); + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h index a6bfcc7..2d566d2 100644 --- a/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h +++ b/src/Core/regularisers_GPU/LLT_ROF_GPU_core.h @@ -3,6 +3,6 @@ #include "CCPiDefines.h" #include <stdio.h> -extern "C" CCPI_EXPORT int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z); +extern "C" CCPI_EXPORT int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z); -#endif +#endif diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu index 96b1173..34be05c 100755 --- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu @@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_FGP_GPU_core.h" #include "shared.h" @@ -26,12 +26,12 @@ limitations under the License. /* CUDA implementation of FGP-TV [1] denoising/regularization model (2D/3D case) * * Input Parameters: - * 1. Noisy image/volume - * 2. lambdaPar - regularization parameter + * 1. Noisy image/volume + * 2. lambdaPar - regularization parameter * 3. Number of iterations - * 4. eplsilon: tolerance constant + * 4. eplsilon: tolerance constant * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1) - * 6. nonneg: 'nonnegativity (0 is OFF by default) + * 6. nonneg: 'nonnegativity (0 is OFF by default) * * Output: * [1] Filtered/regularized image/volume @@ -57,16 +57,16 @@ limitations under the License. /************************************************/ __global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int N, int M, int ImSize, float lambda) { - + float val1,val2; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - - int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + int index = xIndex + N*yIndex; + + if ((xIndex < N) && (yIndex < M)) { if (xIndex <= 0) {val1 = 0.0f;} else {val1 = R1[(xIndex-1) + N*yIndex];} if (yIndex <= 0) {val2 = 0.0f;} else {val2 = R2[xIndex + N*(yIndex-1)];} //Write final result to global memory @@ -77,21 +77,21 @@ __global__ void Obj_func2D_kernel(float *Ad, float *D, float *R1, float *R2, int __global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, float *R2, int N, int M, int ImSize, float multip) { - + float val1,val2; - + //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - + + if ((xIndex < N) && (yIndex < M)) { + /* boundary conditions */ if (xIndex >= N-1) val1 = 0.0f; else val1 = D[index] - D[(xIndex+1) + N*yIndex]; if (yIndex >= M-1) val2 = 0.0f; else val2 = D[index] - D[(xIndex) + N*(yIndex + 1)]; - + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -101,16 +101,16 @@ __global__ void Grad_func2D_kernel(float *P1, float *P2, float *D, float *R1, fl __global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float denom; + + float denom; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { - denom = pow(P1[index],2) + pow(P2[index],2); + + if ((xIndex < N) && (yIndex < M)) { + denom = pow(P1[index],2) + pow(P2[index],2); if (denom > 1.0f) { P1[index] = P1[index]/sqrt(denom); P2[index] = P2[index]/sqrt(denom); @@ -120,15 +120,15 @@ __global__ void Proj_func2D_iso_kernel(float *P1, float *P2, int N, int M, int I } __global__ void Proj_func2D_aniso_kernel(float *P1, float *P2, int N, int M, int ImSize) { - - float val1, val2; + + float val1, val2; //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { val1 = abs(P1[index]); val2 = abs(P2[index]); if (val1 < 1.0f) {val1 = 1.0f;} @@ -143,10 +143,10 @@ __global__ void Rupd_func2D_kernel(float *P1, float *P1_old, float *P2, float *P //calculate each thread global index const int xIndex=blockIdx.x*blockDim.x+threadIdx.x; const int yIndex=blockIdx.y*blockDim.y+threadIdx.y; - + int index = xIndex + N*yIndex; - - if ((xIndex < N) && (yIndex < M)) { + + if ((xIndex < N) && (yIndex < M)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); } @@ -156,9 +156,9 @@ __global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -168,17 +168,17 @@ __global__ void nonneg2D_kernel(float* Output, int N, int M, int num_total) /************************************************/ __global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float lambda) { - + float val1,val2,val3; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { if (i <= 0) {val1 = 0.0f;} else {val1 = R1[(N*M)*(k) + (i-1) + N*j];} if (j <= 0) {val2 = 0.0f;} else {val2 = R2[(N*M)*(k) + i + N*(j-1)];} if (k <= 0) {val3 = 0.0f;} else {val3 = R3[(N*M)*(k-1) + i + N*j];} @@ -190,22 +190,22 @@ __global__ void Obj_func3D_kernel(float *Ad, float *D, float *R1, float *R2, flo __global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, float *R1, float *R2, float *R3, int N, int M, int Z, int ImSize, float multip) { - + float val1,val2,val3; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { /* boundary conditions */ if (i >= N-1) val1 = 0.0f; else val1 = D[index] - D[(N*M)*(k) + (i+1) + N*j]; if (j >= M-1) val2 = 0.0f; else val2 = D[index] - D[(N*M)*(k) + i + N*(j+1)]; if (k >= Z-1) val3 = 0.0f; else val3 = D[index] - D[(N*M)*(k+1) + i + N*j]; - + //Write final result to global memory P1[index] = R1[index] + multip*val1; P2[index] = R2[index] + multip*val2; @@ -216,18 +216,18 @@ __global__ void Grad_func3D_kernel(float *P1, float *P2, float *P3, float *D, fl __global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float denom,sq_denom; + + float denom,sq_denom; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { denom = pow(P1[index],2) + pow(P2[index],2) + pow(P3[index],2); - + if (denom > 1.0f) { sq_denom = 1.0f/sqrt(denom); P1[index] = P1[index]*sq_denom; @@ -240,15 +240,15 @@ __global__ void Proj_func3D_iso_kernel(float *P1, float *P2, float *P3, int N, i __global__ void Proj_func3D_aniso_kernel(float *P1, float *P2, float *P3, int N, int M, int Z, int ImSize) { - - float val1, val2, val3; + + float val1, val2, val3; //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { val1 = abs(P1[index]); val2 = abs(P2[index]); @@ -268,10 +268,10 @@ __global__ void Rupd_func3D_kernel(float *P1, float *P1_old, float *P2, float *P int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - - if ((i < N) && (j < M) && (k < Z)) { + + if ((i < N) && (j < M) && (k < Z)) { R1[index] = P1[index] + multip2*(P1[index] - P1_old[index]); R2[index] = P2[index] + multip2*(P2[index] - P2_old[index]); R3[index] = P3[index] + multip2*(P3[index] - P3_old[index]); @@ -284,9 +284,9 @@ __global__ void nonneg3D_kernel(float* Output, int N, int M, int Z, int num_tota int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { if (Output[index] < 0.0f) Output[index] = 0.0f; } @@ -295,9 +295,9 @@ __global__ void FGPcopy_kernel2D(float *Input, float* Output, int N, int M, int { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -308,9 +308,9 @@ __global__ void FGPcopy_kernel3D(float *Input, float* Output, int N, int M, int int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input[index]; } @@ -320,9 +320,9 @@ __global__ void FGPResidCalc2D_kernel(float *Input1, float *Input2, float* Outpu { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -333,9 +333,9 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -352,21 +352,21 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f fprintf(stderr, "No CUDA devices found\n"); return -1; } - + int count = 0, i; - float re, multip,multip2; + float re, multip,multip2; re = 0.0f; float tk = 1.0f; float tkp1=1.0f; - + if (dimZ <= 1) { /*2D verson*/ - int ImSize = dimX*dimY; + int ImSize = dimX*dimY; float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL; - + dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - + /*allocate space for images on device*/ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); @@ -377,7 +377,7 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f checkCudaErrors( cudaMalloc((void**)&P2_prev,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); @@ -386,78 +386,78 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f cudaMemset(R1, 0, ImSize*sizeof(float)); cudaMemset(R2, 0, ImSize*sizeof(float)); - /********************** Run CUDA 2D kernel here ********************/ + /********************** Run CUDA 2D kernel here ********************/ multip = (1.0f/(8.0f*lambdaPar)); - + /* The main kernel */ for (i = 0; i < iter; i++) { - + /* computing the gradient of the objective function */ Obj_func2D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, dimX, dimY, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { nonneg2D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P2, d_update, R1, R2, dimX, dimY, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_func2D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*isotropic TV*/ - else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ + else Proj_func2D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, dimX, dimY, ImSize); /*anisotropic TV*/ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; - - if (epsil != 0.0f) { + + if ((epsil != 0.0f) && (i % 5 == 0)) { /* calculate norm - stopping rules using the Thrust library */ FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + // setup arguments square<float> unary_op; thrust::plus<float> binary_op; - thrust::device_vector<float> d_vec(P1, P1 + ImSize); - float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); - + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + // compute norm - re = (reduction/reduction2); + re = (reduction/reduction2); if (re < epsil) count++; - if (count > 4) break; - + if (count > 3) break; + FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); } - - } + + } //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + cudaFree(d_input); cudaFree(d_update); if (epsil != 0.0f) cudaFree(d_update_prev); @@ -470,15 +470,16 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f } else { /*3D verson*/ - int ImSize = dimX*dimY*dimZ; - float *d_input, *d_update=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; - + int ImSize = dimX*dimY*dimZ; + float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P3=NULL, *P1_prev=NULL, *P2_prev=NULL, *P3_prev=NULL, *R1=NULL, *R2=NULL, *R3=NULL; + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - + + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); /*allocate space for images on device*/ checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&P3,ImSize*sizeof(float)) ); @@ -488,7 +489,7 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f checkCudaErrors( cudaMalloc((void**)&R1,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R2,ImSize*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&R3,ImSize*sizeof(float)) ); - + checkCudaErrors( cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); cudaMemset(P1, 0, ImSize*sizeof(float)); cudaMemset(P2, 0, ImSize*sizeof(float)); @@ -499,60 +500,86 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f cudaMemset(R1, 0, ImSize*sizeof(float)); cudaMemset(R2, 0, ImSize*sizeof(float)); cudaMemset(R3, 0, ImSize*sizeof(float)); - /********************** Run CUDA 3D kernel here ********************/ + /********************** Run CUDA 3D kernel here ********************/ multip = (1.0f/(26.0f*lambdaPar)); - + /* The main kernel */ for (i = 0; i < iter; i++) { - + /* computing the gradient of the objective function */ Obj_func3D_kernel<<<dimGrid,dimBlock>>>(d_input, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, lambdaPar); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + if (nonneg != 0) { nonneg3D_kernel<<<dimGrid,dimBlock>>>(d_update, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); } - + /*Taking a step towards minus of the gradient*/ Grad_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, d_update, R1, R2, R3, dimX, dimY, dimZ, ImSize, multip); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + /* projection step */ if (methodTV == 0) Proj_func3D_iso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* isotropic kernel */ else Proj_func3D_aniso_kernel<<<dimGrid,dimBlock>>>(P1, P2, P3, dimX, dimY, dimZ, ImSize); /* anisotropic kernel */ checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + tkp1 = (1.0f + sqrt(1.0f + 4.0f*tk*tk))*0.5f; multip2 = ((tk-1.0f)/tkp1); - + Rupd_func3D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, P3, P3_prev, R1, R2, R3, tkp1, tk, multip2, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); checkCudaErrors(cudaPeekAtLastError() ); - + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(P3, P3_prev, dimX, dimY, dimZ, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + tk = tkp1; + + if ((epsil != 0.0f) && (i % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + FGPResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(P1, P1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + FGPcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + } - /***************************************************************/ + /***************************************************************/ //copy result matrix from device to host memory cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost); - + if (epsil != 0.0f) cudaFree(d_update_prev); + cudaFree(d_input); - cudaFree(d_update); + cudaFree(d_update); cudaFree(P1); cudaFree(P2); cudaFree(P3); @@ -560,9 +587,9 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, f cudaFree(P2_prev); cudaFree(P3_prev); cudaFree(R1); - cudaFree(R2); - cudaFree(R3); - } + cudaFree(R2); + cudaFree(R3); + } //cudaDeviceReset(); /*adding info into info_vector */ infovector[0] = (float)(i); /*iterations number (if stopped earlier based on tolerance)*/ diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu index 97f2351..e14681c 100755 --- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu @@ -15,7 +15,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_ROF_GPU_core.h" #include "shared.h" @@ -30,25 +30,25 @@ limitations under the License. * 2. lambda - regularization parameter [REQUIRED] * 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED] * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED] -* 5. eplsilon: tolerance constant +* 5. eplsilon: tolerance constant * Output: * [1] Filtered/regularized image/volume * [2] Information vector which contains [iteration no., reached tolerance] - - + + * This function is based on the paper by * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms" */ - + #define BLKXSIZE 8 #define BLKYSIZE 8 #define BLKZSIZE 8 - + #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 #define EPS 1.0e-8 - + #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) ) #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -57,148 +57,148 @@ limitations under the License. __host__ __device__ int sign (float x) { return (x > 0) - (x < 0); -} - -/*********************2D case****************************/ - +} + +/*********************2D case****************************/ + /* differences 1 */ - __global__ void D1_func2D(float* Input, float* D1, int N, int M) + __global__ void D1_func2D(float* Input, float* D1, int N, int M) { int i1, j1, i2; float NOMx_1,NOMy_1,NOMy_0,denom1,denom2,T1; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < N) && (j >= 0) && (j < M)) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= M) j1 = j-1; - + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ - NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[j*N + i2]; /* y- */ - + denom1 = NOMx_1*NOMx_1; denom2 = 0.5f*(sign((float)NOMy_1) + sign((float)NOMy_0))*(MIN(abs((float)NOMy_1), abs((float)NOMy_0))); denom2 = denom2*denom2; T1 = sqrt(denom1 + denom2 + EPS); D1[index] = NOMx_1/T1; - } - } - + } + } + /* differences 2 */ - __global__ void D2_func2D(float* Input, float* D2, int N, int M) + __global__ void D2_func2D(float* Input, float* D2, int N, int M) { int i1, j1, j2; float NOMx_1,NOMy_1,NOMx_0,denom1,denom2,T2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - + int index = i + N*j; - + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i1 = i + 1; if (i1 >= N) i1 = i-1; j1 = j + 1; if (j1 >= M) j1 = j-1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* Forward-backward differences */ NOMx_1 = Input[j1*N + i] - Input[index]; /* x+ */ NOMy_1 = Input[j*N + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[j2*N + i]; /* x- */ - + denom1 = NOMy_1*NOMy_1; denom2 = 0.5f*(sign((float)NOMx_1) + sign((float)NOMx_0))*(MIN(abs((float)NOMx_1), abs((float)NOMx_0))); denom2 = denom2*denom2; T2 = sqrt(denom1 + denom2 + EPS); D2[index] = NOMy_1/T2; - } + } } - - __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) + + __global__ void TV_kernel2D(float *D1, float *D2, float *Update, float *Input, float lambda, float tau, int N, int M) { int i2, j2; float dv1,dv2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; - - int index = i + N*j; - + + int index = i + N*j; + if ((i >= 0) && (i < (N)) && (j >= 0) && (j < (M))) { - + /* boundary conditions (Neumann reflections) */ i2 = i - 1; if (i2 < 0) i2 = i+1; - j2 = j - 1; if (j2 < 0) j2 = j+1; - + j2 = j - 1; if (j2 < 0) j2 = j+1; + /* divergence components */ dv1 = D1[index] - D1[j2*N + i]; dv2 = D2[index] - D2[j*N + i2]; - - Update[index] += tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index])); - - } - } -/*********************3D case****************************/ - + + Update[index] += tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index])); + + } + } +/*********************3D case****************************/ + /* differences 1 */ - __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) + __global__ void D1_func3D(float* Input, float* D1, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMy_0, NOMz_1, NOMz_0, denom1, denom2,denom3, T1; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; j1 = j + 1; if (j1 >= dimY) j1 = j-1; j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; - k2 = k - 1; if (k2 < 0) k2 = k+1; - + k2 = k - 1; if (k2 < 0) k2 = k+1; + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + j1*dimX + i] - Input[index]; /* x+ */ - NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ + NOMy_1 = Input[(dimX*dimY)*k + j*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + j*dimX + i2]; /* y- */ - + NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + j*dimX + i]; /* z- */ - - + + denom1 = NOMx_1*NOMx_1; denom2 = 0.5*(sign(NOMy_1) + sign(NOMy_0))*(MIN(abs(NOMy_1),abs(NOMy_0))); denom2 = denom2*denom2; denom3 = 0.5*(sign(NOMz_1) + sign(NOMz_0))*(MIN(abs(NOMz_1),abs(NOMz_0))); denom3 = denom3*denom3; T1 = sqrt(denom1 + denom2 + denom3 + EPS); - D1[index] = NOMx_1/T1; - } - } + D1[index] = NOMx_1/T1; + } + } /* differences 2 */ - __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) + __global__ void D2_func3D(float* Input, float* D2, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMz_1, NOMz_0, denom1, denom2, denom3, T2; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -207,16 +207,16 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - - + + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ NOMz_0 = Input[index] - Input[(dimX*dimY)*k2 + (j)*dimX + i]; /* z- */ - - + + denom1 = NOMy_1*NOMy_1; denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -226,19 +226,19 @@ __host__ __device__ int sign (float x) D2[index] = NOMy_1/T2; } } - + /* differences 3 */ - __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) + __global__ void D3_func3D(float* Input, float* D3, int dimX, int dimY, int dimZ) { float NOMx_1, NOMy_1, NOMx_0, NOMy_0, NOMz_1, denom1, denom2, denom3, T3; int i1,i2,k1,j1,j2,k2; - + int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { i1 = i + 1; if (i1 >= dimX) i1 = i-1; @@ -247,14 +247,14 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /* Forward-backward differences */ NOMx_1 = Input[(dimX*dimY)*k + (j1)*dimX + i] - Input[index]; /* x+ */ NOMy_1 = Input[(dimX*dimY)*k + (j)*dimX + i1] - Input[index]; /* y+ */ NOMy_0 = Input[index] - Input[(dimX*dimY)*k + (j)*dimX + i2]; /* y- */ NOMx_0 = Input[index] - Input[(dimX*dimY)*k + (j2)*dimX + i]; /* x- */ NOMz_1 = Input[(dimX*dimY)*k1 + j*dimX + i] - Input[index]; /* z+ */ - + denom1 = NOMz_1*NOMz_1; denom2 = 0.5*(sign(NOMx_1) + sign(NOMx_0))*(MIN(abs(NOMx_1),abs(NOMx_0))); denom2 = denom2*denom2; @@ -265,18 +265,18 @@ __host__ __device__ int sign (float x) } } - __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) + __global__ void TV_kernel3D(float *D1, float *D2, float *D3, float *Update, float *Input, float lambda, float tau, int dimX, int dimY, int dimZ) { float dv1, dv2, dv3; int i1,i2,k1,j1,j2,k2; int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - - int index = (dimX*dimY)*k + j*dimX+i; - + + int index = (dimX*dimY)*k + j*dimX+i; + if ((i >= 0) && (i < dimX) && (j >= 0) && (j < dimY) && (k >= 0) && (k < dimZ)) { - + /* symmetric boundary conditions (Neuman) */ i1 = i + 1; if (i1 >= dimX) i1 = i-1; i2 = i - 1; if (i2 < 0) i2 = i+1; @@ -284,23 +284,23 @@ __host__ __device__ int sign (float x) j2 = j - 1; if (j2 < 0) j2 = j+1; k1 = k + 1; if (k1 >= dimZ) k1 = k-1; k2 = k - 1; if (k2 < 0) k2 = k+1; - + /*divergence components */ dv1 = D1[index] - D1[(dimX*dimY)*k + j2*dimX+i]; dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2]; dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i]; - + Update[index] += tau*(lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index])); - - } + + } } __global__ void ROFcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total) { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -311,14 +311,41 @@ __global__ void ROFResidCalc2D_kernel(float *Input1, float *Input2, float* Outpu { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } } +__global__ void ROFcopy_kernel3D(float *Input, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input[index]; + } +} + +__global__ void ROFResidCalc3D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int Z, int num_total) +{ + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k = blockDim.z * blockIdx.z + threadIdx.z; + + int index = (N*M)*k + i + N*j; + + if (index < num_total) { + Output[index] = Input1[index] - Input2[index]; + } +} + + ///////////////////////////////////////////////// ///////////////// HOST FUNCTION ///////////////// extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z) @@ -329,12 +356,12 @@ extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, f fprintf(stderr, "No CUDA devices found\n"); return -1; } - float re; - re = 0.0f; - int ImSize, count, n; + float re; + re = 0.0f; + int ImSize, count, n; count = 0; n = 0; float *d_input, *d_update, *d_D1, *d_D2, *d_update_prev=NULL; - + if (Z == 0) Z = 1; ImSize = N*M*Z; CHECK(cudaMalloc((void**)&d_input,ImSize*sizeof(float))); @@ -342,83 +369,109 @@ extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, f CHECK(cudaMalloc((void**)&d_D1,ImSize*sizeof(float))); CHECK(cudaMalloc((void**)&d_D2,ImSize*sizeof(float))); if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) ); - + CHECK(cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - CHECK(cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); - + CHECK(cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice)); + if (Z == 1) { // TV - 2D case dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D)); - + for(n=0; n < iter; n++) { /* calculate differences */ D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M); CHECK(cudaDeviceSynchronize()); D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /*running main kernel*/ TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M); CHECK(cudaDeviceSynchronize()); - - if (epsil != 0.0f) { + + if ((epsil != 0.0f) && (n % 5 == 0)) { /* calculate norm - stopping rules using the Thrust library */ ROFResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors( cudaPeekAtLastError() ); - + checkCudaErrors( cudaPeekAtLastError() ); + // setup arguments square<float> unary_op; thrust::plus<float> binary_op; - thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); - float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); - float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); - + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + // compute norm - re = (reduction/reduction2); + re = (reduction/reduction2); if (re < epsil) count++; - if (count > 4) break; - + if (count > 3) break; + ROFcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, ImSize); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); } - + } } else { // TV - 3D case dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); - + dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE)); + float *d_D3; CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float))); - + for(n=0; n < iter; n++) { /* calculate differences */ D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z); CHECK(cudaDeviceSynchronize()); D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z); - CHECK(cudaDeviceSynchronize()); + CHECK(cudaDeviceSynchronize()); /*running main kernel*/ TV_kernel3D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_D3, d_update, d_input, lambdaPar, tau, N, M, Z); CHECK(cudaDeviceSynchronize()); + + if ((epsil != 0.0f) && (n % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + ROFResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors( cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + ImSize); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + ROFcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, Z, ImSize); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); } - + + + } + CHECK(cudaFree(d_D3)); - } + } CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost)); if (epsil != 0.0f) cudaFree(d_update_prev); CHECK(cudaFree(d_input)); CHECK(cudaFree(d_update)); CHECK(cudaFree(d_D1)); - CHECK(cudaFree(d_D2)); - - infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ + CHECK(cudaFree(d_D2)); + + infovector[0] = (float)(n); /*iterations number (if stopped earlier based on tolerance)*/ infovector[1] = re; /* reached tolerance */ - + return 0; } diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu index c31f104..b163791 100755 --- a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu +++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu @@ -15,10 +15,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. -*/ +*/ #include "TV_SB_GPU_core.h" #include "shared.h" +#include <thrust/functional.h> #include <thrust/device_vector.h> #include <thrust/transform_reduce.h> @@ -31,15 +32,14 @@ limitations under the License. * 4. eplsilon - tolerance constant [OPTIONAL parameter] * 5. TV-type: 'iso' or 'l1' [OPTIONAL parameter] * 6. nonneg: 'nonnegativity (0 is OFF by default) [OPTIONAL parameter] -* 7. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * * Output: -* 1. Filtered/regularized image -* +* [1] Filtered/regularized image/volume +* [2] Information vector which contains [iteration no., reached tolerance] + * [1]. Goldstein, T. and Osher, S., 2009. The split Bregman method for L1-regularized problems. SIAM journal on imaging sciences, 2(2), pp.323-343. */ -// This will output the proper CUDA error strings in the event that a CUDA host call returns an error #define BLKXSIZE2D 16 #define BLKYSIZE2D 16 @@ -56,22 +56,22 @@ limitations under the License. /************************************************/ __global__ void gauss_seidel2D_kernel(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Bx, float *By, float lambda, float mu, float normConst, int N, int M, int ImSize) { - + float sum; int i1,i2,j1,j2; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; j1 = j+1; if (j1 == M) j1 = j-1; j2 = j-1; if (j2 < 0) j2 = j+1; - + sum = Dx[j*N+i2] - Dx[index] + Dy[j2*N+i] - Dy[index] - Bx[j*N+i2] + Bx[index] - By[j2*N+i] + By[index]; sum += U_prev[j*N+i1] + U_prev[j*N+i2] + U_prev[j1*N+i] + U_prev[j2*N+i]; sum *= lambda; @@ -82,27 +82,27 @@ __global__ void gauss_seidel2D_kernel(float *U, float *A, float *U_prev, float * } __global__ void updDxDy_shrinkAniso2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, float lambda, int N, int M, int ImSize) { - + int i1,j1; float val1, val11, val2, val22, denom_lam; denom_lam = 1.0f/lambda; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + val1 = (U[j*N+i1] - U[index]) + Bx[index]; val2 = (U[j1*N+i] - U[index]) + By[index]; - + val11 = abs(val1) - denom_lam; if (val11 < 0) val11 = 0; val22 = abs(val2) - denom_lam; if (val22 < 0) val22 = 0; - + if (val1 !=0) Dx[index] = (val1/abs(val1))*val11; else Dx[index] = 0; if (val2 !=0) Dy[index] = (val2/abs(val2))*val22; else Dy[index] = 0; } @@ -111,28 +111,28 @@ __global__ void updDxDy_shrinkAniso2D_kernel(float *U, float *Dx, float *Dy, flo __global__ void updDxDy_shrinkIso2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, float lambda, int N, int M, int ImSize) { - + int i1,j1; float val1, val11, val2, denom_lam, denom; denom_lam = 1.0f/lambda; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + val1 = (U[j*N+i1] - U[index]) + Bx[index]; val2 = (U[j1*N+i] - U[index]) + By[index]; - + denom = sqrt(val1*val1 + val2*val2); - + val11 = (denom - denom_lam); if (val11 < 0) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -146,20 +146,20 @@ __global__ void updDxDy_shrinkIso2D_kernel(float *U, float *Dx, float *Dy, float } __global__ void updBxBy2D_kernel(float *U, float *Dx, float *Dy, float *Bx, float *By, int N, int M, int ImSize) -{ +{ int i1,j1; - + //calculate each thread global index const int i=blockIdx.x*blockDim.x+threadIdx.x; const int j=blockIdx.y*blockDim.y+threadIdx.y; - + int index = j*N+i; - + if ((i < N) && (j < M)) { /* symmetric boundary conditions (Neuman) */ i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; - + Bx[index] += (U[j*N+i1] - U[index]) - Dx[index]; By[index] += (U[j1*N+i] - U[index]) - Dy[index]; } @@ -172,17 +172,17 @@ __global__ void updBxBy2D_kernel(float *U, float *Dx, float *Dy, float *Bx, floa /************************************************/ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, float mu, float normConst, int N, int M, int Z, int ImSize) { - + float sum,d_val,b_val; int i1,i2,j1,j2,k1,k2; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; i2 = i-1; if (i2 < 0) i2 = i+1; @@ -190,7 +190,7 @@ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float * j2 = j-1; if (j2 < 0) j2 = j+1; k1 = k+1; if (k1 == Z) k1 = k-1; k2 = k-1; if (k2 < 0) k2 = k+1; - + d_val = Dx[(N*M)*k + j*N+i2] - Dx[index] + Dy[(N*M)*k + j2*N+i] - Dy[index] + Dz[(N*M)*k2 + j*N+i] - Dz[index]; b_val = -Bx[(N*M)*k + j*N+i2] + Bx[index] - By[(N*M)*k + j2*N+i] + By[index] - Bz[(N*M)*k2 + j*N+i] + Bz[index]; sum = d_val + b_val; @@ -203,31 +203,31 @@ __global__ void gauss_seidel3D_kernel(float *U, float *A, float *U_prev, float * } __global__ void updDxDy_shrinkAniso3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, int N, int M, int Z, int ImSize) { - + int i1,j1,k1; float val1, val11, val2, val3, val22, val33, denom_lam; denom_lam = 1.0f/lambda; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + val1 = (U[(N*M)*k + i1 + N*j] - U[index]) + Bx[index]; val2 = (U[(N*M)*k + i + N*j1] - U[index]) + By[index]; val3 = (U[(N*M)*k1 + i + N*j] - U[index]) + Bz[index]; - + val11 = abs(val1) - denom_lam; if (val11 < 0.0f) val11 = 0.0f; val22 = abs(val2) - denom_lam; if (val22 < 0.0f) val22 = 0.0f; val33 = abs(val3) - denom_lam; if (val33 < 0.0f) val33 = 0.0f; - + if (val1 !=0.0f) Dx[index] = (val1/abs(val1))*val11; else Dx[index] = 0.0f; if (val2 !=0.0f) Dy[index] = (val2/abs(val2))*val22; else Dy[index] = 0.0f; if (val3 !=0.0f) Dz[index] = (val3/abs(val3))*val33; else Dz[index] = 0.0f; @@ -237,31 +237,31 @@ __global__ void updDxDy_shrinkAniso3D_kernel(float *U, float *Dx, float *Dy, flo __global__ void updDxDy_shrinkIso3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, float lambda, int N, int M, int Z, int ImSize) { - + int i1,j1,k1; float val1, val11, val2, val3, denom_lam, denom; denom_lam = 1.0f/lambda; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + val1 = (U[(N*M)*k + i1 + N*j] - U[index]) + Bx[index]; val2 = (U[(N*M)*k + i + N*j1] - U[index]) + By[index]; val3 = (U[(N*M)*k1 + i + N*j] - U[index]) + Bz[index]; - + denom = sqrt(val1*val1 + val2*val2 + val3*val3); - + val11 = (denom - denom_lam); if (val11 < 0.0f) val11 = 0.0f; - + if (denom != 0.0f) { Dx[index] = val11*(val1/denom); Dy[index] = val11*(val2/denom); @@ -277,22 +277,22 @@ __global__ void updDxDy_shrinkIso3D_kernel(float *U, float *Dx, float *Dy, float } __global__ void updBxBy3D_kernel(float *U, float *Dx, float *Dy, float *Dz, float *Bx, float *By, float *Bz, int N, int M, int Z, int ImSize) -{ +{ int i1,j1,k1; - + //calculate each thread global index int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if ((i < N) && (j < M) && (k < Z)) { /* symmetric boundary conditions (Neuman) */ i1 = i+1; if (i1 == N) i1 = i-1; j1 = j+1; if (j1 == M) j1 = j-1; k1 = k+1; if (k1 == Z) k1 = k-1; - + Bx[index] += (U[(N*M)*k + i1 + N*j] - U[index]) - Dx[index]; By[index] += (U[(N*M)*k + i + N*j1] - U[index]) - Dy[index]; Bz[index] += (U[(N*M)*k1 + i + N*j] - U[index]) - Dz[index]; @@ -304,9 +304,9 @@ __global__ void SBcopy_kernel2D(float *Input, float* Output, int N, int M, int n { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input[index]; } @@ -317,9 +317,9 @@ __global__ void SBcopy_kernel3D(float *Input, float* Output, int N, int M, int Z int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input[index]; } @@ -329,9 +329,9 @@ __global__ void SBResidCalc2D_kernel(float *Input1, float *Input2, float* Output { int xIndex = blockDim.x * blockIdx.x + threadIdx.x; int yIndex = blockDim.y * blockIdx.y + threadIdx.y; - + int index = xIndex + N*yIndex; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -342,9 +342,9 @@ __global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output int i = blockDim.x * blockIdx.x + threadIdx.x; int j = blockDim.y * blockIdx.y + threadIdx.y; int k = blockDim.z * blockIdx.z + threadIdx.z; - + int index = (N*M)*k + i + N*j; - + if (index < num_total) { Output[index] = Input1[index] - Input2[index]; } @@ -353,7 +353,7 @@ __global__ void SBResidCalc3D_kernel(float *Input1, float *Input2, float* Output /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ /********************* MAIN HOST FUNCTION ******************/ /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ -extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ) +extern "C" int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ) { int deviceCount = -1; // number of devices cudaGetDeviceCount(&deviceCount); @@ -361,134 +361,111 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f fprintf(stderr, "No CUDA devices found\n"); return -1; } - + int ll, DimTotal; float re, lambda, normConst; - int count = 0; - mu = 1.0f/mu; + re = 0.0f; + int count = 0; + mu = 1.0f/mu; lambda = 2.0f*mu; + DimTotal = dimX*dimY*dimZ; + float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; + + /*allocate space for images on device*/ + checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); + if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); + + checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); + checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); + cudaMemset(Dx, 0, DimTotal*sizeof(float)); + cudaMemset(Dy, 0, DimTotal*sizeof(float)); + cudaMemset(Bx, 0, DimTotal*sizeof(float)); + cudaMemset(By, 0, DimTotal*sizeof(float)); if (dimZ <= 1) { /*2D verson*/ - DimTotal = dimX*dimY; normConst = 1.0f/(mu + 4.0f*lambda); - float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Bx=NULL, *By=NULL; - dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D); dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D)); - - /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(Dx, 0, DimTotal*sizeof(float)); - cudaMemset(Dy, 0, DimTotal*sizeof(float)); - cudaMemset(Bx, 0, DimTotal*sizeof(float)); - cudaMemset(By, 0, DimTotal*sizeof(float)); - - /********************** Run CUDA 2D kernels here ********************/ - /* The main kernel */ + /********************** Run CUDA 2D kernels here ********************/ for (ll = 0; ll < iter; ll++) { - + /* storing old value */ SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Bx, By, lambda, mu, normConst, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* 2nd GS iteration */ gauss_seidel2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Bx, By, lambda, mu, normConst, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + /* TV-related step */ - if (methodTV == 1) updDxDy_shrinkAniso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); - else updDxDy_shrinkIso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); - + if (methodTV == 1) updDxDy_shrinkAniso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); + else updDxDy_shrinkIso2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, lambda, dimX, dimY, DimTotal); + /* update for Bregman variables */ updBxBy2D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Bx, By, dimX, dimY, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - SBResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - -// thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - // thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); - // float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - // re = (reduction/reduction2); - // if (re < epsil) count++; - // if (count > 4) break; - } - - } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - cudaFree(d_update_prev); - if (epsil != 0.0f) cudaFree(d_res); - cudaFree(Dx); - cudaFree(Dy); - cudaFree(Bx); - cudaFree(By); - } + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (ll % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + SBResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + SBcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } + } + /***************************************************************/ + } else { /*3D verson*/ - DimTotal = dimX*dimY*dimZ; normConst = 1.0f/(mu + 6.0f*lambda); - float *d_input, *d_update, *d_res, *d_update_prev=NULL, *Dx=NULL, *Dy=NULL, *Dz=NULL, *Bx=NULL, *By=NULL, *Bz=NULL; - - dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); - dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); - + float *Dz=NULL, *Bz=NULL; + + dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE); + dim3 dimGrid(idivup(dimX,BLKXSIZE), idivup(dimY,BLKYSIZE),idivup(dimZ,BLKZSIZE)); + /*allocate space for images on device*/ - checkCudaErrors( cudaMalloc((void**)&d_input,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&d_update_prev,DimTotal*sizeof(float)) ); - if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_res,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Dy,DimTotal*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&Dz,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&Bx,DimTotal*sizeof(float)) ); - checkCudaErrors( cudaMalloc((void**)&By,DimTotal*sizeof(float)) ); checkCudaErrors( cudaMalloc((void**)&Bz,DimTotal*sizeof(float)) ); - - checkCudaErrors( cudaMemcpy(d_input,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - checkCudaErrors( cudaMemcpy(d_update,Input,DimTotal*sizeof(float),cudaMemcpyHostToDevice)); - cudaMemset(Dx, 0, DimTotal*sizeof(float)); - cudaMemset(Dy, 0, DimTotal*sizeof(float)); - cudaMemset(Dz, 0, DimTotal*sizeof(float)); - cudaMemset(Bx, 0, DimTotal*sizeof(float)); - cudaMemset(By, 0, DimTotal*sizeof(float)); - cudaMemset(Bz, 0, DimTotal*sizeof(float)); - - /********************** Run CUDA 3D kernels here ********************/ + + cudaMemset(Dz, 0, DimTotal*sizeof(float)); + cudaMemset(Bz, 0, DimTotal*sizeof(float)); + /********************** Run CUDA 3D kernels here ********************/ /* The main kernel */ for (ll = 0; ll < iter; ll++) { - + /* storing old value */ SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); @@ -497,56 +474,68 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f /* perform two GS iterations (normally 2 is enough for the convergence) */ gauss_seidel3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Dz, Bx, By, Bz, lambda, mu, normConst, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); + checkCudaErrors(cudaPeekAtLastError() ); /* 2nd GS iteration */ gauss_seidel3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_input, d_update_prev, Dx, Dy, Dz, Bx, By, Bz, lambda, mu, normConst, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - + checkCudaErrors(cudaPeekAtLastError() ); + /* TV-related step */ if (methodTV == 1) updDxDy_shrinkAniso3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, lambda, dimX, dimY, dimZ, DimTotal); else updDxDy_shrinkIso3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, lambda, dimX, dimY, dimZ, DimTotal); - + /* update for Bregman variables */ updBxBy3D_kernel<<<dimGrid,dimBlock>>>(d_update, Dx, Dy, Dz, Bx, By, Bz, dimX, dimY, dimZ, DimTotal); checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - if (epsil != 0.0f) { - /* calculate norm - stopping rules using the Thrust library */ - SBResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal); - checkCudaErrors( cudaDeviceSynchronize() ); - checkCudaErrors(cudaPeekAtLastError() ); - - // thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); - // float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>())); - // thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); -// float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>())); - - // re = (reduction/reduction2); - // if (re < epsil) count++; - // if (count > 4) break; - } + checkCudaErrors(cudaPeekAtLastError() ); + + if ((epsil != 0.0f) && (ll % 5 == 0)) { + /* calculate norm - stopping rules using the Thrust library */ + SBResidCalc3D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_res, dimX, dimY, dimZ, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + + // setup arguments + square<float> unary_op; + thrust::plus<float> binary_op; + thrust::device_vector<float> d_vec(d_res, d_res + DimTotal); + float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op)); + thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal); + float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op)); + + // compute norm + re = (reduction/reduction2); + if (re < epsil) count++; + if (count > 3) break; + + SBcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, DimTotal); + checkCudaErrors( cudaDeviceSynchronize() ); + checkCudaErrors(cudaPeekAtLastError() ); + } } - if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll); - /***************************************************************/ - //copy result matrix from device to host memory - cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); - - cudaFree(d_input); - cudaFree(d_update); - cudaFree(d_update_prev); - if (epsil != 0.0f) cudaFree(d_res); - cudaFree(Dx); - cudaFree(Dy); - cudaFree(Dz); - cudaFree(Bx); - cudaFree(By); - cudaFree(Bz); - } - //cudaDeviceReset(); + cudaFree(Dz); + cudaFree(Bz); + + } /**************************end else (3D)********************************/ + + //copy result matrix from device to host memory + cudaMemcpy(Output,d_update,DimTotal*sizeof(float),cudaMemcpyDeviceToHost); + + cudaFree(d_input); + cudaFree(d_update); + cudaFree(d_update_prev); + if (epsil != 0.0f) cudaFree(d_res); + cudaFree(Dx); + cudaFree(Dy); + cudaFree(Bx); + cudaFree(By); + + /*adding info into info_vector */ + infovector[0] = (float)(ll); /*iterations number (if stopped earlier based on tolerance)*/ + infovector[1] = re; /* reached tolerance */ + return 0; } diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.h b/src/Core/regularisers_GPU/TV_SB_GPU_core.h index 901b90f..1eaa6af 100755 --- a/src/Core/regularisers_GPU/TV_SB_GPU_core.h +++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.h @@ -5,6 +5,6 @@ #include <memory.h> -extern "C" CCPI_EXPORT int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); +extern "C" CCPI_EXPORT int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); -#endif +#endif diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py index 05120af..09b465a 100644 --- a/src/Python/ccpi/filters/regularisers.py +++ b/src/Python/ccpi/filters/regularisers.py @@ -52,21 +52,30 @@ def FGP_TV(inputData, regularisation_parameter,iterations, raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) def SB_TV(inputData, regularisation_parameter, iterations, - tolerance_param, methodTV, printM, device='cpu'): + tolerance_param, methodTV, device='cpu'): if device == 'cpu': return TV_SB_CPU(inputData, regularisation_parameter, iterations, tolerance_param, - methodTV, - printM) + methodTV) elif device == 'gpu' and gpu_enabled: return TV_SB_GPU(inputData, regularisation_parameter, iterations, tolerance_param, - methodTV, - printM) + methodTV) + else: + if not gpu_enabled and device == 'gpu': + raise ValueError ('GPU is not available') + raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ + .format(device)) +def LLT_ROF(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, tolerance_param, device='cpu'): + if device == 'cpu': + return LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) + elif device == 'gpu' and gpu_enabled: + return LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) else: if not gpu_enabled and device == 'gpu': raise ValueError ('GPU is not available') @@ -194,17 +203,6 @@ def TGV(inputData, regularisation_parameter, alpha1, alpha0, iterations, raise ValueError ('GPU is not available') raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ .format(device)) -def LLT_ROF(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, - time_marching_parameter, device='cpu'): - if device == 'cpu': - return LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - elif device == 'gpu' and gpu_enabled: - return LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - else: - if not gpu_enabled and device == 'gpu': - raise ValueError ('GPU is not available') - raise ValueError('Unknown device {0}. Expecting gpu or cpu'\ - .format(device)) def NDF_INP(inputData, maskData, regularisation_parameter, edge_parameter, iterations, time_marching_parameter, penalty_type): return NDF_INPAINT_CPU(inputData, maskData, regularisation_parameter, diff --git a/src/Python/src/cpu_regularisers.pyx b/src/Python/src/cpu_regularisers.pyx index aeca141..f2276bb 100644 --- a/src/Python/src/cpu_regularisers.pyx +++ b/src/Python/src/cpu_regularisers.pyx @@ -20,8 +20,8 @@ cimport numpy as np cdef extern float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); cdef extern float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ); -cdef extern float SB_TV_CPU_main(float *Input, float *Output, float lambdaPar, int iterationsNumb, float epsil, int methodTV, int printM, int dimX, int dimY, int dimZ); -cdef extern float LLT_ROF_CPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); +cdef extern float SB_TV_CPU_main(float *Input, float *Output, float *infovector, float mu, int iter, float epsil, int methodTV, int dimX, int dimY, int dimZ); +cdef extern float LLT_ROF_CPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int dimX, int dimY, int dimZ); cdef extern float TGV_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); cdef extern float Diffusion_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int dimX, int dimY, int dimZ); cdef extern float Diffus4th_CPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int dimX, int dimY, int dimZ); @@ -149,18 +149,17 @@ def TV_FGP_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, #********************** Total-variation SB *********************# #***************************************************************# #*************** Total-variation Split Bregman (SB)*************# -def TV_SB_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM): +def TV_SB_CPU(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV): if inputData.ndim == 2: - return TV_SB_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM) + return TV_SB_2D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV) elif inputData.ndim == 3: - return TV_SB_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV, printM) + return TV_SB_3D(inputData, regularisation_parameter, iterationsNumb, tolerance_param, methodTV) def TV_SB_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, int iterationsNumb, float tolerance_param, - int methodTV, - int printM): + int methodTV): cdef long dims[2] dims[0] = inputData.shape[0] @@ -168,23 +167,24 @@ def TV_SB_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') #/* Run SB-TV iterations for 2D data */ - SB_TV_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameter, + SB_TV_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], + regularisation_parameter, iterationsNumb, tolerance_param, methodTV, - printM, - dims[1],dims[0],1) + dims[1],dims[0], 1) - return outputData + return (outputData,infovec) def TV_SB_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterationsNumb, float tolerance_param, - int methodTV, - int printM): + int methodTV): cdef long dims[3] dims[0] = inputData.shape[0] dims[1] = inputData.shape[1] @@ -192,16 +192,71 @@ def TV_SB_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + #/* Run SB-TV iterations for 3D data */ - SB_TV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameter, + SB_TV_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], + regularisation_parameter, iterationsNumb, tolerance_param, methodTV, - printM, dims[2], dims[1], dims[0]) - return outputData + return (outputData,infovec) +#***************************************************************# +#******************* ROF - LLT regularisation ******************# +#***************************************************************# +def LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param): + if inputData.ndim == 2: + return LLT_ROF_2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) + elif inputData.ndim == 3: + return LLT_ROF_3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) +def LLT_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, + float regularisation_parameterROF, + float regularisation_parameterLLT, + int iterations, + float time_marching_parameter, + float tolerance_param): + + cdef long dims[2] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + + cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ + np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + #/* Run ROF-LLT iterations for 2D data */ + LLT_ROF_CPU_main(&inputData[0,0], &outputData[0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, + tolerance_param, + dims[1],dims[0],1) + return (outputData,infovec) + +def LLT_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, + float regularisation_parameterROF, + float regularisation_parameterLLT, + int iterations, + float time_marching_parameter, + float tolerance_param): + + cdef long dims[3] + dims[0] = inputData.shape[0] + dims[1] = inputData.shape[1] + dims[2] = inputData.shape[2] + + cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ + np.zeros([dims[0], dims[1], dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.zeros([2], dtype='float32') + + #/* Run ROF-LLT iterations for 3D data */ + LLT_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0]) + return (outputData,infovec) #***************************************************************# #***************** Total Generalised Variation *****************# #***************************************************************# @@ -259,49 +314,6 @@ def TGV_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, dims[2], dims[1], dims[0]) return outputData -#***************************************************************# -#******************* ROF - LLT regularisation ******************# -#***************************************************************# -def LLT_ROF_CPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter): - if inputData.ndim == 2: - return LLT_ROF_2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - elif inputData.ndim == 3: - return LLT_ROF_3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) - -def LLT_ROF_2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, - float regularisation_parameterROF, - float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - - cdef long dims[2] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - - cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ - np.zeros([dims[0],dims[1]], dtype='float32') - - #/* Run ROF-LLT iterations for 2D data */ - LLT_ROF_CPU_main(&inputData[0,0], &outputData[0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1) - return outputData - -def LLT_ROF_3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, - float regularisation_parameterROF, - float regularisation_parameterLLT, - int iterations, - float time_marching_parameter): - - cdef long dims[3] - dims[0] = inputData.shape[0] - dims[1] = inputData.shape[1] - dims[2] = inputData.shape[2] - - cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ - np.zeros([dims[0], dims[1], dims[2]], dtype='float32') - - #/* Run ROF-LLT iterations for 3D data */ - LLT_ROF_CPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0]) - return outputData #****************************************************************# #**************Directional Total-variation FGP ******************# diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx index db4fa67..6fc4135 100644 --- a/src/Python/src/gpu_regularisers.pyx +++ b/src/Python/src/gpu_regularisers.pyx @@ -22,9 +22,9 @@ CUDAErrorMessage = 'CUDA error' cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z); cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z); -cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z); +cdef extern int TV_SB_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int N, int M, int Z); +cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float *infovector, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, float epsil, int N, int M, int Z); cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ); -cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z); cdef extern int NonlDiff_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int penaltytype, int N, int M, int Z); cdef extern int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, float lambdaPar, int iterationsNumb, float epsil, float eta, int methodTV, int nonneg, int printM, int N, int M, int Z); cdef extern int Diffus4th_GPU_main(float *Input, float *Output, float lambdaPar, float sigmaPar, int iterationsNumb, float tau, int N, int M, int Z); @@ -75,28 +75,25 @@ def TV_SB_GPU(inputData, regularisation_parameter, iterations, tolerance_param, - methodTV, - printM): + methodTV): if inputData.ndim == 2: return SBTV2D(inputData, regularisation_parameter, iterations, tolerance_param, - methodTV, - printM) + methodTV) elif inputData.ndim == 3: return SBTV3D(inputData, regularisation_parameter, iterations, tolerance_param, - methodTV, - printM) + methodTV) # LLT-ROF model -def LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter): +def LLT_ROF_GPU(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param): if inputData.ndim == 2: - return LLT_ROF_GPU2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return LLT_ROF_GPU2D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) elif inputData.ndim == 3: - return LLT_ROF_GPU3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter) + return LLT_ROF_GPU3D(inputData, regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, tolerance_param) # Total Generilised Variation (TGV) def TGV_GPU(inputData, regularisation_parameter, alpha1, alpha0, iterations, LipshitzConst): if inputData.ndim == 2: @@ -300,8 +297,7 @@ def SBTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameter, int iterations, float tolerance_param, - int methodTV, - int printM): + int methodTV): cdef long dims[2] dims[0] = inputData.shape[0] @@ -309,16 +305,17 @@ def SBTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') - # Running CUDA code here - if (TV_SB_GPU_main(&inputData[0,0], &outputData[0,0], + # Running CUDA code here + if (TV_SB_GPU_main(&inputData[0,0], &outputData[0,0],&infovec[0], regularisation_parameter, iterations, tolerance_param, methodTV, - printM, dims[1], dims[0], 1)==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -327,8 +324,7 @@ def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameter, int iterations, float tolerance_param, - int methodTV, - int printM): + int methodTV): cdef long dims[3] dims[0] = inputData.shape[0] @@ -337,16 +333,17 @@ def SBTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + # Running CUDA code here - if (TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0], + if (TV_SB_GPU_main(&inputData[0,0,0], &outputData[0,0,0],&infovec[0], regularisation_parameter , iterations, tolerance_param, methodTV, - printM, dims[2], dims[1], dims[0])==0): - return outputData; + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -359,7 +356,8 @@ def LLT_ROF_GPU2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, float regularisation_parameterROF, float regularisation_parameterLLT, int iterations, - float time_marching_parameter): + float time_marching_parameter, + float tolerance_param): cdef long dims[2] dims[0] = inputData.shape[0] @@ -367,10 +365,15 @@ def LLT_ROF_GPU2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \ np.zeros([dims[0],dims[1]], dtype='float32') - + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') + # Running CUDA code here - if (LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[1],dims[0],1)==0): - return outputData; + if (LLT_ROF_GPU_main(&inputData[0,0], &outputData[0,0],&infovec[0],regularisation_parameterROF, regularisation_parameterLLT, iterations, + time_marching_parameter, + tolerance_param, + dims[1],dims[0],1)==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); @@ -379,7 +382,8 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, float regularisation_parameterROF, float regularisation_parameterLLT, int iterations, - float time_marching_parameter): + float time_marching_parameter, + float tolerance_param): cdef long dims[3] dims[0] = inputData.shape[0] @@ -388,10 +392,16 @@ def LLT_ROF_GPU3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \ np.zeros([dims[0],dims[1],dims[2]], dtype='float32') + cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \ + np.ones([2], dtype='float32') - # Running CUDA code here - if (LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], regularisation_parameterROF, regularisation_parameterLLT, iterations, time_marching_parameter, dims[2], dims[1], dims[0])==0): - return outputData; + # Running CUDA code here + if (LLT_ROF_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0], regularisation_parameterROF, regularisation_parameterLLT, + iterations, + time_marching_parameter, + tolerance_param, + dims[2], dims[1], dims[0])==0): + return (outputData,infovec) else: raise ValueError(CUDAErrorMessage); |