16 files changed, 280 insertions, 188 deletions
diff --git a/demos/demo_cpu_regularisers.py b/demos/demo_cpu_regularisers.py
index 32b97ce..4866811 100644
--- a/demos/demo_cpu_regularisers.py
+++ b/demos/demo_cpu_regularisers.py
@@ -32,7 +32,7 @@ def printParametersToString(pars):
 ###############################################################################
 
 #filename = os.path.join( "data" ,"lena_gray_512.tif")
-filename = "/home/algol/Documents/DEV/CCPi-Regularisation-Toolkit/test/lena_gray_512.tif"
+filename = "/home/kjy41806/Documents/SOFT/CCPi-Regularisation-Toolkit/test/lena_gray_512.tif"
 
 # read image
 Im = plt.imread(filename)
@@ -86,13 +86,13 @@ imgplot = plt.imshow(u0,cmap="gray")
 pars = {'algorithm': ROF_TV, \
         'input' : u0,\
         'regularisation_parameter':0.02,\
-        'number_of_iterations': 5000,\
-        'time_marching_parameter': 0.000385,\
+        'number_of_iterations': 1000,\
+        'time_marching_parameter': 0.001,\
         'tolerance_constant':1e-06}
 
 print ("#############ROF TV CPU####################")
 start_time = timeit.default_timer()
-(rof_cpu,info_vec) = ROF_TV(pars['input'],
+(rof_cpu,info_vec_cpu) = ROF_TV(pars['input'],
              pars['regularisation_parameter'],
              pars['number_of_iterations'],
              pars['time_marching_parameter'],
@@ -130,14 +130,14 @@ imgplot = plt.imshow(u0,cmap="gray")
 pars = {'algorithm' : FGP_TV, \
         'input' : u0,\
         'regularisation_parameter':0.02, \
-        'number_of_iterations' :800 ,\
+        'number_of_iterations' :200 ,\
         'tolerance_constant':1e-06,\
         'methodTV': 0 ,\
         'nonneg': 0}
         
 print ("#############FGP TV CPU####################")
 start_time = timeit.default_timer()
-fgp_cpu,info_vec = FGP_TV(pars['input'], 
+fgp_cpu,info_vec_cpu = FGP_TV(pars['input'], 
               pars['regularisation_parameter'],
               pars['number_of_iterations'],
               pars['tolerance_constant'], 
diff --git a/demos/demo_gpu_regularisers.py b/demos/demo_gpu_regularisers.py
index 89bb948..212ad5a 100644
--- a/demos/demo_gpu_regularisers.py
+++ b/demos/demo_gpu_regularisers.py
@@ -83,16 +83,18 @@ imgplot = plt.imshow(u0,cmap="gray")
 # set parameters
 pars = {'algorithm': ROF_TV, \
         'input' : u0,\
-        'regularisation_parameter':0.04,\
-        'number_of_iterations': 1200,\
-        'time_marching_parameter': 0.0025
-        }
+        'regularisation_parameter':0.02,\
+        'number_of_iterations': 5000,\
+        'time_marching_parameter': 0.001,\
+        'tolerance_constant':1e-06}
+
 print ("##############ROF TV GPU##################")
 start_time = timeit.default_timer()
-rof_gpu = ROF_TV(pars['input'], 
-                     pars['regularisation_parameter'],
-                     pars['number_of_iterations'], 
-                     pars['time_marching_parameter'],'gpu')
+(rof_gpu, info_vec_gpu) = ROF_TV(pars['input'],
+             pars['regularisation_parameter'],
+             pars['number_of_iterations'],
+             pars['time_marching_parameter'],
+             pars['tolerance_constant'], 'gpu')
 
 Qtools = QualityTools(Im, rof_gpu)
 pars['rmse'] = Qtools.rmse()
@@ -125,23 +127,20 @@ imgplot = plt.imshow(u0,cmap="gray")
 # set parameters
 pars = {'algorithm' : FGP_TV, \
         'input' : u0,\
-        'regularisation_parameter':0.04, \
-        'number_of_iterations' :1200 ,\
+        'regularisation_parameter':0.02, \
+        'number_of_iterations' :300 ,\
         'tolerance_constant':1e-06,\
         'methodTV': 0 ,\
-        'nonneg': 0 ,\
-        'printingOut': 0 
-        }
+        'nonneg': 0}
 
 print ("##############FGP TV GPU##################")
 start_time = timeit.default_timer()
-fgp_gpu = FGP_TV(pars['input'], 
+(fgp_gpu, info_vec_gpu) = FGP_TV(pars['input'], 
               pars['regularisation_parameter'],
               pars['number_of_iterations'],
               pars['tolerance_constant'], 
               pars['methodTV'],
-              pars['nonneg'],
-              pars['printingOut'],'gpu')
+              pars['nonneg'],'gpu')
 Qtools = QualityTools(Im, fgp_gpu)
 pars['rmse'] = Qtools.rmse()
 pars['algorithm'] = FGP_TV
@@ -157,7 +156,6 @@ a.text(0.15, 0.25, txtstr, transform=a.transAxes, fontsize=14,
          verticalalignment='top', bbox=props)
 imgplot = plt.imshow(fgp_gpu, cmap="gray")
 plt.title('{}'.format('GPU results'))
-
 #%%
 print ("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
 print ("____________SB-TV regulariser______________")
diff --git a/run.sh b/run.sh
index c5c396a..b2be0d6 100755
--- a/run.sh
+++ b/run.sh
@@ -8,17 +8,17 @@ cd build_proj/
 #make clean
 export CIL_VERSION=19.03
 # install Python modules without CUDA
-cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
+# cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 # install Python modules with CUDA
-# cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
+cmake ../ -DBUILD_PYTHON_WRAPPER=ON -DBUILD_MATLAB_WRAPPER=OFF -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 # install Matlab modules without CUDA
-# cmake ../ -DBUILD_PYTHON_WRAPPER=OFF -DMatlab_ROOT_DIR=/dls_sw/apps/matlab/r2014a/ -DBUILD_MATLAB_WRAPPER=ON -DBUILD_CUDA=OFF -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
+# cmake ../ -DBUILD_PYTHON_WRAPPER=OFF -DMatlab_ROOT_DIR=/dls_sw/apps/matlab/r2014a/ -DBUILD_MATLAB_WRAPPER=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 # install Matlab modules with CUDA
 # cmake ../ -DBUILD_PYTHON_WRAPPER=OFF -DMatlab_ROOT_DIR=/dls_sw/apps/matlab/r2014a/ -DBUILD_MATLAB_WRAPPER=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=./install
 make install
 #### Python
 #cp install/lib/libcilreg.so install/python/ccpi/filters
-cd install/python
+# cd install/python
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../lib
 spyder
 ##### Matlab (Linux)
diff --git a/src/Core/regularisers_CPU/FGP_TV_core.c b/src/Core/regularisers_CPU/FGP_TV_core.c
index 3c3cbd4..a6bb7e0 100644
--- a/src/Core/regularisers_CPU/FGP_TV_core.c
+++ b/src/Core/regularisers_CPU/FGP_TV_core.c
@@ -86,15 +86,17 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb
             
             /* check early stopping criteria */
             if (epsil != 0.0f) {
+            re = 0.0f; re1 = 0.0f;
 	            for(j=0; j<DimTotal; j++)
         	    {
-        	        re += pow(Output[j] - Output_prev[j],2);
-        	        re1 += pow(Output[j],2);
+        	        re += powf(Output[j] - Output_prev[j],2);
+        	        re1 += powf(Output[j],2);
         	    }
-              re = sqrt(re)/sqrt(re1);
-              if (re < epsil)  count++;
-              if (count > 4) break;         
-            copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l);
+           re = sqrtf(re)/sqrtf(re1);
+           if (re < epsil)  count++;
+           if (count > 4) break;         
+           
+           copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), 1l);
             }
      }        	
 	if (epsil != 0.0f) free(Output_prev); 	
@@ -137,12 +139,13 @@ float TV_FGP_CPU_main(float *Input, float *Output, float *infovector, float lamb
             
             /* calculate norm - stopping rules*/
             if (epsil != 0.0f) {
+            re = 0.0f; re1 = 0.0f;
 	            for(j=0; j<DimTotal; j++)
         	    {
-        	        re += pow(Output[j] - Output_prev[j],2);
-        	        re1 += pow(Output[j],2);
+        	        re += powf(Output[j] - Output_prev[j],2);
+        	        re1 += powf(Output[j],2);
         	    }
-            re = sqrt(re)/sqrt(re1);
+            re = sqrtf(re)/sqrtf(re1);
             /* stop if the norm residual is less than the tolerance EPS */
             if (re < epsil)  count++;
             if (count > 4) break; 
diff --git a/src/Core/regularisers_CPU/ROF_TV_core.c b/src/Core/regularisers_CPU/ROF_TV_core.c
index 1b7cf76..a7291d7 100644
--- a/src/Core/regularisers_CPU/ROF_TV_core.c
+++ b/src/Core/regularisers_CPU/ROF_TV_core.c
@@ -19,7 +19,7 @@
 
 #include "ROF_TV_core.h"
 
-#define EPS 1.0e-15
+#define EPS 1.0e-8
 #define MAX(x, y) (((x) > (y)) ? (x) : (y))
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 
@@ -76,12 +76,13 @@ float TV_ROF_CPU_main(float *Input, float *Output, float *infovector, float lamb
             
             /* check early stopping criteria */
             if (epsil != 0.0f) {
+            re = 0.0f; re1 = 0.0f;
 	            for(j=0; j<DimTotal; j++)
         	    {
-        	        re += pow(Output[j] - Output_prev[j],2);
-        	        re1 += pow(Output[j],2);
+        	        re += powf(Output[j] - Output_prev[j],2);
+        	        re1 += powf(Output[j],2);
         	    }
-              re = sqrt(re)/sqrt(re1);
+              re = sqrtf(re)/sqrtf(re1);
               if (re < epsil)  count++;
               if (count > 4) break;         
             copyIm(Output, Output_prev, (long)(dimX), (long)(dimY), (long)(dimZ));
diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu
index b371c5d..96b1173 100755
--- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu
+++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.cu
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "TV_FGP_GPU_core.h"
 #include "shared.h"
+#include <thrust/functional.h>
 #include <thrust/device_vector.h>
 #include <thrust/transform_reduce.h>
 
@@ -31,10 +32,10 @@ limitations under the License.
  * 4. eplsilon: tolerance constant 
  * 5. TV-type: methodTV - 'iso' (0) or 'l1' (1)
  * 6. nonneg: 'nonnegativity (0 is OFF by default) 
- * 7. print information: 0 (off) or 1 (on) 
  *
  * Output:
- * [1] Filtered/regularized image
+ * [1] Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
  *
  * This function is based on the Matlab's code and paper by
  * [1] Amir Beck and Marc Teboulle, "Fast Gradient-Based Algorithms for Constrained Total Variation Image Denoising and Deblurring Problems"
@@ -49,7 +50,7 @@ limitations under the License.
 #define BLKZSIZE 8
 
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+// struct square { __host__ __device__ float operator()(float x) { return x * x; } };
 
 /************************************************/
 /*****************2D modules*********************/
@@ -343,7 +344,7 @@ __global__ void FGPResidCalc3D_kernel(float *Input1, float *Input2, float* Outpu
 /*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/
 
 ////////////MAIN HOST FUNCTION ///////////////
-extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ)
+extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ)
 {
     int deviceCount = -1; // number of devices
     cudaGetDeviceCount(&deviceCount);
@@ -354,20 +355,21 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int
     
     int count = 0, i;
     float re, multip,multip2;    
-	float tk = 1.0f;
+    re = 0.0f;
+    float tk = 1.0f;
     float tkp1=1.0f;
         
     if (dimZ <= 1) {
 		/*2D verson*/
-		int ImSize = dimX*dimY;    
-		float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
+      int ImSize = dimX*dimY;    
+      float *d_input, *d_update=NULL, *d_update_prev=NULL, *P1=NULL, *P2=NULL, *P1_prev=NULL, *P2_prev=NULL, *R1=NULL, *R2=NULL;
    
-		dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-		dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
+      dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+      dim3 dimGrid(idivup(dimX,BLKXSIZE2D), idivup(dimY,BLKYSIZE2D));
     
 		/*allocate space for images on device*/
-		checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
-		checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
+       checkCudaErrors( cudaMalloc((void**)&d_input,ImSize*sizeof(float)) );
+       checkCudaErrors( cudaMalloc((void**)&d_update,ImSize*sizeof(float)) );
 		if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
 		checkCudaErrors( cudaMalloc((void**)&P1,ImSize*sizeof(float)) );
 		checkCudaErrors( cudaMalloc((void**)&P2,ImSize*sizeof(float)) );
@@ -416,40 +418,43 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int
         
             Rupd_func2D_kernel<<<dimGrid,dimBlock>>>(P1, P1_prev, P2, P2_prev, R1, R2, tkp1, tk, multip2, dimX, dimY, ImSize);
             checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );                
+        
+            FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
             checkCudaErrors(cudaPeekAtLastError() );
         
+            FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
+            checkCudaErrors( cudaDeviceSynchronize() );
+            checkCudaErrors(cudaPeekAtLastError() );       
+ 
+            tk = tkp1;
+            
             if (epsil != 0.0f) {
                 /* calculate norm - stopping rules using the Thrust library */
-                FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1_prev, dimX, dimY, ImSize);
+                FGPResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, P1, dimX, dimY, ImSize);
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );               
                 
-                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); 
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
+                // setup arguments
+		square<float>        unary_op;
+		thrust::plus<float> binary_op;
+                thrust::device_vector<float> d_vec(P1, P1 + ImSize); 
+		float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));                
                 thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
-                    
+      		float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));      		
+                   
+                // compute norm
                 re = (reduction/reduction2);      
                 if (re < epsil)  count++;
-                    if (count > 4) break;       
+                if (count > 4) break;       
              
                 FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );                                              
-            }                  
-        
-            FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P1, P1_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );
-        
-            FGPcopy_kernel2D<<<dimGrid,dimBlock>>>(P2, P2_prev, dimX, dimY, ImSize);
-            checkCudaErrors( cudaDeviceSynchronize() );
-            checkCudaErrors(cudaPeekAtLastError() );       
- 
-            tk = tkp1;
-        }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
-            /***************************************************************/    
+            }
+            
+        }	      
             //copy result matrix from device to host memory
             cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
     
@@ -542,7 +547,6 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int
  
             tk = tkp1;
         }
-        if (printM == 1) printf("FGP-TV iterations stopped at iteration %i \n", i);   
             /***************************************************************/    
             //copy result matrix from device to host memory
             cudaMemcpy(Output,d_update,ImSize*sizeof(float),cudaMemcpyDeviceToHost);
@@ -560,5 +564,8 @@ extern "C" int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int
             cudaFree(R3);        
     } 
     //cudaDeviceReset();
+    /*adding info into info_vector */
+    infovector[0] = (float)(i);  /*iterations number (if stopped earlier based on tolerance)*/
+    infovector[1] = re;  /* reached tolerance */
     return 0;
 }
diff --git a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h
index bf13508..597b6c6 100755
--- a/src/Core/regularisers_GPU/TV_FGP_GPU_core.h
+++ b/src/Core/regularisers_GPU/TV_FGP_GPU_core.h
@@ -4,6 +4,6 @@
 #include "CCPiDefines.h"
 #include <memory.h>
 
-extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int dimX, int dimY, int dimZ);
+extern "C" CCPI_EXPORT int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int dimX, int dimY, int dimZ);
 
 #endif 
diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu
index 76f5be9..97f2351 100755
--- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu
+++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.cu
@@ -3,8 +3,8 @@ This work is part of the Core Imaging Library developed by
 Visual Analytics and Imaging System Group of the Science Technology
 Facilities Council, STFC
 
-Copyright 2017 Daniil Kazantsev
-Copyright 2017 Srikanth Nagella, Edoardo Pasca
+Copyright 2019 Daniil Kazantsev
+Copyright 2019 Srikanth Nagella, Edoardo Pasca
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,6 +18,10 @@ limitations under the License.
 */ 
 
 #include "TV_ROF_GPU_core.h"
+#include "shared.h"
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/transform_reduce.h>
 
 /* C-OMP implementation of ROF-TV denoising/regularization model [1] (2D/3D case)
 *
@@ -26,16 +30,16 @@ limitations under the License.
 * 2. lambda - regularization parameter [REQUIRED]
 * 3. tau - marching step for explicit scheme, ~0.1 is recommended [REQUIRED]
 * 4. Number of iterations, for explicit scheme >= 150 is recommended [REQUIRED]
-*
-* Output:
-* [1] Regularized image/volume
+* 5. eplsilon: tolerance constant 
 
+ * Output:
+ * [1] Filtered/regularized image/volume
+ * [2] Information vector which contains [iteration no., reached tolerance]
+ 
+ 
  * This function is based on the paper by
 * [1] Rudin, Osher, Fatemi, "Nonlinear Total Variation based noise removal algorithms"
-*
-* D. Kazantsev, 2016-18
 */
-#include "shared.h"
     
 #define BLKXSIZE 8
 #define BLKYSIZE 8
@@ -43,7 +47,7 @@ limitations under the License.
     
 #define BLKXSIZE2D 16
 #define BLKYSIZE2D 16
-#define EPS 1.0e-12
+#define EPS 1.0e-8
     
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
 
@@ -136,7 +140,7 @@ __host__ __device__ int sign (float x)
                 dv1 = D1[index] - D1[j2*N + i];
                 dv2 = D2[index] - D2[j*N + i2];
                 
-                Update[index] += tau*(2.0f*lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
+                Update[index] += tau*(lambda*(dv1 + dv2) - (Update[index] - Input[index]));      
 		
 		}  
 	}   
@@ -286,42 +290,116 @@ __host__ __device__ int sign (float x)
                     dv2 = D2[index] - D2[(dimX*dimY)*k + j*dimX+i2];
                     dv3 = D3[index] - D3[(dimX*dimY)*k2 + j*dimX+i];
                     
-                    Update[index] += tau*(2.0f*lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
+                    Update[index] += tau*(lambda*(dv1 + dv2 + dv3) - (Update[index] - Input[index]));
 		
 		}  
 	}
+__global__ void ROFcopy_kernel2D(float *Input, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input[index];
+    }
+}
+
+
+__global__ void ROFResidCalc2D_kernel(float *Input1, float *Input2, float* Output, int N, int M, int num_total)
+{
+    int xIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int yIndex = blockDim.y * blockIdx.y + threadIdx.y;
+    
+    int index = xIndex + N*yIndex;
+    
+    if (index < num_total)	{
+        Output[index] = Input1[index] - Input2[index];
+    }
+}
 
 /////////////////////////////////////////////////
-// HOST FUNCTION
-extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z)
+///////////////// HOST FUNCTION /////////////////
+extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z)
 {
-	    // set up device
-		int dev = 0;
-		CHECK(cudaSetDevice(dev));
-        float *d_input, *d_update, *d_D1, *d_D2;
+     int deviceCount = -1; // number of devices
+     cudaGetDeviceCount(&deviceCount);
+     if (deviceCount == 0) {
+         fprintf(stderr, "No CUDA devices found\n");
+          return -1;
+      }
+        float re;    
+        re = 0.0f;			
+	int ImSize, count, n;	
+	count = 0; n = 0;
+        float *d_input, *d_update, *d_D1, *d_D2, *d_update_prev=NULL;
         
 	if (Z == 0) Z = 1;
-        CHECK(cudaMalloc((void**)&d_input,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_update,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D1,N*M*Z*sizeof(float)));
-        CHECK(cudaMalloc((void**)&d_D2,N*M*Z*sizeof(float)));
+	ImSize = N*M*Z;
+        CHECK(cudaMalloc((void**)&d_input,ImSize*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_update,ImSize*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D1,ImSize*sizeof(float)));
+        CHECK(cudaMalloc((void**)&d_D2,ImSize*sizeof(float)));
+        if (epsil != 0.0f) checkCudaErrors( cudaMalloc((void**)&d_update_prev,ImSize*sizeof(float)) );
         
-        CHECK(cudaMemcpy(d_input,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));
-        CHECK(cudaMemcpy(d_update,Input,N*M*Z*sizeof(float),cudaMemcpyHostToDevice));      
+        CHECK(cudaMemcpy(d_input,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));
+        CHECK(cudaMemcpy(d_update,Input,ImSize*sizeof(float),cudaMemcpyHostToDevice));              
         
-        if (Z > 1) {
-			// TV - 3D case
+        if (Z == 1) {
+             // TV - 2D case
+            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
+            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
+             
+            for(n=0; n < iter; n++) {
+                /* calculate differences */
+                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);
+                CHECK(cudaDeviceSynchronize());
+		D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);
+                CHECK(cudaDeviceSynchronize());        
+                /*running main kernel*/
+                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M);
+                CHECK(cudaDeviceSynchronize());
+                
+                if (epsil != 0.0f) {
+                /* calculate norm - stopping rules using the Thrust library */
+                ROFResidCalc2D_kernel<<<dimGrid,dimBlock>>>(d_update, d_update_prev, d_D1, N, M, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors( cudaPeekAtLastError() );               
+                
+                // setup arguments
+		square<float>        unary_op;
+		thrust::plus<float> binary_op;
+                thrust::device_vector<float> d_vec(d_D1, d_D1 + ImSize); 
+		float reduction = std::sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), unary_op, 0.0f, binary_op));                
+                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+      		float reduction2 = std::sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), unary_op, 0.0f, binary_op));      		
+                   
+                // compute norm
+                re = (reduction/reduction2);      
+                if (re < epsil)  count++;
+                if (count > 4) break;       
+             
+                ROFcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, N, M, ImSize);
+                checkCudaErrors( cudaDeviceSynchronize() );
+                checkCudaErrors(cudaPeekAtLastError() );                                              
+           	}
+                
+            }
+        }
+	 else {
+	    // TV - 3D case
             dim3 dimBlock(BLKXSIZE,BLKYSIZE,BLKZSIZE);
             dim3 dimGrid(idivup(N,BLKXSIZE), idivup(M,BLKYSIZE),idivup(Z,BLKXSIZE));            
             
             float *d_D3;
             CHECK(cudaMalloc((void**)&d_D3,N*M*Z*sizeof(float)));
             
-            for(int n=0; n < iter; n++) {
+            for(n=0; n < iter; n++) {
                 /* calculate differences */
                 D1_func3D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M, Z);
                 CHECK(cudaDeviceSynchronize());
-				D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);
+		D2_func3D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M, Z);
                 CHECK(cudaDeviceSynchronize());        
                 D3_func3D<<<dimGrid,dimBlock>>>(d_update, d_D3, N, M, Z);
                 CHECK(cudaDeviceSynchronize());        
@@ -331,28 +409,16 @@ extern "C" int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int
             }
             
             CHECK(cudaFree(d_D3));
-        }
-        else {
-	    // TV - 2D case
-            dim3 dimBlock(BLKXSIZE2D,BLKYSIZE2D);
-            dim3 dimGrid(idivup(N,BLKXSIZE2D), idivup(M,BLKYSIZE2D));
-             
-            for(int n=0; n < iter; n++) {
-                /* calculate differences */
-                D1_func2D<<<dimGrid,dimBlock>>>(d_update, d_D1, N, M);
-                CHECK(cudaDeviceSynchronize());
-				D2_func2D<<<dimGrid,dimBlock>>>(d_update, d_D2, N, M);
-                CHECK(cudaDeviceSynchronize());        
-                /*running main kernel*/
-                TV_kernel2D<<<dimGrid,dimBlock>>>(d_D1, d_D2, d_update, d_input, lambdaPar, tau, N, M);
-                CHECK(cudaDeviceSynchronize());
-            }
-        }        
+        }      
         CHECK(cudaMemcpy(Output,d_update,N*M*Z*sizeof(float),cudaMemcpyDeviceToHost));
+        if (epsil != 0.0f) cudaFree(d_update_prev);
         CHECK(cudaFree(d_input));
         CHECK(cudaFree(d_update));
         CHECK(cudaFree(d_D1));
         CHECK(cudaFree(d_D2));        
-        //cudaDeviceReset();
+	
+	infovector[0] = (float)(n);  /*iterations number (if stopped earlier based on tolerance)*/
+        infovector[1] = re;  /* reached tolerance */
+        
         return 0;
 }
diff --git a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h
index 3a09296..0a75124 100755
--- a/src/Core/regularisers_GPU/TV_ROF_GPU_core.h
+++ b/src/Core/regularisers_GPU/TV_ROF_GPU_core.h
@@ -3,6 +3,6 @@
 #include "CCPiDefines.h"
 #include <stdio.h>
 
-extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
+extern "C" CCPI_EXPORT int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z);
 
 #endif 
diff --git a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu
index 1f494ee..c31f104 100755
--- a/src/Core/regularisers_GPU/TV_SB_GPU_core.cu
+++ b/src/Core/regularisers_GPU/TV_SB_GPU_core.cu
@@ -49,7 +49,7 @@ limitations under the License.
 #define BLKZSIZE 8
 
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+// struct square { __host__ __device__ float operator()(float x) { return x * x; } };
 
 /************************************************/
 /*****************2D modules*********************/
@@ -430,14 +430,14 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );               
                 
-                thrust::device_vector<float> d_vec(d_res, d_res + DimTotal);
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
-                thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal);  		
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+//                thrust::device_vector<float> d_vec(d_res, d_res + DimTotal);
+  //              float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+    //            thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal);  		
+      //          float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
                     
-                re = (reduction/reduction2);      
-                if (re < epsil)  count++;
-                    if (count > 4) break;
+        //        re = (reduction/reduction2);      
+         //       if (re < epsil)  count++;
+          //          if (count > 4) break;
           }
         
         }
@@ -521,14 +521,14 @@ extern "C" int TV_SB_GPU_main(float *Input, float *Output, float mu, int iter, f
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );               
                 
-                thrust::device_vector<float> d_vec(d_res, d_res + DimTotal);
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
-                thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal);  		
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+      //          thrust::device_vector<float> d_vec(d_res, d_res + DimTotal);
+    //            float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));		
+    //            thrust::device_vector<float> d_vec2(d_update, d_update + DimTotal);  		
+//		  float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
                     
-                re = (reduction/reduction2);
-                if (re < epsil)  count++;
-                    if (count > 4) break;
+        //        re = (reduction/reduction2);
+        //        if (re < epsil)  count++;
+        //            if (count > 4) break;
           }
         }
         if (printM == 1) printf("SB-TV iterations stopped at iteration %i \n", ll);   
diff --git a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
index 7503ec7..9db594e 100644
--- a/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
+++ b/src/Core/regularisers_GPU/dTV_FGP_GPU_core.cu
@@ -53,7 +53,7 @@ limitations under the License.
 #define BLKZSIZE 8
 
 #define idivup(a, b) ( ((a)%(b) != 0) ? (a)/(b)+1 : (a)/(b) )
-struct square { __host__ __device__ float operator()(float x) { return x * x; } };
+//struct square { __host__ __device__ float operator()(float x) { return x * x; } };
 
 /************************************************/
 /*****************2D modules*********************/
@@ -552,14 +552,14 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );               
                 
-                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); 
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
-                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+          //      thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); 
+         //       float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
+          //      thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+         //       float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
                     
-                re = (reduction/reduction2);      
-                if (re < epsil)  count++;
-                    if (count > 4) break;       
+          //      re = (reduction/reduction2);      
+          //      if (re < epsil)  count++;
+          //          if (count > 4) break;       
              
                 dTVcopy_kernel2D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, ImSize);
                 checkCudaErrors( cudaDeviceSynchronize() );
@@ -686,14 +686,14 @@ extern "C" int dTV_FGP_GPU_main(float *Input, float *InputRef, float *Output, fl
                 checkCudaErrors( cudaDeviceSynchronize() );
                 checkCudaErrors(cudaPeekAtLastError() );               
                 
-                thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); 
-                float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
-                thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
-                float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
+          //      thrust::device_vector<float> d_vec(P1_prev, P1_prev + ImSize); 
+         //       float reduction = sqrt(thrust::transform_reduce(d_vec.begin(), d_vec.end(), square(), 0.0f, thrust::plus<float>()));
+         //       thrust::device_vector<float> d_vec2(d_update, d_update + ImSize);
+         //       float reduction2 = sqrt(thrust::transform_reduce(d_vec2.begin(), d_vec2.end(), square(), 0.0f, thrust::plus<float>()));
                     
-                re = (reduction/reduction2);      
-                if (re < epsil)  count++;
-                    if (count > 4) break;       
+        //        re = (reduction/reduction2);      
+         //       if (re < epsil)  count++;
+        //            if (count > 4) break;       
              
                 dTVcopy_kernel3D<<<dimGrid,dimBlock>>>(d_update, d_update_prev, dimX, dimY, dimZ, ImSize);
                 checkCudaErrors( cudaDeviceSynchronize() );
diff --git a/src/Core/regularisers_GPU/shared.h b/src/Core/regularisers_GPU/shared.h
index fe98cd6..698dddd 100644
--- a/src/Core/regularisers_GPU/shared.h
+++ b/src/Core/regularisers_GPU/shared.h
@@ -1,4 +1,13 @@
 /*shared macros*/
+template <typename T>
+struct square
+{
+    __host__ __device__
+        T operator()(const T& x) const { 
+            return (float)(x*x);
+        }
+};
+
 
 
 /*checks CUDA call, should be used in functions returning <int> value
diff --git a/src/Matlab/CMakeLists.txt b/src/Matlab/CMakeLists.txt
index b97f845..6c5e6be 100755
--- a/src/Matlab/CMakeLists.txt
+++ b/src/Matlab/CMakeLists.txt
@@ -38,9 +38,9 @@ find_package(Matlab REQUIRED COMPONENTS MAIN_PROGRAM MX_LIBRARY ENG_LIBRARY )
 #set (MEX_TARGETS "CPU_TNV;CPU_ROF")
 #list(APPEND MEX_TARGETS "CPU_TNV")
 #list(APPEND MEX_TARGETS "CPU_ROF")
-
+
 file(GLOB CPU_MEX_FILES
-    "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_CPU/*.c"
+    "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_CPU/*.c"
     #"${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.c"
 )
 
@@ -101,7 +101,7 @@ if (BUILD_CUDA)
     find_package(CUDA)
     if (CUDA_FOUND)
       file(GLOB GPU_MEX_FILES
-        "${CMAKE_SOURCE_DIR}/Matlab/mex_compile/regularisers_GPU/*.cpp"
+        "${CMAKE_SOURCE_DIR}/src/Matlab/mex_compile/regularisers_GPU/*.cpp"
       )
 
       list(LENGTH GPU_MEX_FILES num)
diff --git a/src/Python/ccpi/filters/regularisers.py b/src/Python/ccpi/filters/regularisers.py
index 67f432b..05120af 100644
--- a/src/Python/ccpi/filters/regularisers.py
+++ b/src/Python/ccpi/filters/regularisers.py
@@ -22,7 +22,8 @@ def ROF_TV(inputData, regularisation_parameter, iterations,
         return TV_ROF_GPU(inputData,
                      regularisation_parameter,
                      iterations, 
-                     time_marching_parameter)
+                     time_marching_parameter,
+                     tolerance_param)
     else:
         if not gpu_enabled and device == 'gpu':
             raise ValueError ('GPU is not available')
@@ -44,8 +45,7 @@ def FGP_TV(inputData, regularisation_parameter,iterations,
                      iterations, 
                      tolerance_param,
                      methodTV,
-                     nonneg,
-                     1)
+                     nonneg)
     else:
         if not gpu_enabled and device == 'gpu':
             raise ValueError ('GPU is not available')
diff --git a/src/Python/setup-regularisers.py.in b/src/Python/setup-regularisers.py.in
index 39b820a..4c578e3 100644
--- a/src/Python/setup-regularisers.py.in
+++ b/src/Python/setup-regularisers.py.in
@@ -68,7 +68,7 @@ setup(
     
     ],
 	zip_safe = False,	
-	packages = {'ccpi','ccpi.filters', 'ccpi.supp'},
+	packages = {'ccpi', 'ccpi.filters', 'ccpi.supp'},
 )
 
 
diff --git a/src/Python/src/gpu_regularisers.pyx b/src/Python/src/gpu_regularisers.pyx
index b52f669..db4fa67 100644
--- a/src/Python/src/gpu_regularisers.pyx
+++ b/src/Python/src/gpu_regularisers.pyx
@@ -20,8 +20,8 @@ cimport numpy as np
 
 CUDAErrorMessage = 'CUDA error'
 
-cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float lambdaPar, int iter, float tau, int N, int M, int Z);
-cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int printM, int N, int M, int Z);
+cdef extern int TV_ROF_GPU_main(float* Input, float* Output, float *infovector, float lambdaPar, int iter, float tau, float epsil, int N, int M, int Z);
+cdef extern int TV_FGP_GPU_main(float *Input, float *Output, float *infovector, float lambdaPar, int iter, float epsil, int methodTV, int nonneg, int N, int M, int Z);
 cdef extern int TV_SB_GPU_main(float *Input, float *Output, float lambdaPar, int iter, float epsil, int methodTV, int printM, int N, int M, int Z);
 cdef extern int TGV_GPU_main(float *Input, float *Output, float lambdaPar, float alpha1, float alpha0, int iterationsNumb, float L2, int dimX, int dimY, int dimZ);
 cdef extern int LLT_ROF_GPU_main(float *Input, float *Output, float lambdaROF, float lambdaLLT, int iterationsNumb, float tau, int N, int M, int Z);
@@ -34,17 +34,20 @@ cdef extern int PatchSelect_GPU_main(float *Input, unsigned short *H_i, unsigned
 def TV_ROF_GPU(inputData,
                      regularisation_parameter,
                      iterations, 
-                     time_marching_parameter):
+                     time_marching_parameter,
+                     tolerance_param):
     if inputData.ndim == 2:
         return ROFTV2D(inputData, 
                      regularisation_parameter,
                      iterations,
-                     time_marching_parameter)
+                     time_marching_parameter,
+                     tolerance_param)
     elif inputData.ndim == 3:
         return ROFTV3D(inputData, 
                      regularisation_parameter,
                      iterations, 
-                     time_marching_parameter)
+                     time_marching_parameter,
+                     tolerance_param)
                      
 # Total-variation Fast-Gradient-Projection (FGP)
 def TV_FGP_GPU(inputData,
@@ -52,24 +55,21 @@ def TV_FGP_GPU(inputData,
                      iterations, 
                      tolerance_param,
                      methodTV,
-                     nonneg,
-                     printM):
+                     nonneg):
     if inputData.ndim == 2:
         return FGPTV2D(inputData,
                      regularisation_parameter,
                      iterations, 
                      tolerance_param,
                      methodTV,
-                     nonneg,
-                     printM)
+                     nonneg)
     elif inputData.ndim == 3:
         return FGPTV3D(inputData,
                      regularisation_parameter,
                      iterations, 
                      tolerance_param,
                      methodTV,
-                     nonneg,
-                     printM)
+                     nonneg)
 # Total-variation Split Bregman (SB)
 def TV_SB_GPU(inputData,
                      regularisation_parameter,
@@ -179,7 +179,8 @@ def Diff4th_GPU(inputData,
 def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData, 
                      float regularisation_parameter,
                      int iterations, 
-                     float time_marching_parameter):
+                     float time_marching_parameter,
+                     float tolerance_param):
     
     cdef long dims[2]
     dims[0] = inputData.shape[0]
@@ -187,22 +188,26 @@ def ROFTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 
     cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1]], dtype='float32')
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.ones([2], dtype='float32')
           
     # Running CUDA code here
     if (TV_ROF_GPU_main(
-            &inputData[0,0], &outputData[0,0], 
+            &inputData[0,0], &outputData[0,0], &infovec[0],
                        regularisation_parameter,
-                       iterations , 
+                       iterations, 
                        time_marching_parameter, 
+                       tolerance_param,
                        dims[1], dims[0], 1)==0):
-        return outputData;
+        return (outputData,infovec)
     else:
         raise ValueError(CUDAErrorMessage);
     
 def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
                      int iterations, 
-                     float time_marching_parameter):
+                     float time_marching_parameter,
+                     float tolerance_param):
     
     cdef long dims[3]
     dims[0] = inputData.shape[0]
@@ -211,15 +216,18 @@ def ROFTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData,
 
     cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.ones([2], dtype='float32')
           
     # Running CUDA code here    
     if (TV_ROF_GPU_main(
-            &inputData[0,0,0], &outputData[0,0,0], 
+            &inputData[0,0,0], &outputData[0,0,0], &infovec[0],
                        regularisation_parameter,
-                       iterations , 
+                       iterations, 
                        time_marching_parameter, 
+                       tolerance_param,
                        dims[2], dims[1], dims[0])==0):
-        return outputData;
+        return (outputData,infovec)
     else:
         raise ValueError(CUDAErrorMessage);
 #****************************************************************#
@@ -231,8 +239,7 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
                      int iterations, 
                      float tolerance_param,
                      int methodTV,
-                     int nonneg,
-                     int printM):
+                     int nonneg):
     
     cdef long dims[2]
     dims[0] = inputData.shape[0]
@@ -240,47 +247,48 @@ def FGPTV2D(np.ndarray[np.float32_t, ndim=2, mode="c"] inputData,
 
     cdef np.ndarray[np.float32_t, ndim=2, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1]], dtype='float32')
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.ones([2], dtype='float32')
           
     # Running CUDA code here    
-    if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0],
+    if (TV_FGP_GPU_main(&inputData[0,0], &outputData[0,0], &infovec[0],
                        regularisation_parameter, 
-                       iterations, 
+                       iterations,
                        tolerance_param,
                        methodTV,
                        nonneg,
-                       printM,
                        dims[1], dims[0], 1)==0):
-        return outputData;
+        return (outputData,infovec)
     else:
         raise ValueError(CUDAErrorMessage);
-
     
 def FGPTV3D(np.ndarray[np.float32_t, ndim=3, mode="c"] inputData, 
                      float regularisation_parameter,
                      int iterations, 
                      float tolerance_param,
                      int methodTV,
-                     int nonneg,
-                     int printM):
+                     int nonneg):
     
     cdef long dims[3]
     dims[0] = inputData.shape[0]
     dims[1] = inputData.shape[1]
     dims[2] = inputData.shape[2]
 
+
     cdef np.ndarray[np.float32_t, ndim=3, mode="c"] outputData = \
 		    np.zeros([dims[0],dims[1],dims[2]], dtype='float32')
+    cdef np.ndarray[np.float32_t, ndim=1, mode="c"] infovec = \
+            np.ones([2], dtype='float32')
           
     # Running CUDA code here    
-    if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0],
-                       regularisation_parameter , 
+    if (TV_FGP_GPU_main(&inputData[0,0,0], &outputData[0,0,0], &infovec[0],
+                       regularisation_parameter, 
                        iterations, 
                        tolerance_param,
                        methodTV,
                        nonneg,
-                       printM,
                        dims[2], dims[1], dims[0])==0):
-        return outputData;
+        return (outputData,infovec)
     else:
         raise ValueError(CUDAErrorMessage);