From da30ce619bce168fc24ed0c17f04e411626ba18b Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 20:34:12 +0200
Subject: Eliminate conditionals in the innermost loop to help gcc
 auto-vectorizer

---
 src/Core/CMakeLists.txt                            |  13 +-
 src/Core/regularisers_CPU/TNV_core.c               | 216 +++++++++++----------
 src/Core/regularisers_CPU/TNV_core_backtrack.c     | 189 +++++++++---------
 .../regularisers_CPU/TNV_core_backtrack_loop.h     | 100 ++++++++++
 src/Core/regularisers_CPU/TNV_core_loop.h          | 107 ++++++++++
 5 files changed, 423 insertions(+), 202 deletions(-)
 create mode 100644 src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
 create mode 100644 src/Core/regularisers_CPU/TNV_core_loop.h

diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 10b16f3..76b0f3e 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
 message("Adding regularisers as a shared library")
 
 #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp")
-set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
-#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+
+#set(CMAKE_C_COMPILER clang)
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
+
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
 #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
 
 #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index dce414a..415c644 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -5,12 +5,6 @@
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * 
- * Copyriht 2020 Suren A. Chlingaryan
- * Optimized version with 1/3 of memory consumption and ~10x performance
- * This version is not able to perform back-track except during first iterations
- * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) 
- * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +17,7 @@
  * limitations under the License.
  */
 
+#include <malloc.h>
 #include "TNV_core.h"
 
 #define BLOCK 32
@@ -143,6 +138,7 @@ typedef struct {
     int offY, stepY, copY;
     float *Input, *u, *qx, *qy, *gradx, *grady, *div;
     float *div0, *udiff0;
+    float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
     float resprimal, resdual;
     float unorm, qnorm, product;
 } tnv_thread_t;
@@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
     
     free(ctx->div0);
     free(ctx->udiff0);
-    
+
+    free(ctx->gradxdiff); 
+    free(ctx->gradydiff);
+    free(ctx->ubarx);
+    free(ctx->ubary);
+    free(ctx->udiff);
+
     return 0;
 }
 
@@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     long DimTotal = (long)(dimX*stepY*padZ);
     long Dim1Total = (long)(dimX*(stepY+1)*padZ);
     long DimRow = (long)(dimX * padZ);
+    long DimCell = (long)(padZ);
 
     // Auxiliar vectors
     ctx->Input = memalign(64, Dim1Total * sizeof(float));
@@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     ctx->div0 = memalign(64, DimRow * sizeof(float));
     ctx->udiff0 = memalign(64, DimRow * sizeof(float));
 
-    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
+    ctx->gradxdiff = memalign(64, DimCell * sizeof(float));
+    ctx->gradydiff = memalign(64, DimCell * sizeof(float));
+    ctx->ubarx = memalign(64, DimCell * sizeof(float));
+    ctx->ubary = memalign(64, DimCell * sizeof(float));
+    ctx->udiff = memalign(64, DimCell * sizeof(float));
+    
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
         fprintf(stderr, "Error allocating memory\n");
         exit(-1);
     }
@@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
-    long i, j, k, l;
+    long i, j, k, l, m;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float constant = 1.0f + taulambda;
 
     float resprimal = 0.0f;
-    float resdual = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
     float product = 0.0f;
     float unorm = 0.0f;
     float qnorm = 0.0f;
@@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float qxdiff;
     float qydiff;
     float divdiff;
-    float gradxdiff[dimZ] __attribute__((aligned(64)));
-    float gradydiff[dimZ] __attribute__((aligned(64)));
-    float ubarx[dimZ] __attribute__((aligned(64)));
-    float ubary[dimZ] __attribute__((aligned(64)));
-    float udiff[dimZ] __attribute__((aligned(64)));
-
-
-    for(i=0; i < dimX; i++) {
-        for(k = 0; k < dimZ; k++) { 
-            int l = i * padZ + k;
-            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
-            float udiff = u[l] - u_upd;
-            ctx->udiff0[l] = udiff;
-            ctx->div0[l] = div[l];
+    float *gradxdiff = ctx->gradxdiff;
+    float *gradydiff = ctx->gradydiff;
+    float *ubarx = ctx->ubarx;
+    float *ubary = ctx->ubary;
+    float *udiff = ctx->udiff;
+
+    float *udiff0 = ctx->udiff0;
+    float *div0 = ctx->div0;
+
+
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
         }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
     }
 
-    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
-     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
-      for(int j2 = 0; j2 < BLOCK; j2++) {
-        j = j1 + j2;
-        for(int i2 = 0; i2 < BLOCK; i2++) {
-            float t[3];
-            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
-            i = i1 + i2;
-            if (i == dimX) break;
-            if (j == stepY) { j2 = BLOCK; break; }
-            l = (j * dimX  + i) * padZ;
-
-#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
-                udiff[k] = u[l + k] - u_upd;
-                u[l + k] = u_upd;
-
-                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
-                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
-                gradxdiff[k] = gradx[l + k] - gradx_upd;
-                gradydiff[k] = grady[l + k] - grady_upd;
-                gradx[l + k] = gradx_upd;
-                grady[l + k] = grady_upd;
-                
-                ubarx[k] = gradx_upd - theta * gradxdiff[k];
-                ubary[k] = grady_upd - theta * gradydiff[k];
-
-                float vx = ubarx[k] + divsigma * qx[l + k];
-                float vy = ubary[k] + divsigma * qy[l + k];
-
-                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
-            }
 
-            coefF(t, M1, M2, M3, sigma, p, q, r);
-            
-#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                float vx = ubarx[k] + divsigma * qx[l + k];
-                float vy = ubary[k] + divsigma * qy[l + k];
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qxdiff = sigma * (ubarx[k] - gx_upd);
-                qydiff = sigma * (ubary[k] - gy_upd);
-                
-                qx[l + k] += qxdiff;
-                qy[l + k] += qydiff;
-
-                unorm += (udiff[k] * udiff[k]);
-                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
-                float div_upd = 0;
-                div_upd -= (i > 0)?qx[l + k - padZ]:0;
-                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
-                div_upd += (i < (dimX-1))?qx[l + k]:0;
-                div_upd += (j < (copY-1))?qy[l + k]:0;
-                divdiff = div[l + k] - div_upd;  
-                div[l + k] = div_upd;
-
-                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; 
-                resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
-                resdual += fabs(divsigma * qydiff + gradydiff[k]);
-                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_loop.h"
+                }   
             }
-            
         } // i
-       } // j
-     } // i
-    } // j
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
 
 
     ctx->resprimal = resprimal;
-    ctx->resdual = resdual;
+    ctx->resdual = resdual1 + resdual2;
     ctx->product = product;
     ctx->unorm = unorm;
     ctx->qnorm = qnorm;
@@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-//    tnv_ctx.padZ = dimZ;
+    tnv_ctx.padZ = dimZ;
 //    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
-    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
@@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
                     float udiff = ctx->udiff0[l];
 
                     ctx->div[l] -= ctx0->qy[l + m];
-                    ctx0->div[m + l + dimX * padZ] = ctx->div[l];
-                    ctx0->u[m + l + dimX * padZ] = ctx->u[l];
-
+                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+                    
                     divdiff += ctx0->qy[l + m];
                     resprimal += fabs(divtau * udiff + divdiff); 
                 }
             }
         }
 
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
         for (j = 0; j < tnv_ctx.threads; j++) {
             tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
             resprimal += ctx->resprimal;
@@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
     printf("Iterations stopped at %i with the residual %f \n", iter, residual);
     printf("Return: %f\n", *uT);
 
-//    exit(-1);
     return *uT;
 }
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index 7192475..9b19ed5 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     long i, j, k, l;
+    long i1, i2, j1, j2;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float theta1 = 1.0f + theta;
     float constant = 1.0f + taulambda;
 
-    float resprimal = 0.0;
-    float resdual = 0.0;
-    float product = 0.0;
-    float unorm = 0.0;
-    float qnorm = 0.0;
+    float resprimal = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
 
     float udiff[dimZ] __attribute__((aligned(64)));
     float qxdiff __attribute__((aligned(64)));
@@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float gradxdiff[dimZ] __attribute__((aligned(64)));
     float gradydiff[dimZ] __attribute__((aligned(64)));
 
-    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
-     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
-      for(int j2 = 0; j2 < BLOCK; j2++) {
-        j = j1 + j2;
-        for(int i2 = 0; i2 < BLOCK; i2++) {
-            float t[3];
-            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
-            i = i1 + i2;
-            if (i == dimX) break;
-            if (j == stepY) { j2 = BLOCK; break; }
-            l = (j * dimX  + i) * padZ;
-        
-//#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
-                udiff[k] = u[l + k] - u_upd[l + k];
-                unorm += (udiff[k] * udiff[k]);
-
-                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
-                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
-                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
-                gradydiff[k] = grady[l + k] - grady_upd[l + k];
-
-                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
-                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
-//#define TNV_NEW_STYLE                
-#ifdef TNV_NEW_STYLE                
-                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
-                qy_upd[l + k] = qy[l + k] + sigma * ubary;
-
-                float vx = divsigma * qx_upd[l + k]; //+ ubarx
-                float vy = divsigma * qy_upd[l + k]; //+ ubary
-#else
-                float vx = ubarx + divsigma * qx[l + k];
-                float vy = ubary + divsigma * qy[l + k];
-#endif
-
-                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
-            }
 
-            coefF(t, M1, M2, M3, sigma, p, q, r);
-            
-//#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-#ifdef TNV_NEW_STYLE    
-                float vx = divsigma * qx_upd[l + k];
-                float vy = divsigma * qy_upd[l + k];
-
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qx_upd[l + k] -= sigma * gx_upd;
-                qy_upd[l + k] -= sigma * gy_upd;
-#else
-                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
-                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
-                float vx = ubarx + divsigma * qx[l + k];
-                float vy = ubary + divsigma * qy[l + k];
-
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
-                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
-#endif
-
-                float div_upd_val = 0;
-                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
-                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
-                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
-                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
-                div_upd[l + k] = div_upd_val;
-
-                qxdiff = qx[l + k] - qx_upd[l + k];
-                qydiff = qy[l + k] - qy_upd[l + k];
-                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
-                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
-                resdual += fabs(divsigma * qydiff - gradydiff[k]);
-                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
-
-                if ((offY == 0)||(j > 0)) {
-                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
-                    resprimal += fabs(divtau * udiff[k] + divdiff); 
-                }
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
+    }
+
+
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_backtrack_loop.h"
+                }   
             }
-            
         } // i
-       } // j
-      } // i
-    } // j
-    
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
     
     ctx->resprimal = resprimal;
-    ctx->resdual = resdual;
+    ctx->resdual = resdual1 + resdual2;
     ctx->product = product;
     ctx->unorm = unorm;
     ctx->qnorm = qnorm;
@@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
             }
         }
 
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
         for (j = 0; j < tnv_ctx.threads; j++) {
             tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
             resprimal += ctx->resprimal;
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
new file mode 100644
index 0000000..3ec4250
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
@@ -0,0 +1,100 @@
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            l = (j * dimX  + i) * padZ;
+        
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+
+#ifdef TNV_LOOP_LAST_I
+                gradx_upd[l + k] = 0;
+#else
+                gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                grady_upd[l + k] = 0;
+#else
+                grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+                float div_upd_val = 0;
+#ifndef TNV_LOOP_FIRST_I
+                div_upd_val -= qx_upd[l + k - padZ];
+#endif
+
+#ifndef TNV_LOOP_FIRST_J
+                div_upd_val -= qy_upd[l + k - dimX * padZ];
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd_val += qx_upd[l + k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd_val += qy_upd[l + k];
+#endif
+                div_upd[l + k] = div_upd_val;
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual2 += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+#ifndef TNV_LOOP_FIRST_J
+                divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+            }
diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h
new file mode 100644
index 0000000..34e7139
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_loop.h
@@ -0,0 +1,107 @@
+    {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+            float *__restrict u_next = u + l + padZ;
+            float *__restrict u_current = u + l;
+            float *__restrict u_next_row = u + l + m;
+
+
+            float *__restrict qx_current = qx + l;
+            float *__restrict qy_current = qy + l;
+            float *__restrict qx_prev = qx + l - padZ;
+            float *__restrict qy_prev = qy + l - m;
+
+
+//  __assume(padZ%16==0);
+
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+                udiff[k] = u[l + k] - u_upd; // cache 1w
+                u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+                udiff0[l + k] = udiff[k];
+                div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+                float gradx_upd = 0;
+#else
+                float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+                float gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                float grady_upd = 0;
+#else
+                float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+                float grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+                gradx[l + k] = gradx_upd; // 1 write
+                grady[l + k] = grady_upd; // 1 write
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+                float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+                float vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < padZ; k++) {
+                float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+                float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx_current[k] += qxdiff; // write 1
+                qy_current[k] += qydiff; // write 1
+
+                unorm += (udiff[k] * udiff[k]);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                float div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+                div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+                div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd += qx_current[k]; 
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd += qy_current[k]; 
+#endif
+
+                divdiff = div[l + k] - div_upd;  // 1 read
+                div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+
+                    // We need to have two independent accumulators to allow gcc-autovectorization
+                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+    }
+    
\ No newline at end of file
-- 
cgit v1.2.3