summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2020-03-29 20:34:12 +0200
committerSuren A. Chilingaryan <csa@suren.me>2020-03-29 20:34:12 +0200
commitda30ce619bce168fc24ed0c17f04e411626ba18b (patch)
treee0835acdf2bc3e0e3831cf83addaf00d93d87097 /src
parent9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e (diff)
downloadregularization-da30ce619bce168fc24ed0c17f04e411626ba18b.tar.gz
regularization-da30ce619bce168fc24ed0c17f04e411626ba18b.tar.bz2
regularization-da30ce619bce168fc24ed0c17f04e411626ba18b.tar.xz
regularization-da30ce619bce168fc24ed0c17f04e411626ba18b.zip
Eliminate conditionals in the innermost loop to help gcc auto-vectorizer
Diffstat (limited to 'src')
-rw-r--r--src/Core/CMakeLists.txt13
-rwxr-xr-xsrc/Core/regularisers_CPU/TNV_core.c216
-rwxr-xr-xsrc/Core/regularisers_CPU/TNV_core_backtrack.c189
-rw-r--r--src/Core/regularisers_CPU/TNV_core_backtrack_loop.h100
-rw-r--r--src/Core/regularisers_CPU/TNV_core_loop.h107
5 files changed, 423 insertions, 202 deletions
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 10b16f3..76b0f3e 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
message("Adding regularisers as a shared library")
#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp")
-set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
-#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+
+#set(CMAKE_C_COMPILER clang)
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
+
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
#set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index dce414a..415c644 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -5,12 +5,6 @@
*
* Copyright 2017 Daniil Kazantsev
* Copyright 2017 Srikanth Nagella, Edoardo Pasca
- *
- * Copyriht 2020 Suren A. Chlingaryan
- * Optimized version with 1/3 of memory consumption and ~10x performance
- * This version is not able to perform back-track except during first iterations
- * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c)
- * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,6 +17,7 @@
* limitations under the License.
*/
+#include <malloc.h>
#include "TNV_core.h"
#define BLOCK 32
@@ -143,6 +138,7 @@ typedef struct {
int offY, stepY, copY;
float *Input, *u, *qx, *qy, *gradx, *grady, *div;
float *div0, *udiff0;
+ float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
float resprimal, resdual;
float unorm, qnorm, product;
} tnv_thread_t;
@@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
free(ctx->div0);
free(ctx->udiff0);
-
+
+ free(ctx->gradxdiff);
+ free(ctx->gradydiff);
+ free(ctx->ubarx);
+ free(ctx->ubary);
+ free(ctx->udiff);
+
return 0;
}
@@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
long DimTotal = (long)(dimX*stepY*padZ);
long Dim1Total = (long)(dimX*(stepY+1)*padZ);
long DimRow = (long)(dimX * padZ);
+ long DimCell = (long)(padZ);
// Auxiliar vectors
ctx->Input = memalign(64, Dim1Total * sizeof(float));
@@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
ctx->div0 = memalign(64, DimRow * sizeof(float));
ctx->udiff0 = memalign(64, DimRow * sizeof(float));
- if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
+ ctx->gradxdiff = memalign(64, DimCell * sizeof(float));
+ ctx->gradydiff = memalign(64, DimCell * sizeof(float));
+ ctx->ubarx = memalign(64, DimCell * sizeof(float));
+ ctx->ubary = memalign(64, DimCell * sizeof(float));
+ ctx->udiff = memalign(64, DimCell * sizeof(float));
+
+ if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
fprintf(stderr, "Error allocating memory\n");
exit(-1);
}
@@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
- long i, j, k, l;
+ long i, j, k, l, m;
tnv_context_t *tnv_ctx = (tnv_context_t*)data;
tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
float constant = 1.0f + taulambda;
float resprimal = 0.0f;
- float resdual = 0.0f;
+ float resdual1 = 0.0f;
+ float resdual2 = 0.0f;
float product = 0.0f;
float unorm = 0.0f;
float qnorm = 0.0f;
@@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
float qxdiff;
float qydiff;
float divdiff;
- float gradxdiff[dimZ] __attribute__((aligned(64)));
- float gradydiff[dimZ] __attribute__((aligned(64)));
- float ubarx[dimZ] __attribute__((aligned(64)));
- float ubary[dimZ] __attribute__((aligned(64)));
- float udiff[dimZ] __attribute__((aligned(64)));
-
-
- for(i=0; i < dimX; i++) {
- for(k = 0; k < dimZ; k++) {
- int l = i * padZ + k;
- float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
- float udiff = u[l] - u_upd;
- ctx->udiff0[l] = udiff;
- ctx->div0[l] = div[l];
+ float *gradxdiff = ctx->gradxdiff;
+ float *gradydiff = ctx->gradydiff;
+ float *ubarx = ctx->ubarx;
+ float *ubary = ctx->ubary;
+ float *udiff = ctx->udiff;
+
+ float *udiff0 = ctx->udiff0;
+ float *div0 = ctx->div0;
+
+
+ j = 0; {
+# define TNV_LOOP_FIRST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_loop.h"
}
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_FIRST_J
}
- for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
- for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
- for(int j2 = 0; j2 < BLOCK; j2++) {
- j = j1 + j2;
- for(int i2 = 0; i2 < BLOCK; i2++) {
- float t[3];
- float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
- i = i1 + i2;
- if (i == dimX) break;
- if (j == stepY) { j2 = BLOCK; break; }
- l = (j * dimX + i) * padZ;
-
-#pragma vector aligned
-#pragma GCC ivdep
- for(k = 0; k < dimZ; k++) {
- float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
- udiff[k] = u[l + k] - u_upd;
- u[l + k] = u_upd;
-
- float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
- float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
- gradxdiff[k] = gradx[l + k] - gradx_upd;
- gradydiff[k] = grady[l + k] - grady_upd;
- gradx[l + k] = gradx_upd;
- grady[l + k] = grady_upd;
-
- ubarx[k] = gradx_upd - theta * gradxdiff[k];
- ubary[k] = grady_upd - theta * gradydiff[k];
-
- float vx = ubarx[k] + divsigma * qx[l + k];
- float vy = ubary[k] + divsigma * qy[l + k];
-
- M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
- }
- coefF(t, M1, M2, M3, sigma, p, q, r);
-
-#pragma vector aligned
-#pragma GCC ivdep
- for(k = 0; k < dimZ; k++) {
- float vx = ubarx[k] + divsigma * qx[l + k];
- float vy = ubary[k] + divsigma * qy[l + k];
- float gx_upd = vx * t[0] + vy * t[1];
- float gy_upd = vx * t[1] + vy * t[2];
-
- qxdiff = sigma * (ubarx[k] - gx_upd);
- qydiff = sigma * (ubary[k] - gy_upd);
-
- qx[l + k] += qxdiff;
- qy[l + k] += qydiff;
-
- unorm += (udiff[k] * udiff[k]);
- qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
- float div_upd = 0;
- div_upd -= (i > 0)?qx[l + k - padZ]:0;
- div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
- div_upd += (i < (dimX-1))?qx[l + k]:0;
- div_upd += (j < (copY-1))?qy[l + k]:0;
- divdiff = div[l + k] - div_upd;
- div[l + k] = div_upd;
-
- resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0;
- resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
- resdual += fabs(divsigma * qydiff + gradydiff[k]);
- product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ }
+
+ for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+ for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2 ++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ i = i1 + i2;
+
+ if (i == (dimX - 1)) break;
+ if (j == (copY - 1)) { j2 = BLOCK; break; }
+# include "TNV_core_loop.h"
+ }
}
-
} // i
- } // j
- } // i
- } // j
+
+ }
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+ }
+
+
+
+ for (j = copY - 1; j < stepY; j++) {
+# define TNV_LOOP_LAST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_LAST_J
+ }
+
ctx->resprimal = resprimal;
- ctx->resdual = resdual;
+ ctx->resdual = resdual1 + resdual2;
ctx->product = product;
ctx->unorm = unorm;
ctx->qnorm = qnorm;
@@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
tnv_ctx.dimY = dimY;
tnv_ctx.dimZ = dimZ;
// Padding seems actually slower
-// tnv_ctx.padZ = dimZ;
+ tnv_ctx.padZ = dimZ;
// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
- tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
hw_sched_init();
@@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
float udiff = ctx->udiff0[l];
ctx->div[l] -= ctx0->qy[l + m];
- ctx0->div[m + l + dimX * padZ] = ctx->div[l];
- ctx0->u[m + l + dimX * padZ] = ctx->u[l];
-
+ ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+ ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+
divdiff += ctx0->qy[l + m];
resprimal += fabs(divtau * udiff + divdiff);
}
}
}
+ {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div0[l] - ctx->div[l];
+ float udiff = ctx->udiff0[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
for (j = 0; j < tnv_ctx.threads; j++) {
tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
resprimal += ctx->resprimal;
@@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
printf("Iterations stopped at %i with the residual %f \n", iter, residual);
printf("Return: %f\n", *uT);
-// exit(-1);
return *uT;
}
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index 7192475..9b19ed5 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
long i, j, k, l;
+ long i1, i2, j1, j2;
tnv_context_t *tnv_ctx = (tnv_context_t*)data;
tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
float theta1 = 1.0f + theta;
float constant = 1.0f + taulambda;
- float resprimal = 0.0;
- float resdual = 0.0;
- float product = 0.0;
- float unorm = 0.0;
- float qnorm = 0.0;
+ float resprimal = 0.0f;
+ float resdual1 = 0.0f;
+ float resdual2 = 0.0f;
+ float product = 0.0f;
+ float unorm = 0.0f;
+ float qnorm = 0.0f;
float udiff[dimZ] __attribute__((aligned(64)));
float qxdiff __attribute__((aligned(64)));
@@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
float gradxdiff[dimZ] __attribute__((aligned(64)));
float gradydiff[dimZ] __attribute__((aligned(64)));
- for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
- for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
- for(int j2 = 0; j2 < BLOCK; j2++) {
- j = j1 + j2;
- for(int i2 = 0; i2 < BLOCK; i2++) {
- float t[3];
- float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
- i = i1 + i2;
- if (i == dimX) break;
- if (j == stepY) { j2 = BLOCK; break; }
- l = (j * dimX + i) * padZ;
-
-//#pragma vector aligned
-#pragma GCC ivdep
- for(k = 0; k < dimZ; k++) {
- u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
- udiff[k] = u[l + k] - u_upd[l + k];
- unorm += (udiff[k] * udiff[k]);
-
- gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
- grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
- gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
- gradydiff[k] = grady[l + k] - grady_upd[l + k];
-
- float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
- float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
-//#define TNV_NEW_STYLE
-#ifdef TNV_NEW_STYLE
- qx_upd[l + k] = qx[l + k] + sigma * ubarx;
- qy_upd[l + k] = qy[l + k] + sigma * ubary;
-
- float vx = divsigma * qx_upd[l + k]; //+ ubarx
- float vy = divsigma * qy_upd[l + k]; //+ ubary
-#else
- float vx = ubarx + divsigma * qx[l + k];
- float vy = ubary + divsigma * qy[l + k];
-#endif
-
- M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
- }
- coefF(t, M1, M2, M3, sigma, p, q, r);
-
-//#pragma vector aligned
-#pragma GCC ivdep
- for(k = 0; k < dimZ; k++) {
-#ifdef TNV_NEW_STYLE
- float vx = divsigma * qx_upd[l + k];
- float vy = divsigma * qy_upd[l + k];
-
- float gx_upd = vx * t[0] + vy * t[1];
- float gy_upd = vx * t[1] + vy * t[2];
-
- qx_upd[l + k] -= sigma * gx_upd;
- qy_upd[l + k] -= sigma * gy_upd;
-#else
- float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
- float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
- float vx = ubarx + divsigma * qx[l + k];
- float vy = ubary + divsigma * qy[l + k];
-
- float gx_upd = vx * t[0] + vy * t[1];
- float gy_upd = vx * t[1] + vy * t[2];
-
- qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
- qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
-#endif
-
- float div_upd_val = 0;
- div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
- div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
- div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
- div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
- div_upd[l + k] = div_upd_val;
-
- qxdiff = qx[l + k] - qx_upd[l + k];
- qydiff = qy[l + k] - qy_upd[l + k];
- qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
- resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
- resdual += fabs(divsigma * qydiff - gradydiff[k]);
- product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
-
- if ((offY == 0)||(j > 0)) {
- divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
- resprimal += fabs(divtau * udiff[k] + divdiff);
- }
+ j = 0; {
+# define TNV_LOOP_FIRST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_backtrack_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_FIRST_J
+ }
+
+
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ }
+
+ for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+ for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+ for(int j2 = 0; j2 < BLOCK; j2 ++) {
+ j = j1 + j2;
+ for(int i2 = 0; i2 < BLOCK; i2++) {
+ i = i1 + i2;
+
+ if (i == (dimX - 1)) break;
+ if (j == (copY - 1)) { j2 = BLOCK; break; }
+# include "TNV_core_backtrack_loop.h"
+ }
}
-
} // i
- } // j
- } // i
- } // j
-
+
+ }
+
+ for(int j = 1; j < (copY - 1); j++) {
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+ }
+
+
+
+ for (j = copY - 1; j < stepY; j++) {
+# define TNV_LOOP_LAST_J
+ i = 0; {
+# define TNV_LOOP_FIRST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_FIRST_I
+ }
+ for(i = 1; i < (dimX - 1); i++) {
+# include "TNV_core_backtrack_loop.h"
+ }
+ i = dimX - 1; {
+# define TNV_LOOP_LAST_I
+# include "TNV_core_backtrack_loop.h"
+# undef TNV_LOOP_LAST_I
+ }
+# undef TNV_LOOP_LAST_J
+ }
+
ctx->resprimal = resprimal;
- ctx->resdual = resdual;
+ ctx->resdual = resdual1 + resdual2;
ctx->product = product;
ctx->unorm = unorm;
ctx->qnorm = qnorm;
@@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
}
}
+ {
+ tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+ for(i = 0; i < dimX; i++) {
+ for(k = 0; k < dimZ; k++) {
+ int l = i * padZ + k;
+
+ float divdiff = ctx->div[l] - ctx->div_upd[l];
+ float udiff = ctx->u[l] - ctx->u_upd[l];
+ resprimal += fabs(divtau * udiff + divdiff);
+ }
+ }
+ }
+
for (j = 0; j < tnv_ctx.threads; j++) {
tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
resprimal += ctx->resprimal;
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
new file mode 100644
index 0000000..3ec4250
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
@@ -0,0 +1,100 @@
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+ l = (j * dimX + i) * padZ;
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+ udiff[k] = u[l + k] - u_upd[l + k];
+ unorm += (udiff[k] * udiff[k]);
+
+#ifdef TNV_LOOP_LAST_I
+ gradx_upd[l + k] = 0;
+#else
+ gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+ grady_upd[l + k] = 0;
+#else
+ grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+#endif
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+ gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE
+#ifdef TNV_NEW_STYLE
+ qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+ qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+ float vx = divsigma * qx_upd[l + k]; //+ ubarx
+ float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+#endif
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE
+ float vx = divsigma * qx_upd[l + k];
+ float vy = divsigma * qy_upd[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] -= sigma * gx_upd;
+ qy_upd[l + k] -= sigma * gy_upd;
+#else
+ float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+ float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+ float vx = ubarx + divsigma * qx[l + k];
+ float vy = ubary + divsigma * qy[l + k];
+
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+ qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+ float div_upd_val = 0;
+#ifndef TNV_LOOP_FIRST_I
+ div_upd_val -= qx_upd[l + k - padZ];
+#endif
+
+#ifndef TNV_LOOP_FIRST_J
+ div_upd_val -= qy_upd[l + k - dimX * padZ];
+#endif
+#ifndef TNV_LOOP_LAST_I
+ div_upd_val += qx_upd[l + k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+ div_upd_val += qy_upd[l + k];
+#endif
+ div_upd[l + k] = div_upd_val;
+
+ qxdiff = qx[l + k] - qx_upd[l + k];
+ qydiff = qy[l + k] - qy_upd[l + k];
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]);
+ resdual2 += fabs(divsigma * qydiff - gradydiff[k]);
+ product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+#ifndef TNV_LOOP_FIRST_J
+ divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history?
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+#endif
+ }
diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h
new file mode 100644
index 0000000..34e7139
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_loop.h
@@ -0,0 +1,107 @@
+ {
+ float t[3];
+ float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+ l = (j * dimX + i) * padZ;
+ m = dimX * padZ;
+
+ float *__restrict u_next = u + l + padZ;
+ float *__restrict u_current = u + l;
+ float *__restrict u_next_row = u + l + m;
+
+
+ float *__restrict qx_current = qx + l;
+ float *__restrict qy_current = qy + l;
+ float *__restrict qx_prev = qx + l - padZ;
+ float *__restrict qy_prev = qy + l - m;
+
+
+// __assume(padZ%16==0);
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < dimZ; k++) {
+ float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+ udiff[k] = u[l + k] - u_upd; // cache 1w
+ u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+ udiff0[l + k] = udiff[k];
+ div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+ float gradx_upd = 0;
+#else
+ float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+ float gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+ float grady_upd = 0;
+#else
+ float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+ float grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+ gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+ gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+ gradx[l + k] = gradx_upd; // 1 write
+ grady[l + k] = grady_upd; // 1 write
+
+ ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+ ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+ float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+ float vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+ M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+ }
+
+ coefF(t, M1, M2, M3, sigma, p, q, r);
+
+//#pragma vector aligned
+#pragma GCC ivdep
+ for(k = 0; k < padZ; k++) {
+ float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+ float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+ float gx_upd = vx * t[0] + vy * t[1];
+ float gy_upd = vx * t[1] + vy * t[2];
+
+ qxdiff = sigma * (ubarx[k] - gx_upd);
+ qydiff = sigma * (ubary[k] - gy_upd);
+
+ qx_current[k] += qxdiff; // write 1
+ qy_current[k] += qydiff; // write 1
+
+ unorm += (udiff[k] * udiff[k]);
+ qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+ float div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+ div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+ div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+ div_upd += qx_current[k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+ div_upd += qy_current[k];
+#endif
+
+ divdiff = div[l + k] - div_upd; // 1 read
+ div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+ resprimal += fabs(divtau * udiff[k] + divdiff);
+#endif
+
+ // We need to have two independent accumulators to allow gcc-autovectorization
+ resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+ resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+ product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+ }
+ }
+ \ No newline at end of file