blob: 86299cd70b6a75ea66195c9f9a3f1fd4721e0913 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
|
{
float t[3];
float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
l = (j * dimX + i) * padZ;
m = dimX * padZ;
floatxx *__restrict u_next = u + l + padZ;
floatxx *__restrict u_current = u + l;
floatxx *__restrict u_next_row = u + l + m;
floatxx *__restrict qx_current = qx + l;
floatxx *__restrict qy_current = qy + l;
floatxx *__restrict qx_prev = qx + l - padZ;
floatxx *__restrict qy_prev = qy + l - m;
// __assume(padZ%16==0);
#pragma vector aligned
#pragma GCC ivdep
for(k = 0; k < dimZ; k++) {
floatyy u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
udiff[k] = u[l + k] - u_upd; // cache 1w
u[l + k] = u_upd; // 1 write
#ifdef TNV_LOOP_FIRST_J
udiff0[l + k] = udiff[k];
div0[l + k] = div[l + k];
#endif
#ifdef TNV_LOOP_LAST_I
floatyy gradx_upd = 0;
#else
floatyy u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
floatyy gradx_upd = (u_next_upd - u_upd); // 2 reads
#endif
#ifdef TNV_LOOP_LAST_J
floatyy grady_upd = 0;
#else
floatyy u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
floatyy grady_upd = (u_next_row_upd - u_upd); // 1 read
#endif
gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
gradx[l + k] = gradx_upd; // 1 write
grady[l + k] = grady_upd; // 1 write
ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
floatyy vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
floatyy vy = ubary[k] + divsigma * qy[l + k]; // 1 read
M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
}
coefF(t, M1, M2, M3, sigma, p, q, r);
#pragma vector aligned
#pragma GCC ivdep
for(k = 0; k < padZ; k++) {
floatyy vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
floatyy vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
floatyy gx_upd = vx * t[0] + vy * t[1];
floatyy gy_upd = vx * t[1] + vy * t[2];
qxdiff = sigma * (ubarx[k] - gx_upd);
qydiff = sigma * (ubary[k] - gy_upd);
qx_current[k] += qxdiff; // write 1
qy_current[k] += qydiff; // write 1
unorm += (udiff[k] * udiff[k]);
qnorm += (qxdiff * qxdiff + qydiff * qydiff);
floatyy div_upd = 0;
#ifndef TNV_LOOP_FIRST_I
div_upd -= qx_prev[k]; // 1 read
#endif
#ifndef TNV_LOOP_FIRST_J
div_upd -= qy_prev[k]; // 1 read
#endif
#ifndef TNV_LOOP_LAST_I
div_upd += qx_current[k];
#endif
#ifndef TNV_LOOP_LAST_J
div_upd += qy_current[k];
#endif
divdiff = div[l + k] - div_upd; // 1 read
div[l + k] = div_upd; // 1 write
#ifndef TNV_LOOP_FIRST_J
resprimal += fabs(divtau * udiff[k] + divdiff);
#endif
// We need to have two independent accumulators to allow gcc-autovectorization
resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
}
}
|