1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
#define _POSIX_C_SOURCE 200112L
#define _GNU_SOURCE
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>
#include <ctype.h>
#include <time.h>
#include <sched.h>
#include <arpa/inet.h>
#include <sys/time.h>
#include "cpu.h"
#include "pci.h"
#include "tools.h"
#include "error.h"
/*
void *memcpy128(void * dst, void const * src, size_t len) {
long pos = - (len>>2);
char * plDst = (char *) dst - 4 * pos;
char const * plSrc = (char const *) src - 4 * pos;
if (pos) {
__asm__ __volatile__ (
"1: \n\t"
"mov (%0,%2,4), %%edi \n\t"
"mov %%edi, (%1,%2,4) \n\t"
"inc %2 \n\t"
"jnz 1b \n\t"
:
: "r" (plSrc), "r" (plDst), "r" (pos)
: "%edi"
);
}
long pos = - ((len>>4)<<4);
char * plDst = (char *) dst - pos;
char const * plSrc = (char const *) src - pos;
if (pos) {
__asm__ __volatile__ (
"1: \n\t"
// "movdqa (%0,%2), %%xmm0 \n\t"
"mov (%0,%2), %%esi \n\t"
"movd %%esi, %%xmm0 \n\t"
"mov 4(%0,%2), %%esi \n\t"
"movd %%esi, %%xmm1 \n\t"
"mov 8(%0,%2), %%esi \n\t"
"movd %%esi, %%xmm2 \n\t"
"mov 12(%0,%2), %%esi \n\t"
"movd %%esi, %%xmm3 \n\t"
"pslldq $4, %%xmm1 \n\t"
"por %%xmm1, %%xmm0 \n\t"
"pslldq $8, %%xmm2 \n\t"
"por %%xmm2, %%xmm0 \n\t"
"pslldq $12, %%xmm3 \n\t"
"por %%xmm3, %%xmm0 \n\t"
"movntdq %%xmm0, (%1,%2) \n\t"
"add $16, %2 \n\t"
"jnz 1b \n\t"
:
: "r" (plSrc), "r" (plDst), "r" (pos)
: "%rsi"
);
}
len &= 0x3;
char * pcDst = (char *) plDst;
char const * pcSrc = (char const *) plSrc;
while (len--) {
*pcDst++ = *pcSrc++;
}
return (dst);
}
*/
void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {
size_t sse_size = (size / 512);
__asm__ __volatile__ (
"push %2 \n\t"
"mov $0, %%rax \n\t"
"1: \n\t"
"vmovdqa (%0,%%rax), %%ymm0 \n\t"
"vmovdqa 32(%0,%%rax), %%ymm1 \n\t"
"vmovdqa 64(%0,%%rax), %%ymm2 \n\t"
"vmovdqa 96(%0,%%rax), %%ymm3 \n\t"
"vmovdqa 128(%0,%%rax), %%ymm4 \n\t"
"vmovdqa 160(%0,%%rax), %%ymm5 \n\t"
"vmovdqa 192(%0,%%rax), %%ymm6 \n\t"
"vmovdqa 224(%0,%%rax), %%ymm7 \n\t"
"vmovdqa 256(%0,%%rax), %%ymm8 \n\t"
"vmovdqa 288(%0,%%rax), %%ymm9 \n\t"
"vmovdqa 320(%0,%%rax), %%ymm10 \n\t"
"vmovdqa 352(%0,%%rax), %%ymm11 \n\t"
"vmovdqa 384(%0,%%rax), %%ymm12 \n\t"
"vmovdqa 416(%0,%%rax), %%ymm13 \n\t"
"vmovdqa 448(%0,%%rax), %%ymm14 \n\t"
"vmovdqa 480(%0,%%rax), %%ymm15 \n\t"
"vmovntps %%ymm0, (%1,%%rax) \n\t"
"vmovntps %%ymm1, 32(%1,%%rax) \n\t"
"vmovntps %%ymm2, 64(%1,%%rax) \n\t"
"vmovntps %%ymm3, 96(%1,%%rax) \n\t"
"vmovntps %%ymm4, 128(%1,%%rax) \n\t"
"vmovntps %%ymm5, 160(%1,%%rax) \n\t"
"vmovntps %%ymm6, 192(%1,%%rax) \n\t"
"vmovntps %%ymm7, 224(%1,%%rax) \n\t"
"vmovntps %%ymm8, 256(%1,%%rax) \n\t"
"vmovntps %%ymm9, 288(%1,%%rax) \n\t"
"vmovntps %%ymm10, 320(%1,%%rax) \n\t"
"vmovntps %%ymm11, 352(%1,%%rax) \n\t"
"vmovntps %%ymm12, 384(%1,%%rax) \n\t"
"vmovntps %%ymm13, 416(%1,%%rax) \n\t"
"vmovntps %%ymm14, 448(%1,%%rax) \n\t"
"vmovntps %%ymm15, 480(%1,%%rax) \n\t"
"add $512, %%rax \n\t"
"dec %2 \n\t"
"jnz 1b \n\t"
"pop %2 \n\t"
"sfence"
:
: "p" (dst), "p" (src), "r" (sse_size)
: "%rax"
);
}
void pcilib_pagecpy(void *dst, void *src, size_t size) {
int gen = pcilib_get_cpu_gen();
if ((gen > 3)&&(size%4096==0)&&((uintptr_t)dst%32==0)&&((uintptr_t)src%32==0)) {
pcilib_memcpy4k_avx(dst, src, size);
} else
memcpy(dst, src, size);
}
|