summaryrefslogtreecommitdiffstats
path: root/pcilib/pagecpy.c
blob: f474f9ff61132519f19890df5315549a493a2f9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#define _POSIX_C_SOURCE 200112L
#define _GNU_SOURCE

#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>
#include <ctype.h>
#include <time.h>
#include <sched.h>
#include <arpa/inet.h>
#include <sys/time.h>

#include "cpu.h"
#include "pci.h"
#include "tools.h"
#include "error.h"


/*
void *memcpy128(void * dst, void const * src, size_t len) {

    long pos = - (len>>2);
    char * plDst = (char *) dst - 4 * pos;
    char const * plSrc = (char const *) src - 4 * pos;

    if (pos) {
        __asm__ __volatile__ (
            "1:						\n\t"
            "mov	(%0,%2,4), %%edi		\n\t"
            "mov	%%edi, (%1,%2,4)		\n\t"
            "inc	%2				\n\t"
            "jnz 	1b				\n\t"
	: 
	: "r" (plSrc), "r" (plDst), "r" (pos)
	: "%edi"
        );
    }



    long pos = - ((len>>4)<<4);
    char * plDst = (char *) dst - pos;
    char const * plSrc = (char const *) src - pos;

    if (pos) {
        __asm__ __volatile__ (
            "1:						\n\t"
//            "movdqa	(%0,%2), %%xmm0			\n\t"
            "mov	(%0,%2), %%esi			\n\t"
            "movd	%%esi, %%xmm0			\n\t"
            "mov	4(%0,%2), %%esi			\n\t"
            "movd	%%esi, %%xmm1			\n\t"
            "mov	8(%0,%2), %%esi			\n\t"
            "movd	%%esi, %%xmm2			\n\t"
            "mov	12(%0,%2), %%esi		\n\t"
            "movd	%%esi, %%xmm3			\n\t"
	    "pslldq	$4, %%xmm1			\n\t"
	    "por	%%xmm1, %%xmm0			\n\t"
	    "pslldq	$8, %%xmm2			\n\t"
	    "por	%%xmm2, %%xmm0			\n\t"
	    "pslldq	$12, %%xmm3			\n\t"
	    "por	%%xmm3, %%xmm0			\n\t"
	    
            "movntdq	%%xmm0, (%1,%2)			\n\t"
            "add	$16, %2				\n\t"
            "jnz 	1b				\n\t"
	: 
	: "r" (plSrc), "r" (plDst), "r" (pos)
	: "%rsi"
        );
    }



    len &= 0x3;

    char * pcDst = (char *) plDst;
    char const * pcSrc = (char const *) plSrc;

    while (len--) {
        *pcDst++ = *pcSrc++;
    }

    return (dst);
} 
*/

void pcilib_memcpy4k_avx(void *dst, void *src, size_t size) {
    size_t sse_size = (size / 512);

    __asm__ __volatile__ (
            "push 	%2			\n\t"
            "mov        $0, %%rax		\n\t"

            "1:					\n\t"

            "vmovdqa 	   (%0,%%rax), %%ymm0	\n\t"
            "vmovdqa 	 32(%0,%%rax), %%ymm1	\n\t"
            "vmovdqa 	 64(%0,%%rax), %%ymm2	\n\t"
            "vmovdqa 	 96(%0,%%rax), %%ymm3	\n\t"
            "vmovdqa 	128(%0,%%rax), %%ymm4	\n\t"
            "vmovdqa 	160(%0,%%rax), %%ymm5	\n\t"
            "vmovdqa 	192(%0,%%rax), %%ymm6	\n\t"
            "vmovdqa 	224(%0,%%rax), %%ymm7	\n\t"

            "vmovdqa 	256(%0,%%rax), %%ymm8	\n\t"
            "vmovdqa 	288(%0,%%rax), %%ymm9	\n\t"
            "vmovdqa 	320(%0,%%rax), %%ymm10	\n\t"
            "vmovdqa 	352(%0,%%rax), %%ymm11	\n\t"
            "vmovdqa 	384(%0,%%rax), %%ymm12	\n\t"
            "vmovdqa 	416(%0,%%rax), %%ymm13	\n\t"
            "vmovdqa 	448(%0,%%rax), %%ymm14	\n\t"
            "vmovdqa 	480(%0,%%rax), %%ymm15	\n\t"

            "vmovntps	%%ymm0,    (%1,%%rax)	\n\t"
            "vmovntps	%%ymm1,  32(%1,%%rax)	\n\t"
            "vmovntps	%%ymm2,  64(%1,%%rax)	\n\t"
            "vmovntps	%%ymm3,  96(%1,%%rax)	\n\t"
            "vmovntps	%%ymm4, 128(%1,%%rax)	\n\t"
            "vmovntps	%%ymm5, 160(%1,%%rax)	\n\t"
            "vmovntps	%%ymm6, 192(%1,%%rax)	\n\t"
            "vmovntps	%%ymm7, 224(%1,%%rax)	\n\t"

            "vmovntps	%%ymm8,  256(%1,%%rax)	\n\t"
            "vmovntps	%%ymm9,  288(%1,%%rax)	\n\t"
            "vmovntps	%%ymm10, 320(%1,%%rax)	\n\t"
            "vmovntps	%%ymm11, 352(%1,%%rax)	\n\t"
            "vmovntps	%%ymm12, 384(%1,%%rax)	\n\t"
            "vmovntps	%%ymm13, 416(%1,%%rax)	\n\t"
            "vmovntps	%%ymm14, 448(%1,%%rax)	\n\t"
            "vmovntps	%%ymm15, 480(%1,%%rax)	\n\t"

            "add	$512, %%rax		\n\t"
            "dec	%2			\n\t"
            "jnz 	1b			\n\t"
            "pop 	%2			\n\t"

            "sfence"
    :
    : "p" (dst), "p" (src), "r" (sse_size)
    : "%rax"
        );
}

void pcilib_pagecpy(void *dst, void *src, size_t size) {
    int gen = pcilib_get_cpu_gen();
    if ((gen > 3)&&(size%4096==0)&&((uintptr_t)dst%32==0)&&((uintptr_t)src%32==0)) {
	pcilib_memcpy4k_avx(dst, src, size);
    } else
	memcpy(dst, src, size);
}