1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
4 */
5
6#include <linux/export.h>
7#include <linux/linkage.h>
8#include <asm/asm.h>
9
10/*
11 * copy_user_nocache - Uncached memory copy with exception handling
12 *
13 * This copies from user space into kernel space, but the kernel
14 * space accesses can take a machine check exception, so they too
15 * need exception handling.
16 *
17 * Note: only 32-bit and 64-bit stores have non-temporal versions,
18 * and we only use aligned versions. Any unaligned parts at the
19 * start or end of the copy will be done using normal cached stores.
20 *
21 * Input:
22 * rdi destination
23 * rsi source
24 * edx count
25 *
26 * Output:
27 * rax uncopied bytes or 0 if successful.
28 */
29SYM_FUNC_START(__copy_user_nocache)
30 /* If destination is not 7-byte aligned, we'll have to align it */
31 testb $7,%dil
32 jne .Lalign
33
34.Lis_aligned:
35 cmp $64,%edx
36 jb .Lquadwords
37
38 .p2align 4,0x90
39.Lunrolled:
4010: movq (%rsi),%r8
4111: movq 8(%rsi),%r9
4212: movq 16(%rsi),%r10
4313: movq 24(%rsi),%r11
4420: movnti %r8,(%rdi)
4521: movnti %r9,8(%rdi)
4622: movnti %r10,16(%rdi)
4723: movnti %r11,24(%rdi)
4830: movq 32(%rsi),%r8
4931: movq 40(%rsi),%r9
5032: movq 48(%rsi),%r10
5133: movq 56(%rsi),%r11
5240: movnti %r8,32(%rdi)
5341: movnti %r9,40(%rdi)
5442: movnti %r10,48(%rdi)
5543: movnti %r11,56(%rdi)
56
57 addq $64,%rsi
58 addq $64,%rdi
59 sub $64,%edx
60 cmp $64,%edx
61 jae .Lunrolled
62
63/*
64 * First set of user mode loads have been done
65 * without any stores, so if they fail, we can
66 * just try the non-unrolled loop.
67 */
68_ASM_EXTABLE_UA(10b, .Lquadwords)
69_ASM_EXTABLE_UA(11b, .Lquadwords)
70_ASM_EXTABLE_UA(12b, .Lquadwords)
71_ASM_EXTABLE_UA(13b, .Lquadwords)
72
73/*
74 * The second set of user mode loads have been
75 * done with 32 bytes stored to the destination,
76 * so we need to take that into account before
77 * falling back to the unrolled loop.
78 */
79_ASM_EXTABLE_UA(30b, .Lfixup32)
80_ASM_EXTABLE_UA(31b, .Lfixup32)
81_ASM_EXTABLE_UA(32b, .Lfixup32)
82_ASM_EXTABLE_UA(33b, .Lfixup32)
83
84/*
85 * An exception on a write means that we're
86 * done, but we need to update the count
87 * depending on where in the unrolled loop
88 * we were.
89 */
90_ASM_EXTABLE_UA(20b, .Ldone0)
91_ASM_EXTABLE_UA(21b, .Ldone8)
92_ASM_EXTABLE_UA(22b, .Ldone16)
93_ASM_EXTABLE_UA(23b, .Ldone24)
94_ASM_EXTABLE_UA(40b, .Ldone32)
95_ASM_EXTABLE_UA(41b, .Ldone40)
96_ASM_EXTABLE_UA(42b, .Ldone48)
97_ASM_EXTABLE_UA(43b, .Ldone56)
98
99.Lquadwords:
100 cmp $8,%edx
101 jb .Llong
10250: movq (%rsi),%rax
10351: movnti %rax,(%rdi)
104 addq $8,%rsi
105 addq $8,%rdi
106 sub $8,%edx
107 jmp .Lquadwords
108
109/*
110 * If we fail on the last full quadword, we will
111 * not try to do any byte-wise cached accesses.
112 * We will try to do one more 4-byte uncached
113 * one, though.
114 */
115_ASM_EXTABLE_UA(50b, .Llast4)
116_ASM_EXTABLE_UA(51b, .Ldone0)
117
118.Llong:
119 test $4,%dl
120 je .Lword
12160: movl (%rsi),%eax
12261: movnti %eax,(%rdi)
123 addq $4,%rsi
124 addq $4,%rdi
125 sub $4,%edx
126.Lword:
127 sfence
128 test $2,%dl
129 je .Lbyte
13070: movw (%rsi),%ax
13171: movw %ax,(%rdi)
132 addq $2,%rsi
133 addq $2,%rdi
134 sub $2,%edx
135.Lbyte:
136 test $1,%dl
137 je .Ldone
13880: movb (%rsi),%al
13981: movb %al,(%rdi)
140 dec %edx
141.Ldone:
142 mov %edx,%eax
143 RET
144
145/*
146 * If we fail on the last four bytes, we won't
147 * bother with any fixups. It's dead, Jim. Note
148 * that there's no need for 'sfence' for any
149 * of this, since the exception will have been
150 * serializing.
151 */
152_ASM_EXTABLE_UA(60b, .Ldone)
153_ASM_EXTABLE_UA(61b, .Ldone)
154_ASM_EXTABLE_UA(70b, .Ldone)
155_ASM_EXTABLE_UA(71b, .Ldone)
156_ASM_EXTABLE_UA(80b, .Ldone)
157_ASM_EXTABLE_UA(81b, .Ldone)
158
159/*
160 * This is the "head needs aliging" case when
161 * the destination isn't 8-byte aligned. The
162 * 4-byte case can be done uncached, but any
163 * smaller alignment is done with regular stores.
164 */
165.Lalign:
166 test $1,%dil
167 je .Lalign_word
168 test %edx,%edx
169 je .Ldone
17090: movb (%rsi),%al
17191: movb %al,(%rdi)
172 inc %rsi
173 inc %rdi
174 dec %edx
175.Lalign_word:
176 test $2,%dil
177 je .Lalign_long
178 cmp $2,%edx
179 jb .Lbyte
18092: movw (%rsi),%ax
18193: movw %ax,(%rdi)
182 addq $2,%rsi
183 addq $2,%rdi
184 sub $2,%edx
185.Lalign_long:
186 test $4,%dil
187 je .Lis_aligned
188 cmp $4,%edx
189 jb .Lword
19094: movl (%rsi),%eax
19195: movnti %eax,(%rdi)
192 addq $4,%rsi
193 addq $4,%rdi
194 sub $4,%edx
195 jmp .Lis_aligned
196
197/*
198 * If we fail on the initial alignment accesses,
199 * we're all done. Again, no point in trying to
200 * do byte-by-byte probing if the 4-byte load
201 * fails - we're not doing any uncached accesses
202 * any more.
203 */
204_ASM_EXTABLE_UA(90b, .Ldone)
205_ASM_EXTABLE_UA(91b, .Ldone)
206_ASM_EXTABLE_UA(92b, .Ldone)
207_ASM_EXTABLE_UA(93b, .Ldone)
208_ASM_EXTABLE_UA(94b, .Ldone)
209_ASM_EXTABLE_UA(95b, .Ldone)
210
211/*
212 * Exception table fixups for faults in the middle
213 */
214.Ldone56: sub $8,%edx
215.Ldone48: sub $8,%edx
216.Ldone40: sub $8,%edx
217.Ldone32: sub $8,%edx
218.Ldone24: sub $8,%edx
219.Ldone16: sub $8,%edx
220.Ldone8: sub $8,%edx
221.Ldone0:
222 mov %edx,%eax
223 RET
224
225.Lfixup32:
226 addq $32,%rsi
227 addq $32,%rdi
228 sub $32,%edx
229 jmp .Lquadwords
230
231.Llast4:
23252: movl (%rsi),%eax
23353: movnti %eax,(%rdi)
234 sfence
235 sub $4,%edx
236 mov %edx,%eax
237 RET
238_ASM_EXTABLE_UA(52b, .Ldone0)
239_ASM_EXTABLE_UA(53b, .Ldone0)
240
241SYM_FUNC_END(__copy_user_nocache)
242EXPORT_SYMBOL(__copy_user_nocache)
243

source code of linux/arch/x86/lib/copy_user_uncached_64.S