1 | //===----------------------Hexagon builtin routine ------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // An optimized version of a memcpy which is equivalent to the following loop: |
10 | // |
11 | // volatile unsigned *dest; |
12 | // unsigned *src; |
13 | // |
14 | // for (i = 0; i < num_words; ++i) |
15 | // *dest++ = *src++; |
16 | // |
17 | // The corresponding C prototype for this function would be |
18 | // void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, |
19 | // const unsigned *src, |
20 | // unsigned num_words); |
21 | // |
22 | // *** Both dest and src must be aligned to 32-bit boundaries. *** |
23 | // The code does not perform any runtime checks for this, and will fail |
24 | // in bad ways if this requirement is not met. |
25 | // |
26 | // The "forward" in the name refers to the fact that the function copies |
27 | // the words going forward in memory. It is incorrect to use this function |
28 | // for cases where the original code copied words in any other order. |
29 | // |
30 | // *** This function is only for the use by the compiler. *** |
31 | // The only indended use is for the LLVM compiler to generate calls to |
32 | // this function, when a mem-copy loop, like the one above, is detected. |
33 | |
34 | .text |
35 | |
36 | // Inputs: |
37 | // r0: dest |
38 | // r1: src |
39 | // r2: num_words |
40 | |
41 | .globl hexagon_memcpy_forward_vp4cp4n2 |
42 | .balign 32 |
43 | .type hexagon_memcpy_forward_vp4cp4n2,@function |
44 | hexagon_memcpy_forward_vp4cp4n2: |
45 | |
46 | // Compute r3 to be the number of words remaining in the current page. |
47 | // At the same time, compute r4 to be the number of 32-byte blocks |
48 | // remaining in the page (for prefetch). |
49 | { |
50 | r3 = sub(##4096, r1) |
51 | r5 = lsr(r2, #3) |
52 | } |
53 | { |
54 | // The word count before end-of-page is in the 12 lowest bits of r3. |
55 | // (If the address in r1 was already page-aligned, the bits are 0.) |
56 | r3 = extractu(r3, #10, #2) |
57 | r4 = extractu(r3, #7, #5) |
58 | } |
59 | { |
60 | r3 = minu(r2, r3) |
61 | r4 = minu(r5, r4) |
62 | } |
63 | { |
64 | r4 = or(r4, ##2105344) // 2105344 = 0x202000 |
65 | p0 = cmp.eq(r3, #0) |
66 | if (p0.new) jump:nt .Lskipprolog |
67 | } |
68 | l2fetch(r1, r4) |
69 | { |
70 | loop0(.Lprolog, r3) |
71 | r2 = sub(r2, r3) // r2 = number of words left after the prolog. |
72 | } |
73 | .falign |
74 | .Lprolog: |
75 | { |
76 | r4 = memw(r1++#4) |
77 | memw(r0++#4) = r4.new |
78 | } :endloop0 |
79 | .Lskipprolog: |
80 | { |
81 | // Let r3 = number of whole pages left (page = 1024 words). |
82 | r3 = lsr(r2, #10) |
83 | if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain |
84 | } |
85 | { |
86 | loop1(.Lout, r3) |
87 | r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 |
88 | r3 = ##2105472 // r3 = 0x202080 (prefetch info) |
89 | } |
90 | // Iterate over pages. |
91 | .falign |
92 | .Lout: |
93 | // Prefetch each individual page. |
94 | l2fetch(r1, r3) |
95 | loop0(.Lpage, #512) |
96 | .falign |
97 | .Lpage: |
98 | r5:4 = memd(r1++#8) |
99 | { |
100 | memw(r0++#8) = r4 |
101 | memw(r0+#4) = r5 |
102 | } :endloop0:endloop1 |
103 | .Lskipmain: |
104 | { |
105 | r3 = ##2105344 // r3 = 0x202000 (prefetch info) |
106 | r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. |
107 | p0 = cmp.eq(r2, #0) |
108 | if (p0.new) jumpr:nt r31 |
109 | } |
110 | { |
111 | r3 = or(r3, r4) |
112 | loop0(.Lepilog, r2) |
113 | } |
114 | l2fetch(r1, r3) |
115 | .falign |
116 | .Lepilog: |
117 | { |
118 | r4 = memw(r1++#4) |
119 | memw(r0++#4) = r4.new |
120 | } :endloop0 |
121 | |
122 | jumpr r31 |
123 | |
124 | .size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 |
125 | |