1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// An optimized version of a memcpy which is equivalent to the following loop:
10//
11// volatile unsigned *dest;
12// unsigned *src;
13//
14// for (i = 0; i < num_words; ++i)
15// *dest++ = *src++;
16//
17// The corresponding C prototype for this function would be
18// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest,
19// const unsigned *src,
20// unsigned num_words);
21//
22// *** Both dest and src must be aligned to 32-bit boundaries. ***
23// The code does not perform any runtime checks for this, and will fail
24// in bad ways if this requirement is not met.
25//
26// The "forward" in the name refers to the fact that the function copies
27// the words going forward in memory. It is incorrect to use this function
28// for cases where the original code copied words in any other order.
29//
30// *** This function is only for the use by the compiler. ***
31// The only indended use is for the LLVM compiler to generate calls to
32// this function, when a mem-copy loop, like the one above, is detected.
33
34 .text
35
36// Inputs:
37// r0: dest
38// r1: src
39// r2: num_words
40
41 .globl hexagon_memcpy_forward_vp4cp4n2
42 .balign 32
43 .type hexagon_memcpy_forward_vp4cp4n2,@function
44hexagon_memcpy_forward_vp4cp4n2:
45
46 // Compute r3 to be the number of words remaining in the current page.
47 // At the same time, compute r4 to be the number of 32-byte blocks
48 // remaining in the page (for prefetch).
49 {
50 r3 = sub(##4096, r1)
51 r5 = lsr(r2, #3)
52 }
53 {
54 // The word count before end-of-page is in the 12 lowest bits of r3.
55 // (If the address in r1 was already page-aligned, the bits are 0.)
56 r3 = extractu(r3, #10, #2)
57 r4 = extractu(r3, #7, #5)
58 }
59 {
60 r3 = minu(r2, r3)
61 r4 = minu(r5, r4)
62 }
63 {
64 r4 = or(r4, ##2105344) // 2105344 = 0x202000
65 p0 = cmp.eq(r3, #0)
66 if (p0.new) jump:nt .Lskipprolog
67 }
68 l2fetch(r1, r4)
69 {
70 loop0(.Lprolog, r3)
71 r2 = sub(r2, r3) // r2 = number of words left after the prolog.
72 }
73 .falign
74.Lprolog:
75 {
76 r4 = memw(r1++#4)
77 memw(r0++#4) = r4.new
78 } :endloop0
79.Lskipprolog:
80 {
81 // Let r3 = number of whole pages left (page = 1024 words).
82 r3 = lsr(r2, #10)
83 if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain
84 }
85 {
86 loop1(.Lout, r3)
87 r2 = extractu(r2, #10, #0) // r2 = r2 & 1023
88 r3 = ##2105472 // r3 = 0x202080 (prefetch info)
89 }
90 // Iterate over pages.
91 .falign
92.Lout:
93 // Prefetch each individual page.
94 l2fetch(r1, r3)
95 loop0(.Lpage, #512)
96 .falign
97.Lpage:
98 r5:4 = memd(r1++#8)
99 {
100 memw(r0++#8) = r4
101 memw(r0+#4) = r5
102 } :endloop0:endloop1
103.Lskipmain:
104 {
105 r3 = ##2105344 // r3 = 0x202000 (prefetch info)
106 r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining.
107 p0 = cmp.eq(r2, #0)
108 if (p0.new) jumpr:nt r31
109 }
110 {
111 r3 = or(r3, r4)
112 loop0(.Lepilog, r2)
113 }
114 l2fetch(r1, r3)
115 .falign
116.Lepilog:
117 {
118 r4 = memw(r1++#4)
119 memw(r0++#4) = r4.new
120 } :endloop0
121
122 jumpr r31
123
124.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
125

source code of compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S