| 1 | //===----------------------Hexagon builtin routine ------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // An optimized version of a memcpy which is equivalent to the following loop: |
| 10 | // |
| 11 | // volatile unsigned *dest; |
| 12 | // unsigned *src; |
| 13 | // |
| 14 | // for (i = 0; i < num_words; ++i) |
| 15 | // *dest++ = *src++; |
| 16 | // |
| 17 | // The corresponding C prototype for this function would be |
| 18 | // void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned *dest, |
| 19 | // const unsigned *src, |
| 20 | // unsigned num_words); |
| 21 | // |
| 22 | // *** Both dest and src must be aligned to 32-bit boundaries. *** |
| 23 | // The code does not perform any runtime checks for this, and will fail |
| 24 | // in bad ways if this requirement is not met. |
| 25 | // |
| 26 | // The "forward" in the name refers to the fact that the function copies |
| 27 | // the words going forward in memory. It is incorrect to use this function |
| 28 | // for cases where the original code copied words in any other order. |
| 29 | // |
| 30 | // *** This function is only for the use by the compiler. *** |
| 31 | // The only indended use is for the LLVM compiler to generate calls to |
| 32 | // this function, when a mem-copy loop, like the one above, is detected. |
| 33 | |
| 34 | .text |
| 35 | |
| 36 | // Inputs: |
| 37 | // r0: dest |
| 38 | // r1: src |
| 39 | // r2: num_words |
| 40 | |
| 41 | .globl hexagon_memcpy_forward_vp4cp4n2 |
| 42 | .balign 32 |
| 43 | .type hexagon_memcpy_forward_vp4cp4n2,@function |
| 44 | hexagon_memcpy_forward_vp4cp4n2: |
| 45 | |
| 46 | // Compute r3 to be the number of words remaining in the current page. |
| 47 | // At the same time, compute r4 to be the number of 32-byte blocks |
| 48 | // remaining in the page (for prefetch). |
| 49 | { |
| 50 | r3 = sub(##4096, r1) |
| 51 | r5 = lsr(r2, #3) |
| 52 | } |
| 53 | { |
| 54 | // The word count before end-of-page is in the 12 lowest bits of r3. |
| 55 | // (If the address in r1 was already page-aligned, the bits are 0.) |
| 56 | r3 = extractu(r3, #10, #2) |
| 57 | r4 = extractu(r3, #7, #5) |
| 58 | } |
| 59 | { |
| 60 | r3 = minu(r2, r3) |
| 61 | r4 = minu(r5, r4) |
| 62 | } |
| 63 | { |
| 64 | r4 = or(r4, ##2105344) // 2105344 = 0x202000 |
| 65 | p0 = cmp.eq(r3, #0) |
| 66 | if (p0.new) jump:nt .Lskipprolog |
| 67 | } |
| 68 | l2fetch(r1, r4) |
| 69 | { |
| 70 | loop0(.Lprolog, r3) |
| 71 | r2 = sub(r2, r3) // r2 = number of words left after the prolog. |
| 72 | } |
| 73 | .falign |
| 74 | .Lprolog: |
| 75 | { |
| 76 | r4 = memw(r1++#4) |
| 77 | memw(r0++#4) = r4.new |
| 78 | } :endloop0 |
| 79 | .Lskipprolog: |
| 80 | { |
| 81 | // Let r3 = number of whole pages left (page = 1024 words). |
| 82 | r3 = lsr(r2, #10) |
| 83 | if (cmp.eq(r3.new, #0)) jump:nt .Lskipmain |
| 84 | } |
| 85 | { |
| 86 | loop1(.Lout, r3) |
| 87 | r2 = extractu(r2, #10, #0) // r2 = r2 & 1023 |
| 88 | r3 = ##2105472 // r3 = 0x202080 (prefetch info) |
| 89 | } |
| 90 | // Iterate over pages. |
| 91 | .falign |
| 92 | .Lout: |
| 93 | // Prefetch each individual page. |
| 94 | l2fetch(r1, r3) |
| 95 | loop0(.Lpage, #512) |
| 96 | .falign |
| 97 | .Lpage: |
| 98 | r5:4 = memd(r1++#8) |
| 99 | { |
| 100 | memw(r0++#8) = r4 |
| 101 | memw(r0+#4) = r5 |
| 102 | } :endloop0:endloop1 |
| 103 | .Lskipmain: |
| 104 | { |
| 105 | r3 = ##2105344 // r3 = 0x202000 (prefetch info) |
| 106 | r4 = lsr(r2, #3) // r4 = number of 32-byte blocks remaining. |
| 107 | p0 = cmp.eq(r2, #0) |
| 108 | if (p0.new) jumpr:nt r31 |
| 109 | } |
| 110 | { |
| 111 | r3 = or(r3, r4) |
| 112 | loop0(.Lepilog, r2) |
| 113 | } |
| 114 | l2fetch(r1, r3) |
| 115 | .falign |
| 116 | .Lepilog: |
| 117 | { |
| 118 | r4 = memw(r1++#4) |
| 119 | memw(r0++#4) = r4.new |
| 120 | } :endloop0 |
| 121 | |
| 122 | jumpr r31 |
| 123 | |
| 124 | .size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2 |
| 125 | |