memcpy_forward_vp4cp4n2.S source code [compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S]

1	//===----------------------Hexagon builtin routine ------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// An optimized version of a memcpy which is equivalent to the following loop:
10	//
11	// volatile unsigned dest;*
12	// unsigned src;*
13	//
14	// for (i = 0; i < num_words; ++i)
15	// dest++ = src++;
16	//
17	// The corresponding C prototype for this function would be
18	// void hexagon_memcpy_forward_vp4cp4n2(volatile unsigned dest,*
19	// const unsigned src,*
20	// unsigned num_words);
21	//
22	// * Both dest and src must be aligned to 32-bit boundaries. *
23	// The code does not perform any runtime checks for this, and will fail
24	// in bad ways if this requirement is not met.
25	//
26	// The "forward" in the name refers to the fact that the function copies
27	// the words going forward in memory. It is incorrect to use this function
28	// for cases where the original code copied words in any other order.
29	//
30	// * This function is only for the use by the compiler. *
31	// The only indended use is for the LLVM compiler to generate calls to
32	// this function, when a mem-copy loop, like the one above, is detected.
33
34	.text
35
36	// Inputs:
37	// r0: dest
38	// r1: src
39	// r2: num_words
40
41	.globl hexagon_memcpy_forward_vp4cp4n2
42	.balign `32`
43	.type hexagon_memcpy_forward_vp4cp4n2,@function
44	hexagon_memcpy_forward_vp4cp4n2:
45
46	// Compute r3 to be the number of words remaining in the current page.
47	// At the same time, compute r4 to be the number of 32-byte blocks
48	// remaining in the page (for prefetch).
49	{
50	r3 = sub(##`4096`, r1)
51	r5 = lsr(r2, #`3`)
52	}
53	{
54	// The word count before end-of-page is in the 12 lowest bits of r3.
55	// (If the address in r1 was already page-aligned, the bits are 0.)
56	r3 = extractu(r3, #`10`, #`2`)
57	r4 = extractu(r3, #`7`, #`5`)
58	}
59	{
60	r3 = minu(r2, r3)
61	r4 = minu(r5, r4)
62	}
63	{
64	r4 = or(r4, ##`2105344`) // 2105344 = 0x202000
65	p0 = cmp.eq(r3, #`0`)
66	if (p0.new) jump:nt .Lskipprolog
67	}
68	l2fetch(r1, r4)
69	{
70	loop0(.Lprolog, r3)
71	r2 = sub(r2, r3) // r2 = number of words left after the prolog.
72	}
73	.falign
74	.Lprolog:
75	{
76	r4 = memw(r1++#`4`)
77	memw(r0++#`4`) = r4.new
78	} :endloop0
79	.Lskipprolog:
80	{
81	// Let r3 = number of whole pages left (page = 1024 words).
82	r3 = lsr(r2, #`10`)
83	if (cmp.eq(r3.new, #`0`)) jump:nt .Lskipmain
84	}
85	{
86	loop1(.Lout, r3)
87	r2 = extractu(r2, #`10`, #`0`) // r2 = r2 & 1023
88	r3 = ##`2105472` // r3 = 0x202080 (prefetch info)
89	}
90	// Iterate over pages.
91	.falign
92	.Lout:
93	// Prefetch each individual page.
94	l2fetch(r1, r3)
95	loop0(.Lpage, #`512`)
96	.falign
97	.Lpage:
98	r5:`4` = memd(r1++#`8`)
99	{
100	memw(r0++#`8`) = r4
101	memw(r0+#`4`) = r5
102	} :endloop0:endloop1
103	.Lskipmain:
104	{
105	r3 = ##`2105344` // r3 = 0x202000 (prefetch info)
106	r4 = lsr(r2, #`3`) // r4 = number of 32-byte blocks remaining.
107	p0 = cmp.eq(r2, #`0`)
108	if (p0.new) jumpr:nt r31
109	}
110	{
111	r3 = or(r3, r4)
112	loop0(.Lepilog, r2)
113	}
114	l2fetch(r1, r3)
115	.falign
116	.Lepilog:
117	{
118	r4 = memw(r1++#`4`)
119	memw(r0++#`4`) = r4.new
120	} :endloop0
121
122	jumpr r31
123
124	.size hexagon_memcpy_forward_vp4cp4n2, . - hexagon_memcpy_forward_vp4cp4n2
125

source code of compiler-rt/lib/builtins/hexagon/memcpy_forward_vp4cp4n2.S