1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* |
3 | * OpenRISC memset.S |
4 | * |
5 | * Hand-optimized assembler version of memset for OpenRISC. |
6 | * Algorithm inspired by several other arch-specific memset routines |
7 | * in the kernel tree |
8 | * |
9 | * Copyright (C) 2015 Olof Kindgren <olof.kindgren@gmail.com> |
10 | */ |
11 | |
12 | .global memset |
13 | .type memset, @function |
14 | memset: |
15 | /* arguments: |
16 | * r3 = *s |
17 | * r4 = c |
18 | * r5 = n |
19 | * r13, r15, r17, r19 used as temp regs |
20 | */ |
21 | |
22 | /* Exit if n == 0 */ |
23 | l.sfeqi r5, 0 |
24 | l.bf 4f |
25 | |
26 | /* Truncate c to char */ |
27 | l.andi r13, r4, 0xff |
28 | |
29 | /* Skip word extension if c is 0 */ |
30 | l.sfeqi r13, 0 |
31 | l.bf 1f |
32 | /* Check for at least two whole words (8 bytes) */ |
33 | l.sfleui r5, 7 |
34 | |
35 | /* Extend char c to 32-bit word cccc in r13 */ |
36 | l.slli r15, r13, 16 // r13 = 000c, r15 = 0c00 |
37 | l.or r13, r13, r15 // r13 = 0c0c, r15 = 0c00 |
38 | l.slli r15, r13, 8 // r13 = 0c0c, r15 = c0c0 |
39 | l.or r13, r13, r15 // r13 = cccc, r15 = c0c0 |
40 | |
41 | 1: l.addi r19, r3, 0 // Set r19 = src |
42 | /* Jump to byte copy loop if less than two words */ |
43 | l.bf 3f |
44 | l.or r17, r5, r0 // Set r17 = n |
45 | |
46 | /* Mask out two LSBs to check alignment */ |
47 | l.andi r15, r3, 0x3 |
48 | |
49 | /* lsb == 00, jump to word copy loop */ |
50 | l.sfeqi r15, 0 |
51 | l.bf 2f |
52 | l.addi r19, r3, 0 // Set r19 = src |
53 | |
54 | /* lsb == 01,10 or 11 */ |
55 | l.sb 0(r3), r13 // *src = c |
56 | l.addi r17, r17, -1 // Decrease n |
57 | |
58 | l.sfeqi r15, 3 |
59 | l.bf 2f |
60 | l.addi r19, r3, 1 // src += 1 |
61 | |
62 | /* lsb == 01 or 10 */ |
63 | l.sb 1(r3), r13 // *(src+1) = c |
64 | l.addi r17, r17, -1 // Decrease n |
65 | |
66 | l.sfeqi r15, 2 |
67 | l.bf 2f |
68 | l.addi r19, r3, 2 // src += 2 |
69 | |
70 | /* lsb == 01 */ |
71 | l.sb 2(r3), r13 // *(src+2) = c |
72 | l.addi r17, r17, -1 // Decrease n |
73 | l.addi r19, r3, 3 // src += 3 |
74 | |
75 | /* Word copy loop */ |
76 | 2: l.sw 0(r19), r13 // *src = cccc |
77 | l.addi r17, r17, -4 // Decrease n |
78 | l.sfgeui r17, 4 |
79 | l.bf 2b |
80 | l.addi r19, r19, 4 // Increase src |
81 | |
82 | /* When n > 0, copy the remaining bytes, otherwise jump to exit */ |
83 | l.sfeqi r17, 0 |
84 | l.bf 4f |
85 | |
86 | /* Byte copy loop */ |
87 | 3: l.addi r17, r17, -1 // Decrease n |
88 | l.sb 0(r19), r13 // *src = cccc |
89 | l.sfnei r17, 0 |
90 | l.bf 3b |
91 | l.addi r19, r19, 1 // Increase src |
92 | |
93 | 4: l.jr r9 |
94 | l.ori r11, r3, 0 |
95 | |