bpf_jit_32.c source code [linux/arch/arm/net/bpf_jit_32.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Just-In-Time compiler for eBPF filters on 32bit ARM
4	*
5	* Copyright (c) 2023 Puranjay Mohan <puranjay12@gmail.com>
6	* Copyright (c) 2017 Shubham Bansal <illusionist.neo@gmail.com>
7	* Copyright (c) 2011 Mircea Gherzan <mgherzan@gmail.com>
8	*/
9
10	#include <linux/bpf.h>
11	#include <linux/bitops.h>
12	#include <linux/compiler.h>
13	#include <linux/errno.h>
14	#include <linux/filter.h>
15	#include <linux/netdevice.h>
16	#include <linux/string.h>
17	#include <linux/slab.h>
18	#include <linux/if_vlan.h>
19	#include <linux/math64.h>
20
21	#include <asm/cacheflush.h>
22	#include <asm/hwcap.h>
23	#include <asm/opcodes.h>
24	#include <asm/system_info.h>
25
26	#include "bpf_jit_32.h"
27
28	/*
29	* eBPF prog stack layout:
30	*
31	* high
32	* original ARM_SP => +-----+
33	* \| \| callee saved registers
34	* +-----+ <= (BPF_FP + SCRATCH_SIZE)
35	* \| ... \| eBPF JIT scratch space
36	* eBPF fp register => +-----+
37	* (BPF_FP) \| ... \| eBPF prog stack
38	* +-----+
39	* \|RSVD \| JIT scratchpad
40	* current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE)
41	* \| ... \| caller-saved registers
42	* +-----+
43	* \| ... \| arguments passed on stack
44	* ARM_SP during call => +-----\|
45	* \| \|
46	* \| ... \| Function call stack
47	* \| \|
48	* +-----+
49	* low
50	*
51	* The callee saved registers depends on whether frame pointers are enabled.
52	* With frame pointers (to be compliant with the ABI):
53	*
54	* high
55	* original ARM_SP => +--------------+ \
56	* \| pc \| \|
57	* current ARM_FP => +--------------+ } callee saved registers
58	* \|r4-r9,fp,ip,lr\| \|
59	* +--------------+ /
60	* low
61	*
62	* Without frame pointers:
63	*
64	* high
65	* original ARM_SP => +--------------+
66	* \| r4-r9,fp,lr \| callee saved registers
67	* current ARM_FP => +--------------+
68	* low
69	*
70	* When popping registers off the stack at the end of a BPF function, we
71	* reference them via the current ARM_FP register.
72	*
73	* Some eBPF operations are implemented via a call to a helper function.
74	* Such calls are "invisible" in the eBPF code, so it is up to the calling
75	* program to preserve any caller-saved ARM registers during the call. The
76	* JIT emits code to push and pop those registers onto the stack, immediately
77	* above the callee stack frame.
78	*/
79	#define CALLEE_MASK (1 << ARM_R4 \| 1 << ARM_R5 \| 1 << ARM_R6 \| \
80	1 << ARM_R7 \| 1 << ARM_R8 \| 1 << ARM_R9 \| \
81	1 << ARM_FP)
82	#define CALLEE_PUSH_MASK (CALLEE_MASK \| 1 << ARM_LR)
83	#define CALLEE_POP_MASK (CALLEE_MASK \| 1 << ARM_PC)
84
85	#define CALLER_MASK (1 << ARM_R0 \| 1 << ARM_R1 \| 1 << ARM_R2 \| 1 << ARM_R3)
86
87	enum {
88	/ Stack layout - these are offsets from (top of stack - 4) /
89	BPF_R2_HI,
90	BPF_R2_LO,
91	BPF_R3_HI,
92	BPF_R3_LO,
93	BPF_R4_HI,
94	BPF_R4_LO,
95	BPF_R5_HI,
96	BPF_R5_LO,
97	BPF_R7_HI,
98	BPF_R7_LO,
99	BPF_R8_HI,
100	BPF_R8_LO,
101	BPF_R9_HI,
102	BPF_R9_LO,
103	BPF_FP_HI,
104	BPF_FP_LO,
105	BPF_TC_HI,
106	BPF_TC_LO,
107	BPF_AX_HI,
108	BPF_AX_LO,
109	/ Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4,*
110	* BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9,
111	* BPF_REG_FP and Tail call counts.
112	*/
113	BPF_JIT_SCRATCH_REGS,
114	};
115
116	/*
117	* Negative "register" values indicate the register is stored on the stack
118	* and are the offset from the top of the eBPF JIT scratch space.
119	*/
120	#define STACK_OFFSET(k) (-4 - (k) * 4)
121	#define SCRATCH_SIZE (BPF_JIT_SCRATCH_REGS * 4)
122
123	#ifdef CONFIG_FRAME_POINTER
124	#define EBPF_SCRATCH_TO_ARM_FP(x) ((x) - 4 * hweight16(CALLEE_PUSH_MASK) - 4)
125	#else
126	#define EBPF_SCRATCH_TO_ARM_FP(x) (x)
127	#endif
128
129	#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */
130	#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */
131	#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */
132
133	#define FLAG_IMM_OVERFLOW (1 << 0)
134
135	/*
136	* Map eBPF registers to ARM 32bit registers or stack scratch space.
137	*
138	* 1. First argument is passed using the arm 32bit registers and rest of the
139	* arguments are passed on stack scratch space.
140	* 2. First callee-saved argument is mapped to arm 32 bit registers and rest
141	* arguments are mapped to scratch space on stack.
142	* 3. We need two 64 bit temp registers to do complex operations on eBPF
143	* registers.
144	*
145	* As the eBPF registers are all 64 bit registers and arm has only 32 bit
146	* registers, we have to map each eBPF registers with two arm 32 bit regs or
147	* scratch memory space and we have to build eBPF 64 bit register from those.
148	*
149	*/
150	static const s8 bpf2a32[][`2`] = {
151	/ return value from in-kernel function, and exit value from eBPF /
152	[BPF_REG_0] = {ARM_R1, ARM_R0},
153	/ arguments from eBPF program to in-kernel function /
154	[BPF_REG_1] = {ARM_R3, ARM_R2},
155	/ Stored on stack scratch space /
156	[BPF_REG_2] = {STACK_OFFSET(BPF_R2_HI), STACK_OFFSET(BPF_R2_LO)},
157	[BPF_REG_3] = {STACK_OFFSET(BPF_R3_HI), STACK_OFFSET(BPF_R3_LO)},
158	[BPF_REG_4] = {STACK_OFFSET(BPF_R4_HI), STACK_OFFSET(BPF_R4_LO)},
159	[BPF_REG_5] = {STACK_OFFSET(BPF_R5_HI), STACK_OFFSET(BPF_R5_LO)},
160	/ callee saved registers that in-kernel function will preserve /
161	[BPF_REG_6] = {ARM_R5, ARM_R4},
162	/ Stored on stack scratch space /
163	[BPF_REG_7] = {STACK_OFFSET(BPF_R7_HI), STACK_OFFSET(BPF_R7_LO)},
164	[BPF_REG_8] = {STACK_OFFSET(BPF_R8_HI), STACK_OFFSET(BPF_R8_LO)},
165	[BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)},
166	/ Read only Frame Pointer to access Stack /
167	[BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)},
168	/ Temporary Register for BPF JIT, can be used*
169	* for constant blindings and others.
170	*/
171	[TMP_REG_1] = {ARM_R7, ARM_R6},
172	[TMP_REG_2] = {ARM_R9, ARM_R8},
173	/ Tail call count. Stored on stack scratch space. /
174	[TCALL_CNT] = {STACK_OFFSET(BPF_TC_HI), STACK_OFFSET(BPF_TC_LO)},
175	/ temporary register for blinding constants.*
176	* Stored on stack scratch space.
177	*/
178	[BPF_REG_AX] = {STACK_OFFSET(BPF_AX_HI), STACK_OFFSET(BPF_AX_LO)},
179	};
180
181	#define dst_lo dst[1]
182	#define dst_hi dst[0]
183	#define src_lo src[1]
184	#define src_hi src[0]
185
186	/*
187	* JIT Context:
188	*
189	* prog : bpf_prog
190	* idx : index of current last JITed instruction.
191	* prologue_bytes : bytes used in prologue.
192	* epilogue_offset : offset of epilogue starting.
193	* offsets : array of eBPF instruction offsets in
194	* JITed code.
195	* target : final JITed code.
196	* epilogue_bytes : no of bytes used in epilogue.
197	* imm_count : no of immediate counts used for global
198	* variables.
199	* imms : array of global variable addresses.
200	*/
201
202	struct jit_ctx {
203	const struct bpf_prog *prog;
204	unsigned int idx;
205	unsigned int prologue_bytes;
206	unsigned int epilogue_offset;
207	unsigned int cpu_architecture;
208	u32 flags;
209	u32 *offsets;
210	u32 *target;
211	u32 stack_size;
212	#if __LINUX_ARM_ARCH__ < 7
213	u16 epilogue_bytes;
214	u16 imm_count;
215	u32 *imms;
216	#endif
217	};
218
219	/*
220	* Wrappers which handle both OABI and EABI and assures Thumb2 interworking
221	* (where the assembly routines like __aeabi_uidiv could cause problems).
222	*/
223	static u32 jit_udiv32(u32 dividend, u32 divisor)
224	{
225	return dividend / divisor;
226	}
227
228	static u32 jit_mod32(u32 dividend, u32 divisor)
229	{
230	return dividend % divisor;
231	}
232
233	static s32 jit_sdiv32(s32 dividend, s32 divisor)
234	{
235	return dividend / divisor;
236	}
237
238	static s32 jit_smod32(s32 dividend, s32 divisor)
239	{
240	return dividend % divisor;
241	}
242
243	/ Wrappers for 64-bit div/mod /
244	static u64 jit_udiv64(u64 dividend, u64 divisor)
245	{
246	return div64_u64(dividend, divisor);
247	}
248
249	static u64 jit_mod64(u64 dividend, u64 divisor)
250	{
251	u64 rem;
252
253	div64_u64_rem(dividend, divisor, remainder: &rem);
254	return rem;
255	}
256
257	static s64 jit_sdiv64(s64 dividend, s64 divisor)
258	{
259	return div64_s64(dividend, divisor);
260	}
261
262	static s64 jit_smod64(s64 dividend, s64 divisor)
263	{
264	u64 q;
265
266	q = div64_s64(dividend, divisor);
267
268	return dividend - q * divisor;
269	}
270
271	static inline void _emit(int cond, u32 inst, struct jit_ctx *ctx)
272	{
273	inst \|= (cond << `28`);
274	inst = __opcode_to_mem_arm(inst);
275
276	if (ctx->target != NULL)
277	ctx->target[ctx->idx] = inst;
278
279	ctx->idx++;
280	}
281
282	/*
283	* Emit an instruction that will be executed unconditionally.
284	*/
285	static inline void emit(u32 inst, struct jit_ctx *ctx)
286	{
287	_emit(ARM_COND_AL, inst, ctx);
288	}
289
290	/*
291	* This is rather horrid, but necessary to convert an integer constant
292	* to an immediate operand for the opcodes, and be able to detect at
293	* build time whether the constant can't be converted (iow, usable in
294	* BUILD_BUG_ON()).
295	*/
296	#define imm12val(v, s) (rol32(v, (s)) \| (s) << 7)
297	#define const_imm8m(x) \
298	({ int r; \
299	u32 v = (x); \
300	if (!(v & ~0x000000ff)) \
301	r = imm12val(v, 0); \
302	else if (!(v & ~0xc000003f)) \
303	r = imm12val(v, 2); \
304	else if (!(v & ~0xf000000f)) \
305	r = imm12val(v, 4); \
306	else if (!(v & ~0xfc000003)) \
307	r = imm12val(v, 6); \
308	else if (!(v & ~0xff000000)) \
309	r = imm12val(v, 8); \
310	else if (!(v & ~0x3fc00000)) \
311	r = imm12val(v, 10); \
312	else if (!(v & ~0x0ff00000)) \
313	r = imm12val(v, 12); \
314	else if (!(v & ~0x03fc0000)) \
315	r = imm12val(v, 14); \
316	else if (!(v & ~0x00ff0000)) \
317	r = imm12val(v, 16); \
318	else if (!(v & ~0x003fc000)) \
319	r = imm12val(v, 18); \
320	else if (!(v & ~0x000ff000)) \
321	r = imm12val(v, 20); \
322	else if (!(v & ~0x0003fc00)) \
323	r = imm12val(v, 22); \
324	else if (!(v & ~0x0000ff00)) \
325	r = imm12val(v, 24); \
326	else if (!(v & ~0x00003fc0)) \
327	r = imm12val(v, 26); \
328	else if (!(v & ~0x00000ff0)) \
329	r = imm12val(v, 28); \
330	else if (!(v & ~0x000003fc)) \
331	r = imm12val(v, 30); \
332	else \
333	r = -1; \
334	r; })
335
336	/*
337	* Checks if immediate value can be converted to imm12(12 bits) value.
338	*/
339	static int imm8m(u32 x)
340	{
341	u32 rot;
342
343	for (rot = `0`; rot < `16`; rot++)
344	if ((x & ~ror32(word: `0xff`, shift: `2` * rot)) == `0`)
345	return rol32(word: x, shift: `2` * rot) \| (rot << `8`);
346	return -`1`;
347	}
348
349	#define imm8m(x) (__builtin_constant_p(x) ? const_imm8m(x) : imm8m(x))
350
351	static u32 arm_bpf_ldst_imm12(u32 op, u8 rt, u8 rn, s16 imm12)
352	{
353	op \|= rt << `12` \| rn << `16`;
354	if (imm12 >= `0`)
355	op \|= ARM_INST_LDST__U;
356	else
357	imm12 = -imm12;
358	return op \| (imm12 & ARM_INST_LDST__IMM12);
359	}
360
361	static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8)
362	{
363	op \|= rt << `12` \| rn << `16`;
364	if (imm8 >= `0`)
365	op \|= ARM_INST_LDST__U;
366	else
367	imm8 = -imm8;
368	return op \| (imm8 & `0xf0`) << `4` \| (imm8 & `0x0f`);
369	}
370
371	#define ARM_LDR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDR_I, rt, rn, off)
372	#define ARM_LDRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDRB_I, rt, rn, off)
373	#define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off)
374	#define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off)
375
376	#define ARM_LDRSH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSH_I, rt, rn, off)
377	#define ARM_LDRSB_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRSB_I, rt, rn, off)
378
379	#define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off)
380	#define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off)
381	#define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off)
382	#define ARM_STRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRH_I, rt, rn, off)
383
384	/*
385	* Initializes the JIT space with undefined instructions.
386	*/
387	static void jit_fill_hole(void area, unsigned* int size)
388	{
389	u32 *ptr;
390	/ We are guaranteed to have aligned memory. /
391	for (ptr = area; size >= sizeof(u32); size -= sizeof(u32))
392	*ptr++ = __opcode_to_mem_arm(ARM_INST_UDF);
393	}
394
395	#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5)
396	/ EABI requires the stack to be aligned to 64-bit boundaries /
397	#define STACK_ALIGNMENT 8
398	#else
399	/ Stack must be aligned to 32-bit boundaries /
400	#define STACK_ALIGNMENT 4
401	#endif
402
403	/ total stack size used in JITed code /
404	#define _STACK_SIZE (ctx->prog->aux->stack_depth + SCRATCH_SIZE)
405	#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT)
406
407	#if __LINUX_ARM_ARCH__ < 7
408
409	static u16 imm_offset(u32 k, struct jit_ctx *ctx)
410	{
411	unsigned int i = `0`, offset;
412	u16 imm;
413
414	/ on the "fake" run we just count them (duplicates included) /
415	if (ctx->target == NULL) {
416	ctx->imm_count++;
417	return `0`;
418	}
419
420	while ((i < ctx->imm_count) && ctx->imms[i]) {
421	if (ctx->imms[i] == k)
422	break;
423	i++;
424	}
425
426	if (ctx->imms[i] == `0`)
427	ctx->imms[i] = k;
428
429	/ constants go just after the epilogue /
430	offset = ctx->offsets[ctx->prog->len - `1`] * `4`;
431	offset += ctx->prologue_bytes;
432	offset += ctx->epilogue_bytes;
433	offset += i * `4`;
434
435	ctx->target[offset / `4`] = k;
436
437	/ PC in ARM mode == address of the instruction + 8 /
438	imm = offset - (`8` + ctx->idx * `4`);
439
440	if (imm & ~`0xfff`) {
441	/*
442	* literal pool is too far, signal it into flags. we
443	* can only detect it on the second pass unfortunately.
444	*/
445	ctx->flags \|= FLAG_IMM_OVERFLOW;
446	return `0`;
447	}
448
449	return imm;
450	}
451
452	#endif /* __LINUX_ARM_ARCH__ */
453
454	static inline int bpf2a32_offset(int bpf_to, int bpf_from,
455	const struct jit_ctx *ctx) {
456	int to, from;
457
458	if (ctx->target == NULL)
459	return `0`;
460	to = ctx->offsets[bpf_to];
461	from = ctx->offsets[bpf_from];
462
463	return to - from - `1`;
464	}
465
466	/*
467	* Move an immediate that's not an imm8m to a core register.
468	*/
469	static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx)
470	{
471	#if __LINUX_ARM_ARCH__ < 7
472	emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx);
473	#else
474	emit(ARM_MOVW(rd, val & `0xffff`), ctx);
475	if (val > `0xffff`)
476	emit(ARM_MOVT(rd, val >> `16`), ctx);
477	#endif
478	}
479
480	static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx)
481	{
482	int imm12 = imm8m(val);
483
484	if (imm12 >= `0`)
485	emit(ARM_MOV_I(rd, imm12), ctx);
486	else
487	emit_mov_i_no8m(rd, val, ctx);
488	}
489
490	static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx)
491	{
492	if (elf_hwcap & HWCAP_THUMB)
493	emit(ARM_BX(tgt_reg), ctx);
494	else
495	emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx);
496	}
497
498	static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx)
499	{
500	#if __LINUX_ARM_ARCH__ < 5
501	emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx);
502	emit_bx_r(tgt_reg, ctx);
503	#else
504	emit(ARM_BLX_R(tgt_reg), ctx);
505	#endif
506	}
507
508	static inline int epilogue_offset(const struct jit_ctx *ctx)
509	{
510	int to, from;
511	/ No need for 1st dummy run /
512	if (ctx->target == NULL)
513	return `0`;
514	to = ctx->epilogue_offset;
515	from = ctx->idx;
516
517	return to - from - `2`;
518	}
519
520	static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op, u8 sign)
521	{
522	const int exclude_mask = BIT(ARM_R0) \| BIT(ARM_R1);
523	const s8 *tmp = bpf2a32[TMP_REG_1];
524	u32 dst;
525
526	#if __LINUX_ARM_ARCH__ == 7
527	if (elf_hwcap & HWCAP_IDIVA) {
528	if (op == BPF_DIV) {
529	emit(sign ? ARM_SDIV(rd, rm, rn) : ARM_UDIV(rd, rm, rn), ctx);
530	} else {
531	emit(sign ? ARM_SDIV(ARM_IP, rm, rn) : ARM_UDIV(ARM_IP, rm, rn), ctx);
532	emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx);
533	}
534	return;
535	}
536	#endif
537
538	/*
539	* For BPF_ALU \| BPF_DIV \| BPF_K instructions
540	* As ARM_R1 and ARM_R0 contains 1st argument of bpf
541	* function, we need to save it on caller side to save
542	* it from getting destroyed within callee.
543	* After the return from the callee, we restore ARM_R0
544	* ARM_R1.
545	*/
546	if (rn != ARM_R1) {
547	emit(ARM_MOV_R(tmp[`0`], ARM_R1), ctx);
548	emit(ARM_MOV_R(ARM_R1, rn), ctx);
549	}
550	if (rm != ARM_R0) {
551	emit(ARM_MOV_R(tmp[`1`], ARM_R0), ctx);
552	emit(ARM_MOV_R(ARM_R0, rm), ctx);
553	}
554
555	/ Push caller-saved registers on stack /
556	emit(ARM_PUSH(CALLER_MASK & ~exclude_mask), ctx);
557
558	/ Call appropriate function /
559	if (sign) {
560	if (op == BPF_DIV)
561	dst = (u32)jit_sdiv32;
562	else
563	dst = (u32)jit_smod32;
564	} else {
565	if (op == BPF_DIV)
566	dst = (u32)jit_udiv32;
567	else
568	dst = (u32)jit_mod32;
569	}
570
571	emit_mov_i(ARM_IP, val: dst, ctx);
572	emit_blx_r(ARM_IP, ctx);
573
574	/ Restore caller-saved registers from stack /
575	emit(ARM_POP(CALLER_MASK & ~exclude_mask), ctx);
576
577	/ Save return value /
578	if (rd != ARM_R0)
579	emit(ARM_MOV_R(rd, ARM_R0), ctx);
580
581	/ Restore ARM_R0 and ARM_R1 /
582	if (rn != ARM_R1)
583	emit(ARM_MOV_R(ARM_R1, tmp[`0`]), ctx);
584	if (rm != ARM_R0)
585	emit(ARM_MOV_R(ARM_R0, tmp[`1`]), ctx);
586	}
587
588	static inline void emit_udivmod64(const s8 rd, const* s8 rm, const* s8 rn, struct* jit_ctx *ctx,
589	u8 op, u8 sign)
590	{
591	u32 dst;
592
593	/ Push caller-saved registers on stack /
594	emit(ARM_PUSH(CALLER_MASK), ctx);
595
596	/*
597	* As we are implementing 64-bit div/mod as function calls, We need to put the dividend in
598	* R0-R1 and the divisor in R2-R3. As we have already pushed these registers on the stack,
599	* we can recover them later after returning from the function call.
600	*/
601	if (rm[`1`] != ARM_R0 \|\| rn[`1`] != ARM_R2) {
602	/*
603	* Move Rm to {R1, R0} if it is not already there.
604	*/
605	if (rm[`1`] != ARM_R0) {
606	if (rn[`1`] == ARM_R0)
607	emit(ARM_PUSH(BIT(ARM_R0) \| BIT(ARM_R1)), ctx);
608	emit(ARM_MOV_R(ARM_R1, rm[`0`]), ctx);
609	emit(ARM_MOV_R(ARM_R0, rm[`1`]), ctx);
610	if (rn[`1`] == ARM_R0) {
611	emit(ARM_POP(BIT(ARM_R2) \| BIT(ARM_R3)), ctx);
612	goto cont;
613	}
614	}
615	/*
616	* Move Rn to {R3, R2} if it is not already there.
617	*/
618	if (rn[`1`] != ARM_R2) {
619	emit(ARM_MOV_R(ARM_R3, rn[`0`]), ctx);
620	emit(ARM_MOV_R(ARM_R2, rn[`1`]), ctx);
621	}
622	}
623
624	cont:
625
626	/ Call appropriate function /
627	if (sign) {
628	if (op == BPF_DIV)
629	dst = (u32)jit_sdiv64;
630	else
631	dst = (u32)jit_smod64;
632	} else {
633	if (op == BPF_DIV)
634	dst = (u32)jit_udiv64;
635	else
636	dst = (u32)jit_mod64;
637	}
638
639	emit_mov_i(ARM_IP, val: dst, ctx);
640	emit_blx_r(ARM_IP, ctx);
641
642	/ Save return value /
643	if (rd[`1`] != ARM_R0) {
644	emit(ARM_MOV_R(rd[`0`], ARM_R1), ctx);
645	emit(ARM_MOV_R(rd[`1`], ARM_R0), ctx);
646	}
647
648	/ Recover {R3, R2} and {R1, R0} from stack if they are not Rd /
649	if (rd[`1`] != ARM_R0 && rd[`1`] != ARM_R2) {
650	emit(ARM_POP(CALLER_MASK), ctx);
651	} else if (rd[`1`] != ARM_R0) {
652	emit(ARM_POP(BIT(ARM_R0) \| BIT(ARM_R1)), ctx);
653	emit(ARM_ADD_I(ARM_SP, ARM_SP, `8`), ctx);
654	} else {
655	emit(ARM_ADD_I(ARM_SP, ARM_SP, `8`), ctx);
656	emit(ARM_POP(BIT(ARM_R2) \| BIT(ARM_R3)), ctx);
657	}
658	}
659
660	/ Is the translated BPF register on stack? /
661	static bool is_stacked(s8 reg)
662	{
663	return reg < `0`;
664	}
665
666	/ If a BPF register is on the stack (stk is true), load it to the*
667	* supplied temporary register and return the temporary register
668	* for subsequent operations, otherwise just use the CPU register.
669	*/
670	static s8 arm_bpf_get_reg32(s8 reg, s8 tmp, struct jit_ctx *ctx)
671	{
672	if (is_stacked(reg)) {
673	emit(ARM_LDR_I(tmp, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
674	reg = tmp;
675	}
676	return reg;
677	}
678
679	static const s8 arm_bpf_get_reg64(const* s8 reg, const* s8 *tmp,
680	struct jit_ctx *ctx)
681	{
682	if (is_stacked(reg: reg[`1`])) {
683	if (__LINUX_ARM_ARCH__ >= `6` \|\|
684	ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
685	emit(ARM_LDRD_I(tmp[`1`], ARM_FP,
686	EBPF_SCRATCH_TO_ARM_FP(reg[`1`])), ctx);
687	} else {
688	emit(ARM_LDR_I(tmp[`1`], ARM_FP,
689	EBPF_SCRATCH_TO_ARM_FP(reg[`1`])), ctx);
690	emit(ARM_LDR_I(tmp[`0`], ARM_FP,
691	EBPF_SCRATCH_TO_ARM_FP(reg[`0`])), ctx);
692	}
693	reg = tmp;
694	}
695	return reg;
696	}
697
698	/ If a BPF register is on the stack (stk is true), save the register*
699	* back to the stack. If the source register is not the same, then
700	* move it into the correct register.
701	*/
702	static void arm_bpf_put_reg32(s8 reg, s8 src, struct jit_ctx *ctx)
703	{
704	if (is_stacked(reg))
705	emit(ARM_STR_I(src, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx);
706	else if (reg != src)
707	emit(ARM_MOV_R(reg, src), ctx);
708	}
709
710	static void arm_bpf_put_reg64(const s8 reg, const* s8 *src,
711	struct jit_ctx *ctx)
712	{
713	if (is_stacked(reg: reg[`1`])) {
714	if (__LINUX_ARM_ARCH__ >= `6` \|\|
715	ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) {
716	emit(ARM_STRD_I(src[`1`], ARM_FP,
717	EBPF_SCRATCH_TO_ARM_FP(reg[`1`])), ctx);
718	} else {
719	emit(ARM_STR_I(src[`1`], ARM_FP,
720	EBPF_SCRATCH_TO_ARM_FP(reg[`1`])), ctx);
721	emit(ARM_STR_I(src[`0`], ARM_FP,
722	EBPF_SCRATCH_TO_ARM_FP(reg[`0`])), ctx);
723	}
724	} else {
725	if (reg[`1`] != src[`1`])
726	emit(ARM_MOV_R(reg[`1`], src[`1`]), ctx);
727	if (reg[`0`] != src[`0`])
728	emit(ARM_MOV_R(reg[`0`], src[`0`]), ctx);
729	}
730	}
731
732	static inline void emit_a32_mov_i(const s8 dst, const u32 val,
733	struct jit_ctx *ctx)
734	{
735	const s8 *tmp = bpf2a32[TMP_REG_1];
736
737	if (is_stacked(reg: dst)) {
738	emit_mov_i(rd: tmp[`1`], val, ctx);
739	arm_bpf_put_reg32(reg: dst, src: tmp[`1`], ctx);
740	} else {
741	emit_mov_i(rd: dst, val, ctx);
742	}
743	}
744
745	static void emit_a32_mov_i64(const s8 dst[], u64 val, struct jit_ctx *ctx)
746	{
747	const s8 *tmp = bpf2a32[TMP_REG_1];
748	const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
749
750	emit_mov_i(rd: rd[`1`], val: (u32)val, ctx);
751	emit_mov_i(rd: rd[`0`], val: val >> `32`, ctx);
752
753	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
754	}
755
756	/ Sign extended move /
757	static inline void emit_a32_mov_se_i64(const bool is64, const s8 dst[],
758	const u32 val, struct jit_ctx *ctx) {
759	u64 val64 = val;
760
761	if (is64 && (val & (`1`<<`31`)))
762	val64 \|= `0xffffffff00000000ULL`;
763	emit_a32_mov_i64(dst, val: val64, ctx);
764	}
765
766	static inline void emit_a32_add_r(const u8 dst, const u8 src,
767	const bool is64, const bool hi,
768	struct jit_ctx *ctx) {
769	/ 64 bit :*
770	* adds dst_lo, dst_lo, src_lo
771	* adc dst_hi, dst_hi, src_hi
772	* 32 bit :
773	* add dst_lo, dst_lo, src_lo
774	*/
775	if (!hi && is64)
776	emit(ARM_ADDS_R(dst, dst, src), ctx);
777	else if (hi && is64)
778	emit(ARM_ADC_R(dst, dst, src), ctx);
779	else
780	emit(ARM_ADD_R(dst, dst, src), ctx);
781	}
782
783	static inline void emit_a32_sub_r(const u8 dst, const u8 src,
784	const bool is64, const bool hi,
785	struct jit_ctx *ctx) {
786	/ 64 bit :*
787	* subs dst_lo, dst_lo, src_lo
788	* sbc dst_hi, dst_hi, src_hi
789	* 32 bit :
790	* sub dst_lo, dst_lo, src_lo
791	*/
792	if (!hi && is64)
793	emit(ARM_SUBS_R(dst, dst, src), ctx);
794	else if (hi && is64)
795	emit(ARM_SBC_R(dst, dst, src), ctx);
796	else
797	emit(ARM_SUB_R(dst, dst, src), ctx);
798	}
799
800	static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64,
801	const bool hi, const u8 op, struct jit_ctx *ctx){
802	switch (BPF_OP(op)) {
803	/ dst = dst + src /
804	case BPF_ADD:
805	emit_a32_add_r(dst, src, is64, hi, ctx);
806	break;
807	/ dst = dst - src /
808	case BPF_SUB:
809	emit_a32_sub_r(dst, src, is64, hi, ctx);
810	break;
811	/ dst = dst \| src /
812	case BPF_OR:
813	emit(ARM_ORR_R(dst, dst, src), ctx);
814	break;
815	/ dst = dst & src /
816	case BPF_AND:
817	emit(ARM_AND_R(dst, dst, src), ctx);
818	break;
819	/ dst = dst ^ src /
820	case BPF_XOR:
821	emit(ARM_EOR_R(dst, dst, src), ctx);
822	break;
823	/ dst = dst * src /
824	case BPF_MUL:
825	emit(ARM_MUL(dst, dst, src), ctx);
826	break;
827	/ dst = dst << src /
828	case BPF_LSH:
829	emit(ARM_LSL_R(dst, dst, src), ctx);
830	break;
831	/ dst = dst >> src /
832	case BPF_RSH:
833	emit(ARM_LSR_R(dst, dst, src), ctx);
834	break;
835	/ dst = dst >> src (signed)/
836	case BPF_ARSH:
837	emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx);
838	break;
839	}
840	}
841
842	/ ALU operation (64 bit) /
843	static inline void emit_a32_alu_r64(const bool is64, const s8 dst[],
844	const s8 src[], struct jit_ctx *ctx,
845	const u8 op) {
846	const s8 *tmp = bpf2a32[TMP_REG_1];
847	const s8 *tmp2 = bpf2a32[TMP_REG_2];
848	const s8 *rd;
849
850	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
851	if (is64) {
852	const s8 *rs;
853
854	rs = arm_bpf_get_reg64(reg: src, tmp: tmp2, ctx);
855
856	/ ALU operation /
857	emit_alu_r(dst: rd[`1`], src: rs[`1`], is64: true, hi: false, op, ctx);
858	emit_alu_r(dst: rd[`0`], src: rs[`0`], is64: true, hi: true, op, ctx);
859	} else {
860	s8 rs;
861
862	rs = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
863
864	/ ALU operation /
865	emit_alu_r(dst: rd[`1`], src: rs, is64: true, hi: false, op, ctx);
866	if (!ctx->prog->aux->verifier_zext)
867	emit_a32_mov_i(dst: rd[`0`], val: `0`, ctx);
868	}
869
870	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
871	}
872
873	/ dst = src (4 bytes)/
874	static inline void emit_a32_mov_r(const s8 dst, const s8 src, const u8 off,
875	struct jit_ctx *ctx) {
876	const s8 *tmp = bpf2a32[TMP_REG_1];
877	s8 rt;
878
879	rt = arm_bpf_get_reg32(reg: src, tmp: tmp[`0`], ctx);
880	if (off && off != `32`) {
881	emit(ARM_LSL_I(rt, rt, `32` - off), ctx);
882	emit(ARM_ASR_I(rt, rt, `32` - off), ctx);
883	}
884	arm_bpf_put_reg32(reg: dst, src: rt, ctx);
885	}
886
887	/ dst = src /
888	static inline void emit_a32_mov_r64(const bool is64, const s8 dst[],
889	const s8 src[],
890	struct jit_ctx *ctx) {
891	if (!is64) {
892	emit_a32_mov_r(dst_lo, src_lo, off: `0`, ctx);
893	if (!ctx->prog->aux->verifier_zext)
894	/ Zero out high 4 bytes /
895	emit_a32_mov_i(dst_hi, val: `0`, ctx);
896	} else if (__LINUX_ARM_ARCH__ < `6` &&
897	ctx->cpu_architecture < CPU_ARCH_ARMv5TE) {
898	/ complete 8 byte move /
899	emit_a32_mov_r(dst_lo, src_lo, off: `0`, ctx);
900	emit_a32_mov_r(dst_hi, src_hi, off: `0`, ctx);
901	} else if (is_stacked(src_lo) && is_stacked(dst_lo)) {
902	const u8 *tmp = bpf2a32[TMP_REG_1];
903
904	emit(ARM_LDRD_I(tmp[`1`], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
905	emit(ARM_STRD_I(tmp[`1`], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
906	} else if (is_stacked(src_lo)) {
907	emit(ARM_LDRD_I(dst[`1`], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx);
908	} else if (is_stacked(dst_lo)) {
909	emit(ARM_STRD_I(src[`1`], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx);
910	} else {
911	emit(ARM_MOV_R(dst[`0`], src[`0`]), ctx);
912	emit(ARM_MOV_R(dst[`1`], src[`1`]), ctx);
913	}
914	}
915
916	/ dst = (signed)src /
917	static inline void emit_a32_movsx_r64(const bool is64, const u8 off, const s8 dst[], const s8 src[],
918	struct jit_ctx *ctx) {
919	const s8 *tmp = bpf2a32[TMP_REG_1];
920	const s8 *rt;
921
922	rt = arm_bpf_get_reg64(reg: dst, tmp, ctx);
923
924	emit_a32_mov_r(dst_lo, src_lo, off, ctx);
925	if (!is64) {
926	if (!ctx->prog->aux->verifier_zext)
927	/ Zero out high 4 bytes /
928	emit_a32_mov_i(dst_hi, val: `0`, ctx);
929	} else {
930	emit(ARM_ASR_I(rt[`0`], rt[`1`], `31`), ctx);
931	}
932	}
933
934	/ Shift operations /
935	static inline void emit_a32_alu_i(const s8 dst, const u32 val,
936	struct jit_ctx ctx, const* u8 op) {
937	const s8 *tmp = bpf2a32[TMP_REG_1];
938	s8 rd;
939
940	rd = arm_bpf_get_reg32(reg: dst, tmp: tmp[`0`], ctx);
941
942	/ Do shift operation /
943	switch (op) {
944	case BPF_LSH:
945	emit(ARM_LSL_I(rd, rd, val), ctx);
946	break;
947	case BPF_RSH:
948	emit(ARM_LSR_I(rd, rd, val), ctx);
949	break;
950	case BPF_ARSH:
951	emit(ARM_ASR_I(rd, rd, val), ctx);
952	break;
953	case BPF_NEG:
954	emit(ARM_RSB_I(rd, rd, val), ctx);
955	break;
956	}
957
958	arm_bpf_put_reg32(reg: dst, src: rd, ctx);
959	}
960
961	/ dst = ~dst (64 bit) /
962	static inline void emit_a32_neg64(const s8 dst[],
963	struct jit_ctx *ctx){
964	const s8 *tmp = bpf2a32[TMP_REG_1];
965	const s8 *rd;
966
967	/ Setup Operand /
968	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
969
970	/ Do Negate Operation /
971	emit(ARM_RSBS_I(rd[`1`], rd[`1`], `0`), ctx);
972	emit(ARM_RSC_I(rd[`0`], rd[`0`], `0`), ctx);
973
974	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
975	}
976
977	/ dst = dst << src /
978	static inline void emit_a32_lsh_r64(const s8 dst[], const s8 src[],
979	struct jit_ctx *ctx) {
980	const s8 *tmp = bpf2a32[TMP_REG_1];
981	const s8 *tmp2 = bpf2a32[TMP_REG_2];
982	const s8 *rd;
983	s8 rt;
984
985	/ Setup Operands /
986	rt = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
987	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
988
989	/ Do LSH operation /
990	emit(ARM_SUB_I(ARM_IP, rt, `32`), ctx);
991	emit(ARM_RSB_I(tmp2[`0`], rt, `32`), ctx);
992	emit(ARM_MOV_SR(ARM_LR, rd[`0`], SRTYPE_ASL, rt), ctx);
993	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[`1`], SRTYPE_ASL, ARM_IP), ctx);
994	emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd[`1`], SRTYPE_LSR, tmp2[`0`]), ctx);
995	emit(ARM_MOV_SR(ARM_LR, rd[`1`], SRTYPE_ASL, rt), ctx);
996
997	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
998	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
999	}
1000
1001	/ dst = dst >> src (signed)/
1002	static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[],
1003	struct jit_ctx *ctx) {
1004	const s8 *tmp = bpf2a32[TMP_REG_1];
1005	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1006	const s8 *rd;
1007	s8 rt;
1008
1009	/ Setup Operands /
1010	rt = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
1011	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1012
1013	/ Do the ARSH operation /
1014	emit(ARM_RSB_I(ARM_IP, rt, `32`), ctx);
1015	emit(ARM_SUBS_I(tmp2[`0`], rt, `32`), ctx);
1016	emit(ARM_MOV_SR(ARM_LR, rd[`1`], SRTYPE_LSR, rt), ctx);
1017	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[`0`], SRTYPE_ASL, ARM_IP), ctx);
1018	_emit(ARM_COND_PL,
1019	ARM_ORR_SR(ARM_LR, ARM_LR, rd[`0`], SRTYPE_ASR, tmp2[`0`]), ctx);
1020	emit(ARM_MOV_SR(ARM_IP, rd[`0`], SRTYPE_ASR, rt), ctx);
1021
1022	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
1023	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
1024	}
1025
1026	/ dst = dst >> src /
1027	static inline void emit_a32_rsh_r64(const s8 dst[], const s8 src[],
1028	struct jit_ctx *ctx) {
1029	const s8 *tmp = bpf2a32[TMP_REG_1];
1030	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1031	const s8 *rd;
1032	s8 rt;
1033
1034	/ Setup Operands /
1035	rt = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
1036	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1037
1038	/ Do RSH operation /
1039	emit(ARM_RSB_I(ARM_IP, rt, `32`), ctx);
1040	emit(ARM_SUBS_I(tmp2[`0`], rt, `32`), ctx);
1041	emit(ARM_MOV_SR(ARM_LR, rd[`1`], SRTYPE_LSR, rt), ctx);
1042	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[`0`], SRTYPE_ASL, ARM_IP), ctx);
1043	emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[`0`], SRTYPE_LSR, tmp2[`0`]), ctx);
1044	emit(ARM_MOV_SR(ARM_IP, rd[`0`], SRTYPE_LSR, rt), ctx);
1045
1046	arm_bpf_put_reg32(dst_lo, ARM_LR, ctx);
1047	arm_bpf_put_reg32(dst_hi, ARM_IP, ctx);
1048	}
1049
1050	/ dst = dst << val /
1051	static inline void emit_a32_lsh_i64(const s8 dst[],
1052	const u32 val, struct jit_ctx *ctx){
1053	const s8 *tmp = bpf2a32[TMP_REG_1];
1054	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1055	const s8 *rd;
1056
1057	/ Setup operands /
1058	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1059
1060	/ Do LSH operation /
1061	if (val < `32`) {
1062	emit(ARM_MOV_SI(tmp2[`0`], rd[`0`], SRTYPE_ASL, val), ctx);
1063	emit(ARM_ORR_SI(rd[`0`], tmp2[`0`], rd[`1`], SRTYPE_LSR, `32` - val), ctx);
1064	emit(ARM_MOV_SI(rd[`1`], rd[`1`], SRTYPE_ASL, val), ctx);
1065	} else {
1066	if (val == `32`)
1067	emit(ARM_MOV_R(rd[`0`], rd[`1`]), ctx);
1068	else
1069	emit(ARM_MOV_SI(rd[`0`], rd[`1`], SRTYPE_ASL, val - `32`), ctx);
1070	emit(ARM_EOR_R(rd[`1`], rd[`1`], rd[`1`]), ctx);
1071	}
1072
1073	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1074	}
1075
1076	/ dst = dst >> val /
1077	static inline void emit_a32_rsh_i64(const s8 dst[],
1078	const u32 val, struct jit_ctx *ctx) {
1079	const s8 *tmp = bpf2a32[TMP_REG_1];
1080	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1081	const s8 *rd;
1082
1083	/ Setup operands /
1084	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1085
1086	/ Do LSR operation /
1087	if (val == `0`) {
1088	/ An immediate value of 0 encodes a shift amount of 32*
1089	* for LSR. To shift by 0, don't do anything.
1090	*/
1091	} else if (val < `32`) {
1092	emit(ARM_MOV_SI(tmp2[`1`], rd[`1`], SRTYPE_LSR, val), ctx);
1093	emit(ARM_ORR_SI(rd[`1`], tmp2[`1`], rd[`0`], SRTYPE_ASL, `32` - val), ctx);
1094	emit(ARM_MOV_SI(rd[`0`], rd[`0`], SRTYPE_LSR, val), ctx);
1095	} else if (val == `32`) {
1096	emit(ARM_MOV_R(rd[`1`], rd[`0`]), ctx);
1097	emit(ARM_MOV_I(rd[`0`], `0`), ctx);
1098	} else {
1099	emit(ARM_MOV_SI(rd[`1`], rd[`0`], SRTYPE_LSR, val - `32`), ctx);
1100	emit(ARM_MOV_I(rd[`0`], `0`), ctx);
1101	}
1102
1103	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1104	}
1105
1106	/ dst = dst >> val (signed) /
1107	static inline void emit_a32_arsh_i64(const s8 dst[],
1108	const u32 val, struct jit_ctx *ctx){
1109	const s8 *tmp = bpf2a32[TMP_REG_1];
1110	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1111	const s8 *rd;
1112
1113	/ Setup operands /
1114	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1115
1116	/ Do ARSH operation /
1117	if (val == `0`) {
1118	/ An immediate value of 0 encodes a shift amount of 32*
1119	* for ASR. To shift by 0, don't do anything.
1120	*/
1121	} else if (val < `32`) {
1122	emit(ARM_MOV_SI(tmp2[`1`], rd[`1`], SRTYPE_LSR, val), ctx);
1123	emit(ARM_ORR_SI(rd[`1`], tmp2[`1`], rd[`0`], SRTYPE_ASL, `32` - val), ctx);
1124	emit(ARM_MOV_SI(rd[`0`], rd[`0`], SRTYPE_ASR, val), ctx);
1125	} else if (val == `32`) {
1126	emit(ARM_MOV_R(rd[`1`], rd[`0`]), ctx);
1127	emit(ARM_MOV_SI(rd[`0`], rd[`0`], SRTYPE_ASR, `31`), ctx);
1128	} else {
1129	emit(ARM_MOV_SI(rd[`1`], rd[`0`], SRTYPE_ASR, val - `32`), ctx);
1130	emit(ARM_MOV_SI(rd[`0`], rd[`0`], SRTYPE_ASR, `31`), ctx);
1131	}
1132
1133	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1134	}
1135
1136	static inline void emit_a32_mul_r64(const s8 dst[], const s8 src[],
1137	struct jit_ctx *ctx) {
1138	const s8 *tmp = bpf2a32[TMP_REG_1];
1139	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1140	const s8 rd, rt;
1141
1142	/ Setup operands for multiplication /
1143	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1144	rt = arm_bpf_get_reg64(reg: src, tmp: tmp2, ctx);
1145
1146	/ Do Multiplication /
1147	emit(ARM_MUL(ARM_IP, rd[`1`], rt[`0`]), ctx);
1148	emit(ARM_MUL(ARM_LR, rd[`0`], rt[`1`]), ctx);
1149	emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx);
1150
1151	emit(ARM_UMULL(ARM_IP, rd[`0`], rd[`1`], rt[`1`]), ctx);
1152	emit(ARM_ADD_R(rd[`0`], ARM_LR, rd[`0`]), ctx);
1153
1154	arm_bpf_put_reg32(dst_lo, ARM_IP, ctx);
1155	arm_bpf_put_reg32(dst_hi, src: rd[`0`], ctx);
1156	}
1157
1158	static bool is_ldst_imm(s16 off, const u8 size)
1159	{
1160	s16 off_max = `0`;
1161
1162	switch (size) {
1163	case BPF_B:
1164	case BPF_W:
1165	off_max = `0xfff`;
1166	break;
1167	case BPF_H:
1168	off_max = `0xff`;
1169	break;
1170	case BPF_DW:
1171	/ Need to make sure off+4 does not overflow. /
1172	off_max = `0xfff` - `4`;
1173	break;
1174	}
1175	return -off_max <= off && off <= off_max;
1176	}
1177
1178	static bool is_ldst_imm8(s16 off, const u8 size)
1179	{
1180	s16 off_max = `0`;
1181
1182	switch (size) {
1183	case BPF_B:
1184	off_max = `0xff`;
1185	break;
1186	case BPF_W:
1187	off_max = `0xfff`;
1188	break;
1189	case BPF_H:
1190	off_max = `0xff`;
1191	break;
1192	}
1193	return -off_max <= off && off <= off_max;
1194	}
1195
1196	/ (size )(dst + off) = src /
1197	static inline void emit_str_r(const s8 dst, const s8 src[],
1198	s16 off, struct jit_ctx ctx, const* u8 sz){
1199	const s8 *tmp = bpf2a32[TMP_REG_1];
1200	s8 rd;
1201
1202	rd = arm_bpf_get_reg32(reg: dst, tmp: tmp[`1`], ctx);
1203
1204	if (!is_ldst_imm(off, size: sz)) {
1205	emit_a32_mov_i(dst: tmp[`0`], val: off, ctx);
1206	emit(ARM_ADD_R(tmp[`0`], tmp[`0`], rd), ctx);
1207	rd = tmp[`0`];
1208	off = `0`;
1209	}
1210	switch (sz) {
1211	case BPF_B:
1212	/ Store a Byte /
1213	emit(ARM_STRB_I(src_lo, rd, off), ctx);
1214	break;
1215	case BPF_H:
1216	/ Store a HalfWord /
1217	emit(ARM_STRH_I(src_lo, rd, off), ctx);
1218	break;
1219	case BPF_W:
1220	/ Store a Word /
1221	emit(ARM_STR_I(src_lo, rd, off), ctx);
1222	break;
1223	case BPF_DW:
1224	/ Store a Double Word /
1225	emit(ARM_STR_I(src_lo, rd, off), ctx);
1226	emit(ARM_STR_I(src_hi, rd, off + `4`), ctx);
1227	break;
1228	}
1229	}
1230
1231	/ dst = (size)(src + off) /
1232	static inline void emit_ldx_r(const s8 dst[], const s8 src,
1233	s16 off, struct jit_ctx ctx, const* u8 sz){
1234	const s8 *tmp = bpf2a32[TMP_REG_1];
1235	const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
1236	s8 rm = src;
1237
1238	if (!is_ldst_imm(off, size: sz)) {
1239	emit_a32_mov_i(dst: tmp[`0`], val: off, ctx);
1240	emit(ARM_ADD_R(tmp[`0`], tmp[`0`], src), ctx);
1241	rm = tmp[`0`];
1242	off = `0`;
1243	} else if (rd[`1`] == rm) {
1244	emit(ARM_MOV_R(tmp[`0`], rm), ctx);
1245	rm = tmp[`0`];
1246	}
1247	switch (sz) {
1248	case BPF_B:
1249	/ Load a Byte /
1250	emit(ARM_LDRB_I(rd[`1`], rm, off), ctx);
1251	if (!ctx->prog->aux->verifier_zext)
1252	emit_a32_mov_i(dst: rd[`0`], val: `0`, ctx);
1253	break;
1254	case BPF_H:
1255	/ Load a HalfWord /
1256	emit(ARM_LDRH_I(rd[`1`], rm, off), ctx);
1257	if (!ctx->prog->aux->verifier_zext)
1258	emit_a32_mov_i(dst: rd[`0`], val: `0`, ctx);
1259	break;
1260	case BPF_W:
1261	/ Load a Word /
1262	emit(ARM_LDR_I(rd[`1`], rm, off), ctx);
1263	if (!ctx->prog->aux->verifier_zext)
1264	emit_a32_mov_i(dst: rd[`0`], val: `0`, ctx);
1265	break;
1266	case BPF_DW:
1267	/ Load a Double Word /
1268	emit(ARM_LDR_I(rd[`1`], rm, off), ctx);
1269	emit(ARM_LDR_I(rd[`0`], rm, off + `4`), ctx);
1270	break;
1271	}
1272	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1273	}
1274
1275	/ dst = (signed size)(src + off) /
1276	static inline void emit_ldsx_r(const s8 dst[], const s8 src,
1277	s16 off, struct jit_ctx ctx, const* u8 sz){
1278	const s8 *tmp = bpf2a32[TMP_REG_1];
1279	const s8 *rd = is_stacked(dst_lo) ? tmp : dst;
1280	s8 rm = src;
1281	int add_off;
1282
1283	if (!is_ldst_imm8(off, size: sz)) {
1284	/*
1285	* offset does not fit in the load/store immediate,
1286	* construct an ADD instruction to apply the offset.
1287	*/
1288	add_off = imm8m(off);
1289	if (add_off > `0`) {
1290	emit(ARM_ADD_I(tmp[`0`], src, add_off), ctx);
1291	rm = tmp[`0`];
1292	} else {
1293	emit_a32_mov_i(dst: tmp[`0`], val: off, ctx);
1294	emit(ARM_ADD_R(tmp[`0`], tmp[`0`], src), ctx);
1295	rm = tmp[`0`];
1296	}
1297	off = `0`;
1298	}
1299
1300	switch (sz) {
1301	case BPF_B:
1302	/ Load a Byte with sign extension/
1303	emit(ARM_LDRSB_I(rd[`1`], rm, off), ctx);
1304	break;
1305	case BPF_H:
1306	/ Load a HalfWord with sign extension/
1307	emit(ARM_LDRSH_I(rd[`1`], rm, off), ctx);
1308	break;
1309	case BPF_W:
1310	/ Load a Word/
1311	emit(ARM_LDR_I(rd[`1`], rm, off), ctx);
1312	break;
1313	}
1314	/ Carry the sign extension to upper 32 bits /
1315	emit(ARM_ASR_I(rd[`0`], rd[`1`], `31`), ctx);
1316	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1317	}
1318
1319	/ Arithmatic Operation /
1320	static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm,
1321	const u8 rn, struct jit_ctx *ctx, u8 op,
1322	bool is_jmp64) {
1323	switch (op) {
1324	case BPF_JSET:
1325	if (is_jmp64) {
1326	emit(ARM_AND_R(ARM_IP, rt, rn), ctx);
1327	emit(ARM_AND_R(ARM_LR, rd, rm), ctx);
1328	emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx);
1329	} else {
1330	emit(ARM_ANDS_R(ARM_IP, rt, rn), ctx);
1331	}
1332	break;
1333	case BPF_JEQ:
1334	case BPF_JNE:
1335	case BPF_JGT:
1336	case BPF_JGE:
1337	case BPF_JLE:
1338	case BPF_JLT:
1339	if (is_jmp64) {
1340	emit(ARM_CMP_R(rd, rm), ctx);
1341	/ Only compare low halve if high halve are equal. /
1342	_emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx);
1343	} else {
1344	emit(ARM_CMP_R(rt, rn), ctx);
1345	}
1346	break;
1347	case BPF_JSLE:
1348	case BPF_JSGT:
1349	emit(ARM_CMP_R(rn, rt), ctx);
1350	if (is_jmp64)
1351	emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx);
1352	break;
1353	case BPF_JSLT:
1354	case BPF_JSGE:
1355	emit(ARM_CMP_R(rt, rn), ctx);
1356	if (is_jmp64)
1357	emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx);
1358	break;
1359	}
1360	}
1361
1362	static int out_offset = -`1`; / initialized on the first pass of build_body() /
1363	static int emit_bpf_tail_call(struct jit_ctx *ctx)
1364	{
1365
1366	/ bpf_tail_call(void prog_ctx, struct bpf_array array, u64 index) /
1367	const s8 *r2 = bpf2a32[BPF_REG_2];
1368	const s8 *r3 = bpf2a32[BPF_REG_3];
1369	const s8 *tmp = bpf2a32[TMP_REG_1];
1370	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1371	const s8 *tcc = bpf2a32[TCALL_CNT];
1372	const s8 *tc;
1373	const int idx0 = ctx->idx;
1374	#define cur_offset (ctx->idx - idx0)
1375	#define jmp_offset (out_offset - (cur_offset) - 2)
1376	u32 lo, hi;
1377	s8 r_array, r_index;
1378	int off;
1379
1380	/ if (index >= array->map.max_entries)*
1381	* goto out;
1382	*/
1383	BUILD_BUG_ON(offsetof(struct bpf_array, map.max_entries) >
1384	ARM_INST_LDST__IMM12);
1385	off = offsetof(struct bpf_array, map.max_entries);
1386	r_array = arm_bpf_get_reg32(reg: r2[`1`], tmp: tmp2[`0`], ctx);
1387	/ index is 32-bit for arrays /
1388	r_index = arm_bpf_get_reg32(reg: r3[`1`], tmp: tmp2[`1`], ctx);
1389	/ array->map.max_entries /
1390	emit(ARM_LDR_I(tmp[`1`], r_array, off), ctx);
1391	/ index >= array->map.max_entries /
1392	emit(ARM_CMP_R(r_index, tmp[`1`]), ctx);
1393	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1394
1395	/ tmp2[0] = array, tmp2[1] = index /
1396
1397	/*
1398	* if (tail_call_cnt >= MAX_TAIL_CALL_CNT)
1399	* goto out;
1400	* tail_call_cnt++;
1401	*/
1402	lo = (u32)MAX_TAIL_CALL_CNT;
1403	hi = (u32)((u64)MAX_TAIL_CALL_CNT >> `32`);
1404	tc = arm_bpf_get_reg64(reg: tcc, tmp, ctx);
1405	emit(ARM_CMP_I(tc[`0`], hi), ctx);
1406	_emit(ARM_COND_EQ, ARM_CMP_I(tc[`1`], lo), ctx);
1407	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1408	emit(ARM_ADDS_I(tc[`1`], tc[`1`], `1`), ctx);
1409	emit(ARM_ADC_I(tc[`0`], tc[`0`], `0`), ctx);
1410	arm_bpf_put_reg64(reg: tcc, src: tmp, ctx);
1411
1412	/ prog = array->ptrs[index]*
1413	* if (prog == NULL)
1414	* goto out;
1415	*/
1416	BUILD_BUG_ON(imm8m(offsetof(struct bpf_array, ptrs)) < `0`);
1417	off = imm8m(offsetof(struct bpf_array, ptrs));
1418	emit(ARM_ADD_I(tmp[`1`], r_array, off), ctx);
1419	emit(ARM_LDR_R_SI(tmp[`1`], tmp[`1`], r_index, SRTYPE_ASL, `2`), ctx);
1420	emit(ARM_CMP_I(tmp[`1`], `0`), ctx);
1421	_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1422
1423	/ goto (prog->bpf_func + prologue_size); /*
1424	BUILD_BUG_ON(offsetof(struct bpf_prog, bpf_func) >
1425	ARM_INST_LDST__IMM12);
1426	off = offsetof(struct bpf_prog, bpf_func);
1427	emit(ARM_LDR_I(tmp[`1`], tmp[`1`], off), ctx);
1428	emit(ARM_ADD_I(tmp[`1`], tmp[`1`], ctx->prologue_bytes), ctx);
1429	emit_bx_r(tgt_reg: tmp[`1`], ctx);
1430
1431	/ out: /
1432	if (out_offset == -`1`)
1433	out_offset = cur_offset;
1434	if (cur_offset != out_offset) {
1435	pr_err_once("tail_call out_offset = %d, expected %d!\n",
1436	cur_offset, out_offset);
1437	return -`1`;
1438	}
1439	return `0`;
1440	#undef cur_offset
1441	#undef jmp_offset
1442	}
1443
1444	/ 0xabcd => 0xcdab /
1445	static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1446	{
1447	#if __LINUX_ARM_ARCH__ < 6
1448	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1449
1450	emit(ARM_AND_I(tmp2[`1`], rn, `0xff`), ctx);
1451	emit(ARM_MOV_SI(tmp2[`0`], rn, SRTYPE_LSR, `8`), ctx);
1452	emit(ARM_AND_I(tmp2[`0`], tmp2[`0`], `0xff`), ctx);
1453	emit(ARM_ORR_SI(rd, tmp2[`0`], tmp2[`1`], SRTYPE_LSL, `8`), ctx);
1454	#else /* ARMv6+ */
1455	emit(ARM_REV16(rd, rn), ctx);
1456	#endif
1457	}
1458
1459	/ 0xabcdefgh => 0xghefcdab /
1460	static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx)
1461	{
1462	#if __LINUX_ARM_ARCH__ < 6
1463	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1464
1465	emit(ARM_AND_I(tmp2[`1`], rn, `0xff`), ctx);
1466	emit(ARM_MOV_SI(tmp2[`0`], rn, SRTYPE_LSR, `24`), ctx);
1467	emit(ARM_ORR_SI(ARM_IP, tmp2[`0`], tmp2[`1`], SRTYPE_LSL, `24`), ctx);
1468
1469	emit(ARM_MOV_SI(tmp2[`1`], rn, SRTYPE_LSR, `8`), ctx);
1470	emit(ARM_AND_I(tmp2[`1`], tmp2[`1`], `0xff`), ctx);
1471	emit(ARM_MOV_SI(tmp2[`0`], rn, SRTYPE_LSR, `16`), ctx);
1472	emit(ARM_AND_I(tmp2[`0`], tmp2[`0`], `0xff`), ctx);
1473	emit(ARM_MOV_SI(tmp2[`0`], tmp2[`0`], SRTYPE_LSL, `8`), ctx);
1474	emit(ARM_ORR_SI(tmp2[`0`], tmp2[`0`], tmp2[`1`], SRTYPE_LSL, `16`), ctx);
1475	emit(ARM_ORR_R(rd, ARM_IP, tmp2[`0`]), ctx);
1476
1477	#else /* ARMv6+ */
1478	emit(ARM_REV(rd, rn), ctx);
1479	#endif
1480	}
1481
1482	// push the scratch stack register on top of the stack
1483	static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx)
1484	{
1485	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1486	const s8 *rt;
1487	u16 reg_set = `0`;
1488
1489	rt = arm_bpf_get_reg64(reg: src, tmp: tmp2, ctx);
1490
1491	reg_set = (`1` << rt[`1`]) \| (`1` << rt[`0`]);
1492	emit(ARM_PUSH(reg_set), ctx);
1493	}
1494
1495	static void build_prologue(struct jit_ctx *ctx)
1496	{
1497	const s8 arm_r0 = bpf2a32[BPF_REG_0][`1`];
1498	const s8 *bpf_r1 = bpf2a32[BPF_REG_1];
1499	const s8 *bpf_fp = bpf2a32[BPF_REG_FP];
1500	const s8 *tcc = bpf2a32[TCALL_CNT];
1501
1502	/ Save callee saved registers. /
1503	#ifdef CONFIG_FRAME_POINTER
1504	u16 reg_set = CALLEE_PUSH_MASK \| `1` << ARM_IP \| `1` << ARM_PC;
1505	emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx);
1506	emit(ARM_PUSH(reg_set), ctx);
1507	emit(ARM_SUB_I(ARM_FP, ARM_IP, `4`), ctx);
1508	#else
1509	emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx);
1510	emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx);
1511	#endif
1512	/ mov r3, #0 /
1513	/ sub r2, sp, #SCRATCH_SIZE /
1514	emit(ARM_MOV_I(bpf_r1[`0`], `0`), ctx);
1515	emit(ARM_SUB_I(bpf_r1[`1`], ARM_SP, SCRATCH_SIZE), ctx);
1516
1517	ctx->stack_size = imm8m(STACK_SIZE);
1518
1519	/ Set up function call stack /
1520	emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx);
1521
1522	/ Set up BPF prog stack base register /
1523	emit_a32_mov_r64(is64: true, dst: bpf_fp, src: bpf_r1, ctx);
1524
1525	/ Initialize Tail Count /
1526	emit(ARM_MOV_I(bpf_r1[`1`], `0`), ctx);
1527	emit_a32_mov_r64(is64: true, dst: tcc, src: bpf_r1, ctx);
1528
1529	/ Move BPF_CTX to BPF_R1 /
1530	emit(ARM_MOV_R(bpf_r1[`1`], arm_r0), ctx);
1531
1532	/ end of prologue /
1533	}
1534
1535	/ restore callee saved registers. /
1536	static void build_epilogue(struct jit_ctx *ctx)
1537	{
1538	#ifdef CONFIG_FRAME_POINTER
1539	/ When using frame pointers, some additional registers need to*
1540	* be loaded. */
1541	u16 reg_set = CALLEE_POP_MASK \| `1` << ARM_SP;
1542	emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * `4`), ctx);
1543	emit(ARM_LDM(ARM_SP, reg_set), ctx);
1544	#else
1545	/ Restore callee saved registers. /
1546	emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx);
1547	emit(ARM_POP(CALLEE_POP_MASK), ctx);
1548	#endif
1549	}
1550
1551	/*
1552	* Convert an eBPF instruction to native instruction, i.e
1553	* JITs an eBPF instruction.
1554	* Returns :
1555	* 0 - Successfully JITed an 8-byte eBPF instruction
1556	* >0 - Successfully JITed a 16-byte eBPF instruction
1557	* <0 - Failed to JIT.
1558	*/
1559	static int build_insn(const struct bpf_insn insn, struct* jit_ctx *ctx)
1560	{
1561	const u8 code = insn->code;
1562	const s8 *dst = bpf2a32[insn->dst_reg];
1563	const s8 *src = bpf2a32[insn->src_reg];
1564	const s8 *tmp = bpf2a32[TMP_REG_1];
1565	const s8 *tmp2 = bpf2a32[TMP_REG_2];
1566	const s16 off = insn->off;
1567	const s32 imm = insn->imm;
1568	const int i = insn - ctx->prog->insnsi;
1569	const bool is64 = BPF_CLASS(code) == BPF_ALU64;
1570	const s8 rd, rs;
1571	s8 rd_lo, rt, rm, rn;
1572	s32 jmp_offset;
1573
1574	#define check_imm(bits, imm) do { \
1575	if ((imm) >= (1 << ((bits) - 1)) \|\| \
1576	(imm) < -(1 << ((bits) - 1))) { \
1577	pr_info("[%2d] imm=%d(0x%x) out of range\n", \
1578	i, imm, imm); \
1579	return -EINVAL; \
1580	} \
1581	} while (0)
1582	#define check_imm24(imm) check_imm(24, imm)
1583
1584	switch (code) {
1585	/ ALU operations /
1586
1587	/ dst = src /
1588	case BPF_ALU \| BPF_MOV \| BPF_K:
1589	case BPF_ALU \| BPF_MOV \| BPF_X:
1590	case BPF_ALU64 \| BPF_MOV \| BPF_K:
1591	case BPF_ALU64 \| BPF_MOV \| BPF_X:
1592	switch (BPF_SRC(code)) {
1593	case BPF_X:
1594	if (imm == `1`) {
1595	/ Special mov32 for zext /
1596	emit_a32_mov_i(dst_hi, val: `0`, ctx);
1597	break;
1598	}
1599	if (insn->off)
1600	emit_a32_movsx_r64(is64, off: insn->off, dst, src, ctx);
1601	else
1602	emit_a32_mov_r64(is64, dst, src, ctx);
1603	break;
1604	case BPF_K:
1605	/ Sign-extend immediate value to destination reg /
1606	emit_a32_mov_se_i64(is64, dst, val: imm, ctx);
1607	break;
1608	}
1609	break;
1610	/ dst = dst + src/imm /
1611	/ dst = dst - src/imm /
1612	/ dst = dst \| src/imm /
1613	/ dst = dst & src/imm /
1614	/ dst = dst ^ src/imm /
1615	/ dst = dst * src/imm /
1616	/ dst = dst << src /
1617	/ dst = dst >> src /
1618	case BPF_ALU \| BPF_ADD \| BPF_K:
1619	case BPF_ALU \| BPF_ADD \| BPF_X:
1620	case BPF_ALU \| BPF_SUB \| BPF_K:
1621	case BPF_ALU \| BPF_SUB \| BPF_X:
1622	case BPF_ALU \| BPF_OR \| BPF_K:
1623	case BPF_ALU \| BPF_OR \| BPF_X:
1624	case BPF_ALU \| BPF_AND \| BPF_K:
1625	case BPF_ALU \| BPF_AND \| BPF_X:
1626	case BPF_ALU \| BPF_XOR \| BPF_K:
1627	case BPF_ALU \| BPF_XOR \| BPF_X:
1628	case BPF_ALU \| BPF_MUL \| BPF_K:
1629	case BPF_ALU \| BPF_MUL \| BPF_X:
1630	case BPF_ALU \| BPF_LSH \| BPF_X:
1631	case BPF_ALU \| BPF_RSH \| BPF_X:
1632	case BPF_ALU \| BPF_ARSH \| BPF_X:
1633	case BPF_ALU64 \| BPF_ADD \| BPF_K:
1634	case BPF_ALU64 \| BPF_ADD \| BPF_X:
1635	case BPF_ALU64 \| BPF_SUB \| BPF_K:
1636	case BPF_ALU64 \| BPF_SUB \| BPF_X:
1637	case BPF_ALU64 \| BPF_OR \| BPF_K:
1638	case BPF_ALU64 \| BPF_OR \| BPF_X:
1639	case BPF_ALU64 \| BPF_AND \| BPF_K:
1640	case BPF_ALU64 \| BPF_AND \| BPF_X:
1641	case BPF_ALU64 \| BPF_XOR \| BPF_K:
1642	case BPF_ALU64 \| BPF_XOR \| BPF_X:
1643	switch (BPF_SRC(code)) {
1644	case BPF_X:
1645	emit_a32_alu_r64(is64, dst, src, ctx, BPF_OP(code));
1646	break;
1647	case BPF_K:
1648	/ Move immediate value to the temporary register*
1649	* and then do the ALU operation on the temporary
1650	* register as this will sign-extend the immediate
1651	* value into temporary reg and then it would be
1652	* safe to do the operation on it.
1653	*/
1654	emit_a32_mov_se_i64(is64, dst: tmp2, val: imm, ctx);
1655	emit_a32_alu_r64(is64, dst, src: tmp2, ctx, BPF_OP(code));
1656	break;
1657	}
1658	break;
1659	/ dst = dst / src(imm) /
1660	/ dst = dst % src(imm) /
1661	case BPF_ALU \| BPF_DIV \| BPF_K:
1662	case BPF_ALU \| BPF_DIV \| BPF_X:
1663	case BPF_ALU \| BPF_MOD \| BPF_K:
1664	case BPF_ALU \| BPF_MOD \| BPF_X:
1665	rd_lo = arm_bpf_get_reg32(dst_lo, tmp: tmp2[`1`], ctx);
1666	switch (BPF_SRC(code)) {
1667	case BPF_X:
1668	rt = arm_bpf_get_reg32(src_lo, tmp: tmp2[`0`], ctx);
1669	break;
1670	case BPF_K:
1671	rt = tmp2[`0`];
1672	emit_a32_mov_i(dst: rt, val: imm, ctx);
1673	break;
1674	default:
1675	rt = src_lo;
1676	break;
1677	}
1678	emit_udivmod(rd: rd_lo, rm: rd_lo, rn: rt, ctx, BPF_OP(code), sign: off);
1679	arm_bpf_put_reg32(dst_lo, src: rd_lo, ctx);
1680	if (!ctx->prog->aux->verifier_zext)
1681	emit_a32_mov_i(dst_hi, val: `0`, ctx);
1682	break;
1683	case BPF_ALU64 \| BPF_DIV \| BPF_K:
1684	case BPF_ALU64 \| BPF_DIV \| BPF_X:
1685	case BPF_ALU64 \| BPF_MOD \| BPF_K:
1686	case BPF_ALU64 \| BPF_MOD \| BPF_X:
1687	rd = arm_bpf_get_reg64(reg: dst, tmp: tmp2, ctx);
1688	switch (BPF_SRC(code)) {
1689	case BPF_X:
1690	rs = arm_bpf_get_reg64(reg: src, tmp, ctx);
1691	break;
1692	case BPF_K:
1693	rs = tmp;
1694	emit_a32_mov_se_i64(is64, dst: rs, val: imm, ctx);
1695	break;
1696	}
1697	emit_udivmod64(rd, rm: rd, rn: rs, ctx, BPF_OP(code), sign: off);
1698	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1699	break;
1700	/ dst = dst << imm /
1701	/ dst = dst >> imm /
1702	/ dst = dst >> imm (signed) /
1703	case BPF_ALU \| BPF_LSH \| BPF_K:
1704	case BPF_ALU \| BPF_RSH \| BPF_K:
1705	case BPF_ALU \| BPF_ARSH \| BPF_K:
1706	if (unlikely(imm > `31`))
1707	return -EINVAL;
1708	if (imm)
1709	emit_a32_alu_i(dst_lo, val: imm, ctx, BPF_OP(code));
1710	if (!ctx->prog->aux->verifier_zext)
1711	emit_a32_mov_i(dst_hi, val: `0`, ctx);
1712	break;
1713	/ dst = dst << imm /
1714	case BPF_ALU64 \| BPF_LSH \| BPF_K:
1715	if (unlikely(imm > `63`))
1716	return -EINVAL;
1717	emit_a32_lsh_i64(dst, val: imm, ctx);
1718	break;
1719	/ dst = dst >> imm /
1720	case BPF_ALU64 \| BPF_RSH \| BPF_K:
1721	if (unlikely(imm > `63`))
1722	return -EINVAL;
1723	emit_a32_rsh_i64(dst, val: imm, ctx);
1724	break;
1725	/ dst = dst << src /
1726	case BPF_ALU64 \| BPF_LSH \| BPF_X:
1727	emit_a32_lsh_r64(dst, src, ctx);
1728	break;
1729	/ dst = dst >> src /
1730	case BPF_ALU64 \| BPF_RSH \| BPF_X:
1731	emit_a32_rsh_r64(dst, src, ctx);
1732	break;
1733	/ dst = dst >> src (signed) /
1734	case BPF_ALU64 \| BPF_ARSH \| BPF_X:
1735	emit_a32_arsh_r64(dst, src, ctx);
1736	break;
1737	/ dst = dst >> imm (signed) /
1738	case BPF_ALU64 \| BPF_ARSH \| BPF_K:
1739	if (unlikely(imm > `63`))
1740	return -EINVAL;
1741	emit_a32_arsh_i64(dst, val: imm, ctx);
1742	break;
1743	/ dst = ~dst /
1744	case BPF_ALU \| BPF_NEG:
1745	emit_a32_alu_i(dst_lo, val: `0`, ctx, BPF_OP(code));
1746	if (!ctx->prog->aux->verifier_zext)
1747	emit_a32_mov_i(dst_hi, val: `0`, ctx);
1748	break;
1749	/ dst = ~dst (64 bit) /
1750	case BPF_ALU64 \| BPF_NEG:
1751	emit_a32_neg64(dst, ctx);
1752	break;
1753	/ dst = dst * src/imm /
1754	case BPF_ALU64 \| BPF_MUL \| BPF_X:
1755	case BPF_ALU64 \| BPF_MUL \| BPF_K:
1756	switch (BPF_SRC(code)) {
1757	case BPF_X:
1758	emit_a32_mul_r64(dst, src, ctx);
1759	break;
1760	case BPF_K:
1761	/ Move immediate value to the temporary register*
1762	* and then do the multiplication on it as this
1763	* will sign-extend the immediate value into temp
1764	* reg then it would be safe to do the operation
1765	* on it.
1766	*/
1767	emit_a32_mov_se_i64(is64, dst: tmp2, val: imm, ctx);
1768	emit_a32_mul_r64(dst, src: tmp2, ctx);
1769	break;
1770	}
1771	break;
1772	/ dst = htole(dst) /
1773	/ dst = htobe(dst) /
1774	case BPF_ALU \| BPF_END \| BPF_FROM_LE: / also BPF_TO_LE /
1775	case BPF_ALU \| BPF_END \| BPF_FROM_BE: / also BPF_TO_BE /
1776	/ dst = bswap(dst) /
1777	case BPF_ALU64 \| BPF_END \| BPF_FROM_LE: / also BPF_TO_LE /
1778	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1779	if (BPF_SRC(code) == BPF_FROM_LE && BPF_CLASS(code) != BPF_ALU64)
1780	goto emit_bswap_uxt;
1781	switch (imm) {
1782	case `16`:
1783	emit_rev16(rd: rd[`1`], rn: rd[`1`], ctx);
1784	goto emit_bswap_uxt;
1785	case `32`:
1786	emit_rev32(rd: rd[`1`], rn: rd[`1`], ctx);
1787	goto emit_bswap_uxt;
1788	case `64`:
1789	emit_rev32(ARM_LR, rn: rd[`1`], ctx);
1790	emit_rev32(rd: rd[`1`], rn: rd[`0`], ctx);
1791	emit(ARM_MOV_R(rd[`0`], ARM_LR), ctx);
1792	break;
1793	}
1794	goto exit;
1795	emit_bswap_uxt:
1796	switch (imm) {
1797	case `16`:
1798	/ zero-extend 16 bits into 64 bits /
1799	#if __LINUX_ARM_ARCH__ < 6
1800	emit_a32_mov_i(dst: tmp2[`1`], val: `0xffff`, ctx);
1801	emit(ARM_AND_R(rd[`1`], rd[`1`], tmp2[`1`]), ctx);
1802	#else /* ARMv6+ */
1803	emit(ARM_UXTH(rd[`1`], rd[`1`]), ctx);
1804	#endif
1805	if (!ctx->prog->aux->verifier_zext)
1806	emit(ARM_EOR_R(rd[`0`], rd[`0`], rd[`0`]), ctx);
1807	break;
1808	case `32`:
1809	/ zero-extend 32 bits into 64 bits /
1810	if (!ctx->prog->aux->verifier_zext)
1811	emit(ARM_EOR_R(rd[`0`], rd[`0`], rd[`0`]), ctx);
1812	break;
1813	case `64`:
1814	/ nop /
1815	break;
1816	}
1817	exit:
1818	arm_bpf_put_reg64(reg: dst, src: rd, ctx);
1819	break;
1820	/ dst = imm64 /
1821	case BPF_LD \| BPF_IMM \| BPF_DW:
1822	{
1823	u64 val = (u32)imm \| (u64)insn[`1`].imm << `32`;
1824
1825	emit_a32_mov_i64(dst, val, ctx);
1826
1827	return `1`;
1828	}
1829	/ LDX: dst = (size )(src + off) /
1830	case BPF_LDX \| BPF_MEM \| BPF_W:
1831	case BPF_LDX \| BPF_MEM \| BPF_H:
1832	case BPF_LDX \| BPF_MEM \| BPF_B:
1833	case BPF_LDX \| BPF_MEM \| BPF_DW:
1834	/ LDSX: dst = (signed size )(src + off) /
1835	case BPF_LDX \| BPF_MEMSX \| BPF_B:
1836	case BPF_LDX \| BPF_MEMSX \| BPF_H:
1837	case BPF_LDX \| BPF_MEMSX \| BPF_W:
1838	rn = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
1839	if (BPF_MODE(insn->code) == BPF_MEMSX)
1840	emit_ldsx_r(dst, src: rn, off, ctx, BPF_SIZE(code));
1841	else
1842	emit_ldx_r(dst, src: rn, off, ctx, BPF_SIZE(code));
1843	break;
1844	/ speculation barrier /
1845	case BPF_ST \| BPF_NOSPEC:
1846	break;
1847	/ ST: (size )(dst + off) = imm /
1848	case BPF_ST \| BPF_MEM \| BPF_W:
1849	case BPF_ST \| BPF_MEM \| BPF_H:
1850	case BPF_ST \| BPF_MEM \| BPF_B:
1851	case BPF_ST \| BPF_MEM \| BPF_DW:
1852	switch (BPF_SIZE(code)) {
1853	case BPF_DW:
1854	/ Sign-extend immediate value into temp reg /
1855	emit_a32_mov_se_i64(is64: true, dst: tmp2, val: imm, ctx);
1856	break;
1857	case BPF_W:
1858	case BPF_H:
1859	case BPF_B:
1860	emit_a32_mov_i(dst: tmp2[`1`], val: imm, ctx);
1861	break;
1862	}
1863	emit_str_r(dst_lo, src: tmp2, off, ctx, BPF_SIZE(code));
1864	break;
1865	/ Atomic ops /
1866	case BPF_STX \| BPF_ATOMIC \| BPF_W:
1867	case BPF_STX \| BPF_ATOMIC \| BPF_DW:
1868	goto notyet;
1869	/ STX: (size )(dst + off) = src /
1870	case BPF_STX \| BPF_MEM \| BPF_W:
1871	case BPF_STX \| BPF_MEM \| BPF_H:
1872	case BPF_STX \| BPF_MEM \| BPF_B:
1873	case BPF_STX \| BPF_MEM \| BPF_DW:
1874	rs = arm_bpf_get_reg64(reg: src, tmp: tmp2, ctx);
1875	emit_str_r(dst_lo, src: rs, off, ctx, BPF_SIZE(code));
1876	break;
1877	/ PC += off if dst == src /
1878	/ PC += off if dst > src /
1879	/ PC += off if dst >= src /
1880	/ PC += off if dst < src /
1881	/ PC += off if dst <= src /
1882	/ PC += off if dst != src /
1883	/ PC += off if dst > src (signed) /
1884	/ PC += off if dst >= src (signed) /
1885	/ PC += off if dst < src (signed) /
1886	/ PC += off if dst <= src (signed) /
1887	/ PC += off if dst & src /
1888	case BPF_JMP \| BPF_JEQ \| BPF_X:
1889	case BPF_JMP \| BPF_JGT \| BPF_X:
1890	case BPF_JMP \| BPF_JGE \| BPF_X:
1891	case BPF_JMP \| BPF_JNE \| BPF_X:
1892	case BPF_JMP \| BPF_JSGT \| BPF_X:
1893	case BPF_JMP \| BPF_JSGE \| BPF_X:
1894	case BPF_JMP \| BPF_JSET \| BPF_X:
1895	case BPF_JMP \| BPF_JLE \| BPF_X:
1896	case BPF_JMP \| BPF_JLT \| BPF_X:
1897	case BPF_JMP \| BPF_JSLT \| BPF_X:
1898	case BPF_JMP \| BPF_JSLE \| BPF_X:
1899	case BPF_JMP32 \| BPF_JEQ \| BPF_X:
1900	case BPF_JMP32 \| BPF_JGT \| BPF_X:
1901	case BPF_JMP32 \| BPF_JGE \| BPF_X:
1902	case BPF_JMP32 \| BPF_JNE \| BPF_X:
1903	case BPF_JMP32 \| BPF_JSGT \| BPF_X:
1904	case BPF_JMP32 \| BPF_JSGE \| BPF_X:
1905	case BPF_JMP32 \| BPF_JSET \| BPF_X:
1906	case BPF_JMP32 \| BPF_JLE \| BPF_X:
1907	case BPF_JMP32 \| BPF_JLT \| BPF_X:
1908	case BPF_JMP32 \| BPF_JSLT \| BPF_X:
1909	case BPF_JMP32 \| BPF_JSLE \| BPF_X:
1910	/ Setup source registers /
1911	rm = arm_bpf_get_reg32(src_hi, tmp: tmp2[`0`], ctx);
1912	rn = arm_bpf_get_reg32(src_lo, tmp: tmp2[`1`], ctx);
1913	goto go_jmp;
1914	/ PC += off if dst == imm /
1915	/ PC += off if dst > imm /
1916	/ PC += off if dst >= imm /
1917	/ PC += off if dst < imm /
1918	/ PC += off if dst <= imm /
1919	/ PC += off if dst != imm /
1920	/ PC += off if dst > imm (signed) /
1921	/ PC += off if dst >= imm (signed) /
1922	/ PC += off if dst < imm (signed) /
1923	/ PC += off if dst <= imm (signed) /
1924	/ PC += off if dst & imm /
1925	case BPF_JMP \| BPF_JEQ \| BPF_K:
1926	case BPF_JMP \| BPF_JGT \| BPF_K:
1927	case BPF_JMP \| BPF_JGE \| BPF_K:
1928	case BPF_JMP \| BPF_JNE \| BPF_K:
1929	case BPF_JMP \| BPF_JSGT \| BPF_K:
1930	case BPF_JMP \| BPF_JSGE \| BPF_K:
1931	case BPF_JMP \| BPF_JSET \| BPF_K:
1932	case BPF_JMP \| BPF_JLT \| BPF_K:
1933	case BPF_JMP \| BPF_JLE \| BPF_K:
1934	case BPF_JMP \| BPF_JSLT \| BPF_K:
1935	case BPF_JMP \| BPF_JSLE \| BPF_K:
1936	case BPF_JMP32 \| BPF_JEQ \| BPF_K:
1937	case BPF_JMP32 \| BPF_JGT \| BPF_K:
1938	case BPF_JMP32 \| BPF_JGE \| BPF_K:
1939	case BPF_JMP32 \| BPF_JNE \| BPF_K:
1940	case BPF_JMP32 \| BPF_JSGT \| BPF_K:
1941	case BPF_JMP32 \| BPF_JSGE \| BPF_K:
1942	case BPF_JMP32 \| BPF_JSET \| BPF_K:
1943	case BPF_JMP32 \| BPF_JLT \| BPF_K:
1944	case BPF_JMP32 \| BPF_JLE \| BPF_K:
1945	case BPF_JMP32 \| BPF_JSLT \| BPF_K:
1946	case BPF_JMP32 \| BPF_JSLE \| BPF_K:
1947	if (off == `0`)
1948	break;
1949	rm = tmp2[`0`];
1950	rn = tmp2[`1`];
1951	/ Sign-extend immediate value /
1952	emit_a32_mov_se_i64(is64: true, dst: tmp2, val: imm, ctx);
1953	go_jmp:
1954	/ Setup destination register /
1955	rd = arm_bpf_get_reg64(reg: dst, tmp, ctx);
1956
1957	/ Check for the condition /
1958	emit_ar_r(rd: rd[`0`], rt: rd[`1`], rm, rn, ctx, BPF_OP(code),
1959	BPF_CLASS(code) == BPF_JMP);
1960
1961	/ Setup JUMP instruction /
1962	jmp_offset = bpf2a32_offset(bpf_to: i+off, bpf_from: i, ctx);
1963	switch (BPF_OP(code)) {
1964	case BPF_JNE:
1965	case BPF_JSET:
1966	_emit(ARM_COND_NE, ARM_B(jmp_offset), ctx);
1967	break;
1968	case BPF_JEQ:
1969	_emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx);
1970	break;
1971	case BPF_JGT:
1972	_emit(ARM_COND_HI, ARM_B(jmp_offset), ctx);
1973	break;
1974	case BPF_JGE:
1975	_emit(ARM_COND_CS, ARM_B(jmp_offset), ctx);
1976	break;
1977	case BPF_JSGT:
1978	_emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1979	break;
1980	case BPF_JSGE:
1981	_emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1982	break;
1983	case BPF_JLE:
1984	_emit(ARM_COND_LS, ARM_B(jmp_offset), ctx);
1985	break;
1986	case BPF_JLT:
1987	_emit(ARM_COND_CC, ARM_B(jmp_offset), ctx);
1988	break;
1989	case BPF_JSLT:
1990	_emit(ARM_COND_LT, ARM_B(jmp_offset), ctx);
1991	break;
1992	case BPF_JSLE:
1993	_emit(ARM_COND_GE, ARM_B(jmp_offset), ctx);
1994	break;
1995	}
1996	break;
1997	/ JMP OFF /
1998	case BPF_JMP \| BPF_JA:
1999	case BPF_JMP32 \| BPF_JA:
2000	{
2001	if (BPF_CLASS(code) == BPF_JMP32 && imm != `0`)
2002	jmp_offset = bpf2a32_offset(bpf_to: i + imm, bpf_from: i, ctx);
2003	else if (BPF_CLASS(code) == BPF_JMP && off != `0`)
2004	jmp_offset = bpf2a32_offset(bpf_to: i + off, bpf_from: i, ctx);
2005	else
2006	break;
2007
2008	check_imm24(jmp_offset);
2009	emit(ARM_B(jmp_offset), ctx);
2010	break;
2011	}
2012	/ tail call /
2013	case BPF_JMP \| BPF_TAIL_CALL:
2014	if (emit_bpf_tail_call(ctx))
2015	return -EFAULT;
2016	break;
2017	/ function call /
2018	case BPF_JMP \| BPF_CALL:
2019	{
2020	const s8 *r0 = bpf2a32[BPF_REG_0];
2021	const s8 *r1 = bpf2a32[BPF_REG_1];
2022	const s8 *r2 = bpf2a32[BPF_REG_2];
2023	const s8 *r3 = bpf2a32[BPF_REG_3];
2024	const s8 *r4 = bpf2a32[BPF_REG_4];
2025	const s8 *r5 = bpf2a32[BPF_REG_5];
2026	const u32 func = (u32)__bpf_call_base + (u32)imm;
2027
2028	emit_a32_mov_r64(is64: true, dst: r0, src: r1, ctx);
2029	emit_a32_mov_r64(is64: true, dst: r1, src: r2, ctx);
2030	emit_push_r64(src: r5, ctx);
2031	emit_push_r64(src: r4, ctx);
2032	emit_push_r64(src: r3, ctx);
2033
2034	emit_a32_mov_i(dst: tmp[`1`], val: func, ctx);
2035	emit_blx_r(tgt_reg: tmp[`1`], ctx);
2036
2037	emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(`24`)), ctx); // callee clean
2038	break;
2039	}
2040	/ function return /
2041	case BPF_JMP \| BPF_EXIT:
2042	/ Optimization: when last instruction is EXIT*
2043	* simply fallthrough to epilogue.
2044	*/
2045	if (i == ctx->prog->len - `1`)
2046	break;
2047	jmp_offset = epilogue_offset(ctx);
2048	check_imm24(jmp_offset);
2049	emit(ARM_B(jmp_offset), ctx);
2050	break;
2051	notyet:
2052	pr_info_once("* NOT YET: opcode %02x *\n", code);
2053	return -EFAULT;
2054	default:
2055	pr_err_once("unknown opcode %02x\n", code);
2056	return -EINVAL;
2057	}
2058
2059	if (ctx->flags & FLAG_IMM_OVERFLOW)
2060	/*
2061	* this instruction generated an overflow when
2062	* trying to access the literal pool, so
2063	* delegate this filter to the kernel interpreter.
2064	*/
2065	return -`1`;
2066	return `0`;
2067	}
2068
2069	static int build_body(struct jit_ctx *ctx)
2070	{
2071	const struct bpf_prog *prog = ctx->prog;
2072	unsigned int i;
2073
2074	for (i = `0`; i < prog->len; i++) {
2075	const struct bpf_insn *insn = &(prog->insnsi[i]);
2076	int ret;
2077
2078	ret = build_insn(insn, ctx);
2079
2080	/ It's used with loading the 64 bit immediate value. /
2081	if (ret > `0`) {
2082	i++;
2083	if (ctx->target == NULL)
2084	ctx->offsets[i] = ctx->idx;
2085	continue;
2086	}
2087
2088	if (ctx->target == NULL)
2089	ctx->offsets[i] = ctx->idx;
2090
2091	/ If unsuccesful, return with error code /
2092	if (ret)
2093	return ret;
2094	}
2095	return `0`;
2096	}
2097
2098	static int validate_code(struct jit_ctx *ctx)
2099	{
2100	int i;
2101
2102	for (i = `0`; i < ctx->idx; i++) {
2103	if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF))
2104	return -`1`;
2105	}
2106
2107	return `0`;
2108	}
2109
2110	bool bpf_jit_needs_zext(void)
2111	{
2112	return true;
2113	}
2114
2115	struct bpf_prog bpf_int_jit_compile(struct* bpf_prog *prog)
2116	{
2117	struct bpf_prog tmp, orig_prog = prog;
2118	struct bpf_binary_header *header;
2119	bool tmp_blinded = false;
2120	struct jit_ctx ctx;
2121	unsigned int tmp_idx;
2122	unsigned int image_size;
2123	u8 *image_ptr;
2124
2125	/ If BPF JIT was not enabled then we must fall back to*
2126	* the interpreter.
2127	*/
2128	if (!prog->jit_requested)
2129	return orig_prog;
2130
2131	/ If constant blinding was enabled and we failed during blinding*
2132	* then we must fall back to the interpreter. Otherwise, we save
2133	* the new JITed code.
2134	*/
2135	tmp = bpf_jit_blind_constants(fp: prog);
2136
2137	if (IS_ERR(ptr: tmp))
2138	return orig_prog;
2139	if (tmp != prog) {
2140	tmp_blinded = true;
2141	prog = tmp;
2142	}
2143
2144	memset(&ctx, `0`, sizeof(ctx));
2145	ctx.prog = prog;
2146	ctx.cpu_architecture = cpu_architecture();
2147
2148	/ Not able to allocate memory for offsets[] , then*
2149	* we must fall back to the interpreter
2150	*/
2151	ctx.offsets = kcalloc(n: prog->len, size: sizeof(int), GFP_KERNEL);
2152	if (ctx.offsets == NULL) {
2153	prog = orig_prog;
2154	goto out;
2155	}
2156
2157	/ 1) fake pass to find in the length of the JITed code,*
2158	* to compute ctx->offsets and other context variables
2159	* needed to compute final JITed code.
2160	* Also, calculate random starting pointer/start of JITed code
2161	* which is prefixed by random number of fault instructions.
2162	*
2163	* If the first pass fails then there is no chance of it
2164	* being successful in the second pass, so just fall back
2165	* to the interpreter.
2166	*/
2167	if (build_body(ctx: &ctx)) {
2168	prog = orig_prog;
2169	goto out_off;
2170	}
2171
2172	tmp_idx = ctx.idx;
2173	build_prologue(ctx: &ctx);
2174	ctx.prologue_bytes = (ctx.idx - tmp_idx) * `4`;
2175
2176	ctx.epilogue_offset = ctx.idx;
2177
2178	#if __LINUX_ARM_ARCH__ < 7
2179	tmp_idx = ctx.idx;
2180	build_epilogue(ctx: &ctx);
2181	ctx.epilogue_bytes = (ctx.idx - tmp_idx) * `4`;
2182
2183	ctx.idx += ctx.imm_count;
2184	if (ctx.imm_count) {
2185	ctx.imms = kcalloc(n: ctx.imm_count, size: sizeof(u32), GFP_KERNEL);
2186	if (ctx.imms == NULL) {
2187	prog = orig_prog;
2188	goto out_off;
2189	}
2190	}
2191	#else
2192	/ there's nothing about the epilogue on ARMv7 /
2193	build_epilogue(&ctx);
2194	#endif
2195	/ Now we can get the actual image size of the JITed arm code.*
2196	* Currently, we are not considering the THUMB-2 instructions
2197	* for jit, although it can decrease the size of the image.
2198	*
2199	* As each arm instruction is of length 32bit, we are translating
2200	* number of JITed instructions into the size required to store these
2201	* JITed code.
2202	*/
2203	image_size = sizeof(u32) * ctx.idx;
2204
2205	/ Now we know the size of the structure to make /
2206	header = bpf_jit_binary_alloc(proglen: image_size, image_ptr: &image_ptr,
2207	alignment: sizeof(u32), bpf_fill_ill_insns: jit_fill_hole);
2208	/ Not able to allocate memory for the structure then*
2209	* we must fall back to the interpretation
2210	*/
2211	if (header == NULL) {
2212	prog = orig_prog;
2213	goto out_imms;
2214	}
2215
2216	/ 2.) Actual pass to generate final JIT code /
2217	ctx.target = (u32 *) image_ptr;
2218	ctx.idx = `0`;
2219
2220	build_prologue(ctx: &ctx);
2221
2222	/ If building the body of the JITed code fails somehow,*
2223	* we fall back to the interpretation.
2224	*/
2225	if (build_body(ctx: &ctx) < `0`) {
2226	image_ptr = NULL;
2227	bpf_jit_binary_free(hdr: header);
2228	prog = orig_prog;
2229	goto out_imms;
2230	}
2231	build_epilogue(ctx: &ctx);
2232
2233	/ 3.) Extra pass to validate JITed Code /
2234	if (validate_code(ctx: &ctx)) {
2235	image_ptr = NULL;
2236	bpf_jit_binary_free(hdr: header);
2237	prog = orig_prog;
2238	goto out_imms;
2239	}
2240	flush_icache_range(start: (u32)header, end: (u32)(ctx.target + ctx.idx));
2241
2242	if (bpf_jit_enable > `1`)
2243	/ there are 2 passes here /
2244	bpf_jit_dump(flen: prog->len, proglen: image_size, pass: `2`, image: ctx.target);
2245
2246	bpf_jit_binary_lock_ro(hdr: header);
2247	prog->bpf_func = (void *)ctx.target;
2248	prog->jited = `1`;
2249	prog->jited_len = image_size;
2250
2251	out_imms:
2252	#if __LINUX_ARM_ARCH__ < 7
2253	if (ctx.imm_count)
2254	kfree(objp: ctx.imms);
2255	#endif
2256	out_off:
2257	kfree(objp: ctx.offsets);
2258	out:
2259	if (tmp_blinded)
2260	bpf_jit_prog_release_other(fp: prog, fp_other: prog == orig_prog ?
2261	tmp : orig_prog);
2262	return prog;
2263	}
2264
2265

source code of linux/arch/arm/net/bpf_jit_32.c