1 | /* |
2 | * Copyright © 2014 Broadcom |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
5 | * copy of this software and associated documentation files (the "Software"), |
6 | * to deal in the Software without restriction, including without limitation |
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
8 | * and/or sell copies of the Software, and to permit persons to whom the |
9 | * Software is furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice (including the next |
12 | * paragraph) shall be included in all copies or substantial portions of the |
13 | * Software. |
14 | * |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
21 | * IN THE SOFTWARE. |
22 | */ |
23 | |
24 | /** |
25 | * DOC: Shader validator for VC4. |
26 | * |
27 | * Since the VC4 has no IOMMU between it and system memory, a user |
28 | * with access to execute shaders could escalate privilege by |
29 | * overwriting system memory (using the VPM write address register in |
30 | * the general-purpose DMA mode) or reading system memory it shouldn't |
31 | * (reading it as a texture, uniform data, or direct-addressed TMU |
32 | * lookup). |
33 | * |
34 | * The shader validator walks over a shader's BO, ensuring that its |
35 | * accesses are appropriately bounded, and recording where texture |
36 | * accesses are made so that we can do relocations for them in the |
37 | * uniform stream. |
38 | * |
39 | * Shader BO are immutable for their lifetimes (enforced by not |
40 | * allowing mmaps, GEM prime export, or rendering to from a CL), so |
41 | * this validation is only performed at BO creation time. |
42 | */ |
43 | |
44 | #include "vc4_drv.h" |
45 | #include "vc4_qpu_defines.h" |
46 | |
47 | #define LIVE_REG_COUNT (32 + 32 + 4) |
48 | |
49 | struct vc4_shader_validation_state { |
50 | /* Current IP being validated. */ |
51 | uint32_t ip; |
52 | |
53 | /* IP at the end of the BO, do not read shader[max_ip] */ |
54 | uint32_t max_ip; |
55 | |
56 | uint64_t *shader; |
57 | |
58 | struct vc4_texture_sample_info tmu_setup[2]; |
59 | int tmu_write_count[2]; |
60 | |
61 | /* For registers that were last written to by a MIN instruction with |
62 | * one argument being a uniform, the address of the uniform. |
63 | * Otherwise, ~0. |
64 | * |
65 | * This is used for the validation of direct address memory reads. |
66 | */ |
67 | uint32_t live_min_clamp_offsets[LIVE_REG_COUNT]; |
68 | bool live_max_clamp_regs[LIVE_REG_COUNT]; |
69 | uint32_t live_immediates[LIVE_REG_COUNT]; |
70 | |
71 | /* Bitfield of which IPs are used as branch targets. |
72 | * |
73 | * Used for validation that the uniform stream is updated at the right |
74 | * points and clearing the texturing/clamping state. |
75 | */ |
76 | unsigned long *branch_targets; |
77 | |
78 | /* Set when entering a basic block, and cleared when the uniform |
79 | * address update is found. This is used to make sure that we don't |
80 | * read uniforms when the address is undefined. |
81 | */ |
82 | bool needs_uniform_address_update; |
83 | |
84 | /* Set when we find a backwards branch. If the branch is backwards, |
85 | * the taraget is probably doing an address reset to read uniforms, |
86 | * and so we need to be sure that a uniforms address is present in the |
87 | * stream, even if the shader didn't need to read uniforms in later |
88 | * basic blocks. |
89 | */ |
90 | bool needs_uniform_address_for_loop; |
91 | |
92 | /* Set when we find an instruction writing the top half of the |
93 | * register files. If we allowed writing the unusable regs in |
94 | * a threaded shader, then the other shader running on our |
95 | * QPU's clamp validation would be invalid. |
96 | */ |
97 | bool all_registers_used; |
98 | }; |
99 | |
100 | static uint32_t |
101 | waddr_to_live_reg_index(uint32_t waddr, bool is_b) |
102 | { |
103 | if (waddr < 32) { |
104 | if (is_b) |
105 | return 32 + waddr; |
106 | else |
107 | return waddr; |
108 | } else if (waddr <= QPU_W_ACC3) { |
109 | return 64 + waddr - QPU_W_ACC0; |
110 | } else { |
111 | return ~0; |
112 | } |
113 | } |
114 | |
115 | static uint32_t |
116 | raddr_add_a_to_live_reg_index(uint64_t inst) |
117 | { |
118 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
119 | uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); |
120 | uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); |
121 | uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); |
122 | |
123 | if (add_a == QPU_MUX_A) |
124 | return raddr_a; |
125 | else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) |
126 | return 32 + raddr_b; |
127 | else if (add_a <= QPU_MUX_R3) |
128 | return 64 + add_a; |
129 | else |
130 | return ~0; |
131 | } |
132 | |
133 | static bool |
134 | live_reg_is_upper_half(uint32_t lri) |
135 | { |
136 | return (lri >= 16 && lri < 32) || |
137 | (lri >= 32 + 16 && lri < 32 + 32); |
138 | } |
139 | |
140 | static bool |
141 | is_tmu_submit(uint32_t waddr) |
142 | { |
143 | return (waddr == QPU_W_TMU0_S || |
144 | waddr == QPU_W_TMU1_S); |
145 | } |
146 | |
147 | static bool |
148 | is_tmu_write(uint32_t waddr) |
149 | { |
150 | return (waddr >= QPU_W_TMU0_S && |
151 | waddr <= QPU_W_TMU1_B); |
152 | } |
153 | |
154 | static bool |
155 | record_texture_sample(struct vc4_validated_shader_info *validated_shader, |
156 | struct vc4_shader_validation_state *validation_state, |
157 | int tmu) |
158 | { |
159 | uint32_t s = validated_shader->num_texture_samples; |
160 | int i; |
161 | struct vc4_texture_sample_info *temp_samples; |
162 | |
163 | temp_samples = krealloc(objp: validated_shader->texture_samples, |
164 | new_size: (s + 1) * sizeof(*temp_samples), |
165 | GFP_KERNEL); |
166 | if (!temp_samples) |
167 | return false; |
168 | |
169 | memcpy(&temp_samples[s], |
170 | &validation_state->tmu_setup[tmu], |
171 | sizeof(*temp_samples)); |
172 | |
173 | validated_shader->num_texture_samples = s + 1; |
174 | validated_shader->texture_samples = temp_samples; |
175 | |
176 | for (i = 0; i < 4; i++) |
177 | validation_state->tmu_setup[tmu].p_offset[i] = ~0; |
178 | |
179 | return true; |
180 | } |
181 | |
182 | static bool |
183 | check_tmu_write(struct vc4_validated_shader_info *validated_shader, |
184 | struct vc4_shader_validation_state *validation_state, |
185 | bool is_mul) |
186 | { |
187 | uint64_t inst = validation_state->shader[validation_state->ip]; |
188 | uint32_t waddr = (is_mul ? |
189 | QPU_GET_FIELD(inst, QPU_WADDR_MUL) : |
190 | QPU_GET_FIELD(inst, QPU_WADDR_ADD)); |
191 | uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); |
192 | uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); |
193 | int tmu = waddr > QPU_W_TMU0_B; |
194 | bool submit = is_tmu_submit(waddr); |
195 | bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; |
196 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
197 | |
198 | if (is_direct) { |
199 | uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); |
200 | uint32_t clamp_reg, clamp_offset; |
201 | |
202 | if (sig == QPU_SIG_SMALL_IMM) { |
203 | DRM_DEBUG("direct TMU read used small immediate\n" ); |
204 | return false; |
205 | } |
206 | |
207 | /* Make sure that this texture load is an add of the base |
208 | * address of the UBO to a clamped offset within the UBO. |
209 | */ |
210 | if (is_mul || |
211 | QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { |
212 | DRM_DEBUG("direct TMU load wasn't an add\n" ); |
213 | return false; |
214 | } |
215 | |
216 | /* We assert that the clamped address is the first |
217 | * argument, and the UBO base address is the second argument. |
218 | * This is arbitrary, but simpler than supporting flipping the |
219 | * two either way. |
220 | */ |
221 | clamp_reg = raddr_add_a_to_live_reg_index(inst); |
222 | if (clamp_reg == ~0) { |
223 | DRM_DEBUG("direct TMU load wasn't clamped\n" ); |
224 | return false; |
225 | } |
226 | |
227 | clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg]; |
228 | if (clamp_offset == ~0) { |
229 | DRM_DEBUG("direct TMU load wasn't clamped\n" ); |
230 | return false; |
231 | } |
232 | |
233 | /* Store the clamp value's offset in p1 (see reloc_tex() in |
234 | * vc4_validate.c). |
235 | */ |
236 | validation_state->tmu_setup[tmu].p_offset[1] = |
237 | clamp_offset; |
238 | |
239 | if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && |
240 | !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { |
241 | DRM_DEBUG("direct TMU load didn't add to a uniform\n" ); |
242 | return false; |
243 | } |
244 | |
245 | validation_state->tmu_setup[tmu].is_direct = true; |
246 | } else { |
247 | if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && |
248 | raddr_b == QPU_R_UNIF)) { |
249 | DRM_DEBUG("uniform read in the same instruction as " |
250 | "texture setup.\n" ); |
251 | return false; |
252 | } |
253 | } |
254 | |
255 | if (validation_state->tmu_write_count[tmu] >= 4) { |
256 | DRM_DEBUG("TMU%d got too many parameters before dispatch\n" , |
257 | tmu); |
258 | return false; |
259 | } |
260 | validation_state->tmu_setup[tmu].p_offset[validation_state->tmu_write_count[tmu]] = |
261 | validated_shader->uniforms_size; |
262 | validation_state->tmu_write_count[tmu]++; |
263 | /* Since direct uses a RADDR uniform reference, it will get counted in |
264 | * check_instruction_reads() |
265 | */ |
266 | if (!is_direct) { |
267 | if (validation_state->needs_uniform_address_update) { |
268 | DRM_DEBUG("Texturing with undefined uniform address\n" ); |
269 | return false; |
270 | } |
271 | |
272 | validated_shader->uniforms_size += 4; |
273 | } |
274 | |
275 | if (submit) { |
276 | if (!record_texture_sample(validated_shader, |
277 | validation_state, tmu)) { |
278 | return false; |
279 | } |
280 | |
281 | validation_state->tmu_write_count[tmu] = 0; |
282 | } |
283 | |
284 | return true; |
285 | } |
286 | |
287 | static bool require_uniform_address_uniform(struct vc4_validated_shader_info *validated_shader) |
288 | { |
289 | uint32_t o = validated_shader->num_uniform_addr_offsets; |
290 | uint32_t num_uniforms = validated_shader->uniforms_size / 4; |
291 | |
292 | validated_shader->uniform_addr_offsets = |
293 | krealloc(objp: validated_shader->uniform_addr_offsets, |
294 | new_size: (o + 1) * |
295 | sizeof(*validated_shader->uniform_addr_offsets), |
296 | GFP_KERNEL); |
297 | if (!validated_shader->uniform_addr_offsets) |
298 | return false; |
299 | |
300 | validated_shader->uniform_addr_offsets[o] = num_uniforms; |
301 | validated_shader->num_uniform_addr_offsets++; |
302 | |
303 | return true; |
304 | } |
305 | |
306 | static bool |
307 | validate_uniform_address_write(struct vc4_validated_shader_info *validated_shader, |
308 | struct vc4_shader_validation_state *validation_state, |
309 | bool is_mul) |
310 | { |
311 | uint64_t inst = validation_state->shader[validation_state->ip]; |
312 | u32 add_b = QPU_GET_FIELD(inst, QPU_ADD_B); |
313 | u32 raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); |
314 | u32 raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); |
315 | u32 add_lri = raddr_add_a_to_live_reg_index(inst); |
316 | /* We want our reset to be pointing at whatever uniform follows the |
317 | * uniforms base address. |
318 | */ |
319 | u32 expected_offset = validated_shader->uniforms_size + 4; |
320 | |
321 | /* We only support absolute uniform address changes, and we |
322 | * require that they be in the current basic block before any |
323 | * of its uniform reads. |
324 | * |
325 | * One could potentially emit more efficient QPU code, by |
326 | * noticing that (say) an if statement does uniform control |
327 | * flow for all threads and that the if reads the same number |
328 | * of uniforms on each side. However, this scheme is easy to |
329 | * validate so it's all we allow for now. |
330 | */ |
331 | switch (QPU_GET_FIELD(inst, QPU_SIG)) { |
332 | case QPU_SIG_NONE: |
333 | case QPU_SIG_SCOREBOARD_UNLOCK: |
334 | case QPU_SIG_COLOR_LOAD: |
335 | case QPU_SIG_LOAD_TMU0: |
336 | case QPU_SIG_LOAD_TMU1: |
337 | break; |
338 | default: |
339 | DRM_DEBUG("uniforms address change must be " |
340 | "normal math\n" ); |
341 | return false; |
342 | } |
343 | |
344 | if (is_mul || QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_ADD) { |
345 | DRM_DEBUG("Uniform address reset must be an ADD.\n" ); |
346 | return false; |
347 | } |
348 | |
349 | if (QPU_GET_FIELD(inst, QPU_COND_ADD) != QPU_COND_ALWAYS) { |
350 | DRM_DEBUG("Uniform address reset must be unconditional.\n" ); |
351 | return false; |
352 | } |
353 | |
354 | if (QPU_GET_FIELD(inst, QPU_PACK) != QPU_PACK_A_NOP && |
355 | !(inst & QPU_PM)) { |
356 | DRM_DEBUG("No packing allowed on uniforms reset\n" ); |
357 | return false; |
358 | } |
359 | |
360 | if (add_lri == -1) { |
361 | DRM_DEBUG("First argument of uniform address write must be " |
362 | "an immediate value.\n" ); |
363 | return false; |
364 | } |
365 | |
366 | if (validation_state->live_immediates[add_lri] != expected_offset) { |
367 | DRM_DEBUG("Resetting uniforms with offset %db instead of %db\n" , |
368 | validation_state->live_immediates[add_lri], |
369 | expected_offset); |
370 | return false; |
371 | } |
372 | |
373 | if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && |
374 | !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { |
375 | DRM_DEBUG("Second argument of uniform address write must be " |
376 | "a uniform.\n" ); |
377 | return false; |
378 | } |
379 | |
380 | validation_state->needs_uniform_address_update = false; |
381 | validation_state->needs_uniform_address_for_loop = false; |
382 | return require_uniform_address_uniform(validated_shader); |
383 | } |
384 | |
385 | static bool |
386 | check_reg_write(struct vc4_validated_shader_info *validated_shader, |
387 | struct vc4_shader_validation_state *validation_state, |
388 | bool is_mul) |
389 | { |
390 | uint64_t inst = validation_state->shader[validation_state->ip]; |
391 | uint32_t waddr = (is_mul ? |
392 | QPU_GET_FIELD(inst, QPU_WADDR_MUL) : |
393 | QPU_GET_FIELD(inst, QPU_WADDR_ADD)); |
394 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
395 | bool ws = inst & QPU_WS; |
396 | bool is_b = is_mul ^ ws; |
397 | u32 lri = waddr_to_live_reg_index(waddr, is_b); |
398 | |
399 | if (lri != -1) { |
400 | uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); |
401 | uint32_t cond_mul = QPU_GET_FIELD(inst, QPU_COND_MUL); |
402 | |
403 | if (sig == QPU_SIG_LOAD_IMM && |
404 | QPU_GET_FIELD(inst, QPU_PACK) == QPU_PACK_A_NOP && |
405 | ((is_mul && cond_mul == QPU_COND_ALWAYS) || |
406 | (!is_mul && cond_add == QPU_COND_ALWAYS))) { |
407 | validation_state->live_immediates[lri] = |
408 | QPU_GET_FIELD(inst, QPU_LOAD_IMM); |
409 | } else { |
410 | validation_state->live_immediates[lri] = ~0; |
411 | } |
412 | |
413 | if (live_reg_is_upper_half(lri)) |
414 | validation_state->all_registers_used = true; |
415 | } |
416 | |
417 | switch (waddr) { |
418 | case QPU_W_UNIFORMS_ADDRESS: |
419 | if (is_b) { |
420 | DRM_DEBUG("relative uniforms address change " |
421 | "unsupported\n" ); |
422 | return false; |
423 | } |
424 | |
425 | return validate_uniform_address_write(validated_shader, |
426 | validation_state, |
427 | is_mul); |
428 | |
429 | case QPU_W_TLB_COLOR_MS: |
430 | case QPU_W_TLB_COLOR_ALL: |
431 | case QPU_W_TLB_Z: |
432 | /* These only interact with the tile buffer, not main memory, |
433 | * so they're safe. |
434 | */ |
435 | return true; |
436 | |
437 | case QPU_W_TMU0_S: |
438 | case QPU_W_TMU0_T: |
439 | case QPU_W_TMU0_R: |
440 | case QPU_W_TMU0_B: |
441 | case QPU_W_TMU1_S: |
442 | case QPU_W_TMU1_T: |
443 | case QPU_W_TMU1_R: |
444 | case QPU_W_TMU1_B: |
445 | return check_tmu_write(validated_shader, validation_state, |
446 | is_mul); |
447 | |
448 | case QPU_W_HOST_INT: |
449 | case QPU_W_TMU_NOSWAP: |
450 | case QPU_W_TLB_ALPHA_MASK: |
451 | case QPU_W_MUTEX_RELEASE: |
452 | /* XXX: I haven't thought about these, so don't support them |
453 | * for now. |
454 | */ |
455 | DRM_DEBUG("Unsupported waddr %d\n" , waddr); |
456 | return false; |
457 | |
458 | case QPU_W_VPM_ADDR: |
459 | DRM_DEBUG("General VPM DMA unsupported\n" ); |
460 | return false; |
461 | |
462 | case QPU_W_VPM: |
463 | case QPU_W_VPMVCD_SETUP: |
464 | /* We allow VPM setup in general, even including VPM DMA |
465 | * configuration setup, because the (unsafe) DMA can only be |
466 | * triggered by QPU_W_VPM_ADDR writes. |
467 | */ |
468 | return true; |
469 | |
470 | case QPU_W_TLB_STENCIL_SETUP: |
471 | return true; |
472 | } |
473 | |
474 | return true; |
475 | } |
476 | |
477 | static void |
478 | track_live_clamps(struct vc4_validated_shader_info *validated_shader, |
479 | struct vc4_shader_validation_state *validation_state) |
480 | { |
481 | uint64_t inst = validation_state->shader[validation_state->ip]; |
482 | uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD); |
483 | uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); |
484 | uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); |
485 | uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD); |
486 | uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); |
487 | uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); |
488 | uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); |
489 | uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); |
490 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
491 | bool ws = inst & QPU_WS; |
492 | uint32_t lri_add_a, lri_add, lri_mul; |
493 | bool add_a_is_min_0; |
494 | |
495 | /* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0), |
496 | * before we clear previous live state. |
497 | */ |
498 | lri_add_a = raddr_add_a_to_live_reg_index(inst); |
499 | add_a_is_min_0 = (lri_add_a != ~0 && |
500 | validation_state->live_max_clamp_regs[lri_add_a]); |
501 | |
502 | /* Clear live state for registers written by our instruction. */ |
503 | lri_add = waddr_to_live_reg_index(waddr: waddr_add, is_b: ws); |
504 | lri_mul = waddr_to_live_reg_index(waddr: waddr_mul, is_b: !ws); |
505 | if (lri_mul != ~0) { |
506 | validation_state->live_max_clamp_regs[lri_mul] = false; |
507 | validation_state->live_min_clamp_offsets[lri_mul] = ~0; |
508 | } |
509 | if (lri_add != ~0) { |
510 | validation_state->live_max_clamp_regs[lri_add] = false; |
511 | validation_state->live_min_clamp_offsets[lri_add] = ~0; |
512 | } else { |
513 | /* Nothing further to do for live tracking, since only ADDs |
514 | * generate new live clamp registers. |
515 | */ |
516 | return; |
517 | } |
518 | |
519 | /* Now, handle remaining live clamp tracking for the ADD operation. */ |
520 | |
521 | if (cond_add != QPU_COND_ALWAYS) |
522 | return; |
523 | |
524 | if (op_add == QPU_A_MAX) { |
525 | /* Track live clamps of a value to a minimum of 0 (in either |
526 | * arg). |
527 | */ |
528 | if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 || |
529 | (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) { |
530 | return; |
531 | } |
532 | |
533 | validation_state->live_max_clamp_regs[lri_add] = true; |
534 | } else if (op_add == QPU_A_MIN) { |
535 | /* Track live clamps of a value clamped to a minimum of 0 and |
536 | * a maximum of some uniform's offset. |
537 | */ |
538 | if (!add_a_is_min_0) |
539 | return; |
540 | |
541 | if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && |
542 | !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && |
543 | sig != QPU_SIG_SMALL_IMM)) { |
544 | return; |
545 | } |
546 | |
547 | validation_state->live_min_clamp_offsets[lri_add] = |
548 | validated_shader->uniforms_size; |
549 | } |
550 | } |
551 | |
552 | static bool |
553 | check_instruction_writes(struct vc4_validated_shader_info *validated_shader, |
554 | struct vc4_shader_validation_state *validation_state) |
555 | { |
556 | uint64_t inst = validation_state->shader[validation_state->ip]; |
557 | uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); |
558 | uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); |
559 | bool ok; |
560 | |
561 | if (is_tmu_write(waddr: waddr_add) && is_tmu_write(waddr: waddr_mul)) { |
562 | DRM_DEBUG("ADD and MUL both set up textures\n" ); |
563 | return false; |
564 | } |
565 | |
566 | ok = (check_reg_write(validated_shader, validation_state, is_mul: false) && |
567 | check_reg_write(validated_shader, validation_state, is_mul: true)); |
568 | |
569 | track_live_clamps(validated_shader, validation_state); |
570 | |
571 | return ok; |
572 | } |
573 | |
574 | static bool |
575 | check_branch(uint64_t inst, |
576 | struct vc4_validated_shader_info *validated_shader, |
577 | struct vc4_shader_validation_state *validation_state, |
578 | int ip) |
579 | { |
580 | int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); |
581 | uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD); |
582 | uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL); |
583 | |
584 | if ((int)branch_imm < 0) |
585 | validation_state->needs_uniform_address_for_loop = true; |
586 | |
587 | /* We don't want to have to worry about validation of this, and |
588 | * there's no need for it. |
589 | */ |
590 | if (waddr_add != QPU_W_NOP || waddr_mul != QPU_W_NOP) { |
591 | DRM_DEBUG("branch instruction at %d wrote a register.\n" , |
592 | validation_state->ip); |
593 | return false; |
594 | } |
595 | |
596 | return true; |
597 | } |
598 | |
599 | static bool |
600 | check_instruction_reads(struct vc4_validated_shader_info *validated_shader, |
601 | struct vc4_shader_validation_state *validation_state) |
602 | { |
603 | uint64_t inst = validation_state->shader[validation_state->ip]; |
604 | uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); |
605 | uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); |
606 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
607 | |
608 | if (raddr_a == QPU_R_UNIF || |
609 | (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { |
610 | /* This can't overflow the uint32_t, because we're reading 8 |
611 | * bytes of instruction to increment by 4 here, so we'd |
612 | * already be OOM. |
613 | */ |
614 | validated_shader->uniforms_size += 4; |
615 | |
616 | if (validation_state->needs_uniform_address_update) { |
617 | DRM_DEBUG("Uniform read with undefined uniform " |
618 | "address\n" ); |
619 | return false; |
620 | } |
621 | } |
622 | |
623 | if ((raddr_a >= 16 && raddr_a < 32) || |
624 | (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) { |
625 | validation_state->all_registers_used = true; |
626 | } |
627 | |
628 | return true; |
629 | } |
630 | |
631 | /* Make sure that all branches are absolute and point within the shader, and |
632 | * note their targets for later. |
633 | */ |
634 | static bool |
635 | vc4_validate_branches(struct vc4_shader_validation_state *validation_state) |
636 | { |
637 | uint32_t max_branch_target = 0; |
638 | int ip; |
639 | int last_branch = -2; |
640 | |
641 | for (ip = 0; ip < validation_state->max_ip; ip++) { |
642 | uint64_t inst = validation_state->shader[ip]; |
643 | int32_t branch_imm = QPU_GET_FIELD(inst, QPU_BRANCH_TARGET); |
644 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
645 | uint32_t after_delay_ip = ip + 4; |
646 | uint32_t branch_target_ip; |
647 | |
648 | if (sig == QPU_SIG_PROG_END) { |
649 | /* There are two delay slots after program end is |
650 | * signaled that are still executed, then we're |
651 | * finished. validation_state->max_ip is the |
652 | * instruction after the last valid instruction in the |
653 | * program. |
654 | */ |
655 | validation_state->max_ip = ip + 3; |
656 | continue; |
657 | } |
658 | |
659 | if (sig != QPU_SIG_BRANCH) |
660 | continue; |
661 | |
662 | if (ip - last_branch < 4) { |
663 | DRM_DEBUG("Branch at %d during delay slots\n" , ip); |
664 | return false; |
665 | } |
666 | last_branch = ip; |
667 | |
668 | if (inst & QPU_BRANCH_REG) { |
669 | DRM_DEBUG("branching from register relative " |
670 | "not supported\n" ); |
671 | return false; |
672 | } |
673 | |
674 | if (!(inst & QPU_BRANCH_REL)) { |
675 | DRM_DEBUG("relative branching required\n" ); |
676 | return false; |
677 | } |
678 | |
679 | /* The actual branch target is the instruction after the delay |
680 | * slots, plus whatever byte offset is in the low 32 bits of |
681 | * the instruction. Make sure we're not branching beyond the |
682 | * end of the shader object. |
683 | */ |
684 | if (branch_imm % sizeof(inst) != 0) { |
685 | DRM_DEBUG("branch target not aligned\n" ); |
686 | return false; |
687 | } |
688 | |
689 | branch_target_ip = after_delay_ip + (branch_imm >> 3); |
690 | if (branch_target_ip >= validation_state->max_ip) { |
691 | DRM_DEBUG("Branch at %d outside of shader (ip %d/%d)\n" , |
692 | ip, branch_target_ip, |
693 | validation_state->max_ip); |
694 | return false; |
695 | } |
696 | set_bit(nr: branch_target_ip, addr: validation_state->branch_targets); |
697 | |
698 | /* Make sure that the non-branching path is also not outside |
699 | * the shader. |
700 | */ |
701 | if (after_delay_ip >= validation_state->max_ip) { |
702 | DRM_DEBUG("Branch at %d continues past shader end " |
703 | "(%d/%d)\n" , |
704 | ip, after_delay_ip, validation_state->max_ip); |
705 | return false; |
706 | } |
707 | set_bit(nr: after_delay_ip, addr: validation_state->branch_targets); |
708 | max_branch_target = max(max_branch_target, after_delay_ip); |
709 | } |
710 | |
711 | if (max_branch_target > validation_state->max_ip - 3) { |
712 | DRM_DEBUG("Branch landed after QPU_SIG_PROG_END" ); |
713 | return false; |
714 | } |
715 | |
716 | return true; |
717 | } |
718 | |
719 | /* Resets any known state for the shader, used when we may be branched to from |
720 | * multiple locations in the program (or at shader start). |
721 | */ |
722 | static void |
723 | reset_validation_state(struct vc4_shader_validation_state *validation_state) |
724 | { |
725 | int i; |
726 | |
727 | for (i = 0; i < 8; i++) |
728 | validation_state->tmu_setup[i / 4].p_offset[i % 4] = ~0; |
729 | |
730 | for (i = 0; i < LIVE_REG_COUNT; i++) { |
731 | validation_state->live_min_clamp_offsets[i] = ~0; |
732 | validation_state->live_max_clamp_regs[i] = false; |
733 | validation_state->live_immediates[i] = ~0; |
734 | } |
735 | } |
736 | |
737 | static bool |
738 | texturing_in_progress(struct vc4_shader_validation_state *validation_state) |
739 | { |
740 | return (validation_state->tmu_write_count[0] != 0 || |
741 | validation_state->tmu_write_count[1] != 0); |
742 | } |
743 | |
744 | static bool |
745 | vc4_handle_branch_target(struct vc4_shader_validation_state *validation_state) |
746 | { |
747 | uint32_t ip = validation_state->ip; |
748 | |
749 | if (!test_bit(ip, validation_state->branch_targets)) |
750 | return true; |
751 | |
752 | if (texturing_in_progress(validation_state)) { |
753 | DRM_DEBUG("Branch target landed during TMU setup\n" ); |
754 | return false; |
755 | } |
756 | |
757 | /* Reset our live values tracking, since this instruction may have |
758 | * multiple predecessors. |
759 | * |
760 | * One could potentially do analysis to determine that, for |
761 | * example, all predecessors have a live max clamp in the same |
762 | * register, but we don't bother with that. |
763 | */ |
764 | reset_validation_state(validation_state); |
765 | |
766 | /* Since we've entered a basic block from potentially multiple |
767 | * predecessors, we need the uniforms address to be updated before any |
768 | * unforms are read. We require that after any branch point, the next |
769 | * uniform to be loaded is a uniform address offset. That uniform's |
770 | * offset will be marked by the uniform address register write |
771 | * validation, or a one-off the end-of-program check. |
772 | */ |
773 | validation_state->needs_uniform_address_update = true; |
774 | |
775 | return true; |
776 | } |
777 | |
778 | struct vc4_validated_shader_info * |
779 | vc4_validate_shader(struct drm_gem_dma_object *shader_obj) |
780 | { |
781 | struct vc4_dev *vc4 = to_vc4_dev(shader_obj->base.dev); |
782 | bool found_shader_end = false; |
783 | int shader_end_ip = 0; |
784 | uint32_t last_thread_switch_ip = -3; |
785 | uint32_t ip; |
786 | struct vc4_validated_shader_info *validated_shader = NULL; |
787 | struct vc4_shader_validation_state validation_state; |
788 | |
789 | if (WARN_ON_ONCE(vc4->is_vc5)) |
790 | return NULL; |
791 | |
792 | memset(&validation_state, 0, sizeof(validation_state)); |
793 | validation_state.shader = shader_obj->vaddr; |
794 | validation_state.max_ip = shader_obj->base.size / sizeof(uint64_t); |
795 | |
796 | reset_validation_state(validation_state: &validation_state); |
797 | |
798 | validation_state.branch_targets = |
799 | kcalloc(BITS_TO_LONGS(validation_state.max_ip), |
800 | size: sizeof(unsigned long), GFP_KERNEL); |
801 | if (!validation_state.branch_targets) |
802 | goto fail; |
803 | |
804 | validated_shader = kcalloc(n: 1, size: sizeof(*validated_shader), GFP_KERNEL); |
805 | if (!validated_shader) |
806 | goto fail; |
807 | |
808 | if (!vc4_validate_branches(validation_state: &validation_state)) |
809 | goto fail; |
810 | |
811 | for (ip = 0; ip < validation_state.max_ip; ip++) { |
812 | uint64_t inst = validation_state.shader[ip]; |
813 | uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); |
814 | |
815 | validation_state.ip = ip; |
816 | |
817 | if (!vc4_handle_branch_target(validation_state: &validation_state)) |
818 | goto fail; |
819 | |
820 | if (ip == last_thread_switch_ip + 3) { |
821 | /* Reset r0-r3 live clamp data */ |
822 | int i; |
823 | |
824 | for (i = 64; i < LIVE_REG_COUNT; i++) { |
825 | validation_state.live_min_clamp_offsets[i] = ~0; |
826 | validation_state.live_max_clamp_regs[i] = false; |
827 | validation_state.live_immediates[i] = ~0; |
828 | } |
829 | } |
830 | |
831 | switch (sig) { |
832 | case QPU_SIG_NONE: |
833 | case QPU_SIG_WAIT_FOR_SCOREBOARD: |
834 | case QPU_SIG_SCOREBOARD_UNLOCK: |
835 | case QPU_SIG_COLOR_LOAD: |
836 | case QPU_SIG_LOAD_TMU0: |
837 | case QPU_SIG_LOAD_TMU1: |
838 | case QPU_SIG_PROG_END: |
839 | case QPU_SIG_SMALL_IMM: |
840 | case QPU_SIG_THREAD_SWITCH: |
841 | case QPU_SIG_LAST_THREAD_SWITCH: |
842 | if (!check_instruction_writes(validated_shader, |
843 | validation_state: &validation_state)) { |
844 | DRM_DEBUG("Bad write at ip %d\n" , ip); |
845 | goto fail; |
846 | } |
847 | |
848 | if (!check_instruction_reads(validated_shader, |
849 | validation_state: &validation_state)) |
850 | goto fail; |
851 | |
852 | if (sig == QPU_SIG_PROG_END) { |
853 | found_shader_end = true; |
854 | shader_end_ip = ip; |
855 | } |
856 | |
857 | if (sig == QPU_SIG_THREAD_SWITCH || |
858 | sig == QPU_SIG_LAST_THREAD_SWITCH) { |
859 | validated_shader->is_threaded = true; |
860 | |
861 | if (ip < last_thread_switch_ip + 3) { |
862 | DRM_DEBUG("Thread switch too soon after " |
863 | "last switch at ip %d\n" , ip); |
864 | goto fail; |
865 | } |
866 | last_thread_switch_ip = ip; |
867 | } |
868 | |
869 | break; |
870 | |
871 | case QPU_SIG_LOAD_IMM: |
872 | if (!check_instruction_writes(validated_shader, |
873 | validation_state: &validation_state)) { |
874 | DRM_DEBUG("Bad LOAD_IMM write at ip %d\n" , ip); |
875 | goto fail; |
876 | } |
877 | break; |
878 | |
879 | case QPU_SIG_BRANCH: |
880 | if (!check_branch(inst, validated_shader, |
881 | validation_state: &validation_state, ip)) |
882 | goto fail; |
883 | |
884 | if (ip < last_thread_switch_ip + 3) { |
885 | DRM_DEBUG("Branch in thread switch at ip %d" , |
886 | ip); |
887 | goto fail; |
888 | } |
889 | |
890 | break; |
891 | default: |
892 | DRM_DEBUG("Unsupported QPU signal %d at " |
893 | "instruction %d\n" , sig, ip); |
894 | goto fail; |
895 | } |
896 | |
897 | /* There are two delay slots after program end is signaled |
898 | * that are still executed, then we're finished. |
899 | */ |
900 | if (found_shader_end && ip == shader_end_ip + 2) |
901 | break; |
902 | } |
903 | |
904 | if (ip == validation_state.max_ip) { |
905 | DRM_DEBUG("shader failed to terminate before " |
906 | "shader BO end at %zd\n" , |
907 | shader_obj->base.size); |
908 | goto fail; |
909 | } |
910 | |
911 | /* Might corrupt other thread */ |
912 | if (validated_shader->is_threaded && |
913 | validation_state.all_registers_used) { |
914 | DRM_DEBUG("Shader uses threading, but uses the upper " |
915 | "half of the registers, too\n" ); |
916 | goto fail; |
917 | } |
918 | |
919 | /* If we did a backwards branch and we haven't emitted a uniforms |
920 | * reset since then, we still need the uniforms stream to have the |
921 | * uniforms address available so that the backwards branch can do its |
922 | * uniforms reset. |
923 | * |
924 | * We could potentially prove that the backwards branch doesn't |
925 | * contain any uses of uniforms until program exit, but that doesn't |
926 | * seem to be worth the trouble. |
927 | */ |
928 | if (validation_state.needs_uniform_address_for_loop) { |
929 | if (!require_uniform_address_uniform(validated_shader)) |
930 | goto fail; |
931 | validated_shader->uniforms_size += 4; |
932 | } |
933 | |
934 | /* Again, no chance of integer overflow here because the worst case |
935 | * scenario is 8 bytes of uniforms plus handles per 8-byte |
936 | * instruction. |
937 | */ |
938 | validated_shader->uniforms_src_size = |
939 | (validated_shader->uniforms_size + |
940 | 4 * validated_shader->num_texture_samples); |
941 | |
942 | kfree(objp: validation_state.branch_targets); |
943 | |
944 | return validated_shader; |
945 | |
946 | fail: |
947 | kfree(objp: validation_state.branch_targets); |
948 | if (validated_shader) { |
949 | kfree(objp: validated_shader->uniform_addr_offsets); |
950 | kfree(objp: validated_shader->texture_samples); |
951 | kfree(objp: validated_shader); |
952 | } |
953 | return NULL; |
954 | } |
955 | |