1 | // SPDX-License-Identifier: GPL-2.0 OR MIT |
2 | /* |
3 | * Copyright 2015-2022 Advanced Micro Devices, Inc. |
4 | * |
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
6 | * copy of this software and associated documentation files (the "Software"), |
7 | * to deal in the Software without restriction, including without limitation |
8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
9 | * and/or sell copies of the Software, and to permit persons to whom the |
10 | * Software is furnished to do so, subject to the following conditions: |
11 | * |
12 | * The above copyright notice and this permission notice shall be included in |
13 | * all copies or substantial portions of the Software. |
14 | * |
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
18 | * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR |
19 | * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, |
20 | * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
21 | * OTHER DEALINGS IN THE SOFTWARE. |
22 | */ |
23 | |
24 | #include <linux/pci.h> |
25 | #include <linux/acpi.h> |
26 | #include "kfd_crat.h" |
27 | #include "kfd_priv.h" |
28 | #include "kfd_topology.h" |
29 | #include "amdgpu.h" |
30 | #include "amdgpu_amdkfd.h" |
31 | |
32 | /* GPU Processor ID base for dGPUs for which VCRAT needs to be created. |
33 | * GPU processor ID are expressed with Bit[31]=1. |
34 | * The base is set to 0x8000_0000 + 0x1000 to avoid collision with GPU IDs |
35 | * used in the CRAT. |
36 | */ |
37 | static uint32_t gpu_processor_id_low = 0x80001000; |
38 | |
39 | /* Return the next available gpu_processor_id and increment it for next GPU |
40 | * @total_cu_count - Total CUs present in the GPU including ones |
41 | * masked off |
42 | */ |
43 | static inline unsigned int get_and_inc_gpu_processor_id( |
44 | unsigned int total_cu_count) |
45 | { |
46 | int current_id = gpu_processor_id_low; |
47 | |
48 | gpu_processor_id_low += total_cu_count; |
49 | return current_id; |
50 | } |
51 | |
52 | |
53 | static struct kfd_gpu_cache_info kaveri_cache_info[] = { |
54 | { |
55 | /* TCP L1 Cache per CU */ |
56 | .cache_size = 16, |
57 | .cache_level = 1, |
58 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
59 | CRAT_CACHE_FLAGS_DATA_CACHE | |
60 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
61 | .num_cu_shared = 1, |
62 | }, |
63 | { |
64 | /* Scalar L1 Instruction Cache (in SQC module) per bank */ |
65 | .cache_size = 16, |
66 | .cache_level = 1, |
67 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
68 | CRAT_CACHE_FLAGS_INST_CACHE | |
69 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
70 | .num_cu_shared = 2, |
71 | }, |
72 | { |
73 | /* Scalar L1 Data Cache (in SQC module) per bank */ |
74 | .cache_size = 8, |
75 | .cache_level = 1, |
76 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
77 | CRAT_CACHE_FLAGS_DATA_CACHE | |
78 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
79 | .num_cu_shared = 2, |
80 | }, |
81 | |
82 | /* TODO: Add L2 Cache information */ |
83 | }; |
84 | |
85 | |
86 | static struct kfd_gpu_cache_info carrizo_cache_info[] = { |
87 | { |
88 | /* TCP L1 Cache per CU */ |
89 | .cache_size = 16, |
90 | .cache_level = 1, |
91 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
92 | CRAT_CACHE_FLAGS_DATA_CACHE | |
93 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
94 | .num_cu_shared = 1, |
95 | }, |
96 | { |
97 | /* Scalar L1 Instruction Cache (in SQC module) per bank */ |
98 | .cache_size = 8, |
99 | .cache_level = 1, |
100 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
101 | CRAT_CACHE_FLAGS_INST_CACHE | |
102 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
103 | .num_cu_shared = 4, |
104 | }, |
105 | { |
106 | /* Scalar L1 Data Cache (in SQC module) per bank. */ |
107 | .cache_size = 4, |
108 | .cache_level = 1, |
109 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
110 | CRAT_CACHE_FLAGS_DATA_CACHE | |
111 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
112 | .num_cu_shared = 4, |
113 | }, |
114 | |
115 | /* TODO: Add L2 Cache information */ |
116 | }; |
117 | |
118 | #define hawaii_cache_info kaveri_cache_info |
119 | #define tonga_cache_info carrizo_cache_info |
120 | #define fiji_cache_info carrizo_cache_info |
121 | #define polaris10_cache_info carrizo_cache_info |
122 | #define polaris11_cache_info carrizo_cache_info |
123 | #define polaris12_cache_info carrizo_cache_info |
124 | #define vegam_cache_info carrizo_cache_info |
125 | |
126 | /* NOTE: L1 cache information has been updated and L2/L3 |
127 | * cache information has been added for Vega10 and |
128 | * newer ASICs. The unit for cache_size is KiB. |
129 | * In future, check & update cache details |
130 | * for every new ASIC is required. |
131 | */ |
132 | |
133 | static struct kfd_gpu_cache_info vega10_cache_info[] = { |
134 | { |
135 | /* TCP L1 Cache per CU */ |
136 | .cache_size = 16, |
137 | .cache_level = 1, |
138 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
139 | CRAT_CACHE_FLAGS_DATA_CACHE | |
140 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
141 | .num_cu_shared = 1, |
142 | }, |
143 | { |
144 | /* Scalar L1 Instruction Cache per SQC */ |
145 | .cache_size = 32, |
146 | .cache_level = 1, |
147 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
148 | CRAT_CACHE_FLAGS_INST_CACHE | |
149 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
150 | .num_cu_shared = 3, |
151 | }, |
152 | { |
153 | /* Scalar L1 Data Cache per SQC */ |
154 | .cache_size = 16, |
155 | .cache_level = 1, |
156 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
157 | CRAT_CACHE_FLAGS_DATA_CACHE | |
158 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
159 | .num_cu_shared = 3, |
160 | }, |
161 | { |
162 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
163 | .cache_size = 4096, |
164 | .cache_level = 2, |
165 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
166 | CRAT_CACHE_FLAGS_DATA_CACHE | |
167 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
168 | .num_cu_shared = 16, |
169 | }, |
170 | }; |
171 | |
172 | static struct kfd_gpu_cache_info raven_cache_info[] = { |
173 | { |
174 | /* TCP L1 Cache per CU */ |
175 | .cache_size = 16, |
176 | .cache_level = 1, |
177 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
178 | CRAT_CACHE_FLAGS_DATA_CACHE | |
179 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
180 | .num_cu_shared = 1, |
181 | }, |
182 | { |
183 | /* Scalar L1 Instruction Cache per SQC */ |
184 | .cache_size = 32, |
185 | .cache_level = 1, |
186 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
187 | CRAT_CACHE_FLAGS_INST_CACHE | |
188 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
189 | .num_cu_shared = 3, |
190 | }, |
191 | { |
192 | /* Scalar L1 Data Cache per SQC */ |
193 | .cache_size = 16, |
194 | .cache_level = 1, |
195 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
196 | CRAT_CACHE_FLAGS_DATA_CACHE | |
197 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
198 | .num_cu_shared = 3, |
199 | }, |
200 | { |
201 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
202 | .cache_size = 1024, |
203 | .cache_level = 2, |
204 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
205 | CRAT_CACHE_FLAGS_DATA_CACHE | |
206 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
207 | .num_cu_shared = 11, |
208 | }, |
209 | }; |
210 | |
211 | static struct kfd_gpu_cache_info renoir_cache_info[] = { |
212 | { |
213 | /* TCP L1 Cache per CU */ |
214 | .cache_size = 16, |
215 | .cache_level = 1, |
216 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
217 | CRAT_CACHE_FLAGS_DATA_CACHE | |
218 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
219 | .num_cu_shared = 1, |
220 | }, |
221 | { |
222 | /* Scalar L1 Instruction Cache per SQC */ |
223 | .cache_size = 32, |
224 | .cache_level = 1, |
225 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
226 | CRAT_CACHE_FLAGS_INST_CACHE | |
227 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
228 | .num_cu_shared = 3, |
229 | }, |
230 | { |
231 | /* Scalar L1 Data Cache per SQC */ |
232 | .cache_size = 16, |
233 | .cache_level = 1, |
234 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
235 | CRAT_CACHE_FLAGS_DATA_CACHE | |
236 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
237 | .num_cu_shared = 3, |
238 | }, |
239 | { |
240 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
241 | .cache_size = 1024, |
242 | .cache_level = 2, |
243 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
244 | CRAT_CACHE_FLAGS_DATA_CACHE | |
245 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
246 | .num_cu_shared = 8, |
247 | }, |
248 | }; |
249 | |
250 | static struct kfd_gpu_cache_info vega12_cache_info[] = { |
251 | { |
252 | /* TCP L1 Cache per CU */ |
253 | .cache_size = 16, |
254 | .cache_level = 1, |
255 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
256 | CRAT_CACHE_FLAGS_DATA_CACHE | |
257 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
258 | .num_cu_shared = 1, |
259 | }, |
260 | { |
261 | /* Scalar L1 Instruction Cache per SQC */ |
262 | .cache_size = 32, |
263 | .cache_level = 1, |
264 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
265 | CRAT_CACHE_FLAGS_INST_CACHE | |
266 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
267 | .num_cu_shared = 3, |
268 | }, |
269 | { |
270 | /* Scalar L1 Data Cache per SQC */ |
271 | .cache_size = 16, |
272 | .cache_level = 1, |
273 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
274 | CRAT_CACHE_FLAGS_DATA_CACHE | |
275 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
276 | .num_cu_shared = 3, |
277 | }, |
278 | { |
279 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
280 | .cache_size = 2048, |
281 | .cache_level = 2, |
282 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
283 | CRAT_CACHE_FLAGS_DATA_CACHE | |
284 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
285 | .num_cu_shared = 5, |
286 | }, |
287 | }; |
288 | |
289 | static struct kfd_gpu_cache_info vega20_cache_info[] = { |
290 | { |
291 | /* TCP L1 Cache per CU */ |
292 | .cache_size = 16, |
293 | .cache_level = 1, |
294 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
295 | CRAT_CACHE_FLAGS_DATA_CACHE | |
296 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
297 | .num_cu_shared = 1, |
298 | }, |
299 | { |
300 | /* Scalar L1 Instruction Cache per SQC */ |
301 | .cache_size = 32, |
302 | .cache_level = 1, |
303 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
304 | CRAT_CACHE_FLAGS_INST_CACHE | |
305 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
306 | .num_cu_shared = 3, |
307 | }, |
308 | { |
309 | /* Scalar L1 Data Cache per SQC */ |
310 | .cache_size = 16, |
311 | .cache_level = 1, |
312 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
313 | CRAT_CACHE_FLAGS_DATA_CACHE | |
314 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
315 | .num_cu_shared = 3, |
316 | }, |
317 | { |
318 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
319 | .cache_size = 8192, |
320 | .cache_level = 2, |
321 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
322 | CRAT_CACHE_FLAGS_DATA_CACHE | |
323 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
324 | .num_cu_shared = 16, |
325 | }, |
326 | }; |
327 | |
328 | static struct kfd_gpu_cache_info aldebaran_cache_info[] = { |
329 | { |
330 | /* TCP L1 Cache per CU */ |
331 | .cache_size = 16, |
332 | .cache_level = 1, |
333 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
334 | CRAT_CACHE_FLAGS_DATA_CACHE | |
335 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
336 | .num_cu_shared = 1, |
337 | }, |
338 | { |
339 | /* Scalar L1 Instruction Cache per SQC */ |
340 | .cache_size = 32, |
341 | .cache_level = 1, |
342 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
343 | CRAT_CACHE_FLAGS_INST_CACHE | |
344 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
345 | .num_cu_shared = 2, |
346 | }, |
347 | { |
348 | /* Scalar L1 Data Cache per SQC */ |
349 | .cache_size = 16, |
350 | .cache_level = 1, |
351 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
352 | CRAT_CACHE_FLAGS_DATA_CACHE | |
353 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
354 | .num_cu_shared = 2, |
355 | }, |
356 | { |
357 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
358 | .cache_size = 8192, |
359 | .cache_level = 2, |
360 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
361 | CRAT_CACHE_FLAGS_DATA_CACHE | |
362 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
363 | .num_cu_shared = 14, |
364 | }, |
365 | }; |
366 | |
367 | static struct kfd_gpu_cache_info navi10_cache_info[] = { |
368 | { |
369 | /* TCP L1 Cache per CU */ |
370 | .cache_size = 16, |
371 | .cache_level = 1, |
372 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
373 | CRAT_CACHE_FLAGS_DATA_CACHE | |
374 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
375 | .num_cu_shared = 1, |
376 | }, |
377 | { |
378 | /* Scalar L1 Instruction Cache per SQC */ |
379 | .cache_size = 32, |
380 | .cache_level = 1, |
381 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
382 | CRAT_CACHE_FLAGS_INST_CACHE | |
383 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
384 | .num_cu_shared = 2, |
385 | }, |
386 | { |
387 | /* Scalar L1 Data Cache per SQC */ |
388 | .cache_size = 16, |
389 | .cache_level = 1, |
390 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
391 | CRAT_CACHE_FLAGS_DATA_CACHE | |
392 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
393 | .num_cu_shared = 2, |
394 | }, |
395 | { |
396 | /* GL1 Data Cache per SA */ |
397 | .cache_size = 128, |
398 | .cache_level = 1, |
399 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
400 | CRAT_CACHE_FLAGS_DATA_CACHE | |
401 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
402 | .num_cu_shared = 10, |
403 | }, |
404 | { |
405 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
406 | .cache_size = 4096, |
407 | .cache_level = 2, |
408 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
409 | CRAT_CACHE_FLAGS_DATA_CACHE | |
410 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
411 | .num_cu_shared = 10, |
412 | }, |
413 | }; |
414 | |
415 | static struct kfd_gpu_cache_info vangogh_cache_info[] = { |
416 | { |
417 | /* TCP L1 Cache per CU */ |
418 | .cache_size = 16, |
419 | .cache_level = 1, |
420 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
421 | CRAT_CACHE_FLAGS_DATA_CACHE | |
422 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
423 | .num_cu_shared = 1, |
424 | }, |
425 | { |
426 | /* Scalar L1 Instruction Cache per SQC */ |
427 | .cache_size = 32, |
428 | .cache_level = 1, |
429 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
430 | CRAT_CACHE_FLAGS_INST_CACHE | |
431 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
432 | .num_cu_shared = 2, |
433 | }, |
434 | { |
435 | /* Scalar L1 Data Cache per SQC */ |
436 | .cache_size = 16, |
437 | .cache_level = 1, |
438 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
439 | CRAT_CACHE_FLAGS_DATA_CACHE | |
440 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
441 | .num_cu_shared = 2, |
442 | }, |
443 | { |
444 | /* GL1 Data Cache per SA */ |
445 | .cache_size = 128, |
446 | .cache_level = 1, |
447 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
448 | CRAT_CACHE_FLAGS_DATA_CACHE | |
449 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
450 | .num_cu_shared = 8, |
451 | }, |
452 | { |
453 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
454 | .cache_size = 1024, |
455 | .cache_level = 2, |
456 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
457 | CRAT_CACHE_FLAGS_DATA_CACHE | |
458 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
459 | .num_cu_shared = 8, |
460 | }, |
461 | }; |
462 | |
463 | static struct kfd_gpu_cache_info navi14_cache_info[] = { |
464 | { |
465 | /* TCP L1 Cache per CU */ |
466 | .cache_size = 16, |
467 | .cache_level = 1, |
468 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
469 | CRAT_CACHE_FLAGS_DATA_CACHE | |
470 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
471 | .num_cu_shared = 1, |
472 | }, |
473 | { |
474 | /* Scalar L1 Instruction Cache per SQC */ |
475 | .cache_size = 32, |
476 | .cache_level = 1, |
477 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
478 | CRAT_CACHE_FLAGS_INST_CACHE | |
479 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
480 | .num_cu_shared = 2, |
481 | }, |
482 | { |
483 | /* Scalar L1 Data Cache per SQC */ |
484 | .cache_size = 16, |
485 | .cache_level = 1, |
486 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
487 | CRAT_CACHE_FLAGS_DATA_CACHE | |
488 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
489 | .num_cu_shared = 2, |
490 | }, |
491 | { |
492 | /* GL1 Data Cache per SA */ |
493 | .cache_size = 128, |
494 | .cache_level = 1, |
495 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
496 | CRAT_CACHE_FLAGS_DATA_CACHE | |
497 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
498 | .num_cu_shared = 12, |
499 | }, |
500 | { |
501 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
502 | .cache_size = 2048, |
503 | .cache_level = 2, |
504 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
505 | CRAT_CACHE_FLAGS_DATA_CACHE | |
506 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
507 | .num_cu_shared = 12, |
508 | }, |
509 | }; |
510 | |
511 | static struct kfd_gpu_cache_info sienna_cichlid_cache_info[] = { |
512 | { |
513 | /* TCP L1 Cache per CU */ |
514 | .cache_size = 16, |
515 | .cache_level = 1, |
516 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
517 | CRAT_CACHE_FLAGS_DATA_CACHE | |
518 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
519 | .num_cu_shared = 1, |
520 | }, |
521 | { |
522 | /* Scalar L1 Instruction Cache per SQC */ |
523 | .cache_size = 32, |
524 | .cache_level = 1, |
525 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
526 | CRAT_CACHE_FLAGS_INST_CACHE | |
527 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
528 | .num_cu_shared = 2, |
529 | }, |
530 | { |
531 | /* Scalar L1 Data Cache per SQC */ |
532 | .cache_size = 16, |
533 | .cache_level = 1, |
534 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
535 | CRAT_CACHE_FLAGS_DATA_CACHE | |
536 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
537 | .num_cu_shared = 2, |
538 | }, |
539 | { |
540 | /* GL1 Data Cache per SA */ |
541 | .cache_size = 128, |
542 | .cache_level = 1, |
543 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
544 | CRAT_CACHE_FLAGS_DATA_CACHE | |
545 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
546 | .num_cu_shared = 10, |
547 | }, |
548 | { |
549 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
550 | .cache_size = 4096, |
551 | .cache_level = 2, |
552 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
553 | CRAT_CACHE_FLAGS_DATA_CACHE | |
554 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
555 | .num_cu_shared = 10, |
556 | }, |
557 | { |
558 | /* L3 Data Cache per GPU */ |
559 | .cache_size = 128*1024, |
560 | .cache_level = 3, |
561 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
562 | CRAT_CACHE_FLAGS_DATA_CACHE | |
563 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
564 | .num_cu_shared = 10, |
565 | }, |
566 | }; |
567 | |
568 | static struct kfd_gpu_cache_info navy_flounder_cache_info[] = { |
569 | { |
570 | /* TCP L1 Cache per CU */ |
571 | .cache_size = 16, |
572 | .cache_level = 1, |
573 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
574 | CRAT_CACHE_FLAGS_DATA_CACHE | |
575 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
576 | .num_cu_shared = 1, |
577 | }, |
578 | { |
579 | /* Scalar L1 Instruction Cache per SQC */ |
580 | .cache_size = 32, |
581 | .cache_level = 1, |
582 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
583 | CRAT_CACHE_FLAGS_INST_CACHE | |
584 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
585 | .num_cu_shared = 2, |
586 | }, |
587 | { |
588 | /* Scalar L1 Data Cache per SQC */ |
589 | .cache_size = 16, |
590 | .cache_level = 1, |
591 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
592 | CRAT_CACHE_FLAGS_DATA_CACHE | |
593 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
594 | .num_cu_shared = 2, |
595 | }, |
596 | { |
597 | /* GL1 Data Cache per SA */ |
598 | .cache_size = 128, |
599 | .cache_level = 1, |
600 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
601 | CRAT_CACHE_FLAGS_DATA_CACHE | |
602 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
603 | .num_cu_shared = 10, |
604 | }, |
605 | { |
606 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
607 | .cache_size = 3072, |
608 | .cache_level = 2, |
609 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
610 | CRAT_CACHE_FLAGS_DATA_CACHE | |
611 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
612 | .num_cu_shared = 10, |
613 | }, |
614 | { |
615 | /* L3 Data Cache per GPU */ |
616 | .cache_size = 96*1024, |
617 | .cache_level = 3, |
618 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
619 | CRAT_CACHE_FLAGS_DATA_CACHE | |
620 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
621 | .num_cu_shared = 10, |
622 | }, |
623 | }; |
624 | |
625 | static struct kfd_gpu_cache_info dimgrey_cavefish_cache_info[] = { |
626 | { |
627 | /* TCP L1 Cache per CU */ |
628 | .cache_size = 16, |
629 | .cache_level = 1, |
630 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
631 | CRAT_CACHE_FLAGS_DATA_CACHE | |
632 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
633 | .num_cu_shared = 1, |
634 | }, |
635 | { |
636 | /* Scalar L1 Instruction Cache per SQC */ |
637 | .cache_size = 32, |
638 | .cache_level = 1, |
639 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
640 | CRAT_CACHE_FLAGS_INST_CACHE | |
641 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
642 | .num_cu_shared = 2, |
643 | }, |
644 | { |
645 | /* Scalar L1 Data Cache per SQC */ |
646 | .cache_size = 16, |
647 | .cache_level = 1, |
648 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
649 | CRAT_CACHE_FLAGS_DATA_CACHE | |
650 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
651 | .num_cu_shared = 2, |
652 | }, |
653 | { |
654 | /* GL1 Data Cache per SA */ |
655 | .cache_size = 128, |
656 | .cache_level = 1, |
657 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
658 | CRAT_CACHE_FLAGS_DATA_CACHE | |
659 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
660 | .num_cu_shared = 8, |
661 | }, |
662 | { |
663 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
664 | .cache_size = 2048, |
665 | .cache_level = 2, |
666 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
667 | CRAT_CACHE_FLAGS_DATA_CACHE | |
668 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
669 | .num_cu_shared = 8, |
670 | }, |
671 | { |
672 | /* L3 Data Cache per GPU */ |
673 | .cache_size = 32*1024, |
674 | .cache_level = 3, |
675 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
676 | CRAT_CACHE_FLAGS_DATA_CACHE | |
677 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
678 | .num_cu_shared = 8, |
679 | }, |
680 | }; |
681 | |
682 | static struct kfd_gpu_cache_info beige_goby_cache_info[] = { |
683 | { |
684 | /* TCP L1 Cache per CU */ |
685 | .cache_size = 16, |
686 | .cache_level = 1, |
687 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
688 | CRAT_CACHE_FLAGS_DATA_CACHE | |
689 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
690 | .num_cu_shared = 1, |
691 | }, |
692 | { |
693 | /* Scalar L1 Instruction Cache per SQC */ |
694 | .cache_size = 32, |
695 | .cache_level = 1, |
696 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
697 | CRAT_CACHE_FLAGS_INST_CACHE | |
698 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
699 | .num_cu_shared = 2, |
700 | }, |
701 | { |
702 | /* Scalar L1 Data Cache per SQC */ |
703 | .cache_size = 16, |
704 | .cache_level = 1, |
705 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
706 | CRAT_CACHE_FLAGS_DATA_CACHE | |
707 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
708 | .num_cu_shared = 2, |
709 | }, |
710 | { |
711 | /* GL1 Data Cache per SA */ |
712 | .cache_size = 128, |
713 | .cache_level = 1, |
714 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
715 | CRAT_CACHE_FLAGS_DATA_CACHE | |
716 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
717 | .num_cu_shared = 8, |
718 | }, |
719 | { |
720 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
721 | .cache_size = 1024, |
722 | .cache_level = 2, |
723 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
724 | CRAT_CACHE_FLAGS_DATA_CACHE | |
725 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
726 | .num_cu_shared = 8, |
727 | }, |
728 | { |
729 | /* L3 Data Cache per GPU */ |
730 | .cache_size = 16*1024, |
731 | .cache_level = 3, |
732 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
733 | CRAT_CACHE_FLAGS_DATA_CACHE | |
734 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
735 | .num_cu_shared = 8, |
736 | }, |
737 | }; |
738 | |
739 | static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { |
740 | { |
741 | /* TCP L1 Cache per CU */ |
742 | .cache_size = 16, |
743 | .cache_level = 1, |
744 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
745 | CRAT_CACHE_FLAGS_DATA_CACHE | |
746 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
747 | .num_cu_shared = 1, |
748 | }, |
749 | { |
750 | /* Scalar L1 Instruction Cache per SQC */ |
751 | .cache_size = 32, |
752 | .cache_level = 1, |
753 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
754 | CRAT_CACHE_FLAGS_INST_CACHE | |
755 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
756 | .num_cu_shared = 2, |
757 | }, |
758 | { |
759 | /* Scalar L1 Data Cache per SQC */ |
760 | .cache_size = 16, |
761 | .cache_level = 1, |
762 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
763 | CRAT_CACHE_FLAGS_DATA_CACHE | |
764 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
765 | .num_cu_shared = 2, |
766 | }, |
767 | { |
768 | /* GL1 Data Cache per SA */ |
769 | .cache_size = 128, |
770 | .cache_level = 1, |
771 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
772 | CRAT_CACHE_FLAGS_DATA_CACHE | |
773 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
774 | .num_cu_shared = 6, |
775 | }, |
776 | { |
777 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
778 | .cache_size = 2048, |
779 | .cache_level = 2, |
780 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
781 | CRAT_CACHE_FLAGS_DATA_CACHE | |
782 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
783 | .num_cu_shared = 6, |
784 | }, |
785 | }; |
786 | |
787 | static struct kfd_gpu_cache_info gfx1037_cache_info[] = { |
788 | { |
789 | /* TCP L1 Cache per CU */ |
790 | .cache_size = 16, |
791 | .cache_level = 1, |
792 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
793 | CRAT_CACHE_FLAGS_DATA_CACHE | |
794 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
795 | .num_cu_shared = 1, |
796 | }, |
797 | { |
798 | /* Scalar L1 Instruction Cache per SQC */ |
799 | .cache_size = 32, |
800 | .cache_level = 1, |
801 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
802 | CRAT_CACHE_FLAGS_INST_CACHE | |
803 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
804 | .num_cu_shared = 2, |
805 | }, |
806 | { |
807 | /* Scalar L1 Data Cache per SQC */ |
808 | .cache_size = 16, |
809 | .cache_level = 1, |
810 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
811 | CRAT_CACHE_FLAGS_DATA_CACHE | |
812 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
813 | .num_cu_shared = 2, |
814 | }, |
815 | { |
816 | /* GL1 Data Cache per SA */ |
817 | .cache_size = 128, |
818 | .cache_level = 1, |
819 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
820 | CRAT_CACHE_FLAGS_DATA_CACHE | |
821 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
822 | .num_cu_shared = 2, |
823 | }, |
824 | { |
825 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
826 | .cache_size = 256, |
827 | .cache_level = 2, |
828 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
829 | CRAT_CACHE_FLAGS_DATA_CACHE | |
830 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
831 | .num_cu_shared = 2, |
832 | }, |
833 | }; |
834 | |
835 | static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { |
836 | { |
837 | /* TCP L1 Cache per CU */ |
838 | .cache_size = 16, |
839 | .cache_level = 1, |
840 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
841 | CRAT_CACHE_FLAGS_DATA_CACHE | |
842 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
843 | .num_cu_shared = 1, |
844 | }, |
845 | { |
846 | /* Scalar L1 Instruction Cache per SQC */ |
847 | .cache_size = 32, |
848 | .cache_level = 1, |
849 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
850 | CRAT_CACHE_FLAGS_INST_CACHE | |
851 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
852 | .num_cu_shared = 2, |
853 | }, |
854 | { |
855 | /* Scalar L1 Data Cache per SQC */ |
856 | .cache_size = 16, |
857 | .cache_level = 1, |
858 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
859 | CRAT_CACHE_FLAGS_DATA_CACHE | |
860 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
861 | .num_cu_shared = 2, |
862 | }, |
863 | { |
864 | /* GL1 Data Cache per SA */ |
865 | .cache_size = 128, |
866 | .cache_level = 1, |
867 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
868 | CRAT_CACHE_FLAGS_DATA_CACHE | |
869 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
870 | .num_cu_shared = 2, |
871 | }, |
872 | { |
873 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
874 | .cache_size = 256, |
875 | .cache_level = 2, |
876 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
877 | CRAT_CACHE_FLAGS_DATA_CACHE | |
878 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
879 | .num_cu_shared = 2, |
880 | }, |
881 | }; |
882 | |
883 | static struct kfd_gpu_cache_info dummy_cache_info[] = { |
884 | { |
885 | /* TCP L1 Cache per CU */ |
886 | .cache_size = 16, |
887 | .cache_level = 1, |
888 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
889 | CRAT_CACHE_FLAGS_DATA_CACHE | |
890 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
891 | .num_cu_shared = 1, |
892 | }, |
893 | { |
894 | /* Scalar L1 Instruction Cache per SQC */ |
895 | .cache_size = 32, |
896 | .cache_level = 1, |
897 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
898 | CRAT_CACHE_FLAGS_INST_CACHE | |
899 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
900 | .num_cu_shared = 2, |
901 | }, |
902 | { |
903 | /* Scalar L1 Data Cache per SQC */ |
904 | .cache_size = 16, |
905 | .cache_level = 1, |
906 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
907 | CRAT_CACHE_FLAGS_DATA_CACHE | |
908 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
909 | .num_cu_shared = 2, |
910 | }, |
911 | { |
912 | /* GL1 Data Cache per SA */ |
913 | .cache_size = 128, |
914 | .cache_level = 1, |
915 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
916 | CRAT_CACHE_FLAGS_DATA_CACHE | |
917 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
918 | .num_cu_shared = 6, |
919 | }, |
920 | { |
921 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
922 | .cache_size = 2048, |
923 | .cache_level = 2, |
924 | .flags = (CRAT_CACHE_FLAGS_ENABLED | |
925 | CRAT_CACHE_FLAGS_DATA_CACHE | |
926 | CRAT_CACHE_FLAGS_SIMD_CACHE), |
927 | .num_cu_shared = 6, |
928 | }, |
929 | }; |
930 | |
931 | static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, |
932 | struct crat_subtype_computeunit *cu) |
933 | { |
934 | dev->node_props.cpu_cores_count = cu->num_cpu_cores; |
935 | dev->node_props.cpu_core_id_base = cu->processor_id_low; |
936 | if (cu->hsa_capability & CRAT_CU_FLAGS_IOMMU_PRESENT) |
937 | dev->node_props.capability |= HSA_CAP_ATS_PRESENT; |
938 | |
939 | pr_debug("CU CPU: cores=%d id_base=%d\n" , cu->num_cpu_cores, |
940 | cu->processor_id_low); |
941 | } |
942 | |
943 | static void kfd_populated_cu_info_gpu(struct kfd_topology_device *dev, |
944 | struct crat_subtype_computeunit *cu) |
945 | { |
946 | dev->node_props.simd_id_base = cu->processor_id_low; |
947 | dev->node_props.simd_count = cu->num_simd_cores; |
948 | dev->node_props.lds_size_in_kb = cu->lds_size_in_kb; |
949 | dev->node_props.max_waves_per_simd = cu->max_waves_simd; |
950 | dev->node_props.wave_front_size = cu->wave_front_size; |
951 | dev->node_props.array_count = cu->array_count; |
952 | dev->node_props.cu_per_simd_array = cu->num_cu_per_array; |
953 | dev->node_props.simd_per_cu = cu->num_simd_per_cu; |
954 | dev->node_props.max_slots_scratch_cu = cu->max_slots_scatch_cu; |
955 | if (cu->hsa_capability & CRAT_CU_FLAGS_HOT_PLUGGABLE) |
956 | dev->node_props.capability |= HSA_CAP_HOT_PLUGGABLE; |
957 | pr_debug("CU GPU: id_base=%d\n" , cu->processor_id_low); |
958 | } |
959 | |
960 | /* kfd_parse_subtype_cu - parse compute unit subtypes and attach it to correct |
961 | * topology device present in the device_list |
962 | */ |
963 | static int kfd_parse_subtype_cu(struct crat_subtype_computeunit *cu, |
964 | struct list_head *device_list) |
965 | { |
966 | struct kfd_topology_device *dev; |
967 | |
968 | pr_debug("Found CU entry in CRAT table with proximity_domain=%d caps=%x\n" , |
969 | cu->proximity_domain, cu->hsa_capability); |
970 | list_for_each_entry(dev, device_list, list) { |
971 | if (cu->proximity_domain == dev->proximity_domain) { |
972 | if (cu->flags & CRAT_CU_FLAGS_CPU_PRESENT) |
973 | kfd_populated_cu_info_cpu(dev, cu); |
974 | |
975 | if (cu->flags & CRAT_CU_FLAGS_GPU_PRESENT) |
976 | kfd_populated_cu_info_gpu(dev, cu); |
977 | break; |
978 | } |
979 | } |
980 | |
981 | return 0; |
982 | } |
983 | |
984 | static struct kfd_mem_properties * |
985 | find_subtype_mem(uint32_t heap_type, uint32_t flags, uint32_t width, |
986 | struct kfd_topology_device *dev) |
987 | { |
988 | struct kfd_mem_properties *props; |
989 | |
990 | list_for_each_entry(props, &dev->mem_props, list) { |
991 | if (props->heap_type == heap_type |
992 | && props->flags == flags |
993 | && props->width == width) |
994 | return props; |
995 | } |
996 | |
997 | return NULL; |
998 | } |
999 | /* kfd_parse_subtype_mem - parse memory subtypes and attach it to correct |
1000 | * topology device present in the device_list |
1001 | */ |
1002 | static int kfd_parse_subtype_mem(struct crat_subtype_memory *mem, |
1003 | struct list_head *device_list) |
1004 | { |
1005 | struct kfd_mem_properties *props; |
1006 | struct kfd_topology_device *dev; |
1007 | uint32_t heap_type; |
1008 | uint64_t size_in_bytes; |
1009 | uint32_t flags = 0; |
1010 | uint32_t width; |
1011 | |
1012 | pr_debug("Found memory entry in CRAT table with proximity_domain=%d\n" , |
1013 | mem->proximity_domain); |
1014 | list_for_each_entry(dev, device_list, list) { |
1015 | if (mem->proximity_domain == dev->proximity_domain) { |
1016 | /* We're on GPU node */ |
1017 | if (dev->node_props.cpu_cores_count == 0) { |
1018 | /* APU */ |
1019 | if (mem->visibility_type == 0) |
1020 | heap_type = |
1021 | HSA_MEM_HEAP_TYPE_FB_PRIVATE; |
1022 | /* dGPU */ |
1023 | else |
1024 | heap_type = mem->visibility_type; |
1025 | } else |
1026 | heap_type = HSA_MEM_HEAP_TYPE_SYSTEM; |
1027 | |
1028 | if (mem->flags & CRAT_MEM_FLAGS_HOT_PLUGGABLE) |
1029 | flags |= HSA_MEM_FLAGS_HOT_PLUGGABLE; |
1030 | if (mem->flags & CRAT_MEM_FLAGS_NON_VOLATILE) |
1031 | flags |= HSA_MEM_FLAGS_NON_VOLATILE; |
1032 | |
1033 | size_in_bytes = |
1034 | ((uint64_t)mem->length_high << 32) + |
1035 | mem->length_low; |
1036 | width = mem->width; |
1037 | |
1038 | /* Multiple banks of the same type are aggregated into |
1039 | * one. User mode doesn't care about multiple physical |
1040 | * memory segments. It's managed as a single virtual |
1041 | * heap for user mode. |
1042 | */ |
1043 | props = find_subtype_mem(heap_type, flags, width, dev); |
1044 | if (props) { |
1045 | props->size_in_bytes += size_in_bytes; |
1046 | break; |
1047 | } |
1048 | |
1049 | props = kfd_alloc_struct(props); |
1050 | if (!props) |
1051 | return -ENOMEM; |
1052 | |
1053 | props->heap_type = heap_type; |
1054 | props->flags = flags; |
1055 | props->size_in_bytes = size_in_bytes; |
1056 | props->width = width; |
1057 | |
1058 | dev->node_props.mem_banks_count++; |
1059 | list_add_tail(new: &props->list, head: &dev->mem_props); |
1060 | |
1061 | break; |
1062 | } |
1063 | } |
1064 | |
1065 | return 0; |
1066 | } |
1067 | |
1068 | /* kfd_parse_subtype_cache - parse cache subtypes and attach it to correct |
1069 | * topology device present in the device_list |
1070 | */ |
1071 | static int kfd_parse_subtype_cache(struct crat_subtype_cache *cache, |
1072 | struct list_head *device_list) |
1073 | { |
1074 | struct kfd_cache_properties *props; |
1075 | struct kfd_topology_device *dev; |
1076 | uint32_t id; |
1077 | uint32_t total_num_of_cu; |
1078 | |
1079 | id = cache->processor_id_low; |
1080 | |
1081 | pr_debug("Found cache entry in CRAT table with processor_id=%d\n" , id); |
1082 | list_for_each_entry(dev, device_list, list) { |
1083 | total_num_of_cu = (dev->node_props.array_count * |
1084 | dev->node_props.cu_per_simd_array); |
1085 | |
1086 | /* Cache infomration in CRAT doesn't have proximity_domain |
1087 | * information as it is associated with a CPU core or GPU |
1088 | * Compute Unit. So map the cache using CPU core Id or SIMD |
1089 | * (GPU) ID. |
1090 | * TODO: This works because currently we can safely assume that |
1091 | * Compute Units are parsed before caches are parsed. In |
1092 | * future, remove this dependency |
1093 | */ |
1094 | if ((id >= dev->node_props.cpu_core_id_base && |
1095 | id <= dev->node_props.cpu_core_id_base + |
1096 | dev->node_props.cpu_cores_count) || |
1097 | (id >= dev->node_props.simd_id_base && |
1098 | id < dev->node_props.simd_id_base + |
1099 | total_num_of_cu)) { |
1100 | props = kfd_alloc_struct(props); |
1101 | if (!props) |
1102 | return -ENOMEM; |
1103 | |
1104 | props->processor_id_low = id; |
1105 | props->cache_level = cache->cache_level; |
1106 | props->cache_size = cache->cache_size; |
1107 | props->cacheline_size = cache->cache_line_size; |
1108 | props->cachelines_per_tag = cache->lines_per_tag; |
1109 | props->cache_assoc = cache->associativity; |
1110 | props->cache_latency = cache->cache_latency; |
1111 | |
1112 | memcpy(props->sibling_map, cache->sibling_map, |
1113 | CRAT_SIBLINGMAP_SIZE); |
1114 | |
1115 | /* set the sibling_map_size as 32 for CRAT from ACPI */ |
1116 | props->sibling_map_size = CRAT_SIBLINGMAP_SIZE; |
1117 | |
1118 | if (cache->flags & CRAT_CACHE_FLAGS_DATA_CACHE) |
1119 | props->cache_type |= HSA_CACHE_TYPE_DATA; |
1120 | if (cache->flags & CRAT_CACHE_FLAGS_INST_CACHE) |
1121 | props->cache_type |= HSA_CACHE_TYPE_INSTRUCTION; |
1122 | if (cache->flags & CRAT_CACHE_FLAGS_CPU_CACHE) |
1123 | props->cache_type |= HSA_CACHE_TYPE_CPU; |
1124 | if (cache->flags & CRAT_CACHE_FLAGS_SIMD_CACHE) |
1125 | props->cache_type |= HSA_CACHE_TYPE_HSACU; |
1126 | |
1127 | dev->node_props.caches_count++; |
1128 | list_add_tail(new: &props->list, head: &dev->cache_props); |
1129 | |
1130 | break; |
1131 | } |
1132 | } |
1133 | |
1134 | return 0; |
1135 | } |
1136 | |
1137 | /* kfd_parse_subtype_iolink - parse iolink subtypes and attach it to correct |
1138 | * topology device present in the device_list |
1139 | */ |
1140 | static int kfd_parse_subtype_iolink(struct crat_subtype_iolink *iolink, |
1141 | struct list_head *device_list) |
1142 | { |
1143 | struct kfd_iolink_properties *props = NULL, *props2; |
1144 | struct kfd_topology_device *dev, *to_dev; |
1145 | uint32_t id_from; |
1146 | uint32_t id_to; |
1147 | |
1148 | id_from = iolink->proximity_domain_from; |
1149 | id_to = iolink->proximity_domain_to; |
1150 | |
1151 | pr_debug("Found IO link entry in CRAT table with id_from=%d, id_to %d\n" , |
1152 | id_from, id_to); |
1153 | list_for_each_entry(dev, device_list, list) { |
1154 | if (id_from == dev->proximity_domain) { |
1155 | props = kfd_alloc_struct(props); |
1156 | if (!props) |
1157 | return -ENOMEM; |
1158 | |
1159 | props->node_from = id_from; |
1160 | props->node_to = id_to; |
1161 | props->ver_maj = iolink->version_major; |
1162 | props->ver_min = iolink->version_minor; |
1163 | props->iolink_type = iolink->io_interface_type; |
1164 | |
1165 | if (props->iolink_type == CRAT_IOLINK_TYPE_PCIEXPRESS) |
1166 | props->weight = 20; |
1167 | else if (props->iolink_type == CRAT_IOLINK_TYPE_XGMI) |
1168 | props->weight = iolink->weight_xgmi; |
1169 | else |
1170 | props->weight = node_distance(id_from, id_to); |
1171 | |
1172 | props->min_latency = iolink->minimum_latency; |
1173 | props->max_latency = iolink->maximum_latency; |
1174 | props->min_bandwidth = iolink->minimum_bandwidth_mbs; |
1175 | props->max_bandwidth = iolink->maximum_bandwidth_mbs; |
1176 | props->rec_transfer_size = |
1177 | iolink->recommended_transfer_size; |
1178 | |
1179 | dev->node_props.io_links_count++; |
1180 | list_add_tail(new: &props->list, head: &dev->io_link_props); |
1181 | break; |
1182 | } |
1183 | } |
1184 | |
1185 | /* CPU topology is created before GPUs are detected, so CPU->GPU |
1186 | * links are not built at that time. If a PCIe type is discovered, it |
1187 | * means a GPU is detected and we are adding GPU->CPU to the topology. |
1188 | * At this time, also add the corresponded CPU->GPU link if GPU |
1189 | * is large bar. |
1190 | * For xGMI, we only added the link with one direction in the crat |
1191 | * table, add corresponded reversed direction link now. |
1192 | */ |
1193 | if (props && (iolink->flags & CRAT_IOLINK_FLAGS_BI_DIRECTIONAL)) { |
1194 | to_dev = kfd_topology_device_by_proximity_domain_no_lock(proximity_domain: id_to); |
1195 | if (!to_dev) |
1196 | return -ENODEV; |
1197 | /* same everything but the other direction */ |
1198 | props2 = kmemdup(p: props, size: sizeof(*props2), GFP_KERNEL); |
1199 | if (!props2) |
1200 | return -ENOMEM; |
1201 | |
1202 | props2->node_from = id_to; |
1203 | props2->node_to = id_from; |
1204 | props2->kobj = NULL; |
1205 | to_dev->node_props.io_links_count++; |
1206 | list_add_tail(new: &props2->list, head: &to_dev->io_link_props); |
1207 | } |
1208 | |
1209 | return 0; |
1210 | } |
1211 | |
1212 | /* kfd_parse_subtype - parse subtypes and attach it to correct topology device |
1213 | * present in the device_list |
1214 | * @sub_type_hdr - subtype section of crat_image |
1215 | * @device_list - list of topology devices present in this crat_image |
1216 | */ |
1217 | static int kfd_parse_subtype(struct crat_subtype_generic *sub_type_hdr, |
1218 | struct list_head *device_list) |
1219 | { |
1220 | struct crat_subtype_computeunit *cu; |
1221 | struct crat_subtype_memory *mem; |
1222 | struct crat_subtype_cache *cache; |
1223 | struct crat_subtype_iolink *iolink; |
1224 | int ret = 0; |
1225 | |
1226 | switch (sub_type_hdr->type) { |
1227 | case CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY: |
1228 | cu = (struct crat_subtype_computeunit *)sub_type_hdr; |
1229 | ret = kfd_parse_subtype_cu(cu, device_list); |
1230 | break; |
1231 | case CRAT_SUBTYPE_MEMORY_AFFINITY: |
1232 | mem = (struct crat_subtype_memory *)sub_type_hdr; |
1233 | ret = kfd_parse_subtype_mem(mem, device_list); |
1234 | break; |
1235 | case CRAT_SUBTYPE_CACHE_AFFINITY: |
1236 | cache = (struct crat_subtype_cache *)sub_type_hdr; |
1237 | ret = kfd_parse_subtype_cache(cache, device_list); |
1238 | break; |
1239 | case CRAT_SUBTYPE_TLB_AFFINITY: |
1240 | /* |
1241 | * For now, nothing to do here |
1242 | */ |
1243 | pr_debug("Found TLB entry in CRAT table (not processing)\n" ); |
1244 | break; |
1245 | case CRAT_SUBTYPE_CCOMPUTE_AFFINITY: |
1246 | /* |
1247 | * For now, nothing to do here |
1248 | */ |
1249 | pr_debug("Found CCOMPUTE entry in CRAT table (not processing)\n" ); |
1250 | break; |
1251 | case CRAT_SUBTYPE_IOLINK_AFFINITY: |
1252 | iolink = (struct crat_subtype_iolink *)sub_type_hdr; |
1253 | ret = kfd_parse_subtype_iolink(iolink, device_list); |
1254 | break; |
1255 | default: |
1256 | pr_warn("Unknown subtype %d in CRAT\n" , |
1257 | sub_type_hdr->type); |
1258 | } |
1259 | |
1260 | return ret; |
1261 | } |
1262 | |
1263 | /* kfd_parse_crat_table - parse CRAT table. For each node present in CRAT |
1264 | * create a kfd_topology_device and add in to device_list. Also parse |
1265 | * CRAT subtypes and attach it to appropriate kfd_topology_device |
1266 | * @crat_image - input image containing CRAT |
1267 | * @device_list - [OUT] list of kfd_topology_device generated after |
1268 | * parsing crat_image |
1269 | * @proximity_domain - Proximity domain of the first device in the table |
1270 | * |
1271 | * Return - 0 if successful else -ve value |
1272 | */ |
1273 | int kfd_parse_crat_table(void *crat_image, struct list_head *device_list, |
1274 | uint32_t proximity_domain) |
1275 | { |
1276 | struct kfd_topology_device *top_dev = NULL; |
1277 | struct crat_subtype_generic *sub_type_hdr; |
1278 | uint16_t node_id; |
1279 | int ret = 0; |
1280 | struct crat_header *crat_table = (struct crat_header *)crat_image; |
1281 | uint16_t num_nodes; |
1282 | uint32_t image_len; |
1283 | |
1284 | if (!crat_image) |
1285 | return -EINVAL; |
1286 | |
1287 | if (!list_empty(head: device_list)) { |
1288 | pr_warn("Error device list should be empty\n" ); |
1289 | return -EINVAL; |
1290 | } |
1291 | |
1292 | num_nodes = crat_table->num_domains; |
1293 | image_len = crat_table->length; |
1294 | |
1295 | pr_debug("Parsing CRAT table with %d nodes\n" , num_nodes); |
1296 | |
1297 | for (node_id = 0; node_id < num_nodes; node_id++) { |
1298 | top_dev = kfd_create_topology_device(device_list); |
1299 | if (!top_dev) |
1300 | break; |
1301 | top_dev->proximity_domain = proximity_domain++; |
1302 | } |
1303 | |
1304 | if (!top_dev) { |
1305 | ret = -ENOMEM; |
1306 | goto err; |
1307 | } |
1308 | |
1309 | memcpy(top_dev->oem_id, crat_table->oem_id, CRAT_OEMID_LENGTH); |
1310 | memcpy(top_dev->oem_table_id, crat_table->oem_table_id, |
1311 | CRAT_OEMTABLEID_LENGTH); |
1312 | top_dev->oem_revision = crat_table->oem_revision; |
1313 | |
1314 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); |
1315 | while ((char *)sub_type_hdr + sizeof(struct crat_subtype_generic) < |
1316 | ((char *)crat_image) + image_len) { |
1317 | if (sub_type_hdr->flags & CRAT_SUBTYPE_FLAGS_ENABLED) { |
1318 | ret = kfd_parse_subtype(sub_type_hdr, device_list); |
1319 | if (ret) |
1320 | break; |
1321 | } |
1322 | |
1323 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
1324 | sub_type_hdr->length); |
1325 | } |
1326 | |
1327 | err: |
1328 | if (ret) |
1329 | kfd_release_topology_device_list(device_list); |
1330 | |
1331 | return ret; |
1332 | } |
1333 | |
1334 | |
1335 | static int kfd_fill_gpu_cache_info_from_gfx_config(struct kfd_dev *kdev, |
1336 | struct kfd_gpu_cache_info *pcache_info) |
1337 | { |
1338 | struct amdgpu_device *adev = kdev->adev; |
1339 | int i = 0; |
1340 | |
1341 | /* TCP L1 Cache per CU */ |
1342 | if (adev->gfx.config.gc_tcp_l1_size) { |
1343 | pcache_info[i].cache_size = adev->gfx.config.gc_tcp_l1_size; |
1344 | pcache_info[i].cache_level = 1; |
1345 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1346 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1347 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1348 | pcache_info[0].num_cu_shared = adev->gfx.config.gc_num_tcp_per_wpg / 2; |
1349 | i++; |
1350 | } |
1351 | /* Scalar L1 Instruction Cache per SQC */ |
1352 | if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) { |
1353 | pcache_info[i].cache_size = |
1354 | adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; |
1355 | pcache_info[i].cache_level = 1; |
1356 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1357 | CRAT_CACHE_FLAGS_INST_CACHE | |
1358 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1359 | pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; |
1360 | i++; |
1361 | } |
1362 | /* Scalar L1 Data Cache per SQC */ |
1363 | if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { |
1364 | pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; |
1365 | pcache_info[i].cache_level = 1; |
1366 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1367 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1368 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1369 | pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_sqc_per_wgp * 2; |
1370 | i++; |
1371 | } |
1372 | /* GL1 Data Cache per SA */ |
1373 | if (adev->gfx.config.gc_gl1c_per_sa && |
1374 | adev->gfx.config.gc_gl1c_size_per_instance) { |
1375 | pcache_info[i].cache_size = adev->gfx.config.gc_gl1c_per_sa * |
1376 | adev->gfx.config.gc_gl1c_size_per_instance; |
1377 | pcache_info[i].cache_level = 1; |
1378 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1379 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1380 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1381 | pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; |
1382 | i++; |
1383 | } |
1384 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
1385 | if (adev->gfx.config.gc_gl2c_per_gpu) { |
1386 | pcache_info[i].cache_size = adev->gfx.config.gc_gl2c_per_gpu; |
1387 | pcache_info[i].cache_level = 2; |
1388 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1389 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1390 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1391 | pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; |
1392 | i++; |
1393 | } |
1394 | /* L3 Data Cache per GPU */ |
1395 | if (adev->gmc.mall_size) { |
1396 | pcache_info[i].cache_size = adev->gmc.mall_size / 1024; |
1397 | pcache_info[i].cache_level = 3; |
1398 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1399 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1400 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1401 | pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; |
1402 | i++; |
1403 | } |
1404 | return i; |
1405 | } |
1406 | |
1407 | static int kfd_fill_gpu_cache_info_from_gfx_config_v2(struct kfd_dev *kdev, |
1408 | struct kfd_gpu_cache_info *pcache_info) |
1409 | { |
1410 | struct amdgpu_device *adev = kdev->adev; |
1411 | int i = 0; |
1412 | |
1413 | /* TCP L1 Cache per CU */ |
1414 | if (adev->gfx.config.gc_tcp_size_per_cu) { |
1415 | pcache_info[i].cache_size = adev->gfx.config.gc_tcp_size_per_cu; |
1416 | pcache_info[i].cache_level = 1; |
1417 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1418 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1419 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1420 | pcache_info[i].num_cu_shared = 1; |
1421 | i++; |
1422 | } |
1423 | /* Scalar L1 Instruction Cache per SQC */ |
1424 | if (adev->gfx.config.gc_l1_instruction_cache_size_per_sqc) { |
1425 | pcache_info[i].cache_size = |
1426 | adev->gfx.config.gc_l1_instruction_cache_size_per_sqc; |
1427 | pcache_info[i].cache_level = 1; |
1428 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1429 | CRAT_CACHE_FLAGS_INST_CACHE | |
1430 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1431 | pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc; |
1432 | i++; |
1433 | } |
1434 | /* Scalar L1 Data Cache per SQC */ |
1435 | if (adev->gfx.config.gc_l1_data_cache_size_per_sqc) { |
1436 | pcache_info[i].cache_size = adev->gfx.config.gc_l1_data_cache_size_per_sqc; |
1437 | pcache_info[i].cache_level = 1; |
1438 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1439 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1440 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1441 | pcache_info[i].num_cu_shared = adev->gfx.config.gc_num_cu_per_sqc; |
1442 | i++; |
1443 | } |
1444 | /* L2 Data Cache per GPU (Total Tex Cache) */ |
1445 | if (adev->gfx.config.gc_tcc_size) { |
1446 | pcache_info[i].cache_size = adev->gfx.config.gc_tcc_size; |
1447 | pcache_info[i].cache_level = 2; |
1448 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1449 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1450 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1451 | pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; |
1452 | i++; |
1453 | } |
1454 | /* L3 Data Cache per GPU */ |
1455 | if (adev->gmc.mall_size) { |
1456 | pcache_info[i].cache_size = adev->gmc.mall_size / 1024; |
1457 | pcache_info[i].cache_level = 3; |
1458 | pcache_info[i].flags = (CRAT_CACHE_FLAGS_ENABLED | |
1459 | CRAT_CACHE_FLAGS_DATA_CACHE | |
1460 | CRAT_CACHE_FLAGS_SIMD_CACHE); |
1461 | pcache_info[i].num_cu_shared = adev->gfx.config.max_cu_per_sh; |
1462 | i++; |
1463 | } |
1464 | return i; |
1465 | } |
1466 | |
1467 | int kfd_get_gpu_cache_info(struct kfd_node *kdev, struct kfd_gpu_cache_info **pcache_info) |
1468 | { |
1469 | int num_of_cache_types = 0; |
1470 | |
1471 | switch (kdev->adev->asic_type) { |
1472 | case CHIP_KAVERI: |
1473 | *pcache_info = kaveri_cache_info; |
1474 | num_of_cache_types = ARRAY_SIZE(kaveri_cache_info); |
1475 | break; |
1476 | case CHIP_HAWAII: |
1477 | *pcache_info = hawaii_cache_info; |
1478 | num_of_cache_types = ARRAY_SIZE(hawaii_cache_info); |
1479 | break; |
1480 | case CHIP_CARRIZO: |
1481 | *pcache_info = carrizo_cache_info; |
1482 | num_of_cache_types = ARRAY_SIZE(carrizo_cache_info); |
1483 | break; |
1484 | case CHIP_TONGA: |
1485 | *pcache_info = tonga_cache_info; |
1486 | num_of_cache_types = ARRAY_SIZE(tonga_cache_info); |
1487 | break; |
1488 | case CHIP_FIJI: |
1489 | *pcache_info = fiji_cache_info; |
1490 | num_of_cache_types = ARRAY_SIZE(fiji_cache_info); |
1491 | break; |
1492 | case CHIP_POLARIS10: |
1493 | *pcache_info = polaris10_cache_info; |
1494 | num_of_cache_types = ARRAY_SIZE(polaris10_cache_info); |
1495 | break; |
1496 | case CHIP_POLARIS11: |
1497 | *pcache_info = polaris11_cache_info; |
1498 | num_of_cache_types = ARRAY_SIZE(polaris11_cache_info); |
1499 | break; |
1500 | case CHIP_POLARIS12: |
1501 | *pcache_info = polaris12_cache_info; |
1502 | num_of_cache_types = ARRAY_SIZE(polaris12_cache_info); |
1503 | break; |
1504 | case CHIP_VEGAM: |
1505 | *pcache_info = vegam_cache_info; |
1506 | num_of_cache_types = ARRAY_SIZE(vegam_cache_info); |
1507 | break; |
1508 | default: |
1509 | switch (KFD_GC_VERSION(kdev)) { |
1510 | case IP_VERSION(9, 0, 1): |
1511 | *pcache_info = vega10_cache_info; |
1512 | num_of_cache_types = ARRAY_SIZE(vega10_cache_info); |
1513 | break; |
1514 | case IP_VERSION(9, 2, 1): |
1515 | *pcache_info = vega12_cache_info; |
1516 | num_of_cache_types = ARRAY_SIZE(vega12_cache_info); |
1517 | break; |
1518 | case IP_VERSION(9, 4, 0): |
1519 | case IP_VERSION(9, 4, 1): |
1520 | *pcache_info = vega20_cache_info; |
1521 | num_of_cache_types = ARRAY_SIZE(vega20_cache_info); |
1522 | break; |
1523 | case IP_VERSION(9, 4, 2): |
1524 | *pcache_info = aldebaran_cache_info; |
1525 | num_of_cache_types = ARRAY_SIZE(aldebaran_cache_info); |
1526 | break; |
1527 | case IP_VERSION(9, 4, 3): |
1528 | num_of_cache_types = |
1529 | kfd_fill_gpu_cache_info_from_gfx_config_v2(kdev: kdev->kfd, |
1530 | pcache_info: *pcache_info); |
1531 | break; |
1532 | case IP_VERSION(9, 1, 0): |
1533 | case IP_VERSION(9, 2, 2): |
1534 | *pcache_info = raven_cache_info; |
1535 | num_of_cache_types = ARRAY_SIZE(raven_cache_info); |
1536 | break; |
1537 | case IP_VERSION(9, 3, 0): |
1538 | *pcache_info = renoir_cache_info; |
1539 | num_of_cache_types = ARRAY_SIZE(renoir_cache_info); |
1540 | break; |
1541 | case IP_VERSION(10, 1, 10): |
1542 | case IP_VERSION(10, 1, 2): |
1543 | case IP_VERSION(10, 1, 3): |
1544 | case IP_VERSION(10, 1, 4): |
1545 | *pcache_info = navi10_cache_info; |
1546 | num_of_cache_types = ARRAY_SIZE(navi10_cache_info); |
1547 | break; |
1548 | case IP_VERSION(10, 1, 1): |
1549 | *pcache_info = navi14_cache_info; |
1550 | num_of_cache_types = ARRAY_SIZE(navi14_cache_info); |
1551 | break; |
1552 | case IP_VERSION(10, 3, 0): |
1553 | *pcache_info = sienna_cichlid_cache_info; |
1554 | num_of_cache_types = ARRAY_SIZE(sienna_cichlid_cache_info); |
1555 | break; |
1556 | case IP_VERSION(10, 3, 2): |
1557 | *pcache_info = navy_flounder_cache_info; |
1558 | num_of_cache_types = ARRAY_SIZE(navy_flounder_cache_info); |
1559 | break; |
1560 | case IP_VERSION(10, 3, 4): |
1561 | *pcache_info = dimgrey_cavefish_cache_info; |
1562 | num_of_cache_types = ARRAY_SIZE(dimgrey_cavefish_cache_info); |
1563 | break; |
1564 | case IP_VERSION(10, 3, 1): |
1565 | *pcache_info = vangogh_cache_info; |
1566 | num_of_cache_types = ARRAY_SIZE(vangogh_cache_info); |
1567 | break; |
1568 | case IP_VERSION(10, 3, 5): |
1569 | *pcache_info = beige_goby_cache_info; |
1570 | num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info); |
1571 | break; |
1572 | case IP_VERSION(10, 3, 3): |
1573 | *pcache_info = yellow_carp_cache_info; |
1574 | num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info); |
1575 | break; |
1576 | case IP_VERSION(10, 3, 6): |
1577 | *pcache_info = gc_10_3_6_cache_info; |
1578 | num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info); |
1579 | break; |
1580 | case IP_VERSION(10, 3, 7): |
1581 | *pcache_info = gfx1037_cache_info; |
1582 | num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info); |
1583 | break; |
1584 | case IP_VERSION(11, 0, 0): |
1585 | case IP_VERSION(11, 0, 1): |
1586 | case IP_VERSION(11, 0, 2): |
1587 | case IP_VERSION(11, 0, 3): |
1588 | case IP_VERSION(11, 0, 4): |
1589 | case IP_VERSION(11, 5, 0): |
1590 | num_of_cache_types = |
1591 | kfd_fill_gpu_cache_info_from_gfx_config(kdev: kdev->kfd, pcache_info: *pcache_info); |
1592 | break; |
1593 | default: |
1594 | *pcache_info = dummy_cache_info; |
1595 | num_of_cache_types = ARRAY_SIZE(dummy_cache_info); |
1596 | pr_warn("dummy cache info is used temporarily and real cache info need update later.\n" ); |
1597 | break; |
1598 | } |
1599 | } |
1600 | return num_of_cache_types; |
1601 | } |
1602 | |
1603 | /* Memory required to create Virtual CRAT. |
1604 | * Since there is no easy way to predict the amount of memory required, the |
1605 | * following amount is allocated for GPU Virtual CRAT. This is |
1606 | * expected to cover all known conditions. But to be safe additional check |
1607 | * is put in the code to ensure we don't overwrite. |
1608 | */ |
1609 | #define VCRAT_SIZE_FOR_GPU (4 * PAGE_SIZE) |
1610 | |
1611 | /* kfd_fill_cu_for_cpu - Fill in Compute info for the given CPU NUMA node |
1612 | * |
1613 | * @numa_node_id: CPU NUMA node id |
1614 | * @avail_size: Available size in the memory |
1615 | * @sub_type_hdr: Memory into which compute info will be filled in |
1616 | * |
1617 | * Return 0 if successful else return -ve value |
1618 | */ |
1619 | static int kfd_fill_cu_for_cpu(int numa_node_id, int *avail_size, |
1620 | int proximity_domain, |
1621 | struct crat_subtype_computeunit *sub_type_hdr) |
1622 | { |
1623 | const struct cpumask *cpumask; |
1624 | |
1625 | *avail_size -= sizeof(struct crat_subtype_computeunit); |
1626 | if (*avail_size < 0) |
1627 | return -ENOMEM; |
1628 | |
1629 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); |
1630 | |
1631 | /* Fill in subtype header data */ |
1632 | sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; |
1633 | sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); |
1634 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; |
1635 | |
1636 | cpumask = cpumask_of_node(node: numa_node_id); |
1637 | |
1638 | /* Fill in CU data */ |
1639 | sub_type_hdr->flags |= CRAT_CU_FLAGS_CPU_PRESENT; |
1640 | sub_type_hdr->proximity_domain = proximity_domain; |
1641 | sub_type_hdr->processor_id_low = kfd_numa_node_to_apic_id(numa_node_id); |
1642 | if (sub_type_hdr->processor_id_low == -1) |
1643 | return -EINVAL; |
1644 | |
1645 | sub_type_hdr->num_cpu_cores = cpumask_weight(srcp: cpumask); |
1646 | |
1647 | return 0; |
1648 | } |
1649 | |
1650 | /* kfd_fill_mem_info_for_cpu - Fill in Memory info for the given CPU NUMA node |
1651 | * |
1652 | * @numa_node_id: CPU NUMA node id |
1653 | * @avail_size: Available size in the memory |
1654 | * @sub_type_hdr: Memory into which compute info will be filled in |
1655 | * |
1656 | * Return 0 if successful else return -ve value |
1657 | */ |
1658 | static int kfd_fill_mem_info_for_cpu(int numa_node_id, int *avail_size, |
1659 | int proximity_domain, |
1660 | struct crat_subtype_memory *sub_type_hdr) |
1661 | { |
1662 | uint64_t mem_in_bytes = 0; |
1663 | pg_data_t *pgdat; |
1664 | int zone_type; |
1665 | |
1666 | *avail_size -= sizeof(struct crat_subtype_memory); |
1667 | if (*avail_size < 0) |
1668 | return -ENOMEM; |
1669 | |
1670 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); |
1671 | |
1672 | /* Fill in subtype header data */ |
1673 | sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; |
1674 | sub_type_hdr->length = sizeof(struct crat_subtype_memory); |
1675 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; |
1676 | |
1677 | /* Fill in Memory Subunit data */ |
1678 | |
1679 | /* Unlike si_meminfo, si_meminfo_node is not exported. So |
1680 | * the following lines are duplicated from si_meminfo_node |
1681 | * function |
1682 | */ |
1683 | pgdat = NODE_DATA(numa_node_id); |
1684 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) |
1685 | mem_in_bytes += zone_managed_pages(zone: &pgdat->node_zones[zone_type]); |
1686 | mem_in_bytes <<= PAGE_SHIFT; |
1687 | |
1688 | sub_type_hdr->length_low = lower_32_bits(mem_in_bytes); |
1689 | sub_type_hdr->length_high = upper_32_bits(mem_in_bytes); |
1690 | sub_type_hdr->proximity_domain = proximity_domain; |
1691 | |
1692 | return 0; |
1693 | } |
1694 | |
1695 | #ifdef CONFIG_X86_64 |
1696 | static int kfd_fill_iolink_info_for_cpu(int numa_node_id, int *avail_size, |
1697 | uint32_t *num_entries, |
1698 | struct crat_subtype_iolink *sub_type_hdr) |
1699 | { |
1700 | int nid; |
1701 | struct cpuinfo_x86 *c = &cpu_data(0); |
1702 | uint8_t link_type; |
1703 | |
1704 | if (c->x86_vendor == X86_VENDOR_AMD) |
1705 | link_type = CRAT_IOLINK_TYPE_HYPERTRANSPORT; |
1706 | else |
1707 | link_type = CRAT_IOLINK_TYPE_QPI_1_1; |
1708 | |
1709 | *num_entries = 0; |
1710 | |
1711 | /* Create IO links from this node to other CPU nodes */ |
1712 | for_each_online_node(nid) { |
1713 | if (nid == numa_node_id) /* node itself */ |
1714 | continue; |
1715 | |
1716 | *avail_size -= sizeof(struct crat_subtype_iolink); |
1717 | if (*avail_size < 0) |
1718 | return -ENOMEM; |
1719 | |
1720 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); |
1721 | |
1722 | /* Fill in subtype header data */ |
1723 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; |
1724 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); |
1725 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; |
1726 | |
1727 | /* Fill in IO link data */ |
1728 | sub_type_hdr->proximity_domain_from = numa_node_id; |
1729 | sub_type_hdr->proximity_domain_to = nid; |
1730 | sub_type_hdr->io_interface_type = link_type; |
1731 | |
1732 | (*num_entries)++; |
1733 | sub_type_hdr++; |
1734 | } |
1735 | |
1736 | return 0; |
1737 | } |
1738 | #endif |
1739 | |
1740 | /* kfd_create_vcrat_image_cpu - Create Virtual CRAT for CPU |
1741 | * |
1742 | * @pcrat_image: Fill in VCRAT for CPU |
1743 | * @size: [IN] allocated size of crat_image. |
1744 | * [OUT] actual size of data filled in crat_image |
1745 | */ |
1746 | static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) |
1747 | { |
1748 | struct crat_header *crat_table = (struct crat_header *)pcrat_image; |
1749 | struct acpi_table_header *acpi_table; |
1750 | acpi_status status; |
1751 | struct crat_subtype_generic *sub_type_hdr; |
1752 | int avail_size = *size; |
1753 | int numa_node_id; |
1754 | #ifdef CONFIG_X86_64 |
1755 | uint32_t entries = 0; |
1756 | #endif |
1757 | int ret = 0; |
1758 | |
1759 | if (!pcrat_image) |
1760 | return -EINVAL; |
1761 | |
1762 | /* Fill in CRAT Header. |
1763 | * Modify length and total_entries as subunits are added. |
1764 | */ |
1765 | avail_size -= sizeof(struct crat_header); |
1766 | if (avail_size < 0) |
1767 | return -ENOMEM; |
1768 | |
1769 | memset(crat_table, 0, sizeof(struct crat_header)); |
1770 | memcpy(&crat_table->signature, CRAT_SIGNATURE, |
1771 | sizeof(crat_table->signature)); |
1772 | crat_table->length = sizeof(struct crat_header); |
1773 | |
1774 | status = acpi_get_table(signature: "DSDT" , instance: 0, out_table: &acpi_table); |
1775 | if (status != AE_OK) |
1776 | pr_warn("DSDT table not found for OEM information\n" ); |
1777 | else { |
1778 | crat_table->oem_revision = acpi_table->revision; |
1779 | memcpy(crat_table->oem_id, acpi_table->oem_id, |
1780 | CRAT_OEMID_LENGTH); |
1781 | memcpy(crat_table->oem_table_id, acpi_table->oem_table_id, |
1782 | CRAT_OEMTABLEID_LENGTH); |
1783 | acpi_put_table(table: acpi_table); |
1784 | } |
1785 | crat_table->total_entries = 0; |
1786 | crat_table->num_domains = 0; |
1787 | |
1788 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table+1); |
1789 | |
1790 | for_each_online_node(numa_node_id) { |
1791 | if (kfd_numa_node_to_apic_id(numa_node_id) == -1) |
1792 | continue; |
1793 | |
1794 | /* Fill in Subtype: Compute Unit */ |
1795 | ret = kfd_fill_cu_for_cpu(numa_node_id, avail_size: &avail_size, |
1796 | proximity_domain: crat_table->num_domains, |
1797 | sub_type_hdr: (struct crat_subtype_computeunit *)sub_type_hdr); |
1798 | if (ret < 0) |
1799 | return ret; |
1800 | crat_table->length += sub_type_hdr->length; |
1801 | crat_table->total_entries++; |
1802 | |
1803 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
1804 | sub_type_hdr->length); |
1805 | |
1806 | /* Fill in Subtype: Memory */ |
1807 | ret = kfd_fill_mem_info_for_cpu(numa_node_id, avail_size: &avail_size, |
1808 | proximity_domain: crat_table->num_domains, |
1809 | sub_type_hdr: (struct crat_subtype_memory *)sub_type_hdr); |
1810 | if (ret < 0) |
1811 | return ret; |
1812 | crat_table->length += sub_type_hdr->length; |
1813 | crat_table->total_entries++; |
1814 | |
1815 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
1816 | sub_type_hdr->length); |
1817 | |
1818 | /* Fill in Subtype: IO Link */ |
1819 | #ifdef CONFIG_X86_64 |
1820 | ret = kfd_fill_iolink_info_for_cpu(numa_node_id, avail_size: &avail_size, |
1821 | num_entries: &entries, |
1822 | sub_type_hdr: (struct crat_subtype_iolink *)sub_type_hdr); |
1823 | if (ret < 0) |
1824 | return ret; |
1825 | |
1826 | if (entries) { |
1827 | crat_table->length += (sub_type_hdr->length * entries); |
1828 | crat_table->total_entries += entries; |
1829 | |
1830 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
1831 | sub_type_hdr->length * entries); |
1832 | } |
1833 | #else |
1834 | pr_info("IO link not available for non x86 platforms\n" ); |
1835 | #endif |
1836 | |
1837 | crat_table->num_domains++; |
1838 | } |
1839 | |
1840 | /* TODO: Add cache Subtype for CPU. |
1841 | * Currently, CPU cache information is available in function |
1842 | * detect_cache_attributes(cpu) defined in the file |
1843 | * ./arch/x86/kernel/cpu/intel_cacheinfo.c. This function is not |
1844 | * exported and to get the same information the code needs to be |
1845 | * duplicated. |
1846 | */ |
1847 | |
1848 | *size = crat_table->length; |
1849 | pr_info("Virtual CRAT table created for CPU\n" ); |
1850 | |
1851 | return 0; |
1852 | } |
1853 | |
1854 | static int kfd_fill_gpu_memory_affinity(int *avail_size, |
1855 | struct kfd_node *kdev, uint8_t type, uint64_t size, |
1856 | struct crat_subtype_memory *sub_type_hdr, |
1857 | uint32_t proximity_domain, |
1858 | const struct kfd_local_mem_info *local_mem_info) |
1859 | { |
1860 | *avail_size -= sizeof(struct crat_subtype_memory); |
1861 | if (*avail_size < 0) |
1862 | return -ENOMEM; |
1863 | |
1864 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_memory)); |
1865 | sub_type_hdr->type = CRAT_SUBTYPE_MEMORY_AFFINITY; |
1866 | sub_type_hdr->length = sizeof(struct crat_subtype_memory); |
1867 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; |
1868 | |
1869 | sub_type_hdr->proximity_domain = proximity_domain; |
1870 | |
1871 | pr_debug("Fill gpu memory affinity - type 0x%x size 0x%llx\n" , |
1872 | type, size); |
1873 | |
1874 | sub_type_hdr->length_low = lower_32_bits(size); |
1875 | sub_type_hdr->length_high = upper_32_bits(size); |
1876 | |
1877 | sub_type_hdr->width = local_mem_info->vram_width; |
1878 | sub_type_hdr->visibility_type = type; |
1879 | |
1880 | return 0; |
1881 | } |
1882 | |
1883 | #ifdef CONFIG_ACPI_NUMA |
1884 | static void kfd_find_numa_node_in_srat(struct kfd_node *kdev) |
1885 | { |
1886 | struct acpi_table_header * = NULL; |
1887 | struct acpi_subtable_header * = NULL; |
1888 | unsigned long table_end, subtable_len; |
1889 | u32 pci_id = pci_domain_nr(bus: kdev->adev->pdev->bus) << 16 | |
1890 | pci_dev_id(dev: kdev->adev->pdev); |
1891 | u32 bdf; |
1892 | acpi_status status; |
1893 | struct acpi_srat_cpu_affinity *cpu; |
1894 | struct acpi_srat_generic_affinity *gpu; |
1895 | int pxm = 0, max_pxm = 0; |
1896 | int numa_node = NUMA_NO_NODE; |
1897 | bool found = false; |
1898 | |
1899 | /* Fetch the SRAT table from ACPI */ |
1900 | status = acpi_get_table(ACPI_SIG_SRAT, instance: 0, out_table: &table_header); |
1901 | if (status == AE_NOT_FOUND) { |
1902 | pr_warn("SRAT table not found\n" ); |
1903 | return; |
1904 | } else if (ACPI_FAILURE(status)) { |
1905 | const char *err = acpi_format_exception(exception: status); |
1906 | pr_err("SRAT table error: %s\n" , err); |
1907 | return; |
1908 | } |
1909 | |
1910 | table_end = (unsigned long)table_header + table_header->length; |
1911 | |
1912 | /* Parse all entries looking for a match. */ |
1913 | sub_header = (struct acpi_subtable_header *) |
1914 | ((unsigned long)table_header + |
1915 | sizeof(struct acpi_table_srat)); |
1916 | subtable_len = sub_header->length; |
1917 | |
1918 | while (((unsigned long)sub_header) + subtable_len < table_end) { |
1919 | /* |
1920 | * If length is 0, break from this loop to avoid |
1921 | * infinite loop. |
1922 | */ |
1923 | if (subtable_len == 0) { |
1924 | pr_err("SRAT invalid zero length\n" ); |
1925 | break; |
1926 | } |
1927 | |
1928 | switch (sub_header->type) { |
1929 | case ACPI_SRAT_TYPE_CPU_AFFINITY: |
1930 | cpu = (struct acpi_srat_cpu_affinity *)sub_header; |
1931 | pxm = *((u32 *)cpu->proximity_domain_hi) << 8 | |
1932 | cpu->proximity_domain_lo; |
1933 | if (pxm > max_pxm) |
1934 | max_pxm = pxm; |
1935 | break; |
1936 | case ACPI_SRAT_TYPE_GENERIC_AFFINITY: |
1937 | gpu = (struct acpi_srat_generic_affinity *)sub_header; |
1938 | bdf = *((u16 *)(&gpu->device_handle[0])) << 16 | |
1939 | *((u16 *)(&gpu->device_handle[2])); |
1940 | if (bdf == pci_id) { |
1941 | found = true; |
1942 | numa_node = pxm_to_node(gpu->proximity_domain); |
1943 | } |
1944 | break; |
1945 | default: |
1946 | break; |
1947 | } |
1948 | |
1949 | if (found) |
1950 | break; |
1951 | |
1952 | sub_header = (struct acpi_subtable_header *) |
1953 | ((unsigned long)sub_header + subtable_len); |
1954 | subtable_len = sub_header->length; |
1955 | } |
1956 | |
1957 | acpi_put_table(table: table_header); |
1958 | |
1959 | /* Workaround bad cpu-gpu binding case */ |
1960 | if (found && (numa_node < 0 || |
1961 | numa_node > pxm_to_node(max_pxm))) |
1962 | numa_node = 0; |
1963 | |
1964 | if (numa_node != NUMA_NO_NODE) |
1965 | set_dev_node(dev: &kdev->adev->pdev->dev, node: numa_node); |
1966 | } |
1967 | #endif |
1968 | |
1969 | #define KFD_CRAT_INTRA_SOCKET_WEIGHT 13 |
1970 | #define KFD_CRAT_XGMI_WEIGHT 15 |
1971 | |
1972 | /* kfd_fill_gpu_direct_io_link - Fill in direct io link from GPU |
1973 | * to its NUMA node |
1974 | * @avail_size: Available size in the memory |
1975 | * @kdev - [IN] GPU device |
1976 | * @sub_type_hdr: Memory into which io link info will be filled in |
1977 | * @proximity_domain - proximity domain of the GPU node |
1978 | * |
1979 | * Return 0 if successful else return -ve value |
1980 | */ |
1981 | static int kfd_fill_gpu_direct_io_link_to_cpu(int *avail_size, |
1982 | struct kfd_node *kdev, |
1983 | struct crat_subtype_iolink *sub_type_hdr, |
1984 | uint32_t proximity_domain) |
1985 | { |
1986 | *avail_size -= sizeof(struct crat_subtype_iolink); |
1987 | if (*avail_size < 0) |
1988 | return -ENOMEM; |
1989 | |
1990 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); |
1991 | |
1992 | /* Fill in subtype header data */ |
1993 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; |
1994 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); |
1995 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED; |
1996 | if (kfd_dev_is_large_bar(dev: kdev)) |
1997 | sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; |
1998 | |
1999 | /* Fill in IOLINK subtype. |
2000 | * TODO: Fill-in other fields of iolink subtype |
2001 | */ |
2002 | if (kdev->adev->gmc.xgmi.connected_to_cpu || |
2003 | (KFD_GC_VERSION(kdev) == IP_VERSION(9, 4, 3) && |
2004 | kdev->adev->smuio.funcs->get_pkg_type(kdev->adev) == |
2005 | AMDGPU_PKG_TYPE_APU)) { |
2006 | bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3); |
2007 | int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT : |
2008 | KFD_CRAT_INTRA_SOCKET_WEIGHT; |
2009 | uint32_t bandwidth = ext_cpu ? amdgpu_amdkfd_get_xgmi_bandwidth_mbytes( |
2010 | dst: kdev->adev, NULL, is_min: true) : mem_bw; |
2011 | |
2012 | /* |
2013 | * with host gpu xgmi link, host can access gpu memory whether |
2014 | * or not pcie bar type is large, so always create bidirectional |
2015 | * io link. |
2016 | */ |
2017 | sub_type_hdr->flags |= CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; |
2018 | sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; |
2019 | sub_type_hdr->weight_xgmi = weight; |
2020 | sub_type_hdr->minimum_bandwidth_mbs = bandwidth; |
2021 | sub_type_hdr->maximum_bandwidth_mbs = bandwidth; |
2022 | } else { |
2023 | sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_PCIEXPRESS; |
2024 | sub_type_hdr->minimum_bandwidth_mbs = |
2025 | amdgpu_amdkfd_get_pcie_bandwidth_mbytes(adev: kdev->adev, is_min: true); |
2026 | sub_type_hdr->maximum_bandwidth_mbs = |
2027 | amdgpu_amdkfd_get_pcie_bandwidth_mbytes(adev: kdev->adev, is_min: false); |
2028 | } |
2029 | |
2030 | sub_type_hdr->proximity_domain_from = proximity_domain; |
2031 | |
2032 | #ifdef CONFIG_ACPI_NUMA |
2033 | if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE && |
2034 | num_possible_nodes() > 1) |
2035 | kfd_find_numa_node_in_srat(kdev); |
2036 | #endif |
2037 | #ifdef CONFIG_NUMA |
2038 | if (kdev->adev->pdev->dev.numa_node == NUMA_NO_NODE) |
2039 | sub_type_hdr->proximity_domain_to = 0; |
2040 | else |
2041 | sub_type_hdr->proximity_domain_to = kdev->adev->pdev->dev.numa_node; |
2042 | #else |
2043 | sub_type_hdr->proximity_domain_to = 0; |
2044 | #endif |
2045 | return 0; |
2046 | } |
2047 | |
2048 | static int kfd_fill_gpu_xgmi_link_to_gpu(int *avail_size, |
2049 | struct kfd_node *kdev, |
2050 | struct kfd_node *peer_kdev, |
2051 | struct crat_subtype_iolink *sub_type_hdr, |
2052 | uint32_t proximity_domain_from, |
2053 | uint32_t proximity_domain_to) |
2054 | { |
2055 | bool use_ta_info = kdev->kfd->num_nodes == 1; |
2056 | |
2057 | *avail_size -= sizeof(struct crat_subtype_iolink); |
2058 | if (*avail_size < 0) |
2059 | return -ENOMEM; |
2060 | |
2061 | memset((void *)sub_type_hdr, 0, sizeof(struct crat_subtype_iolink)); |
2062 | |
2063 | sub_type_hdr->type = CRAT_SUBTYPE_IOLINK_AFFINITY; |
2064 | sub_type_hdr->length = sizeof(struct crat_subtype_iolink); |
2065 | sub_type_hdr->flags |= CRAT_SUBTYPE_FLAGS_ENABLED | |
2066 | CRAT_IOLINK_FLAGS_BI_DIRECTIONAL; |
2067 | |
2068 | sub_type_hdr->io_interface_type = CRAT_IOLINK_TYPE_XGMI; |
2069 | sub_type_hdr->proximity_domain_from = proximity_domain_from; |
2070 | sub_type_hdr->proximity_domain_to = proximity_domain_to; |
2071 | |
2072 | if (use_ta_info) { |
2073 | sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT * |
2074 | amdgpu_amdkfd_get_xgmi_hops_count(dst: kdev->adev, src: peer_kdev->adev); |
2075 | sub_type_hdr->maximum_bandwidth_mbs = |
2076 | amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(dst: kdev->adev, |
2077 | src: peer_kdev->adev, is_min: false); |
2078 | sub_type_hdr->minimum_bandwidth_mbs = sub_type_hdr->maximum_bandwidth_mbs ? |
2079 | amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(dst: kdev->adev, NULL, is_min: true) : 0; |
2080 | } else { |
2081 | bool is_single_hop = kdev->kfd == peer_kdev->kfd; |
2082 | int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT : |
2083 | (2 * KFD_CRAT_INTRA_SOCKET_WEIGHT) + KFD_CRAT_XGMI_WEIGHT; |
2084 | int mem_bw = 819200; |
2085 | |
2086 | sub_type_hdr->weight_xgmi = weight; |
2087 | sub_type_hdr->maximum_bandwidth_mbs = is_single_hop ? mem_bw : 0; |
2088 | sub_type_hdr->minimum_bandwidth_mbs = is_single_hop ? mem_bw : 0; |
2089 | } |
2090 | |
2091 | return 0; |
2092 | } |
2093 | |
2094 | /* kfd_create_vcrat_image_gpu - Create Virtual CRAT for CPU |
2095 | * |
2096 | * @pcrat_image: Fill in VCRAT for GPU |
2097 | * @size: [IN] allocated size of crat_image. |
2098 | * [OUT] actual size of data filled in crat_image |
2099 | */ |
2100 | static int kfd_create_vcrat_image_gpu(void *pcrat_image, |
2101 | size_t *size, struct kfd_node *kdev, |
2102 | uint32_t proximity_domain) |
2103 | { |
2104 | struct crat_header *crat_table = (struct crat_header *)pcrat_image; |
2105 | struct amdgpu_gfx_config *gfx_info = &kdev->adev->gfx.config; |
2106 | struct amdgpu_cu_info *cu_info = &kdev->adev->gfx.cu_info; |
2107 | struct crat_subtype_generic *sub_type_hdr; |
2108 | struct kfd_local_mem_info local_mem_info; |
2109 | struct kfd_topology_device *peer_dev; |
2110 | struct crat_subtype_computeunit *cu; |
2111 | int avail_size = *size; |
2112 | uint32_t total_num_of_cu; |
2113 | uint32_t nid = 0; |
2114 | int ret = 0; |
2115 | |
2116 | if (!pcrat_image || avail_size < VCRAT_SIZE_FOR_GPU) |
2117 | return -EINVAL; |
2118 | |
2119 | /* Fill the CRAT Header. |
2120 | * Modify length and total_entries as subunits are added. |
2121 | */ |
2122 | avail_size -= sizeof(struct crat_header); |
2123 | if (avail_size < 0) |
2124 | return -ENOMEM; |
2125 | |
2126 | memset(crat_table, 0, sizeof(struct crat_header)); |
2127 | |
2128 | memcpy(&crat_table->signature, CRAT_SIGNATURE, |
2129 | sizeof(crat_table->signature)); |
2130 | /* Change length as we add more subtypes*/ |
2131 | crat_table->length = sizeof(struct crat_header); |
2132 | crat_table->num_domains = 1; |
2133 | crat_table->total_entries = 0; |
2134 | |
2135 | /* Fill in Subtype: Compute Unit |
2136 | * First fill in the sub type header and then sub type data |
2137 | */ |
2138 | avail_size -= sizeof(struct crat_subtype_computeunit); |
2139 | if (avail_size < 0) |
2140 | return -ENOMEM; |
2141 | |
2142 | sub_type_hdr = (struct crat_subtype_generic *)(crat_table + 1); |
2143 | memset(sub_type_hdr, 0, sizeof(struct crat_subtype_computeunit)); |
2144 | |
2145 | sub_type_hdr->type = CRAT_SUBTYPE_COMPUTEUNIT_AFFINITY; |
2146 | sub_type_hdr->length = sizeof(struct crat_subtype_computeunit); |
2147 | sub_type_hdr->flags = CRAT_SUBTYPE_FLAGS_ENABLED; |
2148 | |
2149 | /* Fill CU subtype data */ |
2150 | cu = (struct crat_subtype_computeunit *)sub_type_hdr; |
2151 | cu->flags |= CRAT_CU_FLAGS_GPU_PRESENT; |
2152 | cu->proximity_domain = proximity_domain; |
2153 | |
2154 | cu->num_simd_per_cu = cu_info->simd_per_cu; |
2155 | cu->num_simd_cores = cu_info->simd_per_cu * |
2156 | (cu_info->number / kdev->kfd->num_nodes); |
2157 | cu->max_waves_simd = cu_info->max_waves_per_simd; |
2158 | |
2159 | cu->wave_front_size = cu_info->wave_front_size; |
2160 | cu->array_count = gfx_info->max_sh_per_se * |
2161 | gfx_info->max_shader_engines; |
2162 | total_num_of_cu = (cu->array_count * gfx_info->max_cu_per_sh); |
2163 | cu->processor_id_low = get_and_inc_gpu_processor_id(total_cu_count: total_num_of_cu); |
2164 | cu->num_cu_per_array = gfx_info->max_cu_per_sh; |
2165 | cu->max_slots_scatch_cu = cu_info->max_scratch_slots_per_cu; |
2166 | cu->num_banks = gfx_info->max_shader_engines; |
2167 | cu->lds_size_in_kb = cu_info->lds_size; |
2168 | |
2169 | cu->hsa_capability = 0; |
2170 | |
2171 | crat_table->length += sub_type_hdr->length; |
2172 | crat_table->total_entries++; |
2173 | |
2174 | /* Fill in Subtype: Memory. Only on systems with large BAR (no |
2175 | * private FB), report memory as public. On other systems |
2176 | * report the total FB size (public+private) as a single |
2177 | * private heap. |
2178 | */ |
2179 | local_mem_info = kdev->local_mem_info; |
2180 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
2181 | sub_type_hdr->length); |
2182 | |
2183 | if (kdev->adev->debug_largebar) |
2184 | local_mem_info.local_mem_size_private = 0; |
2185 | |
2186 | if (local_mem_info.local_mem_size_private == 0) |
2187 | ret = kfd_fill_gpu_memory_affinity(avail_size: &avail_size, |
2188 | kdev, HSA_MEM_HEAP_TYPE_FB_PUBLIC, |
2189 | size: local_mem_info.local_mem_size_public, |
2190 | sub_type_hdr: (struct crat_subtype_memory *)sub_type_hdr, |
2191 | proximity_domain, |
2192 | local_mem_info: &local_mem_info); |
2193 | else |
2194 | ret = kfd_fill_gpu_memory_affinity(avail_size: &avail_size, |
2195 | kdev, HSA_MEM_HEAP_TYPE_FB_PRIVATE, |
2196 | size: local_mem_info.local_mem_size_public + |
2197 | local_mem_info.local_mem_size_private, |
2198 | sub_type_hdr: (struct crat_subtype_memory *)sub_type_hdr, |
2199 | proximity_domain, |
2200 | local_mem_info: &local_mem_info); |
2201 | if (ret < 0) |
2202 | return ret; |
2203 | |
2204 | crat_table->length += sizeof(struct crat_subtype_memory); |
2205 | crat_table->total_entries++; |
2206 | |
2207 | /* Fill in Subtype: IO_LINKS |
2208 | * Only direct links are added here which is Link from GPU to |
2209 | * its NUMA node. Indirect links are added by userspace. |
2210 | */ |
2211 | sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + |
2212 | sub_type_hdr->length); |
2213 | ret = kfd_fill_gpu_direct_io_link_to_cpu(avail_size: &avail_size, kdev, |
2214 | sub_type_hdr: (struct crat_subtype_iolink *)sub_type_hdr, proximity_domain); |
2215 | |
2216 | if (ret < 0) |
2217 | return ret; |
2218 | |
2219 | crat_table->length += sub_type_hdr->length; |
2220 | crat_table->total_entries++; |
2221 | |
2222 | |
2223 | /* Fill in Subtype: IO_LINKS |
2224 | * Direct links from GPU to other GPUs through xGMI. |
2225 | * We will loop GPUs that already be processed (with lower value |
2226 | * of proximity_domain), add the link for the GPUs with same |
2227 | * hive id (from this GPU to other GPU) . The reversed iolink |
2228 | * (from other GPU to this GPU) will be added |
2229 | * in kfd_parse_subtype_iolink. |
2230 | */ |
2231 | if (kdev->kfd->hive_id) { |
2232 | for (nid = 0; nid < proximity_domain; ++nid) { |
2233 | peer_dev = kfd_topology_device_by_proximity_domain_no_lock(proximity_domain: nid); |
2234 | if (!peer_dev->gpu) |
2235 | continue; |
2236 | if (peer_dev->gpu->kfd->hive_id != kdev->kfd->hive_id) |
2237 | continue; |
2238 | sub_type_hdr = (typeof(sub_type_hdr))( |
2239 | (char *)sub_type_hdr + |
2240 | sizeof(struct crat_subtype_iolink)); |
2241 | ret = kfd_fill_gpu_xgmi_link_to_gpu( |
2242 | avail_size: &avail_size, kdev, peer_kdev: peer_dev->gpu, |
2243 | sub_type_hdr: (struct crat_subtype_iolink *)sub_type_hdr, |
2244 | proximity_domain_from: proximity_domain, proximity_domain_to: nid); |
2245 | if (ret < 0) |
2246 | return ret; |
2247 | crat_table->length += sub_type_hdr->length; |
2248 | crat_table->total_entries++; |
2249 | } |
2250 | } |
2251 | *size = crat_table->length; |
2252 | pr_info("Virtual CRAT table created for GPU\n" ); |
2253 | |
2254 | return ret; |
2255 | } |
2256 | |
2257 | /* kfd_create_crat_image_virtual - Allocates memory for CRAT image and |
2258 | * creates a Virtual CRAT (VCRAT) image |
2259 | * |
2260 | * NOTE: Call kfd_destroy_crat_image to free CRAT image memory |
2261 | * |
2262 | * @crat_image: VCRAT image created because ACPI does not have a |
2263 | * CRAT for this device |
2264 | * @size: [OUT] size of virtual crat_image |
2265 | * @flags: COMPUTE_UNIT_CPU - Create VCRAT for CPU device |
2266 | * COMPUTE_UNIT_GPU - Create VCRAT for GPU |
2267 | * (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU) - Create VCRAT for APU |
2268 | * -- this option is not currently implemented. |
2269 | * The assumption is that all AMD APUs will have CRAT |
2270 | * @kdev: Valid kfd_node required if flags contain COMPUTE_UNIT_GPU |
2271 | * |
2272 | * Return 0 if successful else return -ve value |
2273 | */ |
2274 | int kfd_create_crat_image_virtual(void **crat_image, size_t *size, |
2275 | int flags, struct kfd_node *kdev, |
2276 | uint32_t proximity_domain) |
2277 | { |
2278 | void *pcrat_image = NULL; |
2279 | int ret = 0, num_nodes; |
2280 | size_t dyn_size; |
2281 | |
2282 | if (!crat_image) |
2283 | return -EINVAL; |
2284 | |
2285 | *crat_image = NULL; |
2286 | |
2287 | /* Allocate the CPU Virtual CRAT size based on the number of online |
2288 | * nodes. Allocate VCRAT_SIZE_FOR_GPU for GPU virtual CRAT image. |
2289 | * This should cover all the current conditions. A check is put not |
2290 | * to overwrite beyond allocated size for GPUs |
2291 | */ |
2292 | switch (flags) { |
2293 | case COMPUTE_UNIT_CPU: |
2294 | num_nodes = num_online_nodes(); |
2295 | dyn_size = sizeof(struct crat_header) + |
2296 | num_nodes * (sizeof(struct crat_subtype_computeunit) + |
2297 | sizeof(struct crat_subtype_memory) + |
2298 | (num_nodes - 1) * sizeof(struct crat_subtype_iolink)); |
2299 | pcrat_image = kvmalloc(size: dyn_size, GFP_KERNEL); |
2300 | if (!pcrat_image) |
2301 | return -ENOMEM; |
2302 | *size = dyn_size; |
2303 | pr_debug("CRAT size is %ld" , dyn_size); |
2304 | ret = kfd_create_vcrat_image_cpu(pcrat_image, size); |
2305 | break; |
2306 | case COMPUTE_UNIT_GPU: |
2307 | if (!kdev) |
2308 | return -EINVAL; |
2309 | pcrat_image = kvmalloc(VCRAT_SIZE_FOR_GPU, GFP_KERNEL); |
2310 | if (!pcrat_image) |
2311 | return -ENOMEM; |
2312 | *size = VCRAT_SIZE_FOR_GPU; |
2313 | ret = kfd_create_vcrat_image_gpu(pcrat_image, size, kdev, |
2314 | proximity_domain); |
2315 | break; |
2316 | case (COMPUTE_UNIT_CPU | COMPUTE_UNIT_GPU): |
2317 | /* TODO: */ |
2318 | ret = -EINVAL; |
2319 | pr_err("VCRAT not implemented for APU\n" ); |
2320 | break; |
2321 | default: |
2322 | ret = -EINVAL; |
2323 | } |
2324 | |
2325 | if (!ret) |
2326 | *crat_image = pcrat_image; |
2327 | else |
2328 | kvfree(addr: pcrat_image); |
2329 | |
2330 | return ret; |
2331 | } |
2332 | |
2333 | |
2334 | /* kfd_destroy_crat_image |
2335 | * |
2336 | * @crat_image: [IN] - crat_image from kfd_create_crat_image_xxx(..) |
2337 | * |
2338 | */ |
2339 | void kfd_destroy_crat_image(void *crat_image) |
2340 | { |
2341 | kvfree(addr: crat_image); |
2342 | } |
2343 | |