amdgpu_vm.c source code [linux/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c]

1	/*
2	* Copyright 2008 Advanced Micro Devices, Inc.
3	* Copyright 2008 Red Hat Inc.
4	* Copyright 2009 Jerome Glisse.
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the "Software"),
8	* to deal in the Software without restriction, including without limitation
9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
10	* and/or sell copies of the Software, and to permit persons to whom the
11	* Software is furnished to do so, subject to the following conditions:
12	*
13	* The above copyright notice and this permission notice shall be included in
14	* all copies or substantial portions of the Software.
15	*
16	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19	* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
20	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22	* OTHER DEALINGS IN THE SOFTWARE.
23	*
24	* Authors: Dave Airlie
25	* Alex Deucher
26	* Jerome Glisse
27	*/
28
29	#include <linux/dma-fence-array.h>
30	#include <linux/interval_tree_generic.h>
31	#include <linux/idr.h>
32	#include <linux/dma-buf.h>
33
34	#include <drm/amdgpu_drm.h>
35	#include <drm/drm_drv.h>
36	#include <drm/ttm/ttm_tt.h>
37	#include <drm/drm_exec.h>
38	#include "amdgpu.h"
39	#include "amdgpu_trace.h"
40	#include "amdgpu_amdkfd.h"
41	#include "amdgpu_gmc.h"
42	#include "amdgpu_xgmi.h"
43	#include "amdgpu_dma_buf.h"
44	#include "amdgpu_res_cursor.h"
45	#include "kfd_svm.h"
46
47	/**
48	* DOC: GPUVM
49	*
50	* GPUVM is the MMU functionality provided on the GPU.
51	* GPUVM is similar to the legacy GART on older asics, however
52	* rather than there being a single global GART table
53	* for the entire GPU, there can be multiple GPUVM page tables active
54	* at any given time. The GPUVM page tables can contain a mix
55	* VRAM pages and system pages (both memory and MMIO) and system pages
56	* can be mapped as snooped (cached system pages) or unsnooped
57	* (uncached system pages).
58	*
59	* Each active GPUVM has an ID associated with it and there is a page table
60	* linked with each VMID. When executing a command buffer,
61	* the kernel tells the engine what VMID to use for that command
62	* buffer. VMIDs are allocated dynamically as commands are submitted.
63	* The userspace drivers maintain their own address space and the kernel
64	* sets up their pages tables accordingly when they submit their
65	* command buffers and a VMID is assigned.
66	* The hardware supports up to 16 active GPUVMs at any given time.
67	*
68	* Each GPUVM is represented by a 1-2 or 1-5 level page table, depending
69	* on the ASIC family. GPUVM supports RWX attributes on each page as well
70	* as other features such as encryption and caching attributes.
71	*
72	* VMID 0 is special. It is the GPUVM used for the kernel driver. In
73	* addition to an aperture managed by a page table, VMID 0 also has
74	* several other apertures. There is an aperture for direct access to VRAM
75	* and there is a legacy AGP aperture which just forwards accesses directly
76	* to the matching system physical addresses (or IOVAs when an IOMMU is
77	* present). These apertures provide direct access to these memories without
78	* incurring the overhead of a page table. VMID 0 is used by the kernel
79	* driver for tasks like memory management.
80	*
81	* GPU clients (i.e., engines on the GPU) use GPUVM VMIDs to access memory.
82	* For user applications, each application can have their own unique GPUVM
83	* address space. The application manages the address space and the kernel
84	* driver manages the GPUVM page tables for each process. If an GPU client
85	* accesses an invalid page, it will generate a GPU page fault, similar to
86	* accessing an invalid page on a CPU.
87	*/
88
89	#define START(node) ((node)->start)
90	#define LAST(node) ((node)->last)
91
92	INTERVAL_TREE_DEFINE(struct amdgpu_bo_va_mapping, rb, uint64_t, __subtree_last,
93	START, LAST, static, amdgpu_vm_it)
94
95	#undef START
96	#undef LAST
97
98	/**
99	* struct amdgpu_prt_cb - Helper to disable partial resident texture feature from a fence callback
100	*/
101	struct amdgpu_prt_cb {
102
103	/**
104	* @adev: amdgpu device
105	*/
106	struct amdgpu_device *adev;
107
108	/**
109	* @cb: callback
110	*/
111	struct dma_fence_cb cb;
112	};
113
114	/**
115	* struct amdgpu_vm_tlb_seq_struct - Helper to increment the TLB flush sequence
116	*/
117	struct amdgpu_vm_tlb_seq_struct {
118	/**
119	* @vm: pointer to the amdgpu_vm structure to set the fence sequence on
120	*/
121	struct amdgpu_vm *vm;
122
123	/**
124	* @cb: callback
125	*/
126	struct dma_fence_cb cb;
127	};
128
129	/**
130	* amdgpu_vm_set_pasid - manage pasid and vm ptr mapping
131	*
132	* @adev: amdgpu_device pointer
133	* @vm: amdgpu_vm pointer
134	* @pasid: the pasid the VM is using on this GPU
135	*
136	* Set the pasid this VM is using on this GPU, can also be used to remove the
137	* pasid by passing in zero.
138	*
139	*/
140	int amdgpu_vm_set_pasid(struct amdgpu_device adev, struct* amdgpu_vm *vm,
141	u32 pasid)
142	{
143	int r;
144
145	if (vm->pasid == pasid)
146	return `0`;
147
148	if (vm->pasid) {
149	r = xa_err(entry: xa_erase_irq(xa: &adev->vm_manager.pasids, index: vm->pasid));
150	if (r < `0`)
151	return r;
152
153	vm->pasid = `0`;
154	}
155
156	if (pasid) {
157	r = xa_err(entry: xa_store_irq(xa: &adev->vm_manager.pasids, index: pasid, entry: vm,
158	GFP_KERNEL));
159	if (r < `0`)
160	return r;
161
162	vm->pasid = pasid;
163	}
164
165
166	return `0`;
167	}
168
169	/**
170	* amdgpu_vm_bo_evicted - vm_bo is evicted
171	*
172	* @vm_bo: vm_bo which is evicted
173	*
174	* State for PDs/PTs and per VM BOs which are not at the location they should
175	* be.
176	*/
177	static void amdgpu_vm_bo_evicted(struct amdgpu_vm_bo_base *vm_bo)
178	{
179	struct amdgpu_vm *vm = vm_bo->vm;
180	struct amdgpu_bo *bo = vm_bo->bo;
181
182	vm_bo->moved = true;
183	spin_lock(lock: &vm_bo->vm->status_lock);
184	if (bo->tbo.type == ttm_bo_type_kernel)
185	list_move(list: &vm_bo->vm_status, head: &vm->evicted);
186	else
187	list_move_tail(list: &vm_bo->vm_status, head: &vm->evicted);
188	spin_unlock(lock: &vm_bo->vm->status_lock);
189	}
190	/**
191	* amdgpu_vm_bo_moved - vm_bo is moved
192	*
193	* @vm_bo: vm_bo which is moved
194	*
195	* State for per VM BOs which are moved, but that change is not yet reflected
196	* in the page tables.
197	*/
198	static void amdgpu_vm_bo_moved(struct amdgpu_vm_bo_base *vm_bo)
199	{
200	spin_lock(lock: &vm_bo->vm->status_lock);
201	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->moved);
202	spin_unlock(lock: &vm_bo->vm->status_lock);
203	}
204
205	/**
206	* amdgpu_vm_bo_idle - vm_bo is idle
207	*
208	* @vm_bo: vm_bo which is now idle
209	*
210	* State for PDs/PTs and per VM BOs which have gone through the state machine
211	* and are now idle.
212	*/
213	static void amdgpu_vm_bo_idle(struct amdgpu_vm_bo_base *vm_bo)
214	{
215	spin_lock(lock: &vm_bo->vm->status_lock);
216	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->idle);
217	spin_unlock(lock: &vm_bo->vm->status_lock);
218	vm_bo->moved = false;
219	}
220
221	/**
222	* amdgpu_vm_bo_invalidated - vm_bo is invalidated
223	*
224	* @vm_bo: vm_bo which is now invalidated
225	*
226	* State for normal BOs which are invalidated and that change not yet reflected
227	* in the PTs.
228	*/
229	static void amdgpu_vm_bo_invalidated(struct amdgpu_vm_bo_base *vm_bo)
230	{
231	spin_lock(lock: &vm_bo->vm->status_lock);
232	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->invalidated);
233	spin_unlock(lock: &vm_bo->vm->status_lock);
234	}
235
236	/**
237	* amdgpu_vm_bo_evicted_user - vm_bo is evicted
238	*
239	* @vm_bo: vm_bo which is evicted
240	*
241	* State for BOs used by user mode queues which are not at the location they
242	* should be.
243	*/
244	static void amdgpu_vm_bo_evicted_user(struct amdgpu_vm_bo_base *vm_bo)
245	{
246	vm_bo->moved = true;
247	spin_lock(lock: &vm_bo->vm->status_lock);
248	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->evicted_user);
249	spin_unlock(lock: &vm_bo->vm->status_lock);
250	}
251
252	/**
253	* amdgpu_vm_bo_relocated - vm_bo is reloacted
254	*
255	* @vm_bo: vm_bo which is relocated
256	*
257	* State for PDs/PTs which needs to update their parent PD.
258	* For the root PD, just move to idle state.
259	*/
260	static void amdgpu_vm_bo_relocated(struct amdgpu_vm_bo_base *vm_bo)
261	{
262	if (vm_bo->bo->parent) {
263	spin_lock(lock: &vm_bo->vm->status_lock);
264	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->relocated);
265	spin_unlock(lock: &vm_bo->vm->status_lock);
266	} else {
267	amdgpu_vm_bo_idle(vm_bo);
268	}
269	}
270
271	/**
272	* amdgpu_vm_bo_done - vm_bo is done
273	*
274	* @vm_bo: vm_bo which is now done
275	*
276	* State for normal BOs which are invalidated and that change has been updated
277	* in the PTs.
278	*/
279	static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
280	{
281	spin_lock(lock: &vm_bo->vm->status_lock);
282	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->done);
283	spin_unlock(lock: &vm_bo->vm->status_lock);
284	}
285
286	/**
287	* amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
288	* @vm: the VM which state machine to reset
289	*
290	* Move all vm_bo object in the VM into a state where they will be updated
291	* again during validation.
292	*/
293	static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
294	{
295	struct amdgpu_vm_bo_base vm_bo, tmp;
296
297	spin_lock(lock: &vm->status_lock);
298	list_splice_init(list: &vm->done, head: &vm->invalidated);
299	list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
300	vm_bo->moved = true;
301	list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
302	struct amdgpu_bo *bo = vm_bo->bo;
303
304	vm_bo->moved = true;
305	if (!bo \|\| bo->tbo.type != ttm_bo_type_kernel)
306	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->moved);
307	else if (bo->parent)
308	list_move(list: &vm_bo->vm_status, head: &vm_bo->vm->relocated);
309	}
310	spin_unlock(lock: &vm->status_lock);
311	}
312
313	/**
314	* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
315	*
316	* @base: base structure for tracking BO usage in a VM
317	* @vm: vm to which bo is to be added
318	* @bo: amdgpu buffer object
319	*
320	* Initialize a bo_va_base structure and add it to the appropriate lists
321	*
322	*/
323	void amdgpu_vm_bo_base_init(struct amdgpu_vm_bo_base *base,
324	struct amdgpu_vm vm, struct* amdgpu_bo *bo)
325	{
326	base->vm = vm;
327	base->bo = bo;
328	base->next = NULL;
329	INIT_LIST_HEAD(list: &base->vm_status);
330
331	if (!bo)
332	return;
333	base->next = bo->vm_bo;
334	bo->vm_bo = base;
335
336	if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
337	return;
338
339	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
340
341	ttm_bo_set_bulk_move(bo: &bo->tbo, bulk: &vm->lru_bulk_move);
342	if (bo->tbo.type == ttm_bo_type_kernel && bo->parent)
343	amdgpu_vm_bo_relocated(vm_bo: base);
344	else
345	amdgpu_vm_bo_idle(vm_bo: base);
346
347	if (bo->preferred_domains &
348	amdgpu_mem_type_to_domain(mem_type: bo->tbo.resource->mem_type))
349	return;
350
351	/*
352	* we checked all the prerequisites, but it looks like this per vm bo
353	* is currently evicted. add the bo to the evicted list to make sure it
354	* is validated on next vm use to avoid fault.
355	* */
356	amdgpu_vm_bo_evicted(vm_bo: base);
357	}
358
359	/**
360	* amdgpu_vm_lock_pd - lock PD in drm_exec
361	*
362	* @vm: vm providing the BOs
363	* @exec: drm execution context
364	* @num_fences: number of extra fences to reserve
365	*
366	* Lock the VM root PD in the DRM execution context.
367	*/
368	int amdgpu_vm_lock_pd(struct amdgpu_vm vm, struct* drm_exec *exec,
369	unsigned int num_fences)
370	{
371	/ We need at least two fences for the VM PD/PT updates /
372	return drm_exec_prepare_obj(exec, obj: &vm->root.bo->tbo.base,
373	num_fences: `2` + num_fences);
374	}
375
376	/**
377	* amdgpu_vm_move_to_lru_tail - move all BOs to the end of LRU
378	*
379	* @adev: amdgpu device pointer
380	* @vm: vm providing the BOs
381	*
382	* Move all BOs to the end of LRU and remember their positions to put them
383	* together.
384	*/
385	void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
386	struct amdgpu_vm *vm)
387	{
388	spin_lock(lock: &adev->mman.bdev.lru_lock);
389	ttm_lru_bulk_move_tail(bulk: &vm->lru_bulk_move);
390	spin_unlock(lock: &adev->mman.bdev.lru_lock);
391	}
392
393	/ Create scheduler entities for page table updates /
394	static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
395	struct amdgpu_vm *vm)
396	{
397	int r;
398
399	r = drm_sched_entity_init(entity: &vm->immediate, priority: DRM_SCHED_PRIORITY_NORMAL,
400	sched_list: adev->vm_manager.vm_pte_scheds,
401	num_sched_list: adev->vm_manager.vm_pte_num_scheds, NULL);
402	if (r)
403	goto error;
404
405	return drm_sched_entity_init(entity: &vm->delayed, priority: DRM_SCHED_PRIORITY_NORMAL,
406	sched_list: adev->vm_manager.vm_pte_scheds,
407	num_sched_list: adev->vm_manager.vm_pte_num_scheds, NULL);
408
409	error:
410	drm_sched_entity_destroy(entity: &vm->immediate);
411	return r;
412	}
413
414	/ Destroy the entities for page table updates again /
415	static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
416	{
417	drm_sched_entity_destroy(entity: &vm->immediate);
418	drm_sched_entity_destroy(entity: &vm->delayed);
419	}
420
421	/**
422	* amdgpu_vm_generation - return the page table re-generation counter
423	* @adev: the amdgpu_device
424	* @vm: optional VM to check, might be NULL
425	*
426	* Returns a page table re-generation token to allow checking if submissions
427	* are still valid to use this VM. The VM parameter might be NULL in which case
428	* just the VRAM lost counter will be used.
429	*/
430	uint64_t amdgpu_vm_generation(struct amdgpu_device adev, struct* amdgpu_vm *vm)
431	{
432	uint64_t result = (u64)atomic_read(v: &adev->vram_lost_counter) << `32`;
433
434	if (!vm)
435	return result;
436
437	result += vm->generation;
438	/ Add one if the page tables will be re-generated on next CS /
439	if (drm_sched_entity_error(entity: &vm->delayed))
440	++result;
441
442	return result;
443	}
444
445	/**
446	* amdgpu_vm_validate - validate evicted BOs tracked in the VM
447	*
448	* @adev: amdgpu device pointer
449	* @vm: vm providing the BOs
450	* @ticket: optional reservation ticket used to reserve the VM
451	* @validate: callback to do the validation
452	* @param: parameter for the validation callback
453	*
454	* Validate the page table BOs and per-VM BOs on command submission if
455	* necessary. If a ticket is given, also try to validate evicted user queue
456	* BOs. They must already be reserved with the given ticket.
457	*
458	* Returns:
459	* Validation result.
460	*/
461	int amdgpu_vm_validate(struct amdgpu_device adev, struct* amdgpu_vm *vm,
462	struct ww_acquire_ctx *ticket,
463	int (validate)(void* p, struct* amdgpu_bo *bo),
464	void *param)
465	{
466	struct amdgpu_vm_bo_base *bo_base;
467	struct amdgpu_bo *shadow;
468	struct amdgpu_bo *bo;
469	int r;
470
471	if (drm_sched_entity_error(entity: &vm->delayed)) {
472	++vm->generation;
473	amdgpu_vm_bo_reset_state_machine(vm);
474	amdgpu_vm_fini_entities(vm);
475	r = amdgpu_vm_init_entities(adev, vm);
476	if (r)
477	return r;
478	}
479
480	spin_lock(lock: &vm->status_lock);
481	while (!list_empty(head: &vm->evicted)) {
482	bo_base = list_first_entry(&vm->evicted,
483	struct amdgpu_vm_bo_base,
484	vm_status);
485	spin_unlock(lock: &vm->status_lock);
486
487	bo = bo_base->bo;
488	shadow = amdgpu_bo_shadowed(bo);
489
490	r = validate(param, bo);
491	if (r)
492	return r;
493	if (shadow) {
494	r = validate(param, shadow);
495	if (r)
496	return r;
497	}
498
499	if (bo->tbo.type != ttm_bo_type_kernel) {
500	amdgpu_vm_bo_moved(vm_bo: bo_base);
501	} else {
502	vm->update_funcs->map_table(to_amdgpu_bo_vm(bo));
503	amdgpu_vm_bo_relocated(vm_bo: bo_base);
504	}
505	spin_lock(lock: &vm->status_lock);
506	}
507	while (ticket && !list_empty(head: &vm->evicted_user)) {
508	bo_base = list_first_entry(&vm->evicted_user,
509	struct amdgpu_vm_bo_base,
510	vm_status);
511	spin_unlock(lock: &vm->status_lock);
512
513	bo = bo_base->bo;
514
515	if (dma_resv_locking_ctx(obj: bo->tbo.base.resv) != ticket) {
516	struct amdgpu_task_info *ti = amdgpu_vm_get_task_info_vm(vm);
517
518	pr_warn_ratelimited("Evicted user BO is not reserved\n");
519	if (ti) {
520	pr_warn_ratelimited("pid %d\n", ti->pid);
521	amdgpu_vm_put_task_info(task_info: ti);
522	}
523
524	return -EINVAL;
525	}
526
527	r = validate(param, bo);
528	if (r)
529	return r;
530
531	amdgpu_vm_bo_invalidated(vm_bo: bo_base);
532
533	spin_lock(lock: &vm->status_lock);
534	}
535	spin_unlock(lock: &vm->status_lock);
536
537	amdgpu_vm_eviction_lock(vm);
538	vm->evicting = false;
539	amdgpu_vm_eviction_unlock(vm);
540
541	return `0`;
542	}
543
544	/**
545	* amdgpu_vm_ready - check VM is ready for updates
546	*
547	* @vm: VM to check
548	*
549	* Check if all VM PDs/PTs are ready for updates
550	*
551	* Returns:
552	* True if VM is not evicting.
553	*/
554	bool amdgpu_vm_ready(struct amdgpu_vm *vm)
555	{
556	bool empty;
557	bool ret;
558
559	amdgpu_vm_eviction_lock(vm);
560	ret = !vm->evicting;
561	amdgpu_vm_eviction_unlock(vm);
562
563	spin_lock(lock: &vm->status_lock);
564	empty = list_empty(head: &vm->evicted);
565	spin_unlock(lock: &vm->status_lock);
566
567	return ret && empty;
568	}
569
570	/**
571	* amdgpu_vm_check_compute_bug - check whether asic has compute vm bug
572	*
573	* @adev: amdgpu_device pointer
574	*/
575	void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev)
576	{
577	const struct amdgpu_ip_block *ip_block;
578	bool has_compute_vm_bug;
579	struct amdgpu_ring *ring;
580	int i;
581
582	has_compute_vm_bug = false;
583
584	ip_block = amdgpu_device_ip_get_ip_block(adev, type: AMD_IP_BLOCK_TYPE_GFX);
585	if (ip_block) {
586	/ Compute has a VM bug for GFX version < 7.*
587	Compute has a VM bug for GFX 8 MEC firmware version < 673./*
588	if (ip_block->version->major <= `7`)
589	has_compute_vm_bug = true;
590	else if (ip_block->version->major == `8`)
591	if (adev->gfx.mec_fw_version < `673`)
592	has_compute_vm_bug = true;
593	}
594
595	for (i = `0`; i < adev->num_rings; i++) {
596	ring = adev->rings[i];
597	if (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
598	/ only compute rings /
599	ring->has_compute_vm_bug = has_compute_vm_bug;
600	else
601	ring->has_compute_vm_bug = false;
602	}
603	}
604
605	/**
606	* amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for job.
607	*
608	* @ring: ring on which the job will be submitted
609	* @job: job to submit
610	*
611	* Returns:
612	* True if sync is needed.
613	*/
614	bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
615	struct amdgpu_job *job)
616	{
617	struct amdgpu_device *adev = ring->adev;
618	unsigned vmhub = ring->vm_hub;
619	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
620
621	if (job->vmid == `0`)
622	return false;
623
624	if (job->vm_needs_flush \|\| ring->has_compute_vm_bug)
625	return true;
626
627	if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
628	return true;
629
630	if (amdgpu_vmid_had_gpu_reset(adev, id: &id_mgr->ids[job->vmid]))
631	return true;
632
633	return false;
634	}
635
636	/**
637	* amdgpu_vm_flush - hardware flush the vm
638	*
639	* @ring: ring to use for flush
640	* @job: related job
641	* @need_pipe_sync: is pipe sync needed
642	*
643	* Emit a VM flush when it is necessary.
644	*
645	* Returns:
646	* 0 on success, errno otherwise.
647	*/
648	int amdgpu_vm_flush(struct amdgpu_ring ring, struct* amdgpu_job *job,
649	bool need_pipe_sync)
650	{
651	struct amdgpu_device *adev = ring->adev;
652	unsigned vmhub = ring->vm_hub;
653	struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
654	struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
655	bool spm_update_needed = job->spm_update_needed;
656	bool gds_switch_needed = ring->funcs->emit_gds_switch &&
657	job->gds_switch_needed;
658	bool vm_flush_needed = job->vm_needs_flush;
659	struct dma_fence *fence = NULL;
660	bool pasid_mapping_needed = false;
661	unsigned int patch;
662	int r;
663
664	if (amdgpu_vmid_had_gpu_reset(adev, id)) {
665	gds_switch_needed = true;
666	vm_flush_needed = true;
667	pasid_mapping_needed = true;
668	spm_update_needed = true;
669	}
670
671	mutex_lock(&id_mgr->lock);
672	if (id->pasid != job->pasid \|\| !id->pasid_mapping \|\|
673	!dma_fence_is_signaled(fence: id->pasid_mapping))
674	pasid_mapping_needed = true;
675	mutex_unlock(lock: &id_mgr->lock);
676
677	gds_switch_needed &= !!ring->funcs->emit_gds_switch;
678	vm_flush_needed &= !!ring->funcs->emit_vm_flush &&
679	job->vm_pd_addr != AMDGPU_BO_INVALID_OFFSET;
680	pasid_mapping_needed &= adev->gmc.gmc_funcs->emit_pasid_mapping &&
681	ring->funcs->emit_wreg;
682
683	if (!vm_flush_needed && !gds_switch_needed && !need_pipe_sync)
684	return `0`;
685
686	amdgpu_ring_ib_begin(ring);
687	if (ring->funcs->init_cond_exec)
688	patch = amdgpu_ring_init_cond_exec(ring,
689	ring->cond_exe_gpu_addr);
690
691	if (need_pipe_sync)
692	amdgpu_ring_emit_pipeline_sync(ring);
693
694	if (vm_flush_needed) {
695	trace_amdgpu_vm_flush(ring, vmid: job->vmid, pd_addr: job->vm_pd_addr);
696	amdgpu_ring_emit_vm_flush(ring, job->vmid, job->vm_pd_addr);
697	}
698
699	if (pasid_mapping_needed)
700	amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
701
702	if (spm_update_needed && adev->gfx.rlc.funcs->update_spm_vmid)
703	adev->gfx.rlc.funcs->update_spm_vmid(adev, ring, job->vmid);
704
705	if (!ring->is_mes_queue && ring->funcs->emit_gds_switch &&
706	gds_switch_needed) {
707	amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
708	job->gds_size, job->gws_base,
709	job->gws_size, job->oa_base,
710	job->oa_size);
711	}
712
713	if (vm_flush_needed \|\| pasid_mapping_needed) {
714	r = amdgpu_fence_emit(ring, fence: &fence, NULL, flags: `0`);
715	if (r)
716	return r;
717	}
718
719	if (vm_flush_needed) {
720	mutex_lock(&id_mgr->lock);
721	dma_fence_put(fence: id->last_flush);
722	id->last_flush = dma_fence_get(fence);
723	id->current_gpu_reset_count =
724	atomic_read(v: &adev->gpu_reset_counter);
725	mutex_unlock(lock: &id_mgr->lock);
726	}
727
728	if (pasid_mapping_needed) {
729	mutex_lock(&id_mgr->lock);
730	id->pasid = job->pasid;
731	dma_fence_put(fence: id->pasid_mapping);
732	id->pasid_mapping = dma_fence_get(fence);
733	mutex_unlock(lock: &id_mgr->lock);
734	}
735	dma_fence_put(fence);
736
737	amdgpu_ring_patch_cond_exec(ring, offset: patch);
738
739	/ the double SWITCH_BUFFER here cannot be skipped by COND_EXEC /
740	if (ring->funcs->emit_switch_buffer) {
741	amdgpu_ring_emit_switch_buffer(ring);
742	amdgpu_ring_emit_switch_buffer(ring);
743	}
744	amdgpu_ring_ib_end(ring);
745	return `0`;
746	}
747
748	/**
749	* amdgpu_vm_bo_find - find the bo_va for a specific vm & bo
750	*
751	* @vm: requested vm
752	* @bo: requested buffer object
753	*
754	* Find @bo inside the requested vm.
755	* Search inside the @bos vm list for the requested vm
756	* Returns the found bo_va or NULL if none is found
757	*
758	* Object has to be reserved!
759	*
760	* Returns:
761	* Found bo_va or NULL.
762	*/
763	struct amdgpu_bo_va amdgpu_vm_bo_find(struct* amdgpu_vm *vm,
764	struct amdgpu_bo *bo)
765	{
766	struct amdgpu_vm_bo_base *base;
767
768	for (base = bo->vm_bo; base; base = base->next) {
769	if (base->vm != vm)
770	continue;
771
772	return container_of(base, struct amdgpu_bo_va, base);
773	}
774	return NULL;
775	}
776
777	/**
778	* amdgpu_vm_map_gart - Resolve gart mapping of addr
779	*
780	* @pages_addr: optional DMA address to use for lookup
781	* @addr: the unmapped addr
782	*
783	* Look up the physical address of the page that the pte resolves
784	* to.
785	*
786	* Returns:
787	* The pointer for the page table entry.
788	*/
789	uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
790	{
791	uint64_t result;
792
793	/ page table offset /
794	result = pages_addr[addr >> PAGE_SHIFT];
795
796	/ in case cpu page size != gpu page size/
797	result \|= addr & (~PAGE_MASK);
798
799	result &= `0xFFFFFFFFFFFFF000ULL`;
800
801	return result;
802	}
803
804	/**
805	* amdgpu_vm_update_pdes - make sure that all directories are valid
806	*
807	* @adev: amdgpu_device pointer
808	* @vm: requested vm
809	* @immediate: submit immediately to the paging queue
810	*
811	* Makes sure all directories are up to date.
812	*
813	* Returns:
814	* 0 for success, error for failure.
815	*/
816	int amdgpu_vm_update_pdes(struct amdgpu_device *adev,
817	struct amdgpu_vm *vm, bool immediate)
818	{
819	struct amdgpu_vm_update_params params;
820	struct amdgpu_vm_bo_base *entry;
821	bool flush_tlb_needed = false;
822	LIST_HEAD(relocated);
823	int r, idx;
824
825	spin_lock(lock: &vm->status_lock);
826	list_splice_init(list: &vm->relocated, head: &relocated);
827	spin_unlock(lock: &vm->status_lock);
828
829	if (list_empty(head: &relocated))
830	return `0`;
831
832	if (!drm_dev_enter(dev: adev_to_drm(adev), idx: &idx))
833	return -ENODEV;
834
835	memset(&params, `0`, sizeof(params));
836	params.adev = adev;
837	params.vm = vm;
838	params.immediate = immediate;
839
840	r = vm->update_funcs->prepare(&params, NULL, AMDGPU_SYNC_EXPLICIT);
841	if (r)
842	goto error;
843
844	list_for_each_entry(entry, &relocated, vm_status) {
845	/ vm_flush_needed after updating moved PDEs /
846	flush_tlb_needed \|= entry->moved;
847
848	r = amdgpu_vm_pde_update(params: &params, entry);
849	if (r)
850	goto error;
851	}
852
853	r = vm->update_funcs->commit(&params, &vm->last_update);
854	if (r)
855	goto error;
856
857	if (flush_tlb_needed)
858	atomic64_inc(v: &vm->tlb_seq);
859
860	while (!list_empty(head: &relocated)) {
861	entry = list_first_entry(&relocated, struct amdgpu_vm_bo_base,
862	vm_status);
863	amdgpu_vm_bo_idle(vm_bo: entry);
864	}
865
866	error:
867	drm_dev_exit(idx);
868	return r;
869	}
870
871	/**
872	* amdgpu_vm_tlb_seq_cb - make sure to increment tlb sequence
873	* @fence: unused
874	* @cb: the callback structure
875	*
876	* Increments the tlb sequence to make sure that future CS execute a VM flush.
877	*/
878	static void amdgpu_vm_tlb_seq_cb(struct dma_fence *fence,
879	struct dma_fence_cb *cb)
880	{
881	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
882
883	tlb_cb = container_of(cb, typeof(*tlb_cb), cb);
884	atomic64_inc(v: &tlb_cb->vm->tlb_seq);
885	kfree(objp: tlb_cb);
886	}
887
888	/**
889	* amdgpu_vm_update_range - update a range in the vm page table
890	*
891	* @adev: amdgpu_device pointer to use for commands
892	* @vm: the VM to update the range
893	* @immediate: immediate submission in a page fault
894	* @unlocked: unlocked invalidation during MM callback
895	* @flush_tlb: trigger tlb invalidation after update completed
896	* @allow_override: change MTYPE for local NUMA nodes
897	* @resv: fences we need to sync to
898	* @start: start of mapped range
899	* @last: last mapped entry
900	* @flags: flags for the entries
901	* @offset: offset into nodes and pages_addr
902	* @vram_base: base for vram mappings
903	* @res: ttm_resource to map
904	* @pages_addr: DMA addresses to use for mapping
905	* @fence: optional resulting fence
906	*
907	* Fill in the page table entries between @start and @last.
908	*
909	* Returns:
910	* 0 for success, negative erro code for failure.
911	*/
912	int amdgpu_vm_update_range(struct amdgpu_device adev, struct* amdgpu_vm *vm,
913	bool immediate, bool unlocked, bool flush_tlb, bool allow_override,
914	struct dma_resv *resv, uint64_t start, uint64_t last,
915	uint64_t flags, uint64_t offset, uint64_t vram_base,
916	struct ttm_resource res, dma_addr_t pages_addr,
917	struct dma_fence **fence)
918	{
919	struct amdgpu_vm_update_params params;
920	struct amdgpu_vm_tlb_seq_struct *tlb_cb;
921	struct amdgpu_res_cursor cursor;
922	enum amdgpu_sync_mode sync_mode;
923	int r, idx;
924
925	if (!drm_dev_enter(dev: adev_to_drm(adev), idx: &idx))
926	return -ENODEV;
927
928	tlb_cb = kmalloc(size: sizeof(*tlb_cb), GFP_KERNEL);
929	if (!tlb_cb) {
930	r = -ENOMEM;
931	goto error_unlock;
932	}
933
934	/ Vega20+XGMI where PTEs get inadvertently cached in L2 texture cache,*
935	* heavy-weight flush TLB unconditionally.
936	*/
937	flush_tlb \|= adev->gmc.xgmi.num_physical_nodes &&
938	amdgpu_ip_version(adev, ip: GC_HWIP, inst: `0`) == IP_VERSION(`9`, `4`, `0`);
939
940	/*
941	* On GFX8 and older any 8 PTE block with a valid bit set enters the TLB
942	*/
943	flush_tlb \|= amdgpu_ip_version(adev, ip: GC_HWIP, inst: `0`) < IP_VERSION(`9`, `0`, `0`);
944
945	memset(&params, `0`, sizeof(params));
946	params.adev = adev;
947	params.vm = vm;
948	params.immediate = immediate;
949	params.pages_addr = pages_addr;
950	params.unlocked = unlocked;
951	params.allow_override = allow_override;
952
953	/ Implicitly sync to command submissions in the same VM before*
954	* unmapping. Sync to moving fences before mapping.
955	*/
956	if (!(flags & AMDGPU_PTE_VALID))
957	sync_mode = AMDGPU_SYNC_EQ_OWNER;
958	else
959	sync_mode = AMDGPU_SYNC_EXPLICIT;
960
961	amdgpu_vm_eviction_lock(vm);
962	if (vm->evicting) {
963	r = -EBUSY;
964	goto error_free;
965	}
966
967	if (!unlocked && !dma_fence_is_signaled(fence: vm->last_unlocked)) {
968	struct dma_fence *tmp = dma_fence_get_stub();
969
970	amdgpu_bo_fence(bo: vm->root.bo, fence: vm->last_unlocked, shared: true);
971	swap(vm->last_unlocked, tmp);
972	dma_fence_put(fence: tmp);
973	}
974
975	r = vm->update_funcs->prepare(&params, resv, sync_mode);
976	if (r)
977	goto error_free;
978
979	amdgpu_res_first(res: pages_addr ? NULL : res, start: offset,
980	size: (last - start + `1`) * AMDGPU_GPU_PAGE_SIZE, cur: &cursor);
981	while (cursor.remaining) {
982	uint64_t tmp, num_entries, addr;
983
984	num_entries = cursor.size >> AMDGPU_GPU_PAGE_SHIFT;
985	if (pages_addr) {
986	bool contiguous = true;
987
988	if (num_entries > AMDGPU_GPU_PAGES_IN_CPU_PAGE) {
989	uint64_t pfn = cursor.start >> PAGE_SHIFT;
990	uint64_t count;
991
992	contiguous = pages_addr[pfn + `1`] ==
993	pages_addr[pfn] + PAGE_SIZE;
994
995	tmp = num_entries /
996	AMDGPU_GPU_PAGES_IN_CPU_PAGE;
997	for (count = `2`; count < tmp; ++count) {
998	uint64_t idx = pfn + count;
999
1000	if (contiguous != (pages_addr[idx] ==
1001	pages_addr[idx - `1`] + PAGE_SIZE))
1002	break;
1003	}
1004	if (!contiguous)
1005	count--;
1006	num_entries = count *
1007	AMDGPU_GPU_PAGES_IN_CPU_PAGE;
1008	}
1009
1010	if (!contiguous) {
1011	addr = cursor.start;
1012	params.pages_addr = pages_addr;
1013	} else {
1014	addr = pages_addr[cursor.start >> PAGE_SHIFT];
1015	params.pages_addr = NULL;
1016	}
1017
1018	} else if (flags & (AMDGPU_PTE_VALID \| AMDGPU_PTE_PRT)) {
1019	addr = vram_base + cursor.start;
1020	} else {
1021	addr = `0`;
1022	}
1023
1024	tmp = start + num_entries;
1025	r = amdgpu_vm_ptes_update(params: &params, start, end: tmp, dst: addr, flags);
1026	if (r)
1027	goto error_free;
1028
1029	amdgpu_res_next(cur: &cursor, size: num_entries * AMDGPU_GPU_PAGE_SIZE);
1030	start = tmp;
1031	}
1032
1033	r = vm->update_funcs->commit(&params, fence);
1034
1035	if (flush_tlb \|\| params.table_freed) {
1036	tlb_cb->vm = vm;
1037	if (fence && *fence &&
1038	!dma_fence_add_callback(fence: *fence, cb: &tlb_cb->cb,
1039	func: amdgpu_vm_tlb_seq_cb)) {
1040	dma_fence_put(fence: vm->last_tlb_flush);
1041	vm->last_tlb_flush = dma_fence_get(fence: *fence);
1042	} else {
1043	amdgpu_vm_tlb_seq_cb(NULL, cb: &tlb_cb->cb);
1044	}
1045	tlb_cb = NULL;
1046	}
1047
1048	error_free:
1049	kfree(objp: tlb_cb);
1050
1051	error_unlock:
1052	amdgpu_vm_eviction_unlock(vm);
1053	drm_dev_exit(idx);
1054	return r;
1055	}
1056
1057	static void amdgpu_vm_bo_get_memory(struct amdgpu_bo_va *bo_va,
1058	struct amdgpu_mem_stats *stats)
1059	{
1060	struct amdgpu_vm *vm = bo_va->base.vm;
1061	struct amdgpu_bo *bo = bo_va->base.bo;
1062
1063	if (!bo)
1064	return;
1065
1066	/*
1067	* For now ignore BOs which are currently locked and potentially
1068	* changing their location.
1069	*/
1070	if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv &&
1071	!dma_resv_trylock(obj: bo->tbo.base.resv))
1072	return;
1073
1074	amdgpu_bo_get_memory(bo, stats);
1075	if (bo->tbo.base.resv != vm->root.bo->tbo.base.resv)
1076	dma_resv_unlock(obj: bo->tbo.base.resv);
1077	}
1078
1079	void amdgpu_vm_get_memory(struct amdgpu_vm *vm,
1080	struct amdgpu_mem_stats *stats)
1081	{
1082	struct amdgpu_bo_va bo_va, tmp;
1083
1084	spin_lock(lock: &vm->status_lock);
1085	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status)
1086	amdgpu_vm_bo_get_memory(bo_va, stats);
1087
1088	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status)
1089	amdgpu_vm_bo_get_memory(bo_va, stats);
1090
1091	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status)
1092	amdgpu_vm_bo_get_memory(bo_va, stats);
1093
1094	list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status)
1095	amdgpu_vm_bo_get_memory(bo_va, stats);
1096
1097	list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status)
1098	amdgpu_vm_bo_get_memory(bo_va, stats);
1099
1100	list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status)
1101	amdgpu_vm_bo_get_memory(bo_va, stats);
1102	spin_unlock(lock: &vm->status_lock);
1103	}
1104
1105	/**
1106	* amdgpu_vm_bo_update - update all BO mappings in the vm page table
1107	*
1108	* @adev: amdgpu_device pointer
1109	* @bo_va: requested BO and VM object
1110	* @clear: if true clear the entries
1111	*
1112	* Fill in the page table entries for @bo_va.
1113	*
1114	* Returns:
1115	* 0 for success, -EINVAL for failure.
1116	*/
1117	int amdgpu_vm_bo_update(struct amdgpu_device adev, struct* amdgpu_bo_va *bo_va,
1118	bool clear)
1119	{
1120	struct amdgpu_bo *bo = bo_va->base.bo;
1121	struct amdgpu_vm *vm = bo_va->base.vm;
1122	struct amdgpu_bo_va_mapping *mapping;
1123	dma_addr_t *pages_addr = NULL;
1124	struct ttm_resource *mem;
1125	struct dma_fence **last_update;
1126	bool flush_tlb = clear;
1127	bool uncached;
1128	struct dma_resv *resv;
1129	uint64_t vram_base;
1130	uint64_t flags;
1131	int r;
1132
1133	if (clear \|\| !bo) {
1134	mem = NULL;
1135	resv = vm->root.bo->tbo.base.resv;
1136	} else {
1137	struct drm_gem_object *obj = &bo->tbo.base;
1138
1139	resv = bo->tbo.base.resv;
1140	if (obj->import_attach && bo_va->is_xgmi) {
1141	struct dma_buf *dma_buf = obj->import_attach->dmabuf;
1142	struct drm_gem_object *gobj = dma_buf->priv;
1143	struct amdgpu_bo *abo = gem_to_amdgpu_bo(gobj);
1144
1145	if (abo->tbo.resource &&
1146	abo->tbo.resource->mem_type == TTM_PL_VRAM)
1147	bo = gem_to_amdgpu_bo(gobj);
1148	}
1149	mem = bo->tbo.resource;
1150	if (mem && (mem->mem_type == TTM_PL_TT \|\|
1151	mem->mem_type == AMDGPU_PL_PREEMPT))
1152	pages_addr = bo->tbo.ttm->dma_address;
1153	}
1154
1155	if (bo) {
1156	struct amdgpu_device *bo_adev;
1157
1158	flags = amdgpu_ttm_tt_pte_flags(adev, ttm: bo->tbo.ttm, mem);
1159
1160	if (amdgpu_bo_encrypted(bo))
1161	flags \|= AMDGPU_PTE_TMZ;
1162
1163	bo_adev = amdgpu_ttm_adev(bdev: bo->tbo.bdev);
1164	vram_base = bo_adev->vm_manager.vram_base_offset;
1165	uncached = (bo->flags & AMDGPU_GEM_CREATE_UNCACHED) != `0`;
1166	} else {
1167	flags = `0x0`;
1168	vram_base = `0`;
1169	uncached = false;
1170	}
1171
1172	if (clear \|\| (bo && bo->tbo.base.resv ==
1173	vm->root.bo->tbo.base.resv))
1174	last_update = &vm->last_update;
1175	else
1176	last_update = &bo_va->last_pt_update;
1177
1178	if (!clear && bo_va->base.moved) {
1179	flush_tlb = true;
1180	list_splice_init(list: &bo_va->valids, head: &bo_va->invalids);
1181
1182	} else if (bo_va->cleared != clear) {
1183	list_splice_init(list: &bo_va->valids, head: &bo_va->invalids);
1184	}
1185
1186	list_for_each_entry(mapping, &bo_va->invalids, list) {
1187	uint64_t update_flags = flags;
1188
1189	/ normally,bo_va->flags only contians READABLE and WIRTEABLE bit go here*
1190	* but in case of something, we filter the flags in first place
1191	*/
1192	if (!(mapping->flags & AMDGPU_PTE_READABLE))
1193	update_flags &= ~AMDGPU_PTE_READABLE;
1194	if (!(mapping->flags & AMDGPU_PTE_WRITEABLE))
1195	update_flags &= ~AMDGPU_PTE_WRITEABLE;
1196
1197	/ Apply ASIC specific mapping flags /
1198	amdgpu_gmc_get_vm_pte(adev, mapping, &update_flags);
1199
1200	trace_amdgpu_vm_bo_update(mapping);
1201
1202	r = amdgpu_vm_update_range(adev, vm, immediate: false, unlocked: false, flush_tlb,
1203	allow_override: !uncached, resv, start: mapping->start, last: mapping->last,
1204	flags: update_flags, offset: mapping->offset,
1205	vram_base, res: mem, pages_addr,
1206	fence: last_update);
1207	if (r)
1208	return r;
1209	}
1210
1211	/ If the BO is not in its preferred location add it back to*
1212	* the evicted list so that it gets validated again on the
1213	* next command submission.
1214	*/
1215	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) {
1216	uint32_t mem_type = bo->tbo.resource->mem_type;
1217
1218	if (!(bo->preferred_domains &
1219	amdgpu_mem_type_to_domain(mem_type)))
1220	amdgpu_vm_bo_evicted(vm_bo: &bo_va->base);
1221	else
1222	amdgpu_vm_bo_idle(vm_bo: &bo_va->base);
1223	} else {
1224	amdgpu_vm_bo_done(vm_bo: &bo_va->base);
1225	}
1226
1227	list_splice_init(list: &bo_va->invalids, head: &bo_va->valids);
1228	bo_va->cleared = clear;
1229	bo_va->base.moved = false;
1230
1231	if (trace_amdgpu_vm_bo_mapping_enabled()) {
1232	list_for_each_entry(mapping, &bo_va->valids, list)
1233	trace_amdgpu_vm_bo_mapping(mapping);
1234	}
1235
1236	return `0`;
1237	}
1238
1239	/**
1240	* amdgpu_vm_update_prt_state - update the global PRT state
1241	*
1242	* @adev: amdgpu_device pointer
1243	*/
1244	static void amdgpu_vm_update_prt_state(struct amdgpu_device *adev)
1245	{
1246	unsigned long flags;
1247	bool enable;
1248
1249	spin_lock_irqsave(&adev->vm_manager.prt_lock, flags);
1250	enable = !!atomic_read(v: &adev->vm_manager.num_prt_users);
1251	adev->gmc.gmc_funcs->set_prt(adev, enable);
1252	spin_unlock_irqrestore(lock: &adev->vm_manager.prt_lock, flags);
1253	}
1254
1255	/**
1256	* amdgpu_vm_prt_get - add a PRT user
1257	*
1258	* @adev: amdgpu_device pointer
1259	*/
1260	static void amdgpu_vm_prt_get(struct amdgpu_device *adev)
1261	{
1262	if (!adev->gmc.gmc_funcs->set_prt)
1263	return;
1264
1265	if (atomic_inc_return(v: &adev->vm_manager.num_prt_users) == `1`)
1266	amdgpu_vm_update_prt_state(adev);
1267	}
1268
1269	/**
1270	* amdgpu_vm_prt_put - drop a PRT user
1271	*
1272	* @adev: amdgpu_device pointer
1273	*/
1274	static void amdgpu_vm_prt_put(struct amdgpu_device *adev)
1275	{
1276	if (atomic_dec_return(v: &adev->vm_manager.num_prt_users) == `0`)
1277	amdgpu_vm_update_prt_state(adev);
1278	}
1279
1280	/**
1281	* amdgpu_vm_prt_cb - callback for updating the PRT status
1282	*
1283	* @fence: fence for the callback
1284	* @_cb: the callback function
1285	*/
1286	static void amdgpu_vm_prt_cb(struct dma_fence fence, struct* dma_fence_cb *_cb)
1287	{
1288	struct amdgpu_prt_cb cb = container_of(_cb, struct* amdgpu_prt_cb, cb);
1289
1290	amdgpu_vm_prt_put(adev: cb->adev);
1291	kfree(objp: cb);
1292	}
1293
1294	/**
1295	* amdgpu_vm_add_prt_cb - add callback for updating the PRT status
1296	*
1297	* @adev: amdgpu_device pointer
1298	* @fence: fence for the callback
1299	*/
1300	static void amdgpu_vm_add_prt_cb(struct amdgpu_device *adev,
1301	struct dma_fence *fence)
1302	{
1303	struct amdgpu_prt_cb *cb;
1304
1305	if (!adev->gmc.gmc_funcs->set_prt)
1306	return;
1307
1308	cb = kmalloc(size: sizeof(struct amdgpu_prt_cb), GFP_KERNEL);
1309	if (!cb) {
1310	/ Last resort when we are OOM /
1311	if (fence)
1312	dma_fence_wait(fence, intr: false);
1313
1314	amdgpu_vm_prt_put(adev);
1315	} else {
1316	cb->adev = adev;
1317	if (!fence \|\| dma_fence_add_callback(fence, cb: &cb->cb,
1318	func: amdgpu_vm_prt_cb))
1319	amdgpu_vm_prt_cb(fence, cb: &cb->cb);
1320	}
1321	}
1322
1323	/**
1324	* amdgpu_vm_free_mapping - free a mapping
1325	*
1326	* @adev: amdgpu_device pointer
1327	* @vm: requested vm
1328	* @mapping: mapping to be freed
1329	* @fence: fence of the unmap operation
1330	*
1331	* Free a mapping and make sure we decrease the PRT usage count if applicable.
1332	*/
1333	static void amdgpu_vm_free_mapping(struct amdgpu_device *adev,
1334	struct amdgpu_vm *vm,
1335	struct amdgpu_bo_va_mapping *mapping,
1336	struct dma_fence *fence)
1337	{
1338	if (mapping->flags & AMDGPU_PTE_PRT)
1339	amdgpu_vm_add_prt_cb(adev, fence);
1340	kfree(objp: mapping);
1341	}
1342
1343	/**
1344	* amdgpu_vm_prt_fini - finish all prt mappings
1345	*
1346	* @adev: amdgpu_device pointer
1347	* @vm: requested vm
1348	*
1349	* Register a cleanup callback to disable PRT support after VM dies.
1350	*/
1351	static void amdgpu_vm_prt_fini(struct amdgpu_device adev, struct* amdgpu_vm *vm)
1352	{
1353	struct dma_resv *resv = vm->root.bo->tbo.base.resv;
1354	struct dma_resv_iter cursor;
1355	struct dma_fence *fence;
1356
1357	dma_resv_for_each_fence(&cursor, resv, DMA_RESV_USAGE_BOOKKEEP, fence) {
1358	/ Add a callback for each fence in the reservation object /
1359	amdgpu_vm_prt_get(adev);
1360	amdgpu_vm_add_prt_cb(adev, fence);
1361	}
1362	}
1363
1364	/**
1365	* amdgpu_vm_clear_freed - clear freed BOs in the PT
1366	*
1367	* @adev: amdgpu_device pointer
1368	* @vm: requested vm
1369	* @fence: optional resulting fence (unchanged if no work needed to be done
1370	* or if an error occurred)
1371	*
1372	* Make sure all freed BOs are cleared in the PT.
1373	* PTs have to be reserved and mutex must be locked!
1374	*
1375	* Returns:
1376	* 0 for success.
1377	*
1378	*/
1379	int amdgpu_vm_clear_freed(struct amdgpu_device *adev,
1380	struct amdgpu_vm *vm,
1381	struct dma_fence **fence)
1382	{
1383	struct dma_resv *resv = vm->root.bo->tbo.base.resv;
1384	struct amdgpu_bo_va_mapping *mapping;
1385	uint64_t init_pte_value = `0`;
1386	struct dma_fence *f = NULL;
1387	int r;
1388
1389	while (!list_empty(head: &vm->freed)) {
1390	mapping = list_first_entry(&vm->freed,
1391	struct amdgpu_bo_va_mapping, list);
1392	list_del(entry: &mapping->list);
1393
1394	r = amdgpu_vm_update_range(adev, vm, immediate: false, unlocked: false, flush_tlb: true, allow_override: false,
1395	resv, start: mapping->start, last: mapping->last,
1396	flags: init_pte_value, offset: `0`, vram_base: `0`, NULL, NULL,
1397	fence: &f);
1398	amdgpu_vm_free_mapping(adev, vm, mapping, fence: f);
1399	if (r) {
1400	dma_fence_put(fence: f);
1401	return r;
1402	}
1403	}
1404
1405	if (fence && f) {
1406	dma_fence_put(fence: *fence);
1407	*fence = f;
1408	} else {
1409	dma_fence_put(fence: f);
1410	}
1411
1412	return `0`;
1413
1414	}
1415
1416	/**
1417	* amdgpu_vm_handle_moved - handle moved BOs in the PT
1418	*
1419	* @adev: amdgpu_device pointer
1420	* @vm: requested vm
1421	* @ticket: optional reservation ticket used to reserve the VM
1422	*
1423	* Make sure all BOs which are moved are updated in the PTs.
1424	*
1425	* Returns:
1426	* 0 for success.
1427	*
1428	* PTs have to be reserved!
1429	*/
1430	int amdgpu_vm_handle_moved(struct amdgpu_device *adev,
1431	struct amdgpu_vm *vm,
1432	struct ww_acquire_ctx *ticket)
1433	{
1434	struct amdgpu_bo_va *bo_va;
1435	struct dma_resv *resv;
1436	bool clear, unlock;
1437	int r;
1438
1439	spin_lock(lock: &vm->status_lock);
1440	while (!list_empty(head: &vm->moved)) {
1441	bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
1442	base.vm_status);
1443	spin_unlock(lock: &vm->status_lock);
1444
1445	/ Per VM BOs never need to bo cleared in the page tables /
1446	r = amdgpu_vm_bo_update(adev, bo_va, clear: false);
1447	if (r)
1448	return r;
1449	spin_lock(lock: &vm->status_lock);
1450	}
1451
1452	while (!list_empty(head: &vm->invalidated)) {
1453	bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
1454	base.vm_status);
1455	resv = bo_va->base.bo->tbo.base.resv;
1456	spin_unlock(lock: &vm->status_lock);
1457
1458	/ Try to reserve the BO to avoid clearing its ptes /
1459	if (!adev->debug_vm && dma_resv_trylock(obj: resv)) {
1460	clear = false;
1461	unlock = true;
1462	/ The caller is already holding the reservation lock /
1463	} else if (ticket && dma_resv_locking_ctx(obj: resv) == ticket) {
1464	clear = false;
1465	unlock = false;
1466	/ Somebody else is using the BO right now /
1467	} else {
1468	clear = true;
1469	unlock = false;
1470	}
1471
1472	r = amdgpu_vm_bo_update(adev, bo_va, clear);
1473
1474	if (unlock)
1475	dma_resv_unlock(obj: resv);
1476	if (r)
1477	return r;
1478
1479	/ Remember evicted DMABuf imports in compute VMs for later*
1480	* validation
1481	*/
1482	if (vm->is_compute_context &&
1483	bo_va->base.bo->tbo.base.import_attach &&
1484	(!bo_va->base.bo->tbo.resource \|\|
1485	bo_va->base.bo->tbo.resource->mem_type == TTM_PL_SYSTEM))
1486	amdgpu_vm_bo_evicted_user(vm_bo: &bo_va->base);
1487
1488	spin_lock(lock: &vm->status_lock);
1489	}
1490	spin_unlock(lock: &vm->status_lock);
1491
1492	return `0`;
1493	}
1494
1495	/**
1496	* amdgpu_vm_flush_compute_tlb - Flush TLB on compute VM
1497	*
1498	* @adev: amdgpu_device pointer
1499	* @vm: requested vm
1500	* @flush_type: flush type
1501	* @xcc_mask: mask of XCCs that belong to the compute partition in need of a TLB flush.
1502	*
1503	* Flush TLB if needed for a compute VM.
1504	*
1505	* Returns:
1506	* 0 for success.
1507	*/
1508	int amdgpu_vm_flush_compute_tlb(struct amdgpu_device *adev,
1509	struct amdgpu_vm *vm,
1510	uint32_t flush_type,
1511	uint32_t xcc_mask)
1512	{
1513	uint64_t tlb_seq = amdgpu_vm_tlb_seq(vm);
1514	bool all_hub = false;
1515	int xcc = `0`, r = `0`;
1516
1517	WARN_ON_ONCE(!vm->is_compute_context);
1518
1519	/*
1520	* It can be that we race and lose here, but that is extremely unlikely
1521	* and the worst thing which could happen is that we flush the changes
1522	* into the TLB once more which is harmless.
1523	*/
1524	if (atomic64_xchg(v: &vm->kfd_last_flushed_seq, new: tlb_seq) == tlb_seq)
1525	return `0`;
1526
1527	if (adev->family == AMDGPU_FAMILY_AI \|\|
1528	adev->family == AMDGPU_FAMILY_RV)
1529	all_hub = true;
1530
1531	for_each_inst(xcc, xcc_mask) {
1532	r = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid: vm->pasid, flush_type,
1533	all_hub, inst: xcc);
1534	if (r)
1535	break;
1536	}
1537	return r;
1538	}
1539
1540	/**
1541	* amdgpu_vm_bo_add - add a bo to a specific vm
1542	*
1543	* @adev: amdgpu_device pointer
1544	* @vm: requested vm
1545	* @bo: amdgpu buffer object
1546	*
1547	* Add @bo into the requested vm.
1548	* Add @bo to the list of bos associated with the vm
1549	*
1550	* Returns:
1551	* Newly added bo_va or NULL for failure
1552	*
1553	* Object has to be reserved!
1554	*/
1555	struct amdgpu_bo_va amdgpu_vm_bo_add(struct* amdgpu_device *adev,
1556	struct amdgpu_vm *vm,
1557	struct amdgpu_bo *bo)
1558	{
1559	struct amdgpu_bo_va *bo_va;
1560
1561	bo_va = kzalloc(size: sizeof(struct amdgpu_bo_va), GFP_KERNEL);
1562	if (bo_va == NULL) {
1563	return NULL;
1564	}
1565	amdgpu_vm_bo_base_init(base: &bo_va->base, vm, bo);
1566
1567	bo_va->ref_count = `1`;
1568	bo_va->last_pt_update = dma_fence_get_stub();
1569	INIT_LIST_HEAD(list: &bo_va->valids);
1570	INIT_LIST_HEAD(list: &bo_va->invalids);
1571
1572	if (!bo)
1573	return bo_va;
1574
1575	dma_resv_assert_held(bo->tbo.base.resv);
1576	if (amdgpu_dmabuf_is_xgmi_accessible(adev, bo)) {
1577	bo_va->is_xgmi = true;
1578	/ Power up XGMI if it can be potentially used /
1579	amdgpu_xgmi_set_pstate(adev, pstate: AMDGPU_XGMI_PSTATE_MAX_VEGA20);
1580	}
1581
1582	return bo_va;
1583	}
1584
1585
1586	/**
1587	* amdgpu_vm_bo_insert_map - insert a new mapping
1588	*
1589	* @adev: amdgpu_device pointer
1590	* @bo_va: bo_va to store the address
1591	* @mapping: the mapping to insert
1592	*
1593	* Insert a new mapping into all structures.
1594	*/
1595	static void amdgpu_vm_bo_insert_map(struct amdgpu_device *adev,
1596	struct amdgpu_bo_va *bo_va,
1597	struct amdgpu_bo_va_mapping *mapping)
1598	{
1599	struct amdgpu_vm *vm = bo_va->base.vm;
1600	struct amdgpu_bo *bo = bo_va->base.bo;
1601
1602	mapping->bo_va = bo_va;
1603	list_add(new: &mapping->list, head: &bo_va->invalids);
1604	amdgpu_vm_it_insert(node: mapping, root: &vm->va);
1605
1606	if (mapping->flags & AMDGPU_PTE_PRT)
1607	amdgpu_vm_prt_get(adev);
1608
1609	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv &&
1610	!bo_va->base.moved) {
1611	amdgpu_vm_bo_moved(vm_bo: &bo_va->base);
1612	}
1613	trace_amdgpu_vm_bo_map(bo_va, mapping);
1614	}
1615
1616	/ Validate operation parameters to prevent potential abuse /
1617	static int amdgpu_vm_verify_parameters(struct amdgpu_device *adev,
1618	struct amdgpu_bo *bo,
1619	uint64_t saddr,
1620	uint64_t offset,
1621	uint64_t size)
1622	{
1623	uint64_t tmp, lpfn;
1624
1625	if (saddr & AMDGPU_GPU_PAGE_MASK
1626	\|\| offset & AMDGPU_GPU_PAGE_MASK
1627	\|\| size & AMDGPU_GPU_PAGE_MASK)
1628	return -EINVAL;
1629
1630	if (check_add_overflow(saddr, size, &tmp)
1631	\|\| check_add_overflow(offset, size, &tmp)
1632	\|\| size == `0` / which also leads to end < begin /)
1633	return -EINVAL;
1634
1635	/ make sure object fit at this offset /
1636	if (bo && offset + size > amdgpu_bo_size(bo))
1637	return -EINVAL;
1638
1639	/ Ensure last pfn not exceed max_pfn /
1640	lpfn = (saddr + size - `1`) >> AMDGPU_GPU_PAGE_SHIFT;
1641	if (lpfn >= adev->vm_manager.max_pfn)
1642	return -EINVAL;
1643
1644	return `0`;
1645	}
1646
1647	/**
1648	* amdgpu_vm_bo_map - map bo inside a vm
1649	*
1650	* @adev: amdgpu_device pointer
1651	* @bo_va: bo_va to store the address
1652	* @saddr: where to map the BO
1653	* @offset: requested offset in the BO
1654	* @size: BO size in bytes
1655	* @flags: attributes of pages (read/write/valid/etc.)
1656	*
1657	* Add a mapping of the BO at the specefied addr into the VM.
1658	*
1659	* Returns:
1660	* 0 for success, error for failure.
1661	*
1662	* Object has to be reserved and unreserved outside!
1663	*/
1664	int amdgpu_vm_bo_map(struct amdgpu_device *adev,
1665	struct amdgpu_bo_va *bo_va,
1666	uint64_t saddr, uint64_t offset,
1667	uint64_t size, uint64_t flags)
1668	{
1669	struct amdgpu_bo_va_mapping mapping, tmp;
1670	struct amdgpu_bo *bo = bo_va->base.bo;
1671	struct amdgpu_vm *vm = bo_va->base.vm;
1672	uint64_t eaddr;
1673	int r;
1674
1675	r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size);
1676	if (r)
1677	return r;
1678
1679	saddr /= AMDGPU_GPU_PAGE_SIZE;
1680	eaddr = saddr + (size - `1`) / AMDGPU_GPU_PAGE_SIZE;
1681
1682	tmp = amdgpu_vm_it_iter_first(root: &vm->va, start: saddr, last: eaddr);
1683	if (tmp) {
1684	/ bo and tmp overlap, invalid addr /
1685	dev_err(adev->dev, "bo %p va 0x%010Lx-0x%010Lx conflict with "
1686	"0x%010Lx-0x%010Lx\n", bo, saddr, eaddr,
1687	tmp->start, tmp->last + `1`);
1688	return -EINVAL;
1689	}
1690
1691	mapping = kmalloc(size: sizeof(*mapping), GFP_KERNEL);
1692	if (!mapping)
1693	return -ENOMEM;
1694
1695	mapping->start = saddr;
1696	mapping->last = eaddr;
1697	mapping->offset = offset;
1698	mapping->flags = flags;
1699
1700	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
1701
1702	return `0`;
1703	}
1704
1705	/**
1706	* amdgpu_vm_bo_replace_map - map bo inside a vm, replacing existing mappings
1707	*
1708	* @adev: amdgpu_device pointer
1709	* @bo_va: bo_va to store the address
1710	* @saddr: where to map the BO
1711	* @offset: requested offset in the BO
1712	* @size: BO size in bytes
1713	* @flags: attributes of pages (read/write/valid/etc.)
1714	*
1715	* Add a mapping of the BO at the specefied addr into the VM. Replace existing
1716	* mappings as we do so.
1717	*
1718	* Returns:
1719	* 0 for success, error for failure.
1720	*
1721	* Object has to be reserved and unreserved outside!
1722	*/
1723	int amdgpu_vm_bo_replace_map(struct amdgpu_device *adev,
1724	struct amdgpu_bo_va *bo_va,
1725	uint64_t saddr, uint64_t offset,
1726	uint64_t size, uint64_t flags)
1727	{
1728	struct amdgpu_bo_va_mapping *mapping;
1729	struct amdgpu_bo *bo = bo_va->base.bo;
1730	uint64_t eaddr;
1731	int r;
1732
1733	r = amdgpu_vm_verify_parameters(adev, bo, saddr, offset, size);
1734	if (r)
1735	return r;
1736
1737	/ Allocate all the needed memory /
1738	mapping = kmalloc(size: sizeof(*mapping), GFP_KERNEL);
1739	if (!mapping)
1740	return -ENOMEM;
1741
1742	r = amdgpu_vm_bo_clear_mappings(adev, vm: bo_va->base.vm, saddr, size);
1743	if (r) {
1744	kfree(objp: mapping);
1745	return r;
1746	}
1747
1748	saddr /= AMDGPU_GPU_PAGE_SIZE;
1749	eaddr = saddr + (size - `1`) / AMDGPU_GPU_PAGE_SIZE;
1750
1751	mapping->start = saddr;
1752	mapping->last = eaddr;
1753	mapping->offset = offset;
1754	mapping->flags = flags;
1755
1756	amdgpu_vm_bo_insert_map(adev, bo_va, mapping);
1757
1758	return `0`;
1759	}
1760
1761	/**
1762	* amdgpu_vm_bo_unmap - remove bo mapping from vm
1763	*
1764	* @adev: amdgpu_device pointer
1765	* @bo_va: bo_va to remove the address from
1766	* @saddr: where to the BO is mapped
1767	*
1768	* Remove a mapping of the BO at the specefied addr from the VM.
1769	*
1770	* Returns:
1771	* 0 for success, error for failure.
1772	*
1773	* Object has to be reserved and unreserved outside!
1774	*/
1775	int amdgpu_vm_bo_unmap(struct amdgpu_device *adev,
1776	struct amdgpu_bo_va *bo_va,
1777	uint64_t saddr)
1778	{
1779	struct amdgpu_bo_va_mapping *mapping;
1780	struct amdgpu_vm *vm = bo_va->base.vm;
1781	bool valid = true;
1782
1783	saddr /= AMDGPU_GPU_PAGE_SIZE;
1784
1785	list_for_each_entry(mapping, &bo_va->valids, list) {
1786	if (mapping->start == saddr)
1787	break;
1788	}
1789
1790	if (&mapping->list == &bo_va->valids) {
1791	valid = false;
1792
1793	list_for_each_entry(mapping, &bo_va->invalids, list) {
1794	if (mapping->start == saddr)
1795	break;
1796	}
1797
1798	if (&mapping->list == &bo_va->invalids)
1799	return -ENOENT;
1800	}
1801
1802	list_del(entry: &mapping->list);
1803	amdgpu_vm_it_remove(node: mapping, root: &vm->va);
1804	mapping->bo_va = NULL;
1805	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
1806
1807	if (valid)
1808	list_add(new: &mapping->list, head: &vm->freed);
1809	else
1810	amdgpu_vm_free_mapping(adev, vm, mapping,
1811	fence: bo_va->last_pt_update);
1812
1813	return `0`;
1814	}
1815
1816	/**
1817	* amdgpu_vm_bo_clear_mappings - remove all mappings in a specific range
1818	*
1819	* @adev: amdgpu_device pointer
1820	* @vm: VM structure to use
1821	* @saddr: start of the range
1822	* @size: size of the range
1823	*
1824	* Remove all mappings in a range, split them as appropriate.
1825	*
1826	* Returns:
1827	* 0 for success, error for failure.
1828	*/
1829	int amdgpu_vm_bo_clear_mappings(struct amdgpu_device *adev,
1830	struct amdgpu_vm *vm,
1831	uint64_t saddr, uint64_t size)
1832	{
1833	struct amdgpu_bo_va_mapping before, after, tmp, next;
1834	LIST_HEAD(removed);
1835	uint64_t eaddr;
1836	int r;
1837
1838	r = amdgpu_vm_verify_parameters(adev, NULL, saddr, offset: `0`, size);
1839	if (r)
1840	return r;
1841
1842	saddr /= AMDGPU_GPU_PAGE_SIZE;
1843	eaddr = saddr + (size - `1`) / AMDGPU_GPU_PAGE_SIZE;
1844
1845	/ Allocate all the needed memory /
1846	before = kzalloc(size: sizeof(*before), GFP_KERNEL);
1847	if (!before)
1848	return -ENOMEM;
1849	INIT_LIST_HEAD(list: &before->list);
1850
1851	after = kzalloc(size: sizeof(*after), GFP_KERNEL);
1852	if (!after) {
1853	kfree(objp: before);
1854	return -ENOMEM;
1855	}
1856	INIT_LIST_HEAD(list: &after->list);
1857
1858	/ Now gather all removed mappings /
1859	tmp = amdgpu_vm_it_iter_first(root: &vm->va, start: saddr, last: eaddr);
1860	while (tmp) {
1861	/ Remember mapping split at the start /
1862	if (tmp->start < saddr) {
1863	before->start = tmp->start;
1864	before->last = saddr - `1`;
1865	before->offset = tmp->offset;
1866	before->flags = tmp->flags;
1867	before->bo_va = tmp->bo_va;
1868	list_add(new: &before->list, head: &tmp->bo_va->invalids);
1869	}
1870
1871	/ Remember mapping split at the end /
1872	if (tmp->last > eaddr) {
1873	after->start = eaddr + `1`;
1874	after->last = tmp->last;
1875	after->offset = tmp->offset;
1876	after->offset += (after->start - tmp->start) << PAGE_SHIFT;
1877	after->flags = tmp->flags;
1878	after->bo_va = tmp->bo_va;
1879	list_add(new: &after->list, head: &tmp->bo_va->invalids);
1880	}
1881
1882	list_del(entry: &tmp->list);
1883	list_add(new: &tmp->list, head: &removed);
1884
1885	tmp = amdgpu_vm_it_iter_next(node: tmp, start: saddr, last: eaddr);
1886	}
1887
1888	/ And free them up /
1889	list_for_each_entry_safe(tmp, next, &removed, list) {
1890	amdgpu_vm_it_remove(node: tmp, root: &vm->va);
1891	list_del(entry: &tmp->list);
1892
1893	if (tmp->start < saddr)
1894	tmp->start = saddr;
1895	if (tmp->last > eaddr)
1896	tmp->last = eaddr;
1897
1898	tmp->bo_va = NULL;
1899	list_add(new: &tmp->list, head: &vm->freed);
1900	trace_amdgpu_vm_bo_unmap(NULL, mapping: tmp);
1901	}
1902
1903	/ Insert partial mapping before the range /
1904	if (!list_empty(head: &before->list)) {
1905	struct amdgpu_bo *bo = before->bo_va->base.bo;
1906
1907	amdgpu_vm_it_insert(node: before, root: &vm->va);
1908	if (before->flags & AMDGPU_PTE_PRT)
1909	amdgpu_vm_prt_get(adev);
1910
1911	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv &&
1912	!before->bo_va->base.moved)
1913	amdgpu_vm_bo_moved(vm_bo: &before->bo_va->base);
1914	} else {
1915	kfree(objp: before);
1916	}
1917
1918	/ Insert partial mapping after the range /
1919	if (!list_empty(head: &after->list)) {
1920	struct amdgpu_bo *bo = after->bo_va->base.bo;
1921
1922	amdgpu_vm_it_insert(node: after, root: &vm->va);
1923	if (after->flags & AMDGPU_PTE_PRT)
1924	amdgpu_vm_prt_get(adev);
1925
1926	if (bo && bo->tbo.base.resv == vm->root.bo->tbo.base.resv &&
1927	!after->bo_va->base.moved)
1928	amdgpu_vm_bo_moved(vm_bo: &after->bo_va->base);
1929	} else {
1930	kfree(objp: after);
1931	}
1932
1933	return `0`;
1934	}
1935
1936	/**
1937	* amdgpu_vm_bo_lookup_mapping - find mapping by address
1938	*
1939	* @vm: the requested VM
1940	* @addr: the address
1941	*
1942	* Find a mapping by it's address.
1943	*
1944	* Returns:
1945	* The amdgpu_bo_va_mapping matching for addr or NULL
1946	*
1947	*/
1948	struct amdgpu_bo_va_mapping amdgpu_vm_bo_lookup_mapping(struct* amdgpu_vm *vm,
1949	uint64_t addr)
1950	{
1951	return amdgpu_vm_it_iter_first(root: &vm->va, start: addr, last: addr);
1952	}
1953
1954	/**
1955	* amdgpu_vm_bo_trace_cs - trace all reserved mappings
1956	*
1957	* @vm: the requested vm
1958	* @ticket: CS ticket
1959	*
1960	* Trace all mappings of BOs reserved during a command submission.
1961	*/
1962	void amdgpu_vm_bo_trace_cs(struct amdgpu_vm vm, struct* ww_acquire_ctx *ticket)
1963	{
1964	struct amdgpu_bo_va_mapping *mapping;
1965
1966	if (!trace_amdgpu_vm_bo_cs_enabled())
1967	return;
1968
1969	for (mapping = amdgpu_vm_it_iter_first(root: &vm->va, start: `0`, U64_MAX); mapping;
1970	mapping = amdgpu_vm_it_iter_next(node: mapping, start: `0`, U64_MAX)) {
1971	if (mapping->bo_va && mapping->bo_va->base.bo) {
1972	struct amdgpu_bo *bo;
1973
1974	bo = mapping->bo_va->base.bo;
1975	if (dma_resv_locking_ctx(obj: bo->tbo.base.resv) !=
1976	ticket)
1977	continue;
1978	}
1979
1980	trace_amdgpu_vm_bo_cs(mapping);
1981	}
1982	}
1983
1984	/**
1985	* amdgpu_vm_bo_del - remove a bo from a specific vm
1986	*
1987	* @adev: amdgpu_device pointer
1988	* @bo_va: requested bo_va
1989	*
1990	* Remove @bo_va->bo from the requested vm.
1991	*
1992	* Object have to be reserved!
1993	*/
1994	void amdgpu_vm_bo_del(struct amdgpu_device *adev,
1995	struct amdgpu_bo_va *bo_va)
1996	{
1997	struct amdgpu_bo_va_mapping mapping, next;
1998	struct amdgpu_bo *bo = bo_va->base.bo;
1999	struct amdgpu_vm *vm = bo_va->base.vm;
2000	struct amdgpu_vm_bo_base **base;
2001
2002	dma_resv_assert_held(vm->root.bo->tbo.base.resv);
2003
2004	if (bo) {
2005	dma_resv_assert_held(bo->tbo.base.resv);
2006	if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
2007	ttm_bo_set_bulk_move(bo: &bo->tbo, NULL);
2008
2009	for (base = &bo_va->base.bo->vm_bo; *base;
2010	base = &(*base)->next) {
2011	if (*base != &bo_va->base)
2012	continue;
2013
2014	*base = bo_va->base.next;
2015	break;
2016	}
2017	}
2018
2019	spin_lock(lock: &vm->status_lock);
2020	list_del(entry: &bo_va->base.vm_status);
2021	spin_unlock(lock: &vm->status_lock);
2022
2023	list_for_each_entry_safe(mapping, next, &bo_va->valids, list) {
2024	list_del(entry: &mapping->list);
2025	amdgpu_vm_it_remove(node: mapping, root: &vm->va);
2026	mapping->bo_va = NULL;
2027	trace_amdgpu_vm_bo_unmap(bo_va, mapping);
2028	list_add(new: &mapping->list, head: &vm->freed);
2029	}
2030	list_for_each_entry_safe(mapping, next, &bo_va->invalids, list) {
2031	list_del(entry: &mapping->list);
2032	amdgpu_vm_it_remove(node: mapping, root: &vm->va);
2033	amdgpu_vm_free_mapping(adev, vm, mapping,
2034	fence: bo_va->last_pt_update);
2035	}
2036
2037	dma_fence_put(fence: bo_va->last_pt_update);
2038
2039	if (bo && bo_va->is_xgmi)
2040	amdgpu_xgmi_set_pstate(adev, pstate: AMDGPU_XGMI_PSTATE_MIN);
2041
2042	kfree(objp: bo_va);
2043	}
2044
2045	/**
2046	* amdgpu_vm_evictable - check if we can evict a VM
2047	*
2048	* @bo: A page table of the VM.
2049	*
2050	* Check if it is possible to evict a VM.
2051	*/
2052	bool amdgpu_vm_evictable(struct amdgpu_bo *bo)
2053	{
2054	struct amdgpu_vm_bo_base *bo_base = bo->vm_bo;
2055
2056	/ Page tables of a destroyed VM can go away immediately /
2057	if (!bo_base \|\| !bo_base->vm)
2058	return true;
2059
2060	/ Don't evict VM page tables while they are busy /
2061	if (!dma_resv_test_signaled(obj: bo->tbo.base.resv, usage: DMA_RESV_USAGE_BOOKKEEP))
2062	return false;
2063
2064	/ Try to block ongoing updates /
2065	if (!amdgpu_vm_eviction_trylock(vm: bo_base->vm))
2066	return false;
2067
2068	/ Don't evict VM page tables while they are updated /
2069	if (!dma_fence_is_signaled(fence: bo_base->vm->last_unlocked)) {
2070	amdgpu_vm_eviction_unlock(vm: bo_base->vm);
2071	return false;
2072	}
2073
2074	bo_base->vm->evicting = true;
2075	amdgpu_vm_eviction_unlock(vm: bo_base->vm);
2076	return true;
2077	}
2078
2079	/**
2080	* amdgpu_vm_bo_invalidate - mark the bo as invalid
2081	*
2082	* @adev: amdgpu_device pointer
2083	* @bo: amdgpu buffer object
2084	* @evicted: is the BO evicted
2085	*
2086	* Mark @bo as invalid.
2087	*/
2088	void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
2089	struct amdgpu_bo *bo, bool evicted)
2090	{
2091	struct amdgpu_vm_bo_base *bo_base;
2092
2093	/ shadow bo doesn't have bo base, its validation needs its parent /
2094	if (bo->parent && (amdgpu_bo_shadowed(bo: bo->parent) == bo))
2095	bo = bo->parent;
2096
2097	for (bo_base = bo->vm_bo; bo_base; bo_base = bo_base->next) {
2098	struct amdgpu_vm *vm = bo_base->vm;
2099
2100	if (evicted && bo->tbo.base.resv == vm->root.bo->tbo.base.resv) {
2101	amdgpu_vm_bo_evicted(vm_bo: bo_base);
2102	continue;
2103	}
2104
2105	if (bo_base->moved)
2106	continue;
2107	bo_base->moved = true;
2108
2109	if (bo->tbo.type == ttm_bo_type_kernel)
2110	amdgpu_vm_bo_relocated(vm_bo: bo_base);
2111	else if (bo->tbo.base.resv == vm->root.bo->tbo.base.resv)
2112	amdgpu_vm_bo_moved(vm_bo: bo_base);
2113	else
2114	amdgpu_vm_bo_invalidated(vm_bo: bo_base);
2115	}
2116	}
2117
2118	/**
2119	* amdgpu_vm_get_block_size - calculate VM page table size as power of two
2120	*
2121	* @vm_size: VM size
2122	*
2123	* Returns:
2124	* VM page table as power of two
2125	*/
2126	static uint32_t amdgpu_vm_get_block_size(uint64_t vm_size)
2127	{
2128	/ Total bits covered by PD + PTs /
2129	unsigned bits = ilog2(vm_size) + `18`;
2130
2131	/ Make sure the PD is 4K in size up to 8GB address space.*
2132	Above that split equal between PD and PTs /*
2133	if (vm_size <= `8`)
2134	return (bits - `9`);
2135	else
2136	return ((bits + `3`) / `2`);
2137	}
2138
2139	/**
2140	* amdgpu_vm_adjust_size - adjust vm size, block size and fragment size
2141	*
2142	* @adev: amdgpu_device pointer
2143	* @min_vm_size: the minimum vm size in GB if it's set auto
2144	* @fragment_size_default: Default PTE fragment size
2145	* @max_level: max VMPT level
2146	* @max_bits: max address space size in bits
2147	*
2148	*/
2149	void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint32_t min_vm_size,
2150	uint32_t fragment_size_default, unsigned max_level,
2151	unsigned max_bits)
2152	{
2153	unsigned int max_size = `1` << (max_bits - `30`);
2154	unsigned int vm_size;
2155	uint64_t tmp;
2156
2157	/ adjust vm size first /
2158	if (amdgpu_vm_size != -`1`) {
2159	vm_size = amdgpu_vm_size;
2160	if (vm_size > max_size) {
2161	dev_warn(adev->dev, "VM size (%d) too large, max is %u GB\n",
2162	amdgpu_vm_size, max_size);
2163	vm_size = max_size;
2164	}
2165	} else {
2166	struct sysinfo si;
2167	unsigned int phys_ram_gb;
2168
2169	/ Optimal VM size depends on the amount of physical*
2170	* RAM available. Underlying requirements and
2171	* assumptions:
2172	*
2173	* - Need to map system memory and VRAM from all GPUs
2174	* - VRAM from other GPUs not known here
2175	* - Assume VRAM <= system memory
2176	* - On GFX8 and older, VM space can be segmented for
2177	* different MTYPEs
2178	* - Need to allow room for fragmentation, guard pages etc.
2179	*
2180	* This adds up to a rough guess of system memory x3.
2181	* Round up to power of two to maximize the available
2182	* VM size with the given page table size.
2183	*/
2184	si_meminfo(val: &si);
2185	phys_ram_gb = ((uint64_t)si.totalram * si.mem_unit +
2186	(`1` << `30`) - `1`) >> `30`;
2187	vm_size = roundup_pow_of_two(
2188	min(max(phys_ram_gb * `3`, min_vm_size), max_size));
2189	}
2190
2191	adev->vm_manager.max_pfn = (uint64_t)vm_size << `18`;
2192
2193	tmp = roundup_pow_of_two(adev->vm_manager.max_pfn);
2194	if (amdgpu_vm_block_size != -`1`)
2195	tmp >>= amdgpu_vm_block_size - `9`;
2196	tmp = DIV_ROUND_UP(fls64(tmp) - `1`, `9`) - `1`;
2197	adev->vm_manager.num_level = min_t(unsigned int, max_level, tmp);
2198	switch (adev->vm_manager.num_level) {
2199	case `3`:
2200	adev->vm_manager.root_level = AMDGPU_VM_PDB2;
2201	break;
2202	case `2`:
2203	adev->vm_manager.root_level = AMDGPU_VM_PDB1;
2204	break;
2205	case `1`:
2206	adev->vm_manager.root_level = AMDGPU_VM_PDB0;
2207	break;
2208	default:
2209	dev_err(adev->dev, "VMPT only supports 2~4+1 levels\n");
2210	}
2211	/ block size depends on vm size and hw setup/
2212	if (amdgpu_vm_block_size != -`1`)
2213	adev->vm_manager.block_size =
2214	min((unsigned)amdgpu_vm_block_size, max_bits
2215	- AMDGPU_GPU_PAGE_SHIFT
2216	- `9` * adev->vm_manager.num_level);
2217	else if (adev->vm_manager.num_level > `1`)
2218	adev->vm_manager.block_size = `9`;
2219	else
2220	adev->vm_manager.block_size = amdgpu_vm_get_block_size(vm_size: tmp);
2221
2222	if (amdgpu_vm_fragment_size == -`1`)
2223	adev->vm_manager.fragment_size = fragment_size_default;
2224	else
2225	adev->vm_manager.fragment_size = amdgpu_vm_fragment_size;
2226
2227	DRM_INFO("vm size is %u GB, %u levels, block size is %u-bit, fragment size is %u-bit\n",
2228	vm_size, adev->vm_manager.num_level + `1`,
2229	adev->vm_manager.block_size,
2230	adev->vm_manager.fragment_size);
2231	}
2232
2233	/**
2234	* amdgpu_vm_wait_idle - wait for the VM to become idle
2235	*
2236	* @vm: VM object to wait for
2237	* @timeout: timeout to wait for VM to become idle
2238	*/
2239	long amdgpu_vm_wait_idle(struct amdgpu_vm vm, long* timeout)
2240	{
2241	timeout = dma_resv_wait_timeout(obj: vm->root.bo->tbo.base.resv,
2242	usage: DMA_RESV_USAGE_BOOKKEEP,
2243	intr: true, timeout);
2244	if (timeout <= `0`)
2245	return timeout;
2246
2247	return dma_fence_wait_timeout(vm->last_unlocked, intr: true, timeout);
2248	}
2249
2250	static void amdgpu_vm_destroy_task_info(struct kref *kref)
2251	{
2252	struct amdgpu_task_info ti = container_of(kref, struct* amdgpu_task_info, refcount);
2253
2254	kfree(objp: ti);
2255	}
2256
2257	static inline struct amdgpu_vm *
2258	amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
2259	{
2260	struct amdgpu_vm *vm;
2261	unsigned long flags;
2262
2263	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
2264	vm = xa_load(&adev->vm_manager.pasids, index: pasid);
2265	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
2266
2267	return vm;
2268	}
2269
2270	/**
2271	* amdgpu_vm_put_task_info - reference down the vm task_info ptr
2272	*
2273	* @task_info: task_info struct under discussion.
2274	*
2275	* frees the vm task_info ptr at the last put
2276	*/
2277	void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info)
2278	{
2279	kref_put(kref: &task_info->refcount, release: amdgpu_vm_destroy_task_info);
2280	}
2281
2282	/**
2283	* amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
2284	*
2285	* @vm: VM to get info from
2286	*
2287	* Returns the reference counted task_info structure, which must be
2288	* referenced down with amdgpu_vm_put_task_info.
2289	*/
2290	struct amdgpu_task_info *
2291	amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
2292	{
2293	struct amdgpu_task_info *ti = NULL;
2294
2295	if (vm) {
2296	ti = vm->task_info;
2297	kref_get(kref: &vm->task_info->refcount);
2298	}
2299
2300	return ti;
2301	}
2302
2303	/**
2304	* amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
2305	*
2306	* @adev: drm device pointer
2307	* @pasid: PASID identifier for VM
2308	*
2309	* Returns the reference counted task_info structure, which must be
2310	* referenced down with amdgpu_vm_put_task_info.
2311	*/
2312	struct amdgpu_task_info *
2313	amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
2314	{
2315	return amdgpu_vm_get_task_info_vm(
2316	vm: amdgpu_vm_get_vm_from_pasid(adev, pasid));
2317	}
2318
2319	static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
2320	{
2321	vm->task_info = kzalloc(size: sizeof(struct amdgpu_task_info), GFP_KERNEL);
2322	if (!vm->task_info)
2323	return -ENOMEM;
2324
2325	kref_init(kref: &vm->task_info->refcount);
2326	return `0`;
2327	}
2328
2329	/**
2330	* amdgpu_vm_set_task_info - Sets VMs task info.
2331	*
2332	* @vm: vm for which to set the info
2333	*/
2334	void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
2335	{
2336	if (!vm->task_info)
2337	return;
2338
2339	if (vm->task_info->pid == current->pid)
2340	return;
2341
2342	vm->task_info->pid = current->pid;
2343	get_task_comm(vm->task_info->task_name, current);
2344
2345	if (current->group_leader->mm != current->mm)
2346	return;
2347
2348	vm->task_info->tgid = current->group_leader->pid;
2349	get_task_comm(vm->task_info->process_name, current->group_leader);
2350	}
2351
2352	/**
2353	* amdgpu_vm_init - initialize a vm instance
2354	*
2355	* @adev: amdgpu_device pointer
2356	* @vm: requested vm
2357	* @xcp_id: GPU partition selection id
2358	*
2359	* Init @vm fields.
2360	*
2361	* Returns:
2362	* 0 for success, error for failure.
2363	*/
2364	int amdgpu_vm_init(struct amdgpu_device adev, struct* amdgpu_vm *vm,
2365	int32_t xcp_id)
2366	{
2367	struct amdgpu_bo *root_bo;
2368	struct amdgpu_bo_vm *root;
2369	int r, i;
2370
2371	vm->va = RB_ROOT_CACHED;
2372	for (i = `0`; i < AMDGPU_MAX_VMHUBS; i++)
2373	vm->reserved_vmid[i] = NULL;
2374	INIT_LIST_HEAD(list: &vm->evicted);
2375	INIT_LIST_HEAD(list: &vm->evicted_user);
2376	INIT_LIST_HEAD(list: &vm->relocated);
2377	INIT_LIST_HEAD(list: &vm->moved);
2378	INIT_LIST_HEAD(list: &vm->idle);
2379	INIT_LIST_HEAD(list: &vm->invalidated);
2380	spin_lock_init(&vm->status_lock);
2381	INIT_LIST_HEAD(list: &vm->freed);
2382	INIT_LIST_HEAD(list: &vm->done);
2383	INIT_LIST_HEAD(list: &vm->pt_freed);
2384	INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
2385	INIT_KFIFO(vm->faults);
2386
2387	r = amdgpu_vm_init_entities(adev, vm);
2388	if (r)
2389	return r;
2390
2391	vm->is_compute_context = false;
2392
2393	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2394	AMDGPU_VM_USE_CPU_FOR_GFX);
2395
2396	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2397	vm->use_cpu_for_update ? "CPU" : "SDMA");
2398	WARN_ONCE((vm->use_cpu_for_update &&
2399	!amdgpu_gmc_vram_full_visible(&adev->gmc)),
2400	"CPU update of VM recommended only for large BAR system\n");
2401
2402	if (vm->use_cpu_for_update)
2403	vm->update_funcs = &amdgpu_vm_cpu_funcs;
2404	else
2405	vm->update_funcs = &amdgpu_vm_sdma_funcs;
2406
2407	vm->last_update = dma_fence_get_stub();
2408	vm->last_unlocked = dma_fence_get_stub();
2409	vm->last_tlb_flush = dma_fence_get_stub();
2410	vm->generation = `0`;
2411
2412	mutex_init(&vm->eviction_lock);
2413	vm->evicting = false;
2414
2415	r = amdgpu_vm_pt_create(adev, vm, level: adev->vm_manager.root_level,
2416	immediate: false, vmbo: &root, xcp_id);
2417	if (r)
2418	goto error_free_delayed;
2419
2420	root_bo = amdgpu_bo_ref(bo: &root->bo);
2421	r = amdgpu_bo_reserve(bo: root_bo, no_intr: true);
2422	if (r) {
2423	amdgpu_bo_unref(bo: &root->shadow);
2424	amdgpu_bo_unref(bo: &root_bo);
2425	goto error_free_delayed;
2426	}
2427
2428	amdgpu_vm_bo_base_init(base: &vm->root, vm, bo: root_bo);
2429	r = dma_resv_reserve_fences(obj: root_bo->tbo.base.resv, num_fences: `1`);
2430	if (r)
2431	goto error_free_root;
2432
2433	r = amdgpu_vm_pt_clear(adev, vm, vmbo: root, immediate: false);
2434	if (r)
2435	goto error_free_root;
2436
2437	r = amdgpu_vm_create_task_info(vm);
2438	if (r)
2439	DRM_DEBUG("Failed to create task info for VM\n");
2440
2441	amdgpu_bo_unreserve(bo: vm->root.bo);
2442	amdgpu_bo_unref(bo: &root_bo);
2443
2444	return `0`;
2445
2446	error_free_root:
2447	amdgpu_vm_pt_free_root(adev, vm);
2448	amdgpu_bo_unreserve(bo: vm->root.bo);
2449	amdgpu_bo_unref(bo: &root_bo);
2450
2451	error_free_delayed:
2452	dma_fence_put(fence: vm->last_tlb_flush);
2453	dma_fence_put(fence: vm->last_unlocked);
2454	amdgpu_vm_fini_entities(vm);
2455
2456	return r;
2457	}
2458
2459	/**
2460	* amdgpu_vm_make_compute - Turn a GFX VM into a compute VM
2461	*
2462	* @adev: amdgpu_device pointer
2463	* @vm: requested vm
2464	*
2465	* This only works on GFX VMs that don't have any BOs added and no
2466	* page tables allocated yet.
2467	*
2468	* Changes the following VM parameters:
2469	* - use_cpu_for_update
2470	* - pte_supports_ats
2471	*
2472	* Reinitializes the page directory to reflect the changed ATS
2473	* setting.
2474	*
2475	* Returns:
2476	* 0 for success, -errno for errors.
2477	*/
2478	int amdgpu_vm_make_compute(struct amdgpu_device adev, struct* amdgpu_vm *vm)
2479	{
2480	int r;
2481
2482	r = amdgpu_bo_reserve(bo: vm->root.bo, no_intr: true);
2483	if (r)
2484	return r;
2485
2486	/ Update VM state /
2487	vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
2488	AMDGPU_VM_USE_CPU_FOR_COMPUTE);
2489	DRM_DEBUG_DRIVER("VM update mode is %s\n",
2490	vm->use_cpu_for_update ? "CPU" : "SDMA");
2491	WARN_ONCE((vm->use_cpu_for_update &&
2492	!amdgpu_gmc_vram_full_visible(&adev->gmc)),
2493	"CPU update of VM recommended only for large BAR system\n");
2494
2495	if (vm->use_cpu_for_update) {
2496	/ Sync with last SDMA update/clear before switching to CPU /
2497	r = amdgpu_bo_sync_wait(bo: vm->root.bo,
2498	AMDGPU_FENCE_OWNER_UNDEFINED, intr: true);
2499	if (r)
2500	goto unreserve_bo;
2501
2502	vm->update_funcs = &amdgpu_vm_cpu_funcs;
2503	r = amdgpu_vm_pt_map_tables(adev, vm);
2504	if (r)
2505	goto unreserve_bo;
2506
2507	} else {
2508	vm->update_funcs = &amdgpu_vm_sdma_funcs;
2509	}
2510
2511	dma_fence_put(fence: vm->last_update);
2512	vm->last_update = dma_fence_get_stub();
2513	vm->is_compute_context = true;
2514
2515	/ Free the shadow bo for compute VM /
2516	amdgpu_bo_unref(bo: &to_amdgpu_bo_vm(vm->root.bo)->shadow);
2517
2518	goto unreserve_bo;
2519
2520	unreserve_bo:
2521	amdgpu_bo_unreserve(bo: vm->root.bo);
2522	return r;
2523	}
2524
2525	/**
2526	* amdgpu_vm_release_compute - release a compute vm
2527	* @adev: amdgpu_device pointer
2528	* @vm: a vm turned into compute vm by calling amdgpu_vm_make_compute
2529	*
2530	* This is a correspondant of amdgpu_vm_make_compute. It decouples compute
2531	* pasid from vm. Compute should stop use of vm after this call.
2532	*/
2533	void amdgpu_vm_release_compute(struct amdgpu_device adev, struct* amdgpu_vm *vm)
2534	{
2535	amdgpu_vm_set_pasid(adev, vm, pasid: `0`);
2536	vm->is_compute_context = false;
2537	}
2538
2539	/**
2540	* amdgpu_vm_fini - tear down a vm instance
2541	*
2542	* @adev: amdgpu_device pointer
2543	* @vm: requested vm
2544	*
2545	* Tear down @vm.
2546	* Unbind the VM and remove all bos from the vm bo list
2547	*/
2548	void amdgpu_vm_fini(struct amdgpu_device adev, struct* amdgpu_vm *vm)
2549	{
2550	struct amdgpu_bo_va_mapping mapping, tmp;
2551	bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
2552	struct amdgpu_bo *root;
2553	unsigned long flags;
2554	int i;
2555
2556	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
2557
2558	flush_work(work: &vm->pt_free_work);
2559
2560	root = amdgpu_bo_ref(bo: vm->root.bo);
2561	amdgpu_bo_reserve(bo: root, no_intr: true);
2562	amdgpu_vm_put_task_info(task_info: vm->task_info);
2563	amdgpu_vm_set_pasid(adev, vm, pasid: `0`);
2564	dma_fence_wait(fence: vm->last_unlocked, intr: false);
2565	dma_fence_put(fence: vm->last_unlocked);
2566	dma_fence_wait(fence: vm->last_tlb_flush, intr: false);
2567	/ Make sure that all fence callbacks have completed /
2568	spin_lock_irqsave(vm->last_tlb_flush->lock, flags);
2569	spin_unlock_irqrestore(lock: vm->last_tlb_flush->lock, flags);
2570	dma_fence_put(fence: vm->last_tlb_flush);
2571
2572	list_for_each_entry_safe(mapping, tmp, &vm->freed, list) {
2573	if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
2574	amdgpu_vm_prt_fini(adev, vm);
2575	prt_fini_needed = false;
2576	}
2577
2578	list_del(entry: &mapping->list);
2579	amdgpu_vm_free_mapping(adev, vm, mapping, NULL);
2580	}
2581
2582	amdgpu_vm_pt_free_root(adev, vm);
2583	amdgpu_bo_unreserve(bo: root);
2584	amdgpu_bo_unref(bo: &root);
2585	WARN_ON(vm->root.bo);
2586
2587	amdgpu_vm_fini_entities(vm);
2588
2589	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
2590	dev_err(adev->dev, "still active bo inside vm\n");
2591	}
2592	rbtree_postorder_for_each_entry_safe(mapping, tmp,
2593	&vm->va.rb_root, rb) {
2594	/ Don't remove the mapping here, we don't want to trigger a*
2595	* rebalance and the tree is about to be destroyed anyway.
2596	*/
2597	list_del(entry: &mapping->list);
2598	kfree(objp: mapping);
2599	}
2600
2601	dma_fence_put(fence: vm->last_update);
2602
2603	for (i = `0`; i < AMDGPU_MAX_VMHUBS; i++) {
2604	if (vm->reserved_vmid[i]) {
2605	amdgpu_vmid_free_reserved(adev, vmhub: i);
2606	vm->reserved_vmid[i] = false;
2607	}
2608	}
2609
2610	}
2611
2612	/**
2613	* amdgpu_vm_manager_init - init the VM manager
2614	*
2615	* @adev: amdgpu_device pointer
2616	*
2617	* Initialize the VM manager structures
2618	*/
2619	void amdgpu_vm_manager_init(struct amdgpu_device *adev)
2620	{
2621	unsigned i;
2622
2623	/ Concurrent flushes are only possible starting with Vega10 and*
2624	* are broken on Navi10 and Navi14.
2625	*/
2626	adev->vm_manager.concurrent_flush = !(adev->asic_type < CHIP_VEGA10 \|\|
2627	adev->asic_type == CHIP_NAVI10 \|\|
2628	adev->asic_type == CHIP_NAVI14);
2629	amdgpu_vmid_mgr_init(adev);
2630
2631	adev->vm_manager.fence_context =
2632	dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2633	for (i = `0`; i < AMDGPU_MAX_RINGS; ++i)
2634	adev->vm_manager.seqno[i] = `0`;
2635
2636	spin_lock_init(&adev->vm_manager.prt_lock);
2637	atomic_set(v: &adev->vm_manager.num_prt_users, i: `0`);
2638
2639	/ If not overridden by the user, by default, only in large BAR systems*
2640	* Compute VM tables will be updated by CPU
2641	*/
2642	#ifdef CONFIG_X86_64
2643	if (amdgpu_vm_update_mode == -`1`) {
2644	/ For asic with VF MMIO access protection*
2645	* avoid using CPU for VM table updates
2646	*/
2647	if (amdgpu_gmc_vram_full_visible(gmc: &adev->gmc) &&
2648	!amdgpu_sriov_vf_mmio_access_protection(adev))
2649	adev->vm_manager.vm_update_mode =
2650	AMDGPU_VM_USE_CPU_FOR_COMPUTE;
2651	else
2652	adev->vm_manager.vm_update_mode = `0`;
2653	} else
2654	adev->vm_manager.vm_update_mode = amdgpu_vm_update_mode;
2655	#else
2656	adev->vm_manager.vm_update_mode = `0`;
2657	#endif
2658
2659	xa_init_flags(xa: &adev->vm_manager.pasids, XA_FLAGS_LOCK_IRQ);
2660	}
2661
2662	/**
2663	* amdgpu_vm_manager_fini - cleanup VM manager
2664	*
2665	* @adev: amdgpu_device pointer
2666	*
2667	* Cleanup the VM manager and free resources.
2668	*/
2669	void amdgpu_vm_manager_fini(struct amdgpu_device *adev)
2670	{
2671	WARN_ON(!xa_empty(&adev->vm_manager.pasids));
2672	xa_destroy(&adev->vm_manager.pasids);
2673
2674	amdgpu_vmid_mgr_fini(adev);
2675	}
2676
2677	/**
2678	* amdgpu_vm_ioctl - Manages VMID reservation for vm hubs.
2679	*
2680	* @dev: drm device pointer
2681	* @data: drm_amdgpu_vm
2682	* @filp: drm file pointer
2683	*
2684	* Returns:
2685	* 0 for success, -errno for errors.
2686	*/
2687	int amdgpu_vm_ioctl(struct drm_device dev, void* data, struct* drm_file *filp)
2688	{
2689	union drm_amdgpu_vm *args = data;
2690	struct amdgpu_device *adev = drm_to_adev(ddev: dev);
2691	struct amdgpu_fpriv *fpriv = filp->driver_priv;
2692
2693	/ No valid flags defined yet /
2694	if (args->in.flags)
2695	return -EINVAL;
2696
2697	switch (args->in.op) {
2698	case AMDGPU_VM_OP_RESERVE_VMID:
2699	/ We only have requirement to reserve vmid from gfxhub /
2700	if (!fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(`0`)]) {
2701	amdgpu_vmid_alloc_reserved(adev, AMDGPU_GFXHUB(`0`));
2702	fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(`0`)] = true;
2703	}
2704
2705	break;
2706	case AMDGPU_VM_OP_UNRESERVE_VMID:
2707	if (fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(`0`)]) {
2708	amdgpu_vmid_free_reserved(adev, AMDGPU_GFXHUB(`0`));
2709	fpriv->vm.reserved_vmid[AMDGPU_GFXHUB(`0`)] = false;
2710	}
2711	break;
2712	default:
2713	return -EINVAL;
2714	}
2715
2716	return `0`;
2717	}
2718
2719	/**
2720	* amdgpu_vm_handle_fault - graceful handling of VM faults.
2721	* @adev: amdgpu device pointer
2722	* @pasid: PASID of the VM
2723	* @vmid: VMID, only used for GFX 9.4.3.
2724	* @node_id: Node_id received in IH cookie. Only applicable for
2725	* GFX 9.4.3.
2726	* @addr: Address of the fault
2727	* @write_fault: true is write fault, false is read fault
2728	*
2729	* Try to gracefully handle a VM fault. Return true if the fault was handled and
2730	* shouldn't be reported any more.
2731	*/
2732	bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
2733	u32 vmid, u32 node_id, uint64_t addr,
2734	bool write_fault)
2735	{
2736	bool is_compute_context = false;
2737	struct amdgpu_bo *root;
2738	unsigned long irqflags;
2739	uint64_t value, flags;
2740	struct amdgpu_vm *vm;
2741	int r;
2742
2743	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
2744	vm = xa_load(&adev->vm_manager.pasids, index: pasid);
2745	if (vm) {
2746	root = amdgpu_bo_ref(bo: vm->root.bo);
2747	is_compute_context = vm->is_compute_context;
2748	} else {
2749	root = NULL;
2750	}
2751	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
2752
2753	if (!root)
2754	return false;
2755
2756	addr /= AMDGPU_GPU_PAGE_SIZE;
2757
2758	if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
2759	node_id, addr, write_fault)) {
2760	amdgpu_bo_unref(bo: &root);
2761	return true;
2762	}
2763
2764	r = amdgpu_bo_reserve(bo: root, no_intr: true);
2765	if (r)
2766	goto error_unref;
2767
2768	/ Double check that the VM still exists /
2769	xa_lock_irqsave(&adev->vm_manager.pasids, irqflags);
2770	vm = xa_load(&adev->vm_manager.pasids, index: pasid);
2771	if (vm && vm->root.bo != root)
2772	vm = NULL;
2773	xa_unlock_irqrestore(&adev->vm_manager.pasids, irqflags);
2774	if (!vm)
2775	goto error_unlock;
2776
2777	flags = AMDGPU_PTE_VALID \| AMDGPU_PTE_SNOOPED \|
2778	AMDGPU_PTE_SYSTEM;
2779
2780	if (is_compute_context) {
2781	/ Intentionally setting invalid PTE flag*
2782	* combination to force a no-retry-fault
2783	*/
2784	flags = AMDGPU_VM_NORETRY_FLAGS;
2785	value = `0`;
2786	} else if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_NEVER) {
2787	/ Redirect the access to the dummy page /
2788	value = adev->dummy_page_addr;
2789	flags \|= AMDGPU_PTE_EXECUTABLE \| AMDGPU_PTE_READABLE \|
2790	AMDGPU_PTE_WRITEABLE;
2791
2792	} else {
2793	/ Let the hw retry silently on the PTE /
2794	value = `0`;
2795	}
2796
2797	r = dma_resv_reserve_fences(obj: root->tbo.base.resv, num_fences: `1`);
2798	if (r) {
2799	pr_debug("failed %d to reserve fence slot\n", r);
2800	goto error_unlock;
2801	}
2802
2803	r = amdgpu_vm_update_range(adev, vm, immediate: true, unlocked: false, flush_tlb: false, allow_override: false,
2804	NULL, start: addr, last: addr, flags, offset: value, vram_base: `0`, NULL, NULL, NULL);
2805	if (r)
2806	goto error_unlock;
2807
2808	r = amdgpu_vm_update_pdes(adev, vm, immediate: true);
2809
2810	error_unlock:
2811	amdgpu_bo_unreserve(bo: root);
2812	if (r < `0`)
2813	DRM_ERROR("Can't handle page fault (%d)\n", r);
2814
2815	error_unref:
2816	amdgpu_bo_unref(bo: &root);
2817
2818	return false;
2819	}
2820
2821	#if defined(CONFIG_DEBUG_FS)
2822	/**
2823	* amdgpu_debugfs_vm_bo_info - print BO info for the VM
2824	*
2825	* @vm: Requested VM for printing BO info
2826	* @m: debugfs file
2827	*
2828	* Print BO information in debugfs file for the VM
2829	*/
2830	void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm vm, struct* seq_file *m)
2831	{
2832	struct amdgpu_bo_va bo_va, tmp;
2833	u64 total_idle = `0`;
2834	u64 total_evicted = `0`;
2835	u64 total_relocated = `0`;
2836	u64 total_moved = `0`;
2837	u64 total_invalidated = `0`;
2838	u64 total_done = `0`;
2839	unsigned int total_idle_objs = `0`;
2840	unsigned int total_evicted_objs = `0`;
2841	unsigned int total_relocated_objs = `0`;
2842	unsigned int total_moved_objs = `0`;
2843	unsigned int total_invalidated_objs = `0`;
2844	unsigned int total_done_objs = `0`;
2845	unsigned int id = `0`;
2846
2847	spin_lock(lock: &vm->status_lock);
2848	seq_puts(m, s: "\tIdle BOs:\n");
2849	list_for_each_entry_safe(bo_va, tmp, &vm->idle, base.vm_status) {
2850	if (!bo_va->base.bo)
2851	continue;
2852	total_idle += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2853	}
2854	total_idle_objs = id;
2855	id = `0`;
2856
2857	seq_puts(m, s: "\tEvicted BOs:\n");
2858	list_for_each_entry_safe(bo_va, tmp, &vm->evicted, base.vm_status) {
2859	if (!bo_va->base.bo)
2860	continue;
2861	total_evicted += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2862	}
2863	total_evicted_objs = id;
2864	id = `0`;
2865
2866	seq_puts(m, s: "\tRelocated BOs:\n");
2867	list_for_each_entry_safe(bo_va, tmp, &vm->relocated, base.vm_status) {
2868	if (!bo_va->base.bo)
2869	continue;
2870	total_relocated += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2871	}
2872	total_relocated_objs = id;
2873	id = `0`;
2874
2875	seq_puts(m, s: "\tMoved BOs:\n");
2876	list_for_each_entry_safe(bo_va, tmp, &vm->moved, base.vm_status) {
2877	if (!bo_va->base.bo)
2878	continue;
2879	total_moved += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2880	}
2881	total_moved_objs = id;
2882	id = `0`;
2883
2884	seq_puts(m, s: "\tInvalidated BOs:\n");
2885	list_for_each_entry_safe(bo_va, tmp, &vm->invalidated, base.vm_status) {
2886	if (!bo_va->base.bo)
2887	continue;
2888	total_invalidated += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2889	}
2890	total_invalidated_objs = id;
2891	id = `0`;
2892
2893	seq_puts(m, s: "\tDone BOs:\n");
2894	list_for_each_entry_safe(bo_va, tmp, &vm->done, base.vm_status) {
2895	if (!bo_va->base.bo)
2896	continue;
2897	total_done += amdgpu_bo_print_info(id: id++, bo: bo_va->base.bo, m);
2898	}
2899	spin_unlock(lock: &vm->status_lock);
2900	total_done_objs = id;
2901
2902	seq_printf(m, fmt: "\tTotal idle size: %12lld\tobjs:\t%d\n", total_idle,
2903	total_idle_objs);
2904	seq_printf(m, fmt: "\tTotal evicted size: %12lld\tobjs:\t%d\n", total_evicted,
2905	total_evicted_objs);
2906	seq_printf(m, fmt: "\tTotal relocated size: %12lld\tobjs:\t%d\n", total_relocated,
2907	total_relocated_objs);
2908	seq_printf(m, fmt: "\tTotal moved size: %12lld\tobjs:\t%d\n", total_moved,
2909	total_moved_objs);
2910	seq_printf(m, fmt: "\tTotal invalidated size: %12lld\tobjs:\t%d\n", total_invalidated,
2911	total_invalidated_objs);
2912	seq_printf(m, fmt: "\tTotal done size: %12lld\tobjs:\t%d\n", total_done,
2913	total_done_objs);
2914	}
2915	#endif
2916
2917	/**
2918	* amdgpu_vm_update_fault_cache - update cached fault into.
2919	* @adev: amdgpu device pointer
2920	* @pasid: PASID of the VM
2921	* @addr: Address of the fault
2922	* @status: GPUVM fault status register
2923	* @vmhub: which vmhub got the fault
2924	*
2925	* Cache the fault info for later use by userspace in debugging.
2926	*/
2927	void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
2928	unsigned int pasid,
2929	uint64_t addr,
2930	uint32_t status,
2931	unsigned int vmhub)
2932	{
2933	struct amdgpu_vm *vm;
2934	unsigned long flags;
2935
2936	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
2937
2938	vm = xa_load(&adev->vm_manager.pasids, index: pasid);
2939	/ Don't update the fault cache if status is 0. In the multiple*
2940	* fault case, subsequent faults will return a 0 status which is
2941	* useless for userspace and replaces the useful fault status, so
2942	* only update if status is non-0.
2943	*/
2944	if (vm && status) {
2945	vm->fault_info.addr = addr;
2946	vm->fault_info.status = status;
2947	if (AMDGPU_IS_GFXHUB(vmhub)) {
2948	vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
2949	vm->fault_info.vmhub \|=
2950	(vmhub - AMDGPU_GFXHUB_START) << AMDGPU_VMHUB_IDX_SHIFT;
2951	} else if (AMDGPU_IS_MMHUB0(vmhub)) {
2952	vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM0;
2953	vm->fault_info.vmhub \|=
2954	(vmhub - AMDGPU_MMHUB0_START) << AMDGPU_VMHUB_IDX_SHIFT;
2955	} else if (AMDGPU_IS_MMHUB1(vmhub)) {
2956	vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_MM1;
2957	vm->fault_info.vmhub \|=
2958	(vmhub - AMDGPU_MMHUB1_START) << AMDGPU_VMHUB_IDX_SHIFT;
2959	} else {
2960	WARN_ONCE(`1`, "Invalid vmhub %u\n", vmhub);
2961	}
2962	}
2963	xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
2964	}
2965
2966

source code of linux/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c