memory.c source code [linux/drivers/accel/habanalabs/common/memory.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	/*
4	* Copyright 2016-2022 HabanaLabs, Ltd.
5	* All Rights Reserved.
6	*/
7
8	#include <uapi/drm/habanalabs_accel.h>
9	#include "habanalabs.h"
10	#include "../include/hw_ip/mmu/mmu_general.h"
11
12	#include <linux/uaccess.h>
13	#include <linux/slab.h>
14	#include <linux/vmalloc.h>
15	#include <linux/pci-p2pdma.h>
16
17	MODULE_IMPORT_NS(DMA_BUF);
18
19	#define HL_MMU_DEBUG 0
20
21	/ use small pages for supporting non-pow2 (32M/40M/48M) DRAM phys page sizes /
22	#define DRAM_POOL_PAGE_SIZE SZ_8M
23
24	#define MEM_HANDLE_INVALID ULONG_MAX
25
26	static int allocate_timestamps_buffers(struct hl_fpriv *hpriv,
27	struct hl_mem_in args, u64 handle);
28
29	static int set_alloc_page_size(struct hl_device hdev, struct* hl_mem_in args, u32 page_size)
30	{
31	struct asic_fixed_properties *prop = &hdev->asic_prop;
32	u64 psize;
33
34	/*
35	* for ASIC that supports setting the allocation page size by user we will address
36	* user's choice only if it is not 0 (as 0 means taking the default page size)
37	*/
38	if (prop->supports_user_set_page_size && args->alloc.page_size) {
39	psize = args->alloc.page_size;
40
41	if (!is_power_of_2(n: psize)) {
42	dev_err(hdev->dev, "user page size (%#llx) is not power of 2\n", psize);
43	return -EINVAL;
44	}
45	} else {
46	psize = prop->device_mem_alloc_default_page_size;
47	}
48
49	*page_size = psize;
50
51	return `0`;
52	}
53
54	/*
55	* The va ranges in context object contain a list with the available chunks of
56	* device virtual memory.
57	* There is one range for host allocations and one for DRAM allocations.
58	*
59	* On initialization each range contains one chunk of all of its available
60	* virtual range which is a half of the total device virtual range.
61	*
62	* On each mapping of physical pages, a suitable virtual range chunk (with a
63	* minimum size) is selected from the list. If the chunk size equals the
64	* requested size, the chunk is returned. Otherwise, the chunk is split into
65	* two chunks - one to return as result and a remainder to stay in the list.
66	*
67	* On each Unmapping of a virtual address, the relevant virtual chunk is
68	* returned to the list. The chunk is added to the list and if its edges match
69	* the edges of the adjacent chunks (means a contiguous chunk can be created),
70	* the chunks are merged.
71	*
72	* On finish, the list is checked to have only one chunk of all the relevant
73	* virtual range (which is a half of the device total virtual range).
74	* If not (means not all mappings were unmapped), a warning is printed.
75	*/
76
77	/*
78	* alloc_device_memory() - allocate device memory.
79	* @ctx: pointer to the context structure.
80	* @args: host parameters containing the requested size.
81	* @ret_handle: result handle.
82	*
83	* This function does the following:
84	* - Allocate the requested size rounded up to 'dram_page_size' pages.
85	* - Return unique handle for later map/unmap/free.
86	*/
87	static int alloc_device_memory(struct hl_ctx ctx, struct* hl_mem_in *args,
88	u32 *ret_handle)
89	{
90	struct hl_device *hdev = ctx->hdev;
91	struct hl_vm *vm = &hdev->vm;
92	struct hl_vm_phys_pg_pack *phys_pg_pack;
93	u64 paddr = `0`, total_size, num_pgs, i;
94	u32 num_curr_pgs, page_size;
95	bool contiguous;
96	int handle, rc;
97
98	num_curr_pgs = `0`;
99
100	rc = set_alloc_page_size(hdev, args, page_size: &page_size);
101	if (rc)
102	return rc;
103
104	num_pgs = DIV_ROUND_UP_ULL(args->alloc.mem_size, page_size);
105	total_size = num_pgs * page_size;
106
107	if (!total_size) {
108	dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
109	return -EINVAL;
110	}
111
112	contiguous = args->flags & HL_MEM_CONTIGUOUS;
113
114	if (contiguous) {
115	if (is_power_of_2(n: page_size))
116	paddr = (uintptr_t) gen_pool_dma_alloc_align(pool: vm->dram_pg_pool,
117	size: total_size, NULL, align: page_size);
118	else
119	paddr = gen_pool_alloc(pool: vm->dram_pg_pool, size: total_size);
120	if (!paddr) {
121	dev_err(hdev->dev,
122	"Cannot allocate %llu contiguous pages with total size of %llu\n",
123	num_pgs, total_size);
124	return -ENOMEM;
125	}
126	}
127
128	phys_pg_pack = kzalloc(size: sizeof(*phys_pg_pack), GFP_KERNEL);
129	if (!phys_pg_pack) {
130	rc = -ENOMEM;
131	goto pages_pack_err;
132	}
133
134	phys_pg_pack->vm_type = VM_TYPE_PHYS_PACK;
135	phys_pg_pack->asid = ctx->asid;
136	phys_pg_pack->npages = num_pgs;
137	phys_pg_pack->page_size = page_size;
138	phys_pg_pack->total_size = total_size;
139	phys_pg_pack->flags = args->flags;
140	phys_pg_pack->contiguous = contiguous;
141
142	phys_pg_pack->pages = kvmalloc_array(n: num_pgs, size: sizeof(u64), GFP_KERNEL);
143	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
144	rc = -ENOMEM;
145	goto pages_arr_err;
146	}
147
148	if (phys_pg_pack->contiguous) {
149	for (i = `0` ; i < num_pgs ; i++)
150	phys_pg_pack->pages[i] = paddr + i * page_size;
151	} else {
152	for (i = `0` ; i < num_pgs ; i++) {
153	if (is_power_of_2(n: page_size))
154	phys_pg_pack->pages[i] =
155	(uintptr_t)gen_pool_dma_alloc_align(pool: vm->dram_pg_pool,
156	size: page_size, NULL,
157	align: page_size);
158	else
159	phys_pg_pack->pages[i] = gen_pool_alloc(pool: vm->dram_pg_pool,
160	size: page_size);
161
162	if (!phys_pg_pack->pages[i]) {
163	dev_err(hdev->dev,
164	"Cannot allocate device memory (out of memory)\n");
165	rc = -ENOMEM;
166	goto page_err;
167	}
168
169	num_curr_pgs++;
170	}
171	}
172
173	spin_lock(lock: &vm->idr_lock);
174	handle = idr_alloc(&vm->phys_pg_pack_handles, ptr: phys_pg_pack, start: `1`, end: `0`,
175	GFP_ATOMIC);
176	spin_unlock(lock: &vm->idr_lock);
177
178	if (handle < `0`) {
179	dev_err(hdev->dev, "Failed to get handle for page\n");
180	rc = -EFAULT;
181	goto idr_err;
182	}
183
184	for (i = `0` ; i < num_pgs ; i++)
185	kref_get(kref: &vm->dram_pg_pool_refcount);
186
187	phys_pg_pack->handle = handle;
188
189	atomic64_add(i: phys_pg_pack->total_size, v: &ctx->dram_phys_mem);
190	atomic64_add(i: phys_pg_pack->total_size, v: &hdev->dram_used_mem);
191
192	*ret_handle = handle;
193
194	return `0`;
195
196	idr_err:
197	page_err:
198	if (!phys_pg_pack->contiguous)
199	for (i = `0` ; i < num_curr_pgs ; i++)
200	gen_pool_free(pool: vm->dram_pg_pool, addr: phys_pg_pack->pages[i],
201	size: page_size);
202
203	kvfree(addr: phys_pg_pack->pages);
204	pages_arr_err:
205	kfree(objp: phys_pg_pack);
206	pages_pack_err:
207	if (contiguous)
208	gen_pool_free(pool: vm->dram_pg_pool, addr: paddr, size: total_size);
209
210	return rc;
211	}
212
213	/**
214	* dma_map_host_va() - DMA mapping of the given host virtual address.
215	* @hdev: habanalabs device structure.
216	* @addr: the host virtual address of the memory area.
217	* @size: the size of the memory area.
218	* @p_userptr: pointer to result userptr structure.
219	*
220	* This function does the following:
221	* - Allocate userptr structure.
222	* - Pin the given host memory using the userptr structure.
223	* - Perform DMA mapping to have the DMA addresses of the pages.
224	*/
225	static int dma_map_host_va(struct hl_device *hdev, u64 addr, u64 size,
226	struct hl_userptr **p_userptr)
227	{
228	struct hl_userptr *userptr;
229	int rc;
230
231	userptr = kzalloc(size: sizeof(*userptr), GFP_KERNEL);
232	if (!userptr) {
233	rc = -ENOMEM;
234	goto userptr_err;
235	}
236
237	rc = hl_pin_host_memory(hdev, addr, size, userptr);
238	if (rc)
239	goto pin_err;
240
241	userptr->dma_mapped = true;
242	userptr->dir = DMA_BIDIRECTIONAL;
243	userptr->vm_type = VM_TYPE_USERPTR;
244
245	*p_userptr = userptr;
246
247	rc = hl_dma_map_sgtable(hdev, userptr->sgt, DMA_BIDIRECTIONAL);
248	if (rc) {
249	dev_err(hdev->dev, "failed to map sgt with DMA region\n");
250	goto dma_map_err;
251	}
252
253	return `0`;
254
255	dma_map_err:
256	hl_unpin_host_memory(hdev, userptr);
257	pin_err:
258	kfree(objp: userptr);
259	userptr_err:
260
261	return rc;
262	}
263
264	/**
265	* dma_unmap_host_va() - DMA unmapping of the given host virtual address.
266	* @hdev: habanalabs device structure.
267	* @userptr: userptr to free.
268	*
269	* This function does the following:
270	* - Unpins the physical pages.
271	* - Frees the userptr structure.
272	*/
273	static void dma_unmap_host_va(struct hl_device *hdev,
274	struct hl_userptr *userptr)
275	{
276	hl_unpin_host_memory(hdev, userptr);
277	kfree(objp: userptr);
278	}
279
280	/**
281	* dram_pg_pool_do_release() - free DRAM pages pool
282	* @ref: pointer to reference object.
283	*
284	* This function does the following:
285	* - Frees the idr structure of physical pages handles.
286	* - Frees the generic pool of DRAM physical pages.
287	*/
288	static void dram_pg_pool_do_release(struct kref *ref)
289	{
290	struct hl_vm vm = container_of(ref, struct* hl_vm,
291	dram_pg_pool_refcount);
292
293	/*
294	* free the idr here as only here we know for sure that there are no
295	* allocated physical pages and hence there are no handles in use
296	*/
297	idr_destroy(&vm->phys_pg_pack_handles);
298	gen_pool_destroy(vm->dram_pg_pool);
299	}
300
301	/**
302	* free_phys_pg_pack() - free physical page pack.
303	* @hdev: habanalabs device structure.
304	* @phys_pg_pack: physical page pack to free.
305	*
306	* This function does the following:
307	* - For DRAM memory only
308	* - iterate over the pack, free each physical block structure by
309	* returning it to the general pool.
310	* - Free the hl_vm_phys_pg_pack structure.
311	*/
312	static void free_phys_pg_pack(struct hl_device *hdev,
313	struct hl_vm_phys_pg_pack *phys_pg_pack)
314	{
315	struct hl_vm *vm = &hdev->vm;
316	u64 i;
317
318	if (phys_pg_pack->created_from_userptr)
319	goto end;
320
321	if (phys_pg_pack->contiguous) {
322	gen_pool_free(pool: vm->dram_pg_pool, addr: phys_pg_pack->pages[`0`],
323	size: phys_pg_pack->total_size);
324
325	for (i = `0`; i < phys_pg_pack->npages ; i++)
326	kref_put(kref: &vm->dram_pg_pool_refcount,
327	release: dram_pg_pool_do_release);
328	} else {
329	for (i = `0` ; i < phys_pg_pack->npages ; i++) {
330	gen_pool_free(pool: vm->dram_pg_pool,
331	addr: phys_pg_pack->pages[i],
332	size: phys_pg_pack->page_size);
333	kref_put(kref: &vm->dram_pg_pool_refcount,
334	release: dram_pg_pool_do_release);
335	}
336	}
337
338	end:
339	kvfree(addr: phys_pg_pack->pages);
340	kfree(objp: phys_pg_pack);
341
342	return;
343	}
344
345	/**
346	* free_device_memory() - free device memory.
347	* @ctx: pointer to the context structure.
348	* @args: host parameters containing the requested size.
349	*
350	* This function does the following:
351	* - Free the device memory related to the given handle.
352	*/
353	static int free_device_memory(struct hl_ctx ctx, struct* hl_mem_in *args)
354	{
355	struct hl_device *hdev = ctx->hdev;
356	struct hl_vm *vm = &hdev->vm;
357	struct hl_vm_phys_pg_pack *phys_pg_pack;
358	u32 handle = args->free.handle;
359
360	spin_lock(lock: &vm->idr_lock);
361	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, id: handle);
362	if (!phys_pg_pack) {
363	spin_unlock(lock: &vm->idr_lock);
364	dev_err(hdev->dev, "free device memory failed, no match for handle %u\n", handle);
365	return -EINVAL;
366	}
367
368	if (atomic_read(v: &phys_pg_pack->mapping_cnt) > `0`) {
369	spin_unlock(lock: &vm->idr_lock);
370	dev_err(hdev->dev, "handle %u is mapped, cannot free\n", handle);
371	return -EINVAL;
372	}
373
374	/ must remove from idr before the freeing of the physical pages as the refcount of the pool*
375	* is also the trigger of the idr destroy
376	*/
377	idr_remove(&vm->phys_pg_pack_handles, id: handle);
378	spin_unlock(lock: &vm->idr_lock);
379
380	atomic64_sub(i: phys_pg_pack->total_size, v: &ctx->dram_phys_mem);
381	atomic64_sub(i: phys_pg_pack->total_size, v: &hdev->dram_used_mem);
382
383	free_phys_pg_pack(hdev, phys_pg_pack);
384
385	return `0`;
386	}
387
388	/**
389	* clear_va_list_locked() - free virtual addresses list.
390	* @hdev: habanalabs device structure.
391	* @va_list: list of virtual addresses to free.
392	*
393	* This function does the following:
394	* - Iterate over the list and free each virtual addresses block.
395	*
396	* This function should be called only when va_list lock is taken.
397	*/
398	static void clear_va_list_locked(struct hl_device *hdev,
399	struct list_head *va_list)
400	{
401	struct hl_vm_va_block va_block, tmp;
402
403	list_for_each_entry_safe(va_block, tmp, va_list, node) {
404	list_del(entry: &va_block->node);
405	kfree(objp: va_block);
406	}
407	}
408
409	/**
410	* print_va_list_locked() - print virtual addresses list.
411	* @hdev: habanalabs device structure.
412	* @va_list: list of virtual addresses to print.
413	*
414	* This function does the following:
415	* - Iterate over the list and print each virtual addresses block.
416	*
417	* This function should be called only when va_list lock is taken.
418	*/
419	static void print_va_list_locked(struct hl_device *hdev,
420	struct list_head *va_list)
421	{
422	#if HL_MMU_DEBUG
423	struct hl_vm_va_block *va_block;
424
425	dev_dbg(hdev->dev, "print va list:\n");
426
427	list_for_each_entry(va_block, va_list, node)
428	dev_dbg(hdev->dev,
429	"va block, start: 0x%llx, end: 0x%llx, size: %llu\n",
430	va_block->start, va_block->end, va_block->size);
431	#endif
432	}
433
434	/**
435	* merge_va_blocks_locked() - merge a virtual block if possible.
436	* @hdev: pointer to the habanalabs device structure.
437	* @va_list: pointer to the virtual addresses block list.
438	* @va_block: virtual block to merge with adjacent blocks.
439	*
440	* This function does the following:
441	* - Merge the given blocks with the adjacent blocks if their virtual ranges
442	* create a contiguous virtual range.
443	*
444	* This Function should be called only when va_list lock is taken.
445	*/
446	static void merge_va_blocks_locked(struct hl_device *hdev,
447	struct list_head va_list, struct* hl_vm_va_block *va_block)
448	{
449	struct hl_vm_va_block prev, next;
450
451	prev = list_prev_entry(va_block, node);
452	if (&prev->node != va_list && prev->end + `1` == va_block->start) {
453	prev->end = va_block->end;
454	prev->size = prev->end - prev->start + `1`;
455	list_del(entry: &va_block->node);
456	kfree(objp: va_block);
457	va_block = prev;
458	}
459
460	next = list_next_entry(va_block, node);
461	if (&next->node != va_list && va_block->end + `1` == next->start) {
462	next->start = va_block->start;
463	next->size = next->end - next->start + `1`;
464	list_del(entry: &va_block->node);
465	kfree(objp: va_block);
466	}
467	}
468
469	/**
470	* add_va_block_locked() - add a virtual block to the virtual addresses list.
471	* @hdev: pointer to the habanalabs device structure.
472	* @va_list: pointer to the virtual addresses block list.
473	* @start: start virtual address.
474	* @end: end virtual address.
475	*
476	* This function does the following:
477	* - Add the given block to the virtual blocks list and merge with other blocks
478	* if a contiguous virtual block can be created.
479	*
480	* This Function should be called only when va_list lock is taken.
481	*/
482	static int add_va_block_locked(struct hl_device *hdev,
483	struct list_head *va_list, u64 start, u64 end)
484	{
485	struct hl_vm_va_block va_block, res = NULL;
486	u64 size = end - start + `1`;
487
488	print_va_list_locked(hdev, va_list);
489
490	list_for_each_entry(va_block, va_list, node) {
491	/ TODO: remove upon matureness /
492	if (hl_mem_area_crosses_range(address: start, size, range_start_address: va_block->start,
493	range_end_address: va_block->end)) {
494	dev_err(hdev->dev,
495	"block crossing ranges at start 0x%llx, end 0x%llx\n",
496	va_block->start, va_block->end);
497	return -EINVAL;
498	}
499
500	if (va_block->end < start)
501	res = va_block;
502	}
503
504	va_block = kmalloc(size: sizeof(*va_block), GFP_KERNEL);
505	if (!va_block)
506	return -ENOMEM;
507
508	va_block->start = start;
509	va_block->end = end;
510	va_block->size = size;
511
512	if (!res)
513	list_add(new: &va_block->node, head: va_list);
514	else
515	list_add(new: &va_block->node, head: &res->node);
516
517	merge_va_blocks_locked(hdev, va_list, va_block);
518
519	print_va_list_locked(hdev, va_list);
520
521	return `0`;
522	}
523
524	/**
525	* add_va_block() - wrapper for add_va_block_locked.
526	* @hdev: pointer to the habanalabs device structure.
527	* @va_range: pointer to the virtual addresses range object.
528	* @start: start virtual address.
529	* @end: end virtual address.
530	*
531	* This function does the following:
532	* - Takes the list lock and calls add_va_block_locked.
533	*/
534	static inline int add_va_block(struct hl_device *hdev,
535	struct hl_va_range *va_range, u64 start, u64 end)
536	{
537	int rc;
538
539	mutex_lock(&va_range->lock);
540	rc = add_va_block_locked(hdev, va_list: &va_range->list, start, end);
541	mutex_unlock(lock: &va_range->lock);
542
543	return rc;
544	}
545
546	/**
547	* is_hint_crossing_range() - check if hint address crossing specified reserved.
548	* @range_type: virtual space range type.
549	* @start_addr: start virtual address.
550	* @size: block size.
551	* @prop: asic properties structure to retrieve reserved ranges from.
552	*/
553	static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
554	u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
555	bool range_cross;
556
557	if (range_type == HL_VA_RANGE_TYPE_DRAM)
558	range_cross =
559	hl_mem_area_crosses_range(address: start_addr, size,
560	range_start_address: prop->hints_dram_reserved_va_range.start_addr,
561	range_end_address: prop->hints_dram_reserved_va_range.end_addr);
562	else if (range_type == HL_VA_RANGE_TYPE_HOST)
563	range_cross =
564	hl_mem_area_crosses_range(address: start_addr, size,
565	range_start_address: prop->hints_host_reserved_va_range.start_addr,
566	range_end_address: prop->hints_host_reserved_va_range.end_addr);
567	else
568	range_cross =
569	hl_mem_area_crosses_range(address: start_addr, size,
570	range_start_address: prop->hints_host_hpage_reserved_va_range.start_addr,
571	range_end_address: prop->hints_host_hpage_reserved_va_range.end_addr);
572
573	return range_cross;
574	}
575
576	/**
577	* get_va_block() - get a virtual block for the given size and alignment.
578	*
579	* @hdev: pointer to the habanalabs device structure.
580	* @va_range: pointer to the virtual addresses range.
581	* @size: requested block size.
582	* @hint_addr: hint for requested address by the user.
583	* @va_block_align: required alignment of the virtual block start address.
584	* @range_type: va range type (host, dram)
585	* @flags: additional memory flags, currently only uses HL_MEM_FORCE_HINT
586	*
587	* This function does the following:
588	* - Iterate on the virtual block list to find a suitable virtual block for the
589	* given size, hint address and alignment.
590	* - Reserve the requested block and update the list.
591	* - Return the start address of the virtual block.
592	*/
593	static u64 get_va_block(struct hl_device *hdev,
594	struct hl_va_range *va_range,
595	u64 size, u64 hint_addr, u32 va_block_align,
596	enum hl_va_range_type range_type,
597	u32 flags)
598	{
599	struct hl_vm_va_block va_block, new_va_block = NULL;
600	struct asic_fixed_properties *prop = &hdev->asic_prop;
601	u64 tmp_hint_addr, valid_start, valid_size, prev_start, prev_end,
602	align_mask, reserved_valid_start = `0`, reserved_valid_size = `0`,
603	dram_hint_mask = prop->dram_hints_align_mask;
604	bool add_prev = false;
605	bool is_align_pow_2 = is_power_of_2(n: va_range->page_size);
606	bool is_hint_dram_addr = hl_is_dram_va(hdev, virt_addr: hint_addr);
607	bool force_hint = flags & HL_MEM_FORCE_HINT;
608	int rc;
609
610	if (is_align_pow_2)
611	align_mask = ~((u64)va_block_align - `1`);
612	else
613	/*
614	* with non-power-of-2 range we work only with page granularity
615	* and the start address is page aligned,
616	* so no need for alignment checking.
617	*/
618	size = DIV_ROUND_UP_ULL(size, va_range->page_size) *
619	va_range->page_size;
620
621	tmp_hint_addr = hint_addr & ~dram_hint_mask;
622
623	/ Check if we need to ignore hint address /
624	if ((is_align_pow_2 && (hint_addr & (va_block_align - `1`))) \|\|
625	(!is_align_pow_2 && is_hint_dram_addr &&
626	do_div(tmp_hint_addr, va_range->page_size))) {
627
628	if (force_hint) {
629	/ Hint must be respected, so here we just fail /
630	dev_err(hdev->dev,
631	"Hint address 0x%llx is not page aligned - cannot be respected\n",
632	hint_addr);
633	return `0`;
634	}
635
636	dev_dbg(hdev->dev,
637	"Hint address 0x%llx will be ignored because it is not aligned\n",
638	hint_addr);
639	hint_addr = `0`;
640	}
641
642	mutex_lock(&va_range->lock);
643
644	print_va_list_locked(hdev, va_list: &va_range->list);
645
646	list_for_each_entry(va_block, &va_range->list, node) {
647	/ Calc the first possible aligned addr /
648	valid_start = va_block->start;
649
650	if (is_align_pow_2 && (valid_start & (va_block_align - `1`))) {
651	valid_start &= align_mask;
652	valid_start += va_block_align;
653	if (valid_start > va_block->end)
654	continue;
655	}
656
657	valid_size = va_block->end - valid_start + `1`;
658	if (valid_size < size)
659	continue;
660
661	/*
662	* In case hint address is 0, and hints_range_reservation
663	* property enabled, then avoid allocating va blocks from the
664	* range reserved for hint addresses
665	*/
666	if (prop->hints_range_reservation && !hint_addr)
667	if (is_hint_crossing_range(range_type, start_addr: valid_start,
668	size, prop))
669	continue;
670
671	/ Pick the minimal length block which has the required size /
672	if (!new_va_block \|\| (valid_size < reserved_valid_size)) {
673	new_va_block = va_block;
674	reserved_valid_start = valid_start;
675	reserved_valid_size = valid_size;
676	}
677
678	if (hint_addr && hint_addr >= valid_start &&
679	(hint_addr + size) <= va_block->end) {
680	new_va_block = va_block;
681	reserved_valid_start = hint_addr;
682	reserved_valid_size = valid_size;
683	break;
684	}
685	}
686
687	if (!new_va_block) {
688	dev_err(hdev->dev, "no available va block for size %llu\n",
689	size);
690	goto out;
691	}
692
693	if (force_hint && reserved_valid_start != hint_addr) {
694	/ Hint address must be respected. If we are here - this means*
695	* we could not respect it.
696	*/
697	dev_err(hdev->dev,
698	"Hint address 0x%llx could not be respected\n",
699	hint_addr);
700	reserved_valid_start = `0`;
701	goto out;
702	}
703
704	/*
705	* Check if there is some leftover range due to reserving the new
706	* va block, then return it to the main virtual addresses list.
707	*/
708	if (reserved_valid_start > new_va_block->start) {
709	prev_start = new_va_block->start;
710	prev_end = reserved_valid_start - `1`;
711
712	new_va_block->start = reserved_valid_start;
713	new_va_block->size = reserved_valid_size;
714
715	add_prev = true;
716	}
717
718	if (new_va_block->size > size) {
719	new_va_block->start += size;
720	new_va_block->size = new_va_block->end - new_va_block->start + `1`;
721	} else {
722	list_del(entry: &new_va_block->node);
723	kfree(objp: new_va_block);
724	}
725
726	if (add_prev) {
727	rc = add_va_block_locked(hdev, va_list: &va_range->list, start: prev_start, end: prev_end);
728	if (rc) {
729	reserved_valid_start = `0`;
730	goto out;
731	}
732	}
733
734	print_va_list_locked(hdev, va_list: &va_range->list);
735	out:
736	mutex_unlock(lock: &va_range->lock);
737
738	return reserved_valid_start;
739	}
740
741	/*
742	* hl_reserve_va_block() - reserve a virtual block of a given size.
743	* @hdev: pointer to the habanalabs device structure.
744	* @ctx: current context
745	* @type: virtual addresses range type.
746	* @size: requested block size.
747	* @alignment: required alignment in bytes of the virtual block start address,
748	* 0 means no alignment.
749	*
750	* This function does the following:
751	* - Iterate on the virtual block list to find a suitable virtual block for the
752	* given size and alignment.
753	* - Reserve the requested block and update the list.
754	* - Return the start address of the virtual block.
755	*/
756	u64 hl_reserve_va_block(struct hl_device hdev, struct* hl_ctx *ctx,
757	enum hl_va_range_type type, u64 size, u32 alignment)
758	{
759	return get_va_block(hdev, va_range: ctx->va_range[type], size, hint_addr: `0`,
760	max(alignment, ctx->va_range[type]->page_size),
761	range_type: type, flags: `0`);
762	}
763
764	/**
765	* hl_get_va_range_type() - get va_range type for the given address and size.
766	* @ctx: context to fetch va_range from.
767	* @address: the start address of the area we want to validate.
768	* @size: the size in bytes of the area we want to validate.
769	* @type: returned va_range type.
770	*
771	* Return: true if the area is inside a valid range, false otherwise.
772	*/
773	static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
774	enum hl_va_range_type *type)
775	{
776	int i;
777
778	for (i = `0` ; i < HL_VA_RANGE_TYPE_MAX; i++) {
779	if (hl_mem_area_inside_range(address, size,
780	range_start_address: ctx->va_range[i]->start_addr,
781	range_end_address: ctx->va_range[i]->end_addr)) {
782	*type = i;
783	return `0`;
784	}
785	}
786
787	return -EINVAL;
788	}
789
790	/**
791	* hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.
792	* @hdev: pointer to the habanalabs device structure
793	* @ctx: pointer to the context structure.
794	* @start_addr: start virtual address.
795	* @size: number of bytes to unreserve.
796	*
797	* This function does the following:
798	* - Takes the list lock and calls add_va_block_locked.
799	*/
800	int hl_unreserve_va_block(struct hl_device hdev, struct* hl_ctx *ctx,
801	u64 start_addr, u64 size)
802	{
803	enum hl_va_range_type type;
804	int rc;
805
806	rc = hl_get_va_range_type(ctx, address: start_addr, size, type: &type);
807	if (rc) {
808	dev_err(hdev->dev,
809	"cannot find va_range for va %#llx size %llu",
810	start_addr, size);
811	return rc;
812	}
813
814	rc = add_va_block(hdev, va_range: ctx->va_range[type], start: start_addr,
815	end: start_addr + size - `1`);
816	if (rc)
817	dev_warn(hdev->dev,
818	"add va block failed for vaddr: 0x%llx\n", start_addr);
819
820	return rc;
821	}
822
823	/**
824	* init_phys_pg_pack_from_userptr() - initialize physical page pack from host
825	* memory
826	* @ctx: pointer to the context structure.
827	* @userptr: userptr to initialize from.
828	* @pphys_pg_pack: result pointer.
829	* @force_regular_page: tell the function to ignore huge page optimization,
830	* even if possible. Needed for cases where the device VA
831	* is allocated before we know the composition of the
832	* physical pages
833	*
834	* This function does the following:
835	* - Create a physical page pack from the physical pages related to the given
836	* virtual block.
837	*/
838	static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
839	struct hl_userptr *userptr,
840	struct hl_vm_phys_pg_pack **pphys_pg_pack,
841	bool force_regular_page)
842	{
843	u32 npages, page_size = PAGE_SIZE,
844	huge_page_size = ctx->hdev->asic_prop.pmmu_huge.page_size;
845	u32 pgs_in_huge_page = huge_page_size >> __ffs(page_size);
846	struct hl_vm_phys_pg_pack *phys_pg_pack;
847	bool first = true, is_huge_page_opt;
848	u64 page_mask, total_npages;
849	struct scatterlist *sg;
850	dma_addr_t dma_addr;
851	int rc, i, j;
852
853	phys_pg_pack = kzalloc(size: sizeof(*phys_pg_pack), GFP_KERNEL);
854	if (!phys_pg_pack)
855	return -ENOMEM;
856
857	phys_pg_pack->vm_type = userptr->vm_type;
858	phys_pg_pack->created_from_userptr = true;
859	phys_pg_pack->asid = ctx->asid;
860	atomic_set(v: &phys_pg_pack->mapping_cnt, i: `1`);
861
862	is_huge_page_opt = (force_regular_page ? false : true);
863
864	/ Only if all dma_addrs are aligned to 2MB and their*
865	* sizes is at least 2MB, we can use huge page mapping.
866	* We limit the 2MB optimization to this condition,
867	* since later on we acquire the related VA range as one
868	* consecutive block.
869	*/
870	total_npages = `0`;
871	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
872	npages = hl_get_sg_info(sg, dma_addr: &dma_addr);
873
874	total_npages += npages;
875
876	if ((npages % pgs_in_huge_page) \|\|
877	(dma_addr & (huge_page_size - `1`)))
878	is_huge_page_opt = false;
879	}
880
881	if (is_huge_page_opt) {
882	page_size = huge_page_size;
883	do_div(total_npages, pgs_in_huge_page);
884	}
885
886	page_mask = ~(((u64) page_size) - `1`);
887
888	phys_pg_pack->pages = kvmalloc_array(n: total_npages, size: sizeof(u64),
889	GFP_KERNEL);
890	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
891	rc = -ENOMEM;
892	goto page_pack_arr_mem_err;
893	}
894
895	phys_pg_pack->npages = total_npages;
896	phys_pg_pack->page_size = page_size;
897	phys_pg_pack->total_size = total_npages * page_size;
898
899	j = `0`;
900	for_each_sgtable_dma_sg(userptr->sgt, sg, i) {
901	npages = hl_get_sg_info(sg, dma_addr: &dma_addr);
902
903	/ align down to physical page size and save the offset /
904	if (first) {
905	first = false;
906	phys_pg_pack->offset = dma_addr & (page_size - `1`);
907	dma_addr &= page_mask;
908	}
909
910	while (npages) {
911	phys_pg_pack->pages[j++] = dma_addr;
912	dma_addr += page_size;
913
914	if (is_huge_page_opt)
915	npages -= pgs_in_huge_page;
916	else
917	npages--;
918	}
919	}
920
921	*pphys_pg_pack = phys_pg_pack;
922
923	return `0`;
924
925	page_pack_arr_mem_err:
926	kfree(objp: phys_pg_pack);
927
928	return rc;
929	}
930
931	/**
932	* map_phys_pg_pack() - maps the physical page pack..
933	* @ctx: pointer to the context structure.
934	* @vaddr: start address of the virtual area to map from.
935	* @phys_pg_pack: the pack of physical pages to map to.
936	*
937	* This function does the following:
938	* - Maps each chunk of virtual memory to matching physical chunk.
939	* - Stores number of successful mappings in the given argument.
940	* - Returns 0 on success, error code otherwise.
941	*/
942	static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
943	struct hl_vm_phys_pg_pack *phys_pg_pack)
944	{
945	struct hl_device *hdev = ctx->hdev;
946	u64 next_vaddr = vaddr, paddr, mapped_pg_cnt = `0`, i;
947	u32 page_size = phys_pg_pack->page_size;
948	int rc = `0`;
949	bool is_host_addr;
950
951	for (i = `0` ; i < phys_pg_pack->npages ; i++) {
952	paddr = phys_pg_pack->pages[i];
953
954	rc = hl_mmu_map_page(ctx, virt_addr: next_vaddr, phys_addr: paddr, page_size,
955	flush_pte: (i + `1`) == phys_pg_pack->npages);
956	if (rc) {
957	dev_err(hdev->dev,
958	"map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",
959	rc, phys_pg_pack->handle, phys_pg_pack->npages,
960	mapped_pg_cnt);
961	goto err;
962	}
963
964	mapped_pg_cnt++;
965	next_vaddr += page_size;
966	}
967
968	return `0`;
969
970	err:
971	is_host_addr = !hl_is_dram_va(hdev, virt_addr: vaddr);
972
973	next_vaddr = vaddr;
974	for (i = `0` ; i < mapped_pg_cnt ; i++) {
975	if (hl_mmu_unmap_page(ctx, virt_addr: next_vaddr, page_size,
976	flush_pte: (i + `1`) == mapped_pg_cnt))
977	dev_warn_ratelimited(hdev->dev,
978	"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
979	phys_pg_pack->handle, next_vaddr,
980	phys_pg_pack->pages[i], page_size);
981
982	next_vaddr += page_size;
983
984	/*
985	* unmapping on Palladium can be really long, so avoid a CPU
986	* soft lockup bug by sleeping a little between unmapping pages
987	*
988	* In addition, on host num of pages could be huge,
989	* because page size could be 4KB, so when unmapping host
990	* pages sleep every 32K pages to avoid soft lockup
991	*/
992	if (hdev->pldm \|\| (is_host_addr && (i & `0x7FFF`) == `0`))
993	usleep_range(min: `50`, max: `200`);
994	}
995
996	return rc;
997	}
998
999	/**
1000	* unmap_phys_pg_pack() - unmaps the physical page pack.
1001	* @ctx: pointer to the context structure.
1002	* @vaddr: start address of the virtual area to unmap.
1003	* @phys_pg_pack: the pack of physical pages to unmap.
1004	*/
1005	static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
1006	struct hl_vm_phys_pg_pack *phys_pg_pack)
1007	{
1008	struct hl_device *hdev = ctx->hdev;
1009	u64 next_vaddr, i;
1010	bool is_host_addr;
1011	u32 page_size;
1012
1013	is_host_addr = !hl_is_dram_va(hdev, virt_addr: vaddr);
1014	page_size = phys_pg_pack->page_size;
1015	next_vaddr = vaddr;
1016
1017	for (i = `0` ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
1018	if (hl_mmu_unmap_page(ctx, virt_addr: next_vaddr, page_size,
1019	flush_pte: (i + `1`) == phys_pg_pack->npages))
1020	dev_warn_ratelimited(hdev->dev,
1021	"unmap failed for vaddr: 0x%llx\n", next_vaddr);
1022
1023	/*
1024	* unmapping on Palladium can be really long, so avoid a CPU
1025	* soft lockup bug by sleeping a little between unmapping pages
1026	*
1027	* In addition, on host num of pages could be huge,
1028	* because page size could be 4KB, so when unmapping host
1029	* pages sleep every 32K pages to avoid soft lockup
1030	*/
1031	if (hdev->pldm \|\| (is_host_addr && (i & `0x7FFF`) == `0`))
1032	usleep_range(min: `50`, max: `200`);
1033	}
1034	}
1035
1036	/**
1037	* map_device_va() - map the given memory.
1038	* @ctx: pointer to the context structure.
1039	* @args: host parameters with handle/host virtual address.
1040	* @device_addr: pointer to result device virtual address.
1041	*
1042	* This function does the following:
1043	* - If given a physical device memory handle, map to a device virtual block
1044	* and return the start address of this block.
1045	* - If given a host virtual address and size, find the related physical pages,
1046	* map a device virtual block to this pages and return the start address of
1047	* this block.
1048	*/
1049	static int map_device_va(struct hl_ctx ctx, struct* hl_mem_in args, u64 device_addr)
1050	{
1051	struct hl_vm_phys_pg_pack *phys_pg_pack;
1052	enum hl_va_range_type va_range_type = `0`;
1053	struct hl_device *hdev = ctx->hdev;
1054	struct hl_userptr *userptr = NULL;
1055	u32 handle = `0`, va_block_align;
1056	struct hl_vm_hash_node *hnode;
1057	struct hl_vm *vm = &hdev->vm;
1058	struct hl_va_range *va_range;
1059	bool is_userptr, do_prefetch;
1060	u64 ret_vaddr, hint_addr;
1061	enum vm_type *vm_type;
1062	int rc;
1063
1064	/ set map flags /
1065	is_userptr = args->flags & HL_MEM_USERPTR;
1066	do_prefetch = hdev->supports_mmu_prefetch && (args->flags & HL_MEM_PREFETCH);
1067
1068	/ Assume failure /
1069	*device_addr = `0`;
1070
1071	if (is_userptr) {
1072	u64 addr = args->map_host.host_virt_addr,
1073	size = args->map_host.mem_size;
1074	u32 page_size = hdev->asic_prop.pmmu.page_size,
1075	huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
1076
1077	rc = dma_map_host_va(hdev, addr, size, p_userptr: &userptr);
1078	if (rc)
1079	return rc;
1080
1081	rc = init_phys_pg_pack_from_userptr(ctx, userptr,
1082	pphys_pg_pack: &phys_pg_pack, force_regular_page: false);
1083	if (rc) {
1084	dev_err(hdev->dev,
1085	"unable to init page pack for vaddr 0x%llx\n",
1086	addr);
1087	goto init_page_pack_err;
1088	}
1089
1090	vm_type = (enum vm_type *) userptr;
1091	hint_addr = args->map_host.hint_addr;
1092	handle = phys_pg_pack->handle;
1093
1094	/ get required alignment /
1095	if (phys_pg_pack->page_size == page_size) {
1096	va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1097	va_range_type = HL_VA_RANGE_TYPE_HOST;
1098	/*
1099	* huge page alignment may be needed in case of regular
1100	* page mapping, depending on the host VA alignment
1101	*/
1102	if (addr & (huge_page_size - `1`))
1103	va_block_align = page_size;
1104	else
1105	va_block_align = huge_page_size;
1106	} else {
1107	/*
1108	* huge page alignment is needed in case of huge page
1109	* mapping
1110	*/
1111	va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1112	va_range_type = HL_VA_RANGE_TYPE_HOST_HUGE;
1113	va_block_align = huge_page_size;
1114	}
1115	} else {
1116	handle = lower_32_bits(args->map_device.handle);
1117
1118	spin_lock(lock: &vm->idr_lock);
1119	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, id: handle);
1120	if (!phys_pg_pack) {
1121	spin_unlock(lock: &vm->idr_lock);
1122	dev_err(hdev->dev,
1123	"no match for handle %u\n", handle);
1124	return -EINVAL;
1125	}
1126
1127	/ increment now to avoid freeing device memory while mapping /
1128	atomic_inc(v: &phys_pg_pack->mapping_cnt);
1129
1130	spin_unlock(lock: &vm->idr_lock);
1131
1132	vm_type = (enum vm_type *) phys_pg_pack;
1133
1134	hint_addr = args->map_device.hint_addr;
1135
1136	/ DRAM VA alignment is the same as the MMU page size /
1137	va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1138	va_range_type = HL_VA_RANGE_TYPE_DRAM;
1139	va_block_align = hdev->asic_prop.dmmu.page_size;
1140	}
1141
1142	/*
1143	* relevant for mapping device physical memory only, as host memory is
1144	* implicitly shared
1145	*/
1146	if (!is_userptr && !(phys_pg_pack->flags & HL_MEM_SHARED) &&
1147	phys_pg_pack->asid != ctx->asid) {
1148	dev_err(hdev->dev,
1149	"Failed to map memory, handle %u is not shared\n",
1150	handle);
1151	rc = -EPERM;
1152	goto shared_err;
1153	}
1154
1155	hnode = kzalloc(size: sizeof(*hnode), GFP_KERNEL);
1156	if (!hnode) {
1157	rc = -ENOMEM;
1158	goto hnode_err;
1159	}
1160
1161	if (hint_addr && phys_pg_pack->offset) {
1162	if (args->flags & HL_MEM_FORCE_HINT) {
1163	/ Fail if hint must be respected but it can't be /
1164	dev_err(hdev->dev,
1165	"Hint address 0x%llx cannot be respected because source memory is not aligned 0x%x\n",
1166	hint_addr, phys_pg_pack->offset);
1167	rc = -EINVAL;
1168	goto va_block_err;
1169	}
1170	dev_dbg(hdev->dev,
1171	"Hint address 0x%llx will be ignored because source memory is not aligned 0x%x\n",
1172	hint_addr, phys_pg_pack->offset);
1173	}
1174
1175	ret_vaddr = get_va_block(hdev, va_range, size: phys_pg_pack->total_size,
1176	hint_addr, va_block_align,
1177	range_type: va_range_type, flags: args->flags);
1178	if (!ret_vaddr) {
1179	dev_err(hdev->dev, "no available va block for handle %u\n",
1180	handle);
1181	rc = -ENOMEM;
1182	goto va_block_err;
1183	}
1184
1185	mutex_lock(&hdev->mmu_lock);
1186
1187	rc = map_phys_pg_pack(ctx, vaddr: ret_vaddr, phys_pg_pack);
1188	if (rc) {
1189	dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",
1190	rc, handle);
1191	mutex_unlock(lock: &hdev->mmu_lock);
1192	goto map_err;
1193	}
1194
1195	rc = hl_mmu_invalidate_cache_range(hdev, is_hard: false, flags: *vm_type \| MMU_OP_SKIP_LOW_CACHE_INV,
1196	asid: ctx->asid, va: ret_vaddr, size: phys_pg_pack->total_size);
1197	mutex_unlock(lock: &hdev->mmu_lock);
1198	if (rc)
1199	goto map_err;
1200
1201	/*
1202	* prefetch is done upon user's request. it is performed in WQ as and so can
1203	* be outside the MMU lock. the operation itself is already protected by the mmu lock
1204	*/
1205	if (do_prefetch) {
1206	rc = hl_mmu_prefetch_cache_range(ctx, flags: *vm_type, asid: ctx->asid, va: ret_vaddr,
1207	size: phys_pg_pack->total_size);
1208	if (rc)
1209	goto map_err;
1210	}
1211
1212	ret_vaddr += phys_pg_pack->offset;
1213
1214	hnode->ptr = vm_type;
1215	hnode->vaddr = ret_vaddr;
1216	hnode->handle = is_userptr ? MEM_HANDLE_INVALID : handle;
1217
1218	mutex_lock(&ctx->mem_hash_lock);
1219	hash_add(ctx->mem_hash, &hnode->node, ret_vaddr);
1220	mutex_unlock(lock: &ctx->mem_hash_lock);
1221
1222	*device_addr = ret_vaddr;
1223
1224	if (is_userptr)
1225	free_phys_pg_pack(hdev, phys_pg_pack);
1226
1227	return rc;
1228
1229	map_err:
1230	if (add_va_block(hdev, va_range, start: ret_vaddr,
1231	end: ret_vaddr + phys_pg_pack->total_size - `1`))
1232	dev_warn(hdev->dev,
1233	"release va block failed for handle 0x%x, vaddr: 0x%llx\n",
1234	handle, ret_vaddr);
1235
1236	va_block_err:
1237	kfree(objp: hnode);
1238	hnode_err:
1239	shared_err:
1240	atomic_dec(v: &phys_pg_pack->mapping_cnt);
1241	if (is_userptr)
1242	free_phys_pg_pack(hdev, phys_pg_pack);
1243	init_page_pack_err:
1244	if (is_userptr)
1245	dma_unmap_host_va(hdev, userptr);
1246
1247	return rc;
1248	}
1249
1250	/ Should be called while the context's mem_hash_lock is taken /
1251	static struct hl_vm_hash_node get_vm_hash_node_locked(struct* hl_ctx *ctx, u64 vaddr)
1252	{
1253	struct hl_vm_hash_node *hnode;
1254
1255	hash_for_each_possible(ctx->mem_hash, hnode, node, vaddr)
1256	if (vaddr == hnode->vaddr)
1257	return hnode;
1258
1259	return NULL;
1260	}
1261
1262	/**
1263	* unmap_device_va() - unmap the given device virtual address.
1264	* @ctx: pointer to the context structure.
1265	* @args: host parameters with device virtual address to unmap.
1266	* @ctx_free: true if in context free flow, false otherwise.
1267	*
1268	* This function does the following:
1269	* - unmap the physical pages related to the given virtual address.
1270	* - return the device virtual block to the virtual block list.
1271	*/
1272	static int unmap_device_va(struct hl_ctx ctx, struct* hl_mem_in *args,
1273	bool ctx_free)
1274	{
1275	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
1276	u64 vaddr = args->unmap.device_virt_addr;
1277	struct asic_fixed_properties *prop;
1278	struct hl_device *hdev = ctx->hdev;
1279	struct hl_userptr *userptr = NULL;
1280	struct hl_vm_hash_node *hnode;
1281	struct hl_va_range *va_range;
1282	enum vm_type *vm_type;
1283	bool is_userptr;
1284	int rc = `0`;
1285
1286	prop = &hdev->asic_prop;
1287
1288	/ protect from double entrance /
1289	mutex_lock(&ctx->mem_hash_lock);
1290	hnode = get_vm_hash_node_locked(ctx, vaddr);
1291	if (!hnode) {
1292	mutex_unlock(lock: &ctx->mem_hash_lock);
1293	dev_err(hdev->dev, "unmap failed, no mem hnode for vaddr 0x%llx\n", vaddr);
1294	return -EINVAL;
1295	}
1296
1297	if (hnode->export_cnt) {
1298	mutex_unlock(lock: &ctx->mem_hash_lock);
1299	dev_err(hdev->dev, "failed to unmap %#llx, memory is exported\n", vaddr);
1300	return -EINVAL;
1301	}
1302
1303	hash_del(node: &hnode->node);
1304	mutex_unlock(lock: &ctx->mem_hash_lock);
1305
1306	vm_type = hnode->ptr;
1307
1308	if (*vm_type == VM_TYPE_USERPTR) {
1309	is_userptr = true;
1310	userptr = hnode->ptr;
1311
1312	rc = init_phys_pg_pack_from_userptr(ctx, userptr, pphys_pg_pack: &phys_pg_pack,
1313	force_regular_page: false);
1314	if (rc) {
1315	dev_err(hdev->dev,
1316	"unable to init page pack for vaddr 0x%llx\n",
1317	vaddr);
1318	goto vm_type_err;
1319	}
1320
1321	if (phys_pg_pack->page_size ==
1322	hdev->asic_prop.pmmu.page_size)
1323	va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
1324	else
1325	va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
1326	} else if (*vm_type == VM_TYPE_PHYS_PACK) {
1327	is_userptr = false;
1328	va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
1329	phys_pg_pack = hnode->ptr;
1330	} else {
1331	dev_warn(hdev->dev,
1332	"unmap failed, unknown vm desc for vaddr 0x%llx\n",
1333	vaddr);
1334	rc = -EFAULT;
1335	goto vm_type_err;
1336	}
1337
1338	if (atomic_read(v: &phys_pg_pack->mapping_cnt) == `0`) {
1339	dev_err(hdev->dev, "vaddr 0x%llx is not mapped\n", vaddr);
1340	rc = -EINVAL;
1341	goto mapping_cnt_err;
1342	}
1343
1344	if (!is_userptr && !is_power_of_2(n: phys_pg_pack->page_size))
1345	vaddr = prop->dram_base_address +
1346	DIV_ROUND_DOWN_ULL(vaddr - prop->dram_base_address,
1347	phys_pg_pack->page_size) *
1348	phys_pg_pack->page_size;
1349	else
1350	vaddr &= ~(((u64) phys_pg_pack->page_size) - `1`);
1351
1352	mutex_lock(&hdev->mmu_lock);
1353
1354	unmap_phys_pg_pack(ctx, vaddr, phys_pg_pack);
1355
1356	/*
1357	* During context free this function is called in a loop to clean all
1358	* the context mappings. Hence the cache invalidation can be called once
1359	* at the loop end rather than for each iteration
1360	*/
1361	if (!ctx_free)
1362	rc = hl_mmu_invalidate_cache_range(hdev, is_hard: true, flags: *vm_type, asid: ctx->asid, va: vaddr,
1363	size: phys_pg_pack->total_size);
1364
1365	mutex_unlock(lock: &hdev->mmu_lock);
1366
1367	/*
1368	* If the context is closing we don't need to check for the MMU cache
1369	* invalidation return code and update the VA free list as in this flow
1370	* we invalidate the MMU cache outside of this unmap function and the VA
1371	* free list will be freed anyway.
1372	*/
1373	if (!ctx_free) {
1374	int tmp_rc;
1375
1376	tmp_rc = add_va_block(hdev, va_range, start: vaddr,
1377	end: vaddr + phys_pg_pack->total_size - `1`);
1378	if (tmp_rc) {
1379	dev_warn(hdev->dev,
1380	"add va block failed for vaddr: 0x%llx\n",
1381	vaddr);
1382	if (!rc)
1383	rc = tmp_rc;
1384	}
1385	}
1386
1387	atomic_dec(v: &phys_pg_pack->mapping_cnt);
1388	kfree(objp: hnode);
1389
1390	if (is_userptr) {
1391	free_phys_pg_pack(hdev, phys_pg_pack);
1392	dma_unmap_host_va(hdev, userptr);
1393	}
1394
1395	return rc;
1396
1397	mapping_cnt_err:
1398	if (is_userptr)
1399	free_phys_pg_pack(hdev, phys_pg_pack);
1400	vm_type_err:
1401	mutex_lock(&ctx->mem_hash_lock);
1402	hash_add(ctx->mem_hash, &hnode->node, vaddr);
1403	mutex_unlock(lock: &ctx->mem_hash_lock);
1404
1405	return rc;
1406	}
1407
1408	static int map_block(struct hl_device hdev, u64 address, u64 handle, u32 *size)
1409	{
1410	u32 block_id;
1411	int rc;
1412
1413	*handle = `0`;
1414	if (size)
1415	*size = `0`;
1416
1417	rc = hdev->asic_funcs->get_hw_block_id(hdev, address, size, &block_id);
1418	if (rc)
1419	return rc;
1420
1421	*handle = block_id \| HL_MMAP_TYPE_BLOCK;
1422	*handle <<= PAGE_SHIFT;
1423
1424	return `0`;
1425	}
1426
1427	static void hw_block_vm_close(struct vm_area_struct *vma)
1428	{
1429	struct hl_vm_hw_block_list_node *lnode =
1430	(struct hl_vm_hw_block_list_node *) vma->vm_private_data;
1431	struct hl_ctx *ctx = lnode->ctx;
1432	long new_mmap_size;
1433
1434	new_mmap_size = lnode->mapped_size - (vma->vm_end - vma->vm_start);
1435	if (new_mmap_size > `0`) {
1436	lnode->mapped_size = new_mmap_size;
1437	return;
1438	}
1439
1440	mutex_lock(&ctx->hw_block_list_lock);
1441	list_del(entry: &lnode->node);
1442	mutex_unlock(lock: &ctx->hw_block_list_lock);
1443	hl_ctx_put(ctx);
1444	kfree(objp: lnode);
1445	vma->vm_private_data = NULL;
1446	}
1447
1448	static const struct vm_operations_struct hw_block_vm_ops = {
1449	.close = hw_block_vm_close
1450	};
1451
1452	/**
1453	* hl_hw_block_mmap() - mmap a hw block to user.
1454	* @hpriv: pointer to the private data of the fd
1455	* @vma: pointer to vm_area_struct of the process
1456	*
1457	* Driver increments context reference for every HW block mapped in order
1458	* to prevent user from closing FD without unmapping first
1459	*/
1460	int hl_hw_block_mmap(struct hl_fpriv hpriv, struct* vm_area_struct *vma)
1461	{
1462	struct hl_vm_hw_block_list_node *lnode;
1463	struct hl_device *hdev = hpriv->hdev;
1464	struct hl_ctx *ctx = hpriv->ctx;
1465	u32 block_id, block_size;
1466	int rc;
1467
1468	/ We use the page offset to hold the block id and thus we need to clear*
1469	* it before doing the mmap itself
1470	*/
1471	block_id = vma->vm_pgoff;
1472	vma->vm_pgoff = `0`;
1473
1474	/ Driver only allows mapping of a complete HW block /
1475	block_size = vma->vm_end - vma->vm_start;
1476
1477	if (!access_ok((void __user *) (uintptr_t) vma->vm_start, block_size)) {
1478	dev_err(hdev->dev,
1479	"user pointer is invalid - 0x%lx\n",
1480	vma->vm_start);
1481
1482	return -EINVAL;
1483	}
1484
1485	lnode = kzalloc(size: sizeof(*lnode), GFP_KERNEL);
1486	if (!lnode)
1487	return -ENOMEM;
1488
1489	rc = hdev->asic_funcs->hw_block_mmap(hdev, vma, block_id, block_size);
1490	if (rc) {
1491	kfree(objp: lnode);
1492	return rc;
1493	}
1494
1495	hl_ctx_get(ctx);
1496
1497	lnode->ctx = ctx;
1498	lnode->vaddr = vma->vm_start;
1499	lnode->block_size = block_size;
1500	lnode->mapped_size = lnode->block_size;
1501	lnode->id = block_id;
1502
1503	vma->vm_private_data = lnode;
1504	vma->vm_ops = &hw_block_vm_ops;
1505
1506	mutex_lock(&ctx->hw_block_list_lock);
1507	list_add_tail(new: &lnode->node, head: &ctx->hw_block_mem_list);
1508	mutex_unlock(lock: &ctx->hw_block_list_lock);
1509
1510	vma->vm_pgoff = block_id;
1511
1512	return `0`;
1513	}
1514
1515	static int set_dma_sg(struct scatterlist *sg, u64 bar_address, u64 chunk_size,
1516	struct device dev, enum* dma_data_direction dir)
1517	{
1518	dma_addr_t addr;
1519	int rc;
1520
1521	addr = dma_map_resource(dev, phys_addr: bar_address, size: chunk_size, dir,
1522	DMA_ATTR_SKIP_CPU_SYNC);
1523	rc = dma_mapping_error(dev, dma_addr: addr);
1524	if (rc)
1525	return rc;
1526
1527	sg_set_page(sg, NULL, len: chunk_size, offset: `0`);
1528	sg_dma_address(sg) = addr;
1529	sg_dma_len(sg) = chunk_size;
1530
1531	return `0`;
1532	}
1533
1534	static struct sg_table alloc_sgt_from_device_pages(struct* hl_device hdev, u64 pages, u64 npages,
1535	u64 page_size, u64 exported_size, u64 offset,
1536	struct device dev, enum* dma_data_direction dir)
1537	{
1538	u64 dma_max_seg_size, curr_page, size, chunk_size, left_size_to_export, left_size_in_page,
1539	left_size_in_dma_seg, device_address, bar_address, start_page;
1540	struct asic_fixed_properties *prop = &hdev->asic_prop;
1541	struct scatterlist *sg;
1542	unsigned int nents, i;
1543	struct sg_table *sgt;
1544	bool next_sg_entry;
1545	int rc;
1546
1547	/ Align max segment size to PAGE_SIZE to fit the minimal IOMMU mapping granularity /
1548	dma_max_seg_size = ALIGN_DOWN(dma_get_max_seg_size(dev), PAGE_SIZE);
1549	if (dma_max_seg_size < PAGE_SIZE) {
1550	dev_err_ratelimited(hdev->dev,
1551	"dma_max_seg_size %llu can't be smaller than PAGE_SIZE\n",
1552	dma_max_seg_size);
1553	return ERR_PTR(error: -EINVAL);
1554	}
1555
1556	sgt = kzalloc(size: sizeof(*sgt), GFP_KERNEL);
1557	if (!sgt)
1558	return ERR_PTR(error: -ENOMEM);
1559
1560	/ Use the offset to move to the actual first page that is exported /
1561	for (start_page = `0` ; start_page < npages ; ++start_page) {
1562	if (offset < page_size)
1563	break;
1564
1565	/ The offset value was validated so there can't be an underflow /
1566	offset -= page_size;
1567	}
1568
1569	/ Calculate the required number of entries for the SG table /
1570	curr_page = start_page;
1571	nents = `1`;
1572	left_size_to_export = exported_size;
1573	left_size_in_page = page_size - offset;
1574	left_size_in_dma_seg = dma_max_seg_size;
1575	next_sg_entry = false;
1576
1577	while (true) {
1578	size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);
1579	left_size_to_export -= size;
1580	left_size_in_page -= size;
1581	left_size_in_dma_seg -= size;
1582
1583	if (!left_size_to_export)
1584	break;
1585
1586	if (!left_size_in_page) {
1587	/ left_size_to_export is not zero so there must be another page /
1588	if (pages[curr_page] + page_size != pages[curr_page + `1`])
1589	next_sg_entry = true;
1590
1591	++curr_page;
1592	left_size_in_page = page_size;
1593	}
1594
1595	if (!left_size_in_dma_seg) {
1596	next_sg_entry = true;
1597	left_size_in_dma_seg = dma_max_seg_size;
1598	}
1599
1600	if (next_sg_entry) {
1601	++nents;
1602	next_sg_entry = false;
1603	}
1604	}
1605
1606	rc = sg_alloc_table(sgt, nents, GFP_KERNEL \| __GFP_ZERO);
1607	if (rc)
1608	goto err_free_sgt;
1609
1610	/ Prepare the SG table entries /
1611	curr_page = start_page;
1612	device_address = pages[curr_page] + offset;
1613	left_size_to_export = exported_size;
1614	left_size_in_page = page_size - offset;
1615	left_size_in_dma_seg = dma_max_seg_size;
1616	next_sg_entry = false;
1617
1618	for_each_sgtable_dma_sg(sgt, sg, i) {
1619	bar_address = hdev->dram_pci_bar_start + (device_address - prop->dram_base_address);
1620	chunk_size = `0`;
1621
1622	for ( ; curr_page < npages ; ++curr_page) {
1623	size = min3(left_size_to_export, left_size_in_page, left_size_in_dma_seg);
1624	chunk_size += size;
1625	left_size_to_export -= size;
1626	left_size_in_page -= size;
1627	left_size_in_dma_seg -= size;
1628
1629	if (!left_size_to_export)
1630	break;
1631
1632	if (!left_size_in_page) {
1633	/ left_size_to_export is not zero so there must be another page /
1634	if (pages[curr_page] + page_size != pages[curr_page + `1`]) {
1635	device_address = pages[curr_page + `1`];
1636	next_sg_entry = true;
1637	}
1638
1639	left_size_in_page = page_size;
1640	}
1641
1642	if (!left_size_in_dma_seg) {
1643	/*
1644	* Skip setting a new device address if already moving to a page
1645	* which is not contiguous with the current page.
1646	*/
1647	if (!next_sg_entry) {
1648	device_address += chunk_size;
1649	next_sg_entry = true;
1650	}
1651
1652	left_size_in_dma_seg = dma_max_seg_size;
1653	}
1654
1655	if (next_sg_entry) {
1656	next_sg_entry = false;
1657	break;
1658	}
1659	}
1660
1661	rc = set_dma_sg(sg, bar_address, chunk_size, dev, dir);
1662	if (rc)
1663	goto err_unmap;
1664	}
1665
1666	/ There should be nothing left to export exactly after looping over all SG elements /
1667	if (left_size_to_export) {
1668	dev_err(hdev->dev,
1669	"left size to export %#llx after initializing %u SG elements\n",
1670	left_size_to_export, sgt->nents);
1671	rc = -ENOMEM;
1672	goto err_unmap;
1673	}
1674
1675	/*
1676	* Because we are not going to include a CPU list, we want to have some chance that other
1677	* users will detect this when going over SG table, by setting the orig_nents to 0 and using
1678	* only nents (length of DMA list).
1679	*/
1680	sgt->orig_nents = `0`;
1681
1682	dev_dbg(hdev->dev, "prepared SG table with %u entries for importer %s\n",
1683	nents, dev_name(dev));
1684	for_each_sgtable_dma_sg(sgt, sg, i)
1685	dev_dbg(hdev->dev,
1686	"SG entry %d: address %#llx, length %#x\n",
1687	i, sg_dma_address(sg), sg_dma_len(sg));
1688
1689	return sgt;
1690
1691	err_unmap:
1692	for_each_sgtable_dma_sg(sgt, sg, i) {
1693	if (!sg_dma_len(sg))
1694	continue;
1695
1696	dma_unmap_resource(dev, sg_dma_address(sg), sg_dma_len(sg), dir,
1697	DMA_ATTR_SKIP_CPU_SYNC);
1698	}
1699
1700	sg_free_table(sgt);
1701
1702	err_free_sgt:
1703	kfree(objp: sgt);
1704	return ERR_PTR(error: rc);
1705	}
1706
1707	static int hl_dmabuf_attach(struct dma_buf *dmabuf,
1708	struct dma_buf_attachment *attachment)
1709	{
1710	struct hl_dmabuf_priv *hl_dmabuf;
1711	struct hl_device *hdev;
1712	int rc;
1713
1714	hl_dmabuf = dmabuf->priv;
1715	hdev = hl_dmabuf->ctx->hdev;
1716
1717	rc = pci_p2pdma_distance(provider: hdev->pdev, client: attachment->dev, verbose: true);
1718
1719	if (rc < `0`)
1720	attachment->peer2peer = false;
1721	return `0`;
1722	}
1723
1724	static struct sg_table hl_map_dmabuf(struct* dma_buf_attachment *attachment,
1725	enum dma_data_direction dir)
1726	{
1727	u64 *pages, npages, page_size, exported_size, offset;
1728	struct dma_buf *dma_buf = attachment->dmabuf;
1729	struct hl_vm_phys_pg_pack *phys_pg_pack;
1730	struct hl_dmabuf_priv *hl_dmabuf;
1731	struct hl_device *hdev;
1732	struct sg_table *sgt;
1733
1734	hl_dmabuf = dma_buf->priv;
1735	hdev = hl_dmabuf->ctx->hdev;
1736
1737	if (!attachment->peer2peer) {
1738	dev_dbg(hdev->dev, "Failed to map dmabuf because p2p is disabled\n");
1739	return ERR_PTR(error: -EPERM);
1740	}
1741
1742	exported_size = hl_dmabuf->dmabuf->size;
1743	offset = hl_dmabuf->offset;
1744	phys_pg_pack = hl_dmabuf->phys_pg_pack;
1745
1746	if (phys_pg_pack) {
1747	pages = phys_pg_pack->pages;
1748	npages = phys_pg_pack->npages;
1749	page_size = phys_pg_pack->page_size;
1750	} else {
1751	pages = &hl_dmabuf->device_phys_addr;
1752	npages = `1`;
1753	page_size = hl_dmabuf->dmabuf->size;
1754	}
1755
1756	sgt = alloc_sgt_from_device_pages(hdev, pages, npages, page_size, exported_size, offset,
1757	dev: attachment->dev, dir);
1758	if (IS_ERR(ptr: sgt))
1759	dev_err(hdev->dev, "failed (%ld) to initialize sgt for dmabuf\n", PTR_ERR(sgt));
1760
1761	return sgt;
1762	}
1763
1764	static void hl_unmap_dmabuf(struct dma_buf_attachment *attachment,
1765	struct sg_table *sgt,
1766	enum dma_data_direction dir)
1767	{
1768	struct scatterlist *sg;
1769	int i;
1770
1771	/ The memory behind the dma-buf has always resided on the device itself, i.e. it lives*
1772	* only in the 'device' domain (after all, it maps a PCI bar address which points to the
1773	* device memory).
1774	*
1775	* Therefore, it was never in the 'CPU' domain and hence, there is no need to perform
1776	* a sync of the memory to the CPU's cache, as it never resided inside that cache.
1777	*/
1778	for_each_sgtable_dma_sg(sgt, sg, i)
1779	dma_unmap_resource(dev: attachment->dev, sg_dma_address(sg),
1780	sg_dma_len(sg), dir,
1781	DMA_ATTR_SKIP_CPU_SYNC);
1782
1783	/ Need to restore orig_nents because sg_free_table use that field /
1784	sgt->orig_nents = sgt->nents;
1785	sg_free_table(sgt);
1786	kfree(objp: sgt);
1787	}
1788
1789	static struct hl_vm_hash_node memhash_node_export_get(struct* hl_ctx *ctx, u64 addr)
1790	{
1791	struct hl_device *hdev = ctx->hdev;
1792	struct hl_vm_hash_node *hnode;
1793
1794	/ get the memory handle /
1795	mutex_lock(&ctx->mem_hash_lock);
1796	hnode = get_vm_hash_node_locked(ctx, vaddr: addr);
1797	if (!hnode) {
1798	mutex_unlock(lock: &ctx->mem_hash_lock);
1799	dev_dbg(hdev->dev, "map address %#llx not found\n", addr);
1800	return ERR_PTR(error: -EINVAL);
1801	}
1802
1803	if (upper_32_bits(hnode->handle)) {
1804	mutex_unlock(lock: &ctx->mem_hash_lock);
1805	dev_dbg(hdev->dev, "invalid handle %#llx for map address %#llx\n",
1806	hnode->handle, addr);
1807	return ERR_PTR(error: -EINVAL);
1808	}
1809
1810	/*
1811	* node found, increase export count so this memory cannot be unmapped
1812	* and the hash node cannot be deleted.
1813	*/
1814	hnode->export_cnt++;
1815	mutex_unlock(lock: &ctx->mem_hash_lock);
1816
1817	return hnode;
1818	}
1819
1820	static void memhash_node_export_put(struct hl_ctx ctx, struct* hl_vm_hash_node *hnode)
1821	{
1822	mutex_lock(&ctx->mem_hash_lock);
1823	hnode->export_cnt--;
1824	mutex_unlock(lock: &ctx->mem_hash_lock);
1825	}
1826
1827	static void hl_release_dmabuf(struct dma_buf *dmabuf)
1828	{
1829	struct hl_dmabuf_priv *hl_dmabuf = dmabuf->priv;
1830	struct hl_ctx *ctx;
1831
1832	if (!hl_dmabuf)
1833	return;
1834
1835	ctx = hl_dmabuf->ctx;
1836
1837	if (hl_dmabuf->memhash_hnode)
1838	memhash_node_export_put(ctx, hnode: hl_dmabuf->memhash_hnode);
1839
1840	atomic_dec(v: &ctx->hdev->dmabuf_export_cnt);
1841	hl_ctx_put(ctx);
1842
1843	/ Paired with get_file() in export_dmabuf() /
1844	fput(ctx->hpriv->file_priv->filp);
1845
1846	kfree(objp: hl_dmabuf);
1847	}
1848
1849	static const struct dma_buf_ops habanalabs_dmabuf_ops = {
1850	.attach = hl_dmabuf_attach,
1851	.map_dma_buf = hl_map_dmabuf,
1852	.unmap_dma_buf = hl_unmap_dmabuf,
1853	.release = hl_release_dmabuf,
1854	};
1855
1856	static int export_dmabuf(struct hl_ctx *ctx,
1857	struct hl_dmabuf_priv *hl_dmabuf,
1858	u64 total_size, int flags, int *dmabuf_fd)
1859	{
1860	DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
1861	struct hl_device *hdev = ctx->hdev;
1862	int rc, fd;
1863
1864	exp_info.ops = &habanalabs_dmabuf_ops;
1865	exp_info.size = total_size;
1866	exp_info.flags = flags;
1867	exp_info.priv = hl_dmabuf;
1868
1869	hl_dmabuf->dmabuf = dma_buf_export(exp_info: &exp_info);
1870	if (IS_ERR(ptr: hl_dmabuf->dmabuf)) {
1871	dev_err(hdev->dev, "failed to export dma-buf\n");
1872	return PTR_ERR(ptr: hl_dmabuf->dmabuf);
1873	}
1874
1875	fd = dma_buf_fd(dmabuf: hl_dmabuf->dmabuf, flags);
1876	if (fd < `0`) {
1877	dev_err(hdev->dev, "failed to get a file descriptor for a dma-buf, %d\n", fd);
1878	rc = fd;
1879	goto err_dma_buf_put;
1880	}
1881
1882	hl_dmabuf->ctx = ctx;
1883	hl_ctx_get(ctx: hl_dmabuf->ctx);
1884	atomic_inc(v: &ctx->hdev->dmabuf_export_cnt);
1885
1886	/ Get compute device file to enforce release order, such that all exported dma-buf will be*
1887	* released first and only then the compute device.
1888	* Paired with fput() in hl_release_dmabuf().
1889	*/
1890	get_file(f: ctx->hpriv->file_priv->filp);
1891
1892	*dmabuf_fd = fd;
1893
1894	return `0`;
1895
1896	err_dma_buf_put:
1897	hl_dmabuf->dmabuf->priv = NULL;
1898	dma_buf_put(dmabuf: hl_dmabuf->dmabuf);
1899	return rc;
1900	}
1901
1902	static int validate_export_params_common(struct hl_device *hdev, u64 addr, u64 size, u64 offset)
1903	{
1904	if (!PAGE_ALIGNED(addr)) {
1905	dev_dbg(hdev->dev,
1906	"exported device memory address 0x%llx should be aligned to PAGE_SIZE 0x%lx\n",
1907	addr, PAGE_SIZE);
1908	return -EINVAL;
1909	}
1910
1911	if (!size \|\| !PAGE_ALIGNED(size)) {
1912	dev_dbg(hdev->dev,
1913	"exported device memory size %llu should be a multiple of PAGE_SIZE %lu\n",
1914	size, PAGE_SIZE);
1915	return -EINVAL;
1916	}
1917
1918	if (!PAGE_ALIGNED(offset)) {
1919	dev_dbg(hdev->dev,
1920	"exported device memory offset %llu should be a multiple of PAGE_SIZE %lu\n",
1921	offset, PAGE_SIZE);
1922	return -EINVAL;
1923	}
1924
1925	return `0`;
1926	}
1927
1928	static int validate_export_params_no_mmu(struct hl_device *hdev, u64 device_addr, u64 size)
1929	{
1930	struct asic_fixed_properties *prop = &hdev->asic_prop;
1931	u64 bar_address;
1932	int rc;
1933
1934	rc = validate_export_params_common(hdev, addr: device_addr, size, offset: `0`);
1935	if (rc)
1936	return rc;
1937
1938	if (device_addr < prop->dram_user_base_address \|\|
1939	(device_addr + size) > prop->dram_end_address \|\|
1940	(device_addr + size) < device_addr) {
1941	dev_dbg(hdev->dev,
1942	"DRAM memory range 0x%llx (+0x%llx) is outside of DRAM boundaries\n",
1943	device_addr, size);
1944	return -EINVAL;
1945	}
1946
1947	bar_address = hdev->dram_pci_bar_start + (device_addr - prop->dram_base_address);
1948
1949	if ((bar_address + size) > (hdev->dram_pci_bar_start + prop->dram_pci_bar_size) \|\|
1950	(bar_address + size) < bar_address) {
1951	dev_dbg(hdev->dev,
1952	"DRAM memory range 0x%llx (+0x%llx) is outside of PCI BAR boundaries\n",
1953	device_addr, size);
1954	return -EINVAL;
1955	}
1956
1957	return `0`;
1958	}
1959
1960	static int validate_export_params(struct hl_device *hdev, u64 device_addr, u64 size, u64 offset,
1961	struct hl_vm_phys_pg_pack *phys_pg_pack)
1962	{
1963	struct asic_fixed_properties *prop = &hdev->asic_prop;
1964	u64 bar_address;
1965	int i, rc;
1966
1967	rc = validate_export_params_common(hdev, addr: device_addr, size, offset);
1968	if (rc)
1969	return rc;
1970
1971	if ((offset + size) > phys_pg_pack->total_size) {
1972	dev_dbg(hdev->dev, "offset %#llx and size %#llx exceed total map size %#llx\n",
1973	offset, size, phys_pg_pack->total_size);
1974	return -EINVAL;
1975	}
1976
1977	for (i = `0` ; i < phys_pg_pack->npages ; i++) {
1978	bar_address = hdev->dram_pci_bar_start +
1979	(phys_pg_pack->pages[i] - prop->dram_base_address);
1980
1981	if ((bar_address + phys_pg_pack->page_size) >
1982	(hdev->dram_pci_bar_start + prop->dram_pci_bar_size) \|\|
1983	(bar_address + phys_pg_pack->page_size) < bar_address) {
1984	dev_dbg(hdev->dev,
1985	"DRAM memory range 0x%llx (+0x%x) is outside of PCI BAR boundaries\n",
1986	phys_pg_pack->pages[i], phys_pg_pack->page_size);
1987	return -EINVAL;
1988	}
1989	}
1990
1991	return `0`;
1992	}
1993
1994	static struct hl_vm_phys_pg_pack get_phys_pg_pack_from_hash_node(struct* hl_device *hdev,
1995	struct hl_vm_hash_node *hnode)
1996	{
1997	struct hl_vm_phys_pg_pack *phys_pg_pack;
1998	struct hl_vm *vm = &hdev->vm;
1999
2000	spin_lock(lock: &vm->idr_lock);
2001	phys_pg_pack = idr_find(&vm->phys_pg_pack_handles, id: (u32) hnode->handle);
2002	if (!phys_pg_pack) {
2003	spin_unlock(lock: &vm->idr_lock);
2004	dev_dbg(hdev->dev, "no match for handle 0x%x\n", (u32) hnode->handle);
2005	return ERR_PTR(error: -EINVAL);
2006	}
2007
2008	spin_unlock(lock: &vm->idr_lock);
2009
2010	if (phys_pg_pack->vm_type != VM_TYPE_PHYS_PACK) {
2011	dev_dbg(hdev->dev, "handle 0x%llx does not represent DRAM memory\n", hnode->handle);
2012	return ERR_PTR(error: -EINVAL);
2013	}
2014
2015	return phys_pg_pack;
2016	}
2017
2018	/**
2019	* export_dmabuf_from_addr() - export a dma-buf object for the given memory
2020	* address and size.
2021	* @ctx: pointer to the context structure.
2022	* @addr: device address.
2023	* @size: size of device memory to export.
2024	* @offset: the offset into the buffer from which to start exporting
2025	* @flags: DMA-BUF file/FD flags.
2026	* @dmabuf_fd: pointer to result FD that represents the dma-buf object.
2027	*
2028	* Create and export a dma-buf object for an existing memory allocation inside
2029	* the device memory, and return a FD which is associated with the dma-buf
2030	* object.
2031	*
2032	* Return: 0 on success, non-zero for failure.
2033	*/
2034	static int export_dmabuf_from_addr(struct hl_ctx *ctx, u64 addr, u64 size, u64 offset,
2035	int flags, int *dmabuf_fd)
2036	{
2037	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
2038	struct hl_vm_hash_node *hnode = NULL;
2039	struct asic_fixed_properties *prop;
2040	struct hl_dmabuf_priv *hl_dmabuf;
2041	struct hl_device *hdev;
2042	int rc;
2043
2044	hdev = ctx->hdev;
2045	prop = &hdev->asic_prop;
2046
2047	/ offset must be 0 in devices without virtual memory support /
2048	if (!prop->dram_supports_virtual_memory && offset) {
2049	dev_dbg(hdev->dev, "offset is not allowed in device without virtual memory\n");
2050	return -EINVAL;
2051	}
2052
2053	hl_dmabuf = kzalloc(size: sizeof(*hl_dmabuf), GFP_KERNEL);
2054	if (!hl_dmabuf)
2055	return -ENOMEM;
2056
2057	if (prop->dram_supports_virtual_memory) {
2058	hnode = memhash_node_export_get(ctx, addr);
2059	if (IS_ERR(ptr: hnode)) {
2060	rc = PTR_ERR(ptr: hnode);
2061	goto err_free_dmabuf_wrapper;
2062	}
2063	phys_pg_pack = get_phys_pg_pack_from_hash_node(hdev, hnode);
2064	if (IS_ERR(ptr: phys_pg_pack)) {
2065	rc = PTR_ERR(ptr: phys_pg_pack);
2066	goto dec_memhash_export_cnt;
2067	}
2068	rc = validate_export_params(hdev, device_addr: addr, size, offset, phys_pg_pack);
2069	if (rc)
2070	goto dec_memhash_export_cnt;
2071
2072	hl_dmabuf->phys_pg_pack = phys_pg_pack;
2073	hl_dmabuf->memhash_hnode = hnode;
2074	hl_dmabuf->offset = offset;
2075	} else {
2076	rc = validate_export_params_no_mmu(hdev, device_addr: addr, size);
2077	if (rc)
2078	goto err_free_dmabuf_wrapper;
2079
2080	hl_dmabuf->device_phys_addr = addr;
2081	}
2082
2083	rc = export_dmabuf(ctx, hl_dmabuf, total_size: size, flags, dmabuf_fd);
2084	if (rc)
2085	goto dec_memhash_export_cnt;
2086
2087	return `0`;
2088
2089	dec_memhash_export_cnt:
2090	if (prop->dram_supports_virtual_memory)
2091	memhash_node_export_put(ctx, hnode);
2092	err_free_dmabuf_wrapper:
2093	kfree(objp: hl_dmabuf);
2094	return rc;
2095	}
2096
2097	static void ts_buff_release(struct hl_mmap_mem_buf *buf)
2098	{
2099	struct hl_ts_buff *ts_buff = buf->private;
2100
2101	vfree(addr: ts_buff->kernel_buff_address);
2102	vfree(addr: ts_buff->user_buff_address);
2103	kfree(objp: ts_buff);
2104	}
2105
2106	static int hl_ts_mmap(struct hl_mmap_mem_buf buf, struct* vm_area_struct vma, void* *args)
2107	{
2108	struct hl_ts_buff *ts_buff = buf->private;
2109
2110	vm_flags_set(vma, VM_DONTEXPAND \| VM_DONTDUMP \| VM_DONTCOPY \| VM_NORESERVE);
2111	return remap_vmalloc_range(vma, addr: ts_buff->user_buff_address, pgoff: `0`);
2112	}
2113
2114	static int hl_ts_alloc_buf(struct hl_mmap_mem_buf buf, gfp_t gfp, void* *args)
2115	{
2116	struct hl_ts_buff *ts_buff = NULL;
2117	u32 num_elements;
2118	size_t size;
2119	void *p;
2120
2121	num_elements = (u32 )args;
2122
2123	ts_buff = kzalloc(size: sizeof(*ts_buff), flags: gfp);
2124	if (!ts_buff)
2125	return -ENOMEM;
2126
2127	/ Allocate the user buffer /
2128	size = num_elements * sizeof(u64);
2129	p = vmalloc_user(size);
2130	if (!p)
2131	goto free_mem;
2132
2133	ts_buff->user_buff_address = p;
2134	buf->mappable_size = size;
2135
2136	/ Allocate the internal kernel buffer /
2137	size = num_elements * sizeof(struct hl_user_pending_interrupt);
2138	p = vzalloc(size);
2139	if (!p)
2140	goto free_user_buff;
2141
2142	ts_buff->kernel_buff_address = p;
2143	ts_buff->kernel_buff_size = size;
2144
2145	buf->private = ts_buff;
2146
2147	return `0`;
2148
2149	free_user_buff:
2150	vfree(addr: ts_buff->user_buff_address);
2151	free_mem:
2152	kfree(objp: ts_buff);
2153	return -ENOMEM;
2154	}
2155
2156	static struct hl_mmap_mem_buf_behavior hl_ts_behavior = {
2157	.topic = "TS",
2158	.mem_id = HL_MMAP_TYPE_TS_BUFF,
2159	.mmap = hl_ts_mmap,
2160	.alloc = hl_ts_alloc_buf,
2161	.release = ts_buff_release,
2162	};
2163
2164	/**
2165	* allocate_timestamps_buffers() - allocate timestamps buffers
2166	* This function will allocate ts buffer that will later on be mapped to the user
2167	* in order to be able to read the timestamp.
2168	* in addition it'll allocate an extra buffer for registration management.
2169	* since we cannot fail during registration for out-of-memory situation, so
2170	* we'll prepare a pool which will be used as user interrupt nodes and instead
2171	* of dynamically allocating nodes while registration we'll pick the node from
2172	* this pool. in addition it'll add node to the mapping hash which will be used
2173	* to map user ts buffer to the internal kernel ts buffer.
2174	* @hpriv: pointer to the private data of the fd
2175	* @args: ioctl input
2176	* @handle: user timestamp buffer handle as an output
2177	*/
2178	static int allocate_timestamps_buffers(struct hl_fpriv hpriv, struct* hl_mem_in args, u64 handle)
2179	{
2180	struct hl_mem_mgr *mmg = &hpriv->mem_mgr;
2181	struct hl_mmap_mem_buf *buf;
2182
2183	if (args->num_of_elements > TS_MAX_ELEMENTS_NUM) {
2184	dev_err(mmg->dev, "Num of elements exceeds Max allowed number (0x%x > 0x%x)\n",
2185	args->num_of_elements, TS_MAX_ELEMENTS_NUM);
2186	return -EINVAL;
2187	}
2188
2189	buf = hl_mmap_mem_buf_alloc(mmg, behavior: &hl_ts_behavior, GFP_KERNEL, args: &args->num_of_elements);
2190	if (!buf)
2191	return -ENOMEM;
2192
2193	*handle = buf->handle;
2194
2195	return `0`;
2196	}
2197
2198	int hl_mem_ioctl(struct drm_device ddev, void* data, struct* drm_file *file_priv)
2199	{
2200	struct hl_fpriv *hpriv = file_priv->driver_priv;
2201	enum hl_device_status status;
2202	union hl_mem_args *args = data;
2203	struct hl_device *hdev = hpriv->hdev;
2204	struct hl_ctx *ctx = hpriv->ctx;
2205	u64 block_handle, device_addr = `0`;
2206	u32 handle = `0`, block_size;
2207	int rc, dmabuf_fd = -EBADF;
2208
2209	if (!hl_device_operational(hdev, status: &status)) {
2210	dev_dbg_ratelimited(hdev->dev,
2211	"Device is %s. Can't execute MEMORY IOCTL\n",
2212	hdev->status[status]);
2213	return -EBUSY;
2214	}
2215
2216	switch (args->in.op) {
2217	case HL_MEM_OP_ALLOC:
2218	if (args->in.alloc.mem_size == `0`) {
2219	dev_err(hdev->dev,
2220	"alloc size must be larger than 0\n");
2221	rc = -EINVAL;
2222	goto out;
2223	}
2224
2225	/ If DRAM does not support virtual memory the driver won't*
2226	* handle the allocation/freeing of that memory. However, for
2227	* system administration/monitoring purposes, the driver will
2228	* keep track of the amount of DRAM memory that is allocated
2229	* and freed by the user. Because this code totally relies on
2230	* the user's input, the driver can't ensure the validity
2231	* of this accounting.
2232	*/
2233	if (!hdev->asic_prop.dram_supports_virtual_memory) {
2234	atomic64_add(i: args->in.alloc.mem_size,
2235	v: &ctx->dram_phys_mem);
2236	atomic64_add(i: args->in.alloc.mem_size,
2237	v: &hdev->dram_used_mem);
2238
2239	dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2240	rc = `0`;
2241
2242	memset(args, `0`, sizeof(*args));
2243	args->out.handle = `0`;
2244	goto out;
2245	}
2246
2247	rc = alloc_device_memory(ctx, args: &args->in, ret_handle: &handle);
2248
2249	memset(args, `0`, sizeof(*args));
2250	args->out.handle = (__u64) handle;
2251	break;
2252
2253	case HL_MEM_OP_FREE:
2254	/ If DRAM does not support virtual memory the driver won't*
2255	* handle the allocation/freeing of that memory. However, for
2256	* system administration/monitoring purposes, the driver will
2257	* keep track of the amount of DRAM memory that is allocated
2258	* and freed by the user. Because this code totally relies on
2259	* the user's input, the driver can't ensure the validity
2260	* of this accounting.
2261	*/
2262	if (!hdev->asic_prop.dram_supports_virtual_memory) {
2263	atomic64_sub(i: args->in.alloc.mem_size,
2264	v: &ctx->dram_phys_mem);
2265	atomic64_sub(i: args->in.alloc.mem_size,
2266	v: &hdev->dram_used_mem);
2267
2268	dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
2269	rc = `0`;
2270
2271	goto out;
2272	}
2273
2274	rc = free_device_memory(ctx, args: &args->in);
2275	break;
2276
2277	case HL_MEM_OP_MAP:
2278	rc = map_device_va(ctx, args: &args->in, device_addr: &device_addr);
2279
2280	memset(args, `0`, sizeof(*args));
2281	args->out.device_virt_addr = device_addr;
2282	break;
2283
2284	case HL_MEM_OP_UNMAP:
2285	rc = unmap_device_va(ctx, args: &args->in, ctx_free: false);
2286	break;
2287
2288	case HL_MEM_OP_MAP_BLOCK:
2289	rc = map_block(hdev, address: args->in.map_block.block_addr,
2290	handle: &block_handle, size: &block_size);
2291	args->out.block_handle = block_handle;
2292	args->out.block_size = block_size;
2293	break;
2294
2295	case HL_MEM_OP_EXPORT_DMABUF_FD:
2296	rc = export_dmabuf_from_addr(ctx,
2297	addr: args->in.export_dmabuf_fd.addr,
2298	size: args->in.export_dmabuf_fd.mem_size,
2299	offset: args->in.export_dmabuf_fd.offset,
2300	flags: args->in.flags,
2301	dmabuf_fd: &dmabuf_fd);
2302	memset(args, `0`, sizeof(*args));
2303	args->out.fd = dmabuf_fd;
2304	break;
2305
2306	case HL_MEM_OP_TS_ALLOC:
2307	rc = allocate_timestamps_buffers(hpriv, args: &args->in, handle: &args->out.handle);
2308	break;
2309	default:
2310	dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
2311	rc = -EINVAL;
2312	break;
2313	}
2314
2315	out:
2316	return rc;
2317	}
2318
2319	static int get_user_memory(struct hl_device *hdev, u64 addr, u64 size,
2320	u32 npages, u64 start, u32 offset,
2321	struct hl_userptr *userptr)
2322	{
2323	int rc;
2324
2325	if (!access_ok((void __user *) (uintptr_t) addr, size)) {
2326	dev_err(hdev->dev, "user pointer is invalid - 0x%llx\n", addr);
2327	return -EFAULT;
2328	}
2329
2330	userptr->pages = kvmalloc_array(n: npages, size: sizeof(struct page *), GFP_KERNEL);
2331	if (!userptr->pages)
2332	return -ENOMEM;
2333
2334	rc = pin_user_pages_fast(start, nr_pages: npages, gup_flags: FOLL_WRITE \| FOLL_LONGTERM,
2335	pages: userptr->pages);
2336
2337	if (rc != npages) {
2338	dev_err(hdev->dev,
2339	"Failed (%d) to pin host memory with user ptr 0x%llx, size 0x%llx, npages %d\n",
2340	rc, addr, size, npages);
2341	if (rc < `0`)
2342	goto destroy_pages;
2343	npages = rc;
2344	rc = -EFAULT;
2345	goto put_pages;
2346	}
2347	userptr->npages = npages;
2348
2349	rc = sg_alloc_table_from_pages(sgt: userptr->sgt,
2350	pages: userptr->pages,
2351	n_pages: npages, offset, size, GFP_KERNEL);
2352	if (rc < `0`) {
2353	dev_err(hdev->dev, "failed to create SG table from pages\n");
2354	goto put_pages;
2355	}
2356
2357	return `0`;
2358
2359	put_pages:
2360	unpin_user_pages(pages: userptr->pages, npages);
2361	destroy_pages:
2362	kvfree(addr: userptr->pages);
2363	return rc;
2364	}
2365
2366	/**
2367	* hl_pin_host_memory() - pins a chunk of host memory.
2368	* @hdev: pointer to the habanalabs device structure.
2369	* @addr: the host virtual address of the memory area.
2370	* @size: the size of the memory area.
2371	* @userptr: pointer to hl_userptr structure.
2372	*
2373	* This function does the following:
2374	* - Pins the physical pages.
2375	* - Create an SG list from those pages.
2376	*/
2377	int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
2378	struct hl_userptr *userptr)
2379	{
2380	u64 start, end;
2381	u32 npages, offset;
2382	int rc;
2383
2384	if (!size) {
2385	dev_err(hdev->dev, "size to pin is invalid - %llu\n", size);
2386	return -EINVAL;
2387	}
2388
2389	/*
2390	* If the combination of the address and size requested for this memory
2391	* region causes an integer overflow, return error.
2392	*/
2393	if (((addr + size) < addr) \|\|
2394	PAGE_ALIGN(addr + size) < (addr + size)) {
2395	dev_err(hdev->dev,
2396	"user pointer 0x%llx + %llu causes integer overflow\n",
2397	addr, size);
2398	return -EINVAL;
2399	}
2400
2401	userptr->pid = current->pid;
2402	userptr->sgt = kzalloc(size: sizeof(*userptr->sgt), GFP_KERNEL);
2403	if (!userptr->sgt)
2404	return -ENOMEM;
2405
2406	start = addr & PAGE_MASK;
2407	offset = addr & ~PAGE_MASK;
2408	end = PAGE_ALIGN(addr + size);
2409	npages = (end - start) >> PAGE_SHIFT;
2410
2411	userptr->size = size;
2412	userptr->addr = addr;
2413	userptr->dma_mapped = false;
2414	INIT_LIST_HEAD(list: &userptr->job_node);
2415
2416	rc = get_user_memory(hdev, addr, size, npages, start, offset,
2417	userptr);
2418	if (rc) {
2419	dev_err(hdev->dev,
2420	"failed to get user memory for address 0x%llx\n",
2421	addr);
2422	goto free_sgt;
2423	}
2424
2425	hl_debugfs_add_userptr(hdev, userptr);
2426
2427	return `0`;
2428
2429	free_sgt:
2430	kfree(objp: userptr->sgt);
2431	return rc;
2432	}
2433
2434	/*
2435	* hl_unpin_host_memory - unpins a chunk of host memory.
2436	* @hdev: pointer to the habanalabs device structure
2437	* @userptr: pointer to hl_userptr structure
2438	*
2439	* This function does the following:
2440	* - Unpins the physical pages related to the host memory
2441	* - Free the SG list
2442	*/
2443	void hl_unpin_host_memory(struct hl_device hdev, struct* hl_userptr *userptr)
2444	{
2445	hl_debugfs_remove_userptr(hdev, userptr);
2446
2447	if (userptr->dma_mapped)
2448	hl_dma_unmap_sgtable(hdev, userptr->sgt, userptr->dir);
2449
2450	unpin_user_pages_dirty_lock(pages: userptr->pages, npages: userptr->npages, make_dirty: true);
2451	kvfree(addr: userptr->pages);
2452
2453	list_del(entry: &userptr->job_node);
2454
2455	sg_free_table(userptr->sgt);
2456	kfree(objp: userptr->sgt);
2457	}
2458
2459	/**
2460	* hl_userptr_delete_list() - clear userptr list.
2461	* @hdev: pointer to the habanalabs device structure.
2462	* @userptr_list: pointer to the list to clear.
2463	*
2464	* This function does the following:
2465	* - Iterates over the list and unpins the host memory and frees the userptr
2466	* structure.
2467	*/
2468	void hl_userptr_delete_list(struct hl_device *hdev,
2469	struct list_head *userptr_list)
2470	{
2471	struct hl_userptr userptr, tmp;
2472
2473	list_for_each_entry_safe(userptr, tmp, userptr_list, job_node) {
2474	hl_unpin_host_memory(hdev, userptr);
2475	kfree(objp: userptr);
2476	}
2477
2478	INIT_LIST_HEAD(list: userptr_list);
2479	}
2480
2481	/**
2482	* hl_userptr_is_pinned() - returns whether the given userptr is pinned.
2483	* @hdev: pointer to the habanalabs device structure.
2484	* @addr: user address to check.
2485	* @size: user block size to check.
2486	* @userptr_list: pointer to the list to clear.
2487	* @userptr: pointer to userptr to check.
2488	*
2489	* This function does the following:
2490	* - Iterates over the list and checks if the given userptr is in it, means is
2491	* pinned. If so, returns true, otherwise returns false.
2492	*/
2493	bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
2494	u32 size, struct list_head *userptr_list,
2495	struct hl_userptr **userptr)
2496	{
2497	list_for_each_entry((*userptr), userptr_list, job_node) {
2498	if ((addr == (userptr)->addr) && (size == (userptr)->size))
2499	return true;
2500	}
2501
2502	return false;
2503	}
2504
2505	/**
2506	* va_range_init() - initialize virtual addresses range.
2507	* @hdev: pointer to the habanalabs device structure.
2508	* @va_ranges: pointer to va_ranges array.
2509	* @range_type: virtual address range type.
2510	* @start: range start address, inclusive.
2511	* @end: range end address, inclusive.
2512	* @page_size: page size for this va_range.
2513	*
2514	* This function does the following:
2515	* - Initializes the virtual addresses list of the given range with the given
2516	* addresses.
2517	*/
2518	static int va_range_init(struct hl_device hdev, struct* hl_va_range **va_ranges,
2519	enum hl_va_range_type range_type, u64 start,
2520	u64 end, u32 page_size)
2521	{
2522	struct hl_va_range *va_range = va_ranges[range_type];
2523	int rc;
2524
2525	INIT_LIST_HEAD(list: &va_range->list);
2526
2527	/*
2528	* PAGE_SIZE alignment
2529	* it is the caller's responsibility to align the addresses if the
2530	* page size is not a power of 2
2531	*/
2532
2533	if (is_power_of_2(n: page_size)) {
2534	start = round_up(start, page_size);
2535
2536	/*
2537	* The end of the range is inclusive, hence we need to align it
2538	* to the end of the last full page in the range. For example if
2539	* end = 0x3ff5 with page size 0x1000, we need to align it to
2540	* 0x2fff. The remaining 0xff5 bytes do not form a full page.
2541	*/
2542	end = round_down(end + `1`, page_size) - `1`;
2543	}
2544
2545	if (start >= end) {
2546	dev_err(hdev->dev, "too small vm range for va list\n");
2547	return -EFAULT;
2548	}
2549
2550	rc = add_va_block(hdev, va_range, start, end);
2551
2552	if (rc) {
2553	dev_err(hdev->dev, "Failed to init host va list\n");
2554	return rc;
2555	}
2556
2557	va_range->start_addr = start;
2558	va_range->end_addr = end;
2559	va_range->page_size = page_size;
2560
2561	return `0`;
2562	}
2563
2564	/**
2565	* va_range_fini() - clear a virtual addresses range.
2566	* @hdev: pointer to the habanalabs structure.
2567	* @va_range: pointer to virtual addresses range.
2568	*
2569	* This function does the following:
2570	* - Frees the virtual addresses block list and its lock.
2571	*/
2572	static void va_range_fini(struct hl_device hdev, struct* hl_va_range *va_range)
2573	{
2574	mutex_lock(&va_range->lock);
2575	clear_va_list_locked(hdev, va_list: &va_range->list);
2576	mutex_unlock(lock: &va_range->lock);
2577
2578	mutex_destroy(lock: &va_range->lock);
2579	kfree(objp: va_range);
2580	}
2581
2582	/**
2583	* vm_ctx_init_with_ranges() - initialize virtual memory for context.
2584	* @ctx: pointer to the habanalabs context structure.
2585	* @host_range_start: host virtual addresses range start.
2586	* @host_range_end: host virtual addresses range end.
2587	* @host_page_size: host page size.
2588	* @host_huge_range_start: host virtual addresses range start for memory
2589	* allocated with huge pages.
2590	* @host_huge_range_end: host virtual addresses range end for memory allocated
2591	* with huge pages.
2592	* @host_huge_page_size: host huge page size.
2593	* @dram_range_start: dram virtual addresses range start.
2594	* @dram_range_end: dram virtual addresses range end.
2595	* @dram_page_size: dram page size.
2596	*
2597	* This function initializes the following:
2598	* - MMU for context.
2599	* - Virtual address to area descriptor hashtable.
2600	* - Virtual block list of available virtual memory.
2601	*/
2602	static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
2603	u64 host_range_start,
2604	u64 host_range_end,
2605	u32 host_page_size,
2606	u64 host_huge_range_start,
2607	u64 host_huge_range_end,
2608	u32 host_huge_page_size,
2609	u64 dram_range_start,
2610	u64 dram_range_end,
2611	u32 dram_page_size)
2612	{
2613	struct hl_device *hdev = ctx->hdev;
2614	int i, rc;
2615
2616	for (i = `0` ; i < HL_VA_RANGE_TYPE_MAX ; i++) {
2617	ctx->va_range[i] =
2618	kzalloc(size: sizeof(struct hl_va_range), GFP_KERNEL);
2619	if (!ctx->va_range[i]) {
2620	rc = -ENOMEM;
2621	goto free_va_range;
2622	}
2623	}
2624
2625	rc = hl_mmu_ctx_init(ctx);
2626	if (rc) {
2627	dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
2628	goto free_va_range;
2629	}
2630
2631	mutex_init(&ctx->mem_hash_lock);
2632	hash_init(ctx->mem_hash);
2633
2634	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2635
2636	rc = va_range_init(hdev, va_ranges: ctx->va_range, range_type: HL_VA_RANGE_TYPE_HOST,
2637	start: host_range_start, end: host_range_end, page_size: host_page_size);
2638	if (rc) {
2639	dev_err(hdev->dev, "failed to init host vm range\n");
2640	goto mmu_ctx_fini;
2641	}
2642
2643	if (hdev->pmmu_huge_range) {
2644	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2645
2646	rc = va_range_init(hdev,
2647	va_ranges: ctx->va_range, range_type: HL_VA_RANGE_TYPE_HOST_HUGE,
2648	start: host_huge_range_start, end: host_huge_range_end,
2649	page_size: host_huge_page_size);
2650	if (rc) {
2651	dev_err(hdev->dev,
2652	"failed to init host huge vm range\n");
2653	goto clear_host_va_range;
2654	}
2655	} else {
2656	kfree(objp: ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2657	ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =
2658	ctx->va_range[HL_VA_RANGE_TYPE_HOST];
2659	}
2660
2661	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2662
2663	rc = va_range_init(hdev, va_ranges: ctx->va_range, range_type: HL_VA_RANGE_TYPE_DRAM,
2664	start: dram_range_start, end: dram_range_end, page_size: dram_page_size);
2665	if (rc) {
2666	dev_err(hdev->dev, "failed to init dram vm range\n");
2667	goto clear_host_huge_va_range;
2668	}
2669
2670	hl_debugfs_add_ctx_mem_hash(hdev, ctx);
2671
2672	return `0`;
2673
2674	clear_host_huge_va_range:
2675	mutex_destroy(lock: &ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
2676
2677	if (hdev->pmmu_huge_range) {
2678	mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2679	clear_va_list_locked(hdev,
2680	va_list: &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);
2681	mutex_unlock(lock: &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2682	}
2683	clear_host_va_range:
2684	if (hdev->pmmu_huge_range)
2685	mutex_destroy(lock: &ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
2686	mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2687	clear_va_list_locked(hdev, va_list: &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);
2688	mutex_unlock(lock: &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2689	mmu_ctx_fini:
2690	mutex_destroy(lock: &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
2691	mutex_destroy(lock: &ctx->mem_hash_lock);
2692	hl_mmu_ctx_fini(ctx);
2693	free_va_range:
2694	for (i = `0` ; i < HL_VA_RANGE_TYPE_MAX ; i++)
2695	kfree(objp: ctx->va_range[i]);
2696
2697	return rc;
2698	}
2699
2700	int hl_vm_ctx_init(struct hl_ctx *ctx)
2701	{
2702	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
2703	u64 host_range_start, host_range_end, host_huge_range_start,
2704	host_huge_range_end, dram_range_start, dram_range_end;
2705	u32 host_page_size, host_huge_page_size, dram_page_size;
2706
2707	atomic64_set(v: &ctx->dram_phys_mem, i: `0`);
2708
2709	/*
2710	* In case of DRAM mapping, the returned address is the physical
2711	* address of the memory related to the given handle.
2712	*/
2713	if (ctx->hdev->mmu_disable)
2714	return `0`;
2715
2716	dram_range_start = prop->dmmu.start_addr;
2717	dram_range_end = prop->dmmu.end_addr - `1`;
2718	dram_page_size = prop->dram_page_size ?
2719	prop->dram_page_size : prop->dmmu.page_size;
2720	host_range_start = prop->pmmu.start_addr;
2721	host_range_end = prop->pmmu.end_addr - `1`;
2722	host_page_size = prop->pmmu.page_size;
2723	host_huge_range_start = prop->pmmu_huge.start_addr;
2724	host_huge_range_end = prop->pmmu_huge.end_addr - `1`;
2725	host_huge_page_size = prop->pmmu_huge.page_size;
2726
2727	return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
2728	host_page_size, host_huge_range_start,
2729	host_huge_range_end, host_huge_page_size,
2730	dram_range_start, dram_range_end, dram_page_size);
2731	}
2732
2733	/**
2734	* hl_vm_ctx_fini() - virtual memory teardown of context.
2735	* @ctx: pointer to the habanalabs context structure.
2736	*
2737	* This function perform teardown the following:
2738	* - Virtual block list of available virtual memory.
2739	* - Virtual address to area descriptor hashtable.
2740	* - MMU for context.
2741	*
2742	* In addition this function does the following:
2743	* - Unmaps the existing hashtable nodes if the hashtable is not empty. The
2744	* hashtable should be empty as no valid mappings should exist at this
2745	* point.
2746	* - Frees any existing physical page list from the idr which relates to the
2747	* current context asid.
2748	* - This function checks the virtual block list for correctness. At this point
2749	* the list should contain one element which describes the whole virtual
2750	* memory range of the context. Otherwise, a warning is printed.
2751	*/
2752	void hl_vm_ctx_fini(struct hl_ctx *ctx)
2753	{
2754	struct hl_vm_phys_pg_pack phys_pg_list, tmp_phys_node;
2755	struct hl_device *hdev = ctx->hdev;
2756	struct hl_vm_hash_node *hnode;
2757	struct hl_vm *vm = &hdev->vm;
2758	struct hlist_node *tmp_node;
2759	struct list_head free_list;
2760	struct hl_mem_in args;
2761	int i;
2762
2763	if (hdev->mmu_disable)
2764	return;
2765
2766	hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
2767
2768	/*
2769	* Clearly something went wrong on hard reset so no point in printing
2770	* another side effect error
2771	*/
2772	if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
2773	dev_dbg(hdev->dev,
2774	"user released device without removing its memory mappings\n");
2775
2776	hash_for_each_safe(ctx->mem_hash, i, tmp_node, hnode, node) {
2777	dev_dbg(hdev->dev,
2778	"hl_mem_hash_node of vaddr 0x%llx of asid %d is still alive\n",
2779	hnode->vaddr, ctx->asid);
2780	args.unmap.device_virt_addr = hnode->vaddr;
2781	unmap_device_va(ctx, args: &args, ctx_free: true);
2782	}
2783
2784	mutex_lock(&hdev->mmu_lock);
2785
2786	/ invalidate the cache once after the unmapping loop /
2787	hl_mmu_invalidate_cache(hdev, is_hard: true, flags: MMU_OP_USERPTR);
2788	hl_mmu_invalidate_cache(hdev, is_hard: true, flags: MMU_OP_PHYS_PACK);
2789
2790	mutex_unlock(lock: &hdev->mmu_lock);
2791
2792	INIT_LIST_HEAD(list: &free_list);
2793
2794	spin_lock(lock: &vm->idr_lock);
2795	idr_for_each_entry(&vm->phys_pg_pack_handles, phys_pg_list, i)
2796	if (phys_pg_list->asid == ctx->asid) {
2797	dev_dbg(hdev->dev,
2798	"page list 0x%px of asid %d is still alive\n",
2799	phys_pg_list, ctx->asid);
2800
2801	atomic64_sub(i: phys_pg_list->total_size, v: &hdev->dram_used_mem);
2802	idr_remove(&vm->phys_pg_pack_handles, id: i);
2803	list_add(new: &phys_pg_list->node, head: &free_list);
2804	}
2805	spin_unlock(lock: &vm->idr_lock);
2806
2807	list_for_each_entry_safe(phys_pg_list, tmp_phys_node, &free_list, node)
2808	free_phys_pg_pack(hdev, phys_pg_pack: phys_pg_list);
2809
2810	va_range_fini(hdev, va_range: ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);
2811	va_range_fini(hdev, va_range: ctx->va_range[HL_VA_RANGE_TYPE_HOST]);
2812
2813	if (hdev->pmmu_huge_range)
2814	va_range_fini(hdev, va_range: ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
2815
2816	mutex_destroy(lock: &ctx->mem_hash_lock);
2817	hl_mmu_ctx_fini(ctx);
2818
2819	/ In this case we need to clear the global accounting of DRAM usage*
2820	* because the user notifies us on allocations. If the user is no more,
2821	* all DRAM is available
2822	*/
2823	if (ctx->asid != HL_KERNEL_ASID_ID &&
2824	!hdev->asic_prop.dram_supports_virtual_memory)
2825	atomic64_set(v: &hdev->dram_used_mem, i: `0`);
2826	}
2827
2828	/**
2829	* hl_vm_init() - initialize virtual memory module.
2830	* @hdev: pointer to the habanalabs device structure.
2831	*
2832	* This function initializes the following:
2833	* - MMU module.
2834	* - DRAM physical pages pool of 2MB.
2835	* - Idr for device memory allocation handles.
2836	*/
2837	int hl_vm_init(struct hl_device *hdev)
2838	{
2839	struct asic_fixed_properties *prop = &hdev->asic_prop;
2840	struct hl_vm *vm = &hdev->vm;
2841	int rc;
2842
2843	if (is_power_of_2(n: prop->dram_page_size))
2844	vm->dram_pg_pool =
2845	gen_pool_create(__ffs(prop->dram_page_size), -`1`);
2846	else
2847	vm->dram_pg_pool =
2848	gen_pool_create(__ffs(DRAM_POOL_PAGE_SIZE), -`1`);
2849
2850	if (!vm->dram_pg_pool) {
2851	dev_err(hdev->dev, "Failed to create dram page pool\n");
2852	return -ENOMEM;
2853	}
2854
2855	kref_init(kref: &vm->dram_pg_pool_refcount);
2856
2857	rc = gen_pool_add(pool: vm->dram_pg_pool, addr: prop->dram_user_base_address,
2858	size: prop->dram_end_address - prop->dram_user_base_address,
2859	nid: -`1`);
2860
2861	if (rc) {
2862	dev_err(hdev->dev,
2863	"Failed to add memory to dram page pool %d\n", rc);
2864	goto pool_add_err;
2865	}
2866
2867	spin_lock_init(&vm->idr_lock);
2868	idr_init(idr: &vm->phys_pg_pack_handles);
2869
2870	atomic64_set(v: &hdev->dram_used_mem, i: `0`);
2871
2872	vm->init_done = true;
2873
2874	return `0`;
2875
2876	pool_add_err:
2877	gen_pool_destroy(vm->dram_pg_pool);
2878
2879	return rc;
2880	}
2881
2882	/**
2883	* hl_vm_fini() - virtual memory module teardown.
2884	* @hdev: pointer to the habanalabs device structure.
2885	*
2886	* This function perform teardown to the following:
2887	* - Idr for device memory allocation handles.
2888	* - DRAM physical pages pool of 2MB.
2889	* - MMU module.
2890	*/
2891	void hl_vm_fini(struct hl_device *hdev)
2892	{
2893	struct hl_vm *vm = &hdev->vm;
2894
2895	if (!vm->init_done)
2896	return;
2897
2898	/*
2899	* At this point all the contexts should be freed and hence no DRAM
2900	* memory should be in use. Hence the DRAM pool should be freed here.
2901	*/
2902	if (kref_put(kref: &vm->dram_pg_pool_refcount, release: dram_pg_pool_do_release) != `1`)
2903	dev_warn(hdev->dev, "dram_pg_pool was not destroyed on %s\n",
2904	__func__);
2905
2906	vm->init_done = false;
2907	}
2908
2909	/**
2910	* hl_hw_block_mem_init() - HW block memory initialization.
2911	* @ctx: pointer to the habanalabs context structure.
2912	*
2913	* This function initializes the HW block virtual mapped addresses list and
2914	* it's lock.
2915	*/
2916	void hl_hw_block_mem_init(struct hl_ctx *ctx)
2917	{
2918	mutex_init(&ctx->hw_block_list_lock);
2919	INIT_LIST_HEAD(list: &ctx->hw_block_mem_list);
2920	}
2921
2922	/**
2923	* hl_hw_block_mem_fini() - HW block memory teardown.
2924	* @ctx: pointer to the habanalabs context structure.
2925	*
2926	* This function clears the HW block virtual mapped addresses list and destroys
2927	* it's lock.
2928	*/
2929	void hl_hw_block_mem_fini(struct hl_ctx *ctx)
2930	{
2931	struct hl_vm_hw_block_list_node lnode, tmp;
2932
2933	if (!list_empty(head: &ctx->hw_block_mem_list))
2934	dev_crit(ctx->hdev->dev, "HW block mem list isn't empty\n");
2935
2936	list_for_each_entry_safe(lnode, tmp, &ctx->hw_block_mem_list, node) {
2937	list_del(entry: &lnode->node);
2938	kfree(objp: lnode);
2939	}
2940
2941	mutex_destroy(lock: &ctx->hw_block_list_lock);
2942	}
2943

source code of linux/drivers/accel/habanalabs/common/memory.c