command_submission.c source code [linux/drivers/accel/habanalabs/common/command_submission.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	/*
4	* Copyright 2016-2021 HabanaLabs, Ltd.
5	* All Rights Reserved.
6	*/
7
8	#include <uapi/drm/habanalabs_accel.h>
9	#include "habanalabs.h"
10
11	#include <linux/uaccess.h>
12	#include <linux/slab.h>
13
14	#define HL_CS_FLAGS_TYPE_MASK (HL_CS_FLAGS_SIGNAL \| HL_CS_FLAGS_WAIT \| \
15	HL_CS_FLAGS_COLLECTIVE_WAIT \| HL_CS_FLAGS_RESERVE_SIGNALS_ONLY \| \
16	HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY \| HL_CS_FLAGS_ENGINE_CORE_COMMAND \| \
17	HL_CS_FLAGS_ENGINES_COMMAND \| HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
18
19
20	#define MAX_TS_ITER_NUM 100
21
22	/**
23	* enum hl_cs_wait_status - cs wait status
24	* @CS_WAIT_STATUS_BUSY: cs was not completed yet
25	* @CS_WAIT_STATUS_COMPLETED: cs completed
26	* @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
27	*/
28	enum hl_cs_wait_status {
29	CS_WAIT_STATUS_BUSY,
30	CS_WAIT_STATUS_COMPLETED,
31	CS_WAIT_STATUS_GONE
32	};
33
34	/*
35	* Data used while handling wait/timestamp nodes.
36	* The purpose of this struct is to store the needed data for both operations
37	* in one variable instead of passing large number of arguments to functions.
38	*/
39	struct wait_interrupt_data {
40	struct hl_user_interrupt *interrupt;
41	struct hl_mmap_mem_buf *buf;
42	struct hl_mem_mgr *mmg;
43	struct hl_cb *cq_cb;
44	u64 ts_handle;
45	u64 ts_offset;
46	u64 cq_handle;
47	u64 cq_offset;
48	u64 target_value;
49	u64 intr_timeout_us;
50	};
51
52	static void job_wq_completion(struct work_struct *work);
53	static int _hl_cs_wait_ioctl(struct hl_device hdev, struct* hl_ctx *ctx, u64 timeout_us, u64 seq,
54	enum hl_cs_wait_status status, s64 timestamp);
55	static void cs_do_release(struct kref *ref);
56
57	static void hl_push_cs_outcome(struct hl_device *hdev,
58	struct hl_cs_outcome_store *outcome_store,
59	u64 seq, ktime_t ts, int error)
60	{
61	struct hl_cs_outcome *node;
62	unsigned long flags;
63
64	/*
65	* CS outcome store supports the following operations:
66	* push outcome - store a recent CS outcome in the store
67	* pop outcome - retrieve a SPECIFIC (by seq) CS outcome from the store
68	* It uses 2 lists: used list and free list.
69	* It has a pre-allocated amount of nodes, each node stores
70	* a single CS outcome.
71	* Initially, all the nodes are in the free list.
72	* On push outcome, a node (any) is taken from the free list, its
73	* information is filled in, and the node is moved to the used list.
74	* It is possible, that there are no nodes left in the free list.
75	* In this case, we will lose some information about old outcomes. We
76	* will pop the OLDEST node from the used list, and make it free.
77	* On pop, the node is searched for in the used list (using a search
78	* index).
79	* If found, the node is then removed from the used list, and moved
80	* back to the free list. The outcome data that the node contained is
81	* returned back to the user.
82	*/
83
84	spin_lock_irqsave(&outcome_store->db_lock, flags);
85
86	if (list_empty(head: &outcome_store->free_list)) {
87	node = list_last_entry(&outcome_store->used_list,
88	struct hl_cs_outcome, list_link);
89	hash_del(node: &node->map_link);
90	dev_dbg(hdev->dev, "CS %llu outcome was lost\n", node->seq);
91	} else {
92	node = list_last_entry(&outcome_store->free_list,
93	struct hl_cs_outcome, list_link);
94	}
95
96	list_del_init(entry: &node->list_link);
97
98	node->seq = seq;
99	node->ts = ts;
100	node->error = error;
101
102	list_add(new: &node->list_link, head: &outcome_store->used_list);
103	hash_add(outcome_store->outcome_map, &node->map_link, node->seq);
104
105	spin_unlock_irqrestore(lock: &outcome_store->db_lock, flags);
106	}
107
108	static bool hl_pop_cs_outcome(struct hl_cs_outcome_store *outcome_store,
109	u64 seq, ktime_t ts, int* *error)
110	{
111	struct hl_cs_outcome *node;
112	unsigned long flags;
113
114	spin_lock_irqsave(&outcome_store->db_lock, flags);
115
116	hash_for_each_possible(outcome_store->outcome_map, node, map_link, seq)
117	if (node->seq == seq) {
118	*ts = node->ts;
119	*error = node->error;
120
121	hash_del(node: &node->map_link);
122	list_del_init(entry: &node->list_link);
123	list_add(new: &node->list_link, head: &outcome_store->free_list);
124
125	spin_unlock_irqrestore(lock: &outcome_store->db_lock, flags);
126
127	return true;
128	}
129
130	spin_unlock_irqrestore(lock: &outcome_store->db_lock, flags);
131
132	return false;
133	}
134
135	static void hl_sob_reset(struct kref *ref)
136	{
137	struct hl_hw_sob hw_sob = container_of(ref, struct* hl_hw_sob,
138	kref);
139	struct hl_device *hdev = hw_sob->hdev;
140
141	dev_dbg(hdev->dev, "reset sob id %u\n", hw_sob->sob_id);
142
143	hdev->asic_funcs->reset_sob(hdev, hw_sob);
144
145	hw_sob->need_reset = false;
146	}
147
148	void hl_sob_reset_error(struct kref *ref)
149	{
150	struct hl_hw_sob hw_sob = container_of(ref, struct* hl_hw_sob,
151	kref);
152	struct hl_device *hdev = hw_sob->hdev;
153
154	dev_crit(hdev->dev,
155	"SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n",
156	hw_sob->q_idx, hw_sob->sob_id);
157	}
158
159	void hw_sob_put(struct hl_hw_sob *hw_sob)
160	{
161	if (hw_sob)
162	kref_put(kref: &hw_sob->kref, release: hl_sob_reset);
163	}
164
165	static void hw_sob_put_err(struct hl_hw_sob *hw_sob)
166	{
167	if (hw_sob)
168	kref_put(kref: &hw_sob->kref, release: hl_sob_reset_error);
169	}
170
171	void hw_sob_get(struct hl_hw_sob *hw_sob)
172	{
173	if (hw_sob)
174	kref_get(kref: &hw_sob->kref);
175	}
176
177	/**
178	* hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
179	* @sob_base: sob base id
180	* @sob_mask: sob user mask, each bit represents a sob offset from sob base
181	* @mask: generated mask
182	*
183	* Return: 0 if given parameters are valid
184	*/
185	int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
186	{
187	int i;
188
189	if (sob_mask == `0`)
190	return -EINVAL;
191
192	if (sob_mask == `0x1`) {
193	*mask = ~(`1` << (sob_base & `0x7`));
194	} else {
195	/ find msb in order to verify sob range is valid /
196	for (i = BITS_PER_BYTE - `1` ; i >= `0` ; i--)
197	if (BIT(i) & sob_mask)
198	break;
199
200	if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & `0x7`) - `1`))
201	return -EINVAL;
202
203	*mask = ~sob_mask;
204	}
205
206	return `0`;
207	}
208
209	static void hl_fence_release(struct kref *kref)
210	{
211	struct hl_fence *fence =
212	container_of(kref, struct hl_fence, refcount);
213	struct hl_cs_compl *hl_cs_cmpl =
214	container_of(fence, struct hl_cs_compl, base_fence);
215
216	kfree(objp: hl_cs_cmpl);
217	}
218
219	void hl_fence_put(struct hl_fence *fence)
220	{
221	if (IS_ERR_OR_NULL(ptr: fence))
222	return;
223	kref_put(kref: &fence->refcount, release: hl_fence_release);
224	}
225
226	void hl_fences_put(struct hl_fence *fence, int* len)
227	{
228	int i;
229
230	for (i = `0`; i < len; i++, fence++)
231	hl_fence_put(fence: *fence);
232	}
233
234	void hl_fence_get(struct hl_fence *fence)
235	{
236	if (fence)
237	kref_get(kref: &fence->refcount);
238	}
239
240	static void hl_fence_init(struct hl_fence *fence, u64 sequence)
241	{
242	kref_init(kref: &fence->refcount);
243	fence->cs_sequence = sequence;
244	fence->error = `0`;
245	fence->timestamp = ktime_set(secs: `0`, nsecs: `0`);
246	fence->mcs_handling_done = false;
247	init_completion(x: &fence->completion);
248	}
249
250	void cs_get(struct hl_cs *cs)
251	{
252	kref_get(kref: &cs->refcount);
253	}
254
255	static int cs_get_unless_zero(struct hl_cs *cs)
256	{
257	return kref_get_unless_zero(kref: &cs->refcount);
258	}
259
260	static void cs_put(struct hl_cs *cs)
261	{
262	kref_put(kref: &cs->refcount, release: cs_do_release);
263	}
264
265	static void cs_job_do_release(struct kref *ref)
266	{
267	struct hl_cs_job job = container_of(ref, struct* hl_cs_job, refcount);
268
269	kfree(objp: job);
270	}
271
272	static void hl_cs_job_put(struct hl_cs_job *job)
273	{
274	kref_put(kref: &job->refcount, release: cs_job_do_release);
275	}
276
277	bool cs_needs_completion(struct hl_cs *cs)
278	{
279	/ In case this is a staged CS, only the last CS in sequence should*
280	* get a completion, any non staged CS will always get a completion
281	*/
282	if (cs->staged_cs && !cs->staged_last)
283	return false;
284
285	return true;
286	}
287
288	bool cs_needs_timeout(struct hl_cs *cs)
289	{
290	/ In case this is a staged CS, only the first CS in sequence should*
291	* get a timeout, any non staged CS will always get a timeout
292	*/
293	if (cs->staged_cs && !cs->staged_first)
294	return false;
295
296	return true;
297	}
298
299	static bool is_cb_patched(struct hl_device hdev, struct* hl_cs_job *job)
300	{
301	/ Patched CB is created for external queues jobs /
302	return (job->queue_type == QUEUE_TYPE_EXT);
303	}
304
305	/*
306	* cs_parser - parse the user command submission
307	*
308	* @hpriv : pointer to the private data of the fd
309	* @job : pointer to the job that holds the command submission info
310	*
311	* The function parses the command submission of the user. It calls the
312	* ASIC specific parser, which returns a list of memory blocks to send
313	* to the device as different command buffers
314	*
315	*/
316	static int cs_parser(struct hl_fpriv hpriv, struct* hl_cs_job *job)
317	{
318	struct hl_device *hdev = hpriv->hdev;
319	struct hl_cs_parser parser;
320	int rc;
321
322	parser.ctx_id = job->cs->ctx->asid;
323	parser.cs_sequence = job->cs->sequence;
324	parser.job_id = job->id;
325
326	parser.hw_queue_id = job->hw_queue_id;
327	parser.job_userptr_list = &job->userptr_list;
328	parser.patched_cb = NULL;
329	parser.user_cb = job->user_cb;
330	parser.user_cb_size = job->user_cb_size;
331	parser.queue_type = job->queue_type;
332	parser.is_kernel_allocated_cb = job->is_kernel_allocated_cb;
333	job->patched_cb = NULL;
334	parser.completion = cs_needs_completion(cs: job->cs);
335
336	rc = hdev->asic_funcs->cs_parser(hdev, &parser);
337
338	if (is_cb_patched(hdev, job)) {
339	if (!rc) {
340	job->patched_cb = parser.patched_cb;
341	job->job_cb_size = parser.patched_cb_size;
342	job->contains_dma_pkt = parser.contains_dma_pkt;
343	atomic_inc(v: &job->patched_cb->cs_cnt);
344	}
345
346	/*
347	* Whether the parsing worked or not, we don't need the
348	* original CB anymore because it was already parsed and
349	* won't be accessed again for this CS
350	*/
351	atomic_dec(v: &job->user_cb->cs_cnt);
352	hl_cb_put(cb: job->user_cb);
353	job->user_cb = NULL;
354	} else if (!rc) {
355	job->job_cb_size = job->user_cb_size;
356	}
357
358	return rc;
359	}
360
361	static void hl_complete_job(struct hl_device hdev, struct* hl_cs_job *job)
362	{
363	struct hl_cs *cs = job->cs;
364
365	if (is_cb_patched(hdev, job)) {
366	hl_userptr_delete_list(hdev, userptr_list: &job->userptr_list);
367
368	/*
369	* We might arrive here from rollback and patched CB wasn't
370	* created, so we need to check it's not NULL
371	*/
372	if (job->patched_cb) {
373	atomic_dec(v: &job->patched_cb->cs_cnt);
374	hl_cb_put(cb: job->patched_cb);
375	}
376	}
377
378	/ For H/W queue jobs, if a user CB was allocated by driver,*
379	* the user CB isn't released in cs_parser() and thus should be
380	* released here. This is also true for INT queues jobs which were
381	* allocated by driver.
382	*/
383	if (job->is_kernel_allocated_cb &&
384	(job->queue_type == QUEUE_TYPE_HW \|\| job->queue_type == QUEUE_TYPE_INT)) {
385	atomic_dec(v: &job->user_cb->cs_cnt);
386	hl_cb_put(cb: job->user_cb);
387	}
388
389	/*
390	* This is the only place where there can be multiple threads
391	* modifying the list at the same time
392	*/
393	spin_lock(lock: &cs->job_lock);
394	list_del(entry: &job->cs_node);
395	spin_unlock(lock: &cs->job_lock);
396
397	hl_debugfs_remove_job(hdev, job);
398
399	/ We decrement reference only for a CS that gets completion*
400	* because the reference was incremented only for this kind of CS
401	* right before it was scheduled.
402	*
403	* In staged submission, only the last CS marked as 'staged_last'
404	* gets completion, hence its release function will be called from here.
405	* As for all the rest CS's in the staged submission which do not get
406	* completion, their CS reference will be decremented by the
407	* 'staged_last' CS during the CS release flow.
408	* All relevant PQ CI counters will be incremented during the CS release
409	* flow by calling 'hl_hw_queue_update_ci'.
410	*/
411	if (cs_needs_completion(cs) &&
412	(job->queue_type == QUEUE_TYPE_EXT \|\| job->queue_type == QUEUE_TYPE_HW)) {
413
414	/ In CS based completions, the timestamp is already available,*
415	* so no need to extract it from job
416	*/
417	if (hdev->asic_prop.completion_mode == HL_COMPLETION_MODE_JOB)
418	cs->completion_timestamp = job->timestamp;
419
420	cs_put(cs);
421	}
422
423	hl_cs_job_put(job);
424	}
425
426	/*
427	* hl_staged_cs_find_first - locate the first CS in this staged submission
428	*
429	* @hdev: pointer to device structure
430	* @cs_seq: staged submission sequence number
431	*
432	* @note: This function must be called under 'hdev->cs_mirror_lock'
433	*
434	* Find and return a CS pointer with the given sequence
435	*/
436	struct hl_cs hl_staged_cs_find_first(struct* hl_device *hdev, u64 cs_seq)
437	{
438	struct hl_cs *cs;
439
440	list_for_each_entry_reverse(cs, &hdev->cs_mirror_list, mirror_node)
441	if (cs->staged_cs && cs->staged_first &&
442	cs->sequence == cs_seq)
443	return cs;
444
445	return NULL;
446	}
447
448	/*
449	* is_staged_cs_last_exists - returns true if the last CS in sequence exists
450	*
451	* @hdev: pointer to device structure
452	* @cs: staged submission member
453	*
454	*/
455	bool is_staged_cs_last_exists(struct hl_device hdev, struct* hl_cs *cs)
456	{
457	struct hl_cs *last_entry;
458
459	last_entry = list_last_entry(&cs->staged_cs_node, struct hl_cs,
460	staged_cs_node);
461
462	if (last_entry->staged_last)
463	return true;
464
465	return false;
466	}
467
468	/*
469	* staged_cs_get - get CS reference if this CS is a part of a staged CS
470	*
471	* @hdev: pointer to device structure
472	* @cs: current CS
473	* @cs_seq: staged submission sequence number
474	*
475	* Increment CS reference for every CS in this staged submission except for
476	* the CS which get completion.
477	*/
478	static void staged_cs_get(struct hl_device hdev, struct* hl_cs *cs)
479	{
480	/ Only the last CS in this staged submission will get a completion.*
481	* We must increment the reference for all other CS's in this
482	* staged submission.
483	* Once we get a completion we will release the whole staged submission.
484	*/
485	if (!cs->staged_last)
486	cs_get(cs);
487	}
488
489	/*
490	* staged_cs_put - put a CS in case it is part of staged submission
491	*
492	* @hdev: pointer to device structure
493	* @cs: CS to put
494	*
495	* This function decrements a CS reference (for a non completion CS)
496	*/
497	static void staged_cs_put(struct hl_device hdev, struct* hl_cs *cs)
498	{
499	/ We release all CS's in a staged submission except the last*
500	* CS which we have never incremented its reference.
501	*/
502	if (!cs_needs_completion(cs))
503	cs_put(cs);
504	}
505
506	static void cs_handle_tdr(struct hl_device hdev, struct* hl_cs *cs)
507	{
508	struct hl_cs next = NULL, iter, *first_cs;
509
510	if (!cs_needs_timeout(cs))
511	return;
512
513	spin_lock(lock: &hdev->cs_mirror_lock);
514
515	/ We need to handle tdr only once for the complete staged submission.*
516	* Hence, we choose the CS that reaches this function first which is
517	* the CS marked as 'staged_last'.
518	* In case single staged cs was submitted which has both first and last
519	* indications, then "cs_find_first" below will return NULL, since we
520	* removed the cs node from the list before getting here,
521	* in such cases just continue with the cs to cancel it's TDR work.
522	*/
523	if (cs->staged_cs && cs->staged_last) {
524	first_cs = hl_staged_cs_find_first(hdev, cs_seq: cs->staged_sequence);
525	if (first_cs)
526	cs = first_cs;
527	}
528
529	spin_unlock(lock: &hdev->cs_mirror_lock);
530
531	/ Don't cancel TDR in case this CS was timedout because we might be*
532	* running from the TDR context
533	*/
534	if (cs->timedout \|\| hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
535	return;
536
537	if (cs->tdr_active)
538	cancel_delayed_work_sync(dwork: &cs->work_tdr);
539
540	spin_lock(lock: &hdev->cs_mirror_lock);
541
542	/ queue TDR for next CS /
543	list_for_each_entry(iter, &hdev->cs_mirror_list, mirror_node)
544	if (cs_needs_timeout(cs: iter)) {
545	next = iter;
546	break;
547	}
548
549	if (next && !next->tdr_active) {
550	next->tdr_active = true;
551	schedule_delayed_work(dwork: &next->work_tdr, delay: next->timeout_jiffies);
552	}
553
554	spin_unlock(lock: &hdev->cs_mirror_lock);
555	}
556
557	/*
558	* force_complete_multi_cs - complete all contexts that wait on multi-CS
559	*
560	* @hdev: pointer to habanalabs device structure
561	*/
562	static void force_complete_multi_cs(struct hl_device *hdev)
563	{
564	int i;
565
566	for (i = `0`; i < MULTI_CS_MAX_USER_CTX; i++) {
567	struct multi_cs_completion *mcs_compl;
568
569	mcs_compl = &hdev->multi_cs_completion[i];
570
571	spin_lock(lock: &mcs_compl->lock);
572
573	if (!mcs_compl->used) {
574	spin_unlock(lock: &mcs_compl->lock);
575	continue;
576	}
577
578	/ when calling force complete no context should be waiting on*
579	* multi-cS.
580	* We are calling the function as a protection for such case
581	* to free any pending context and print error message
582	*/
583	dev_err(hdev->dev,
584	"multi-CS completion context %d still waiting when calling force completion\n",
585	i);
586	complete_all(&mcs_compl->completion);
587	spin_unlock(lock: &mcs_compl->lock);
588	}
589	}
590
591	/*
592	* complete_multi_cs - complete all waiting entities on multi-CS
593	*
594	* @hdev: pointer to habanalabs device structure
595	* @cs: CS structure
596	* The function signals a waiting entity that has an overlapping stream masters
597	* with the completed CS.
598	* For example:
599	* - a completed CS worked on stream master QID 4, multi CS completion
600	* is actively waiting on stream master QIDs 3, 5. don't send signal as no
601	* common stream master QID
602	* - a completed CS worked on stream master QID 4, multi CS completion
603	* is actively waiting on stream master QIDs 3, 4. send signal as stream
604	* master QID 4 is common
605	*/
606	static void complete_multi_cs(struct hl_device hdev, struct* hl_cs *cs)
607	{
608	struct hl_fence *fence = cs->fence;
609	int i;
610
611	/ in case of multi CS check for completion only for the first CS /
612	if (cs->staged_cs && !cs->staged_first)
613	return;
614
615	for (i = `0`; i < MULTI_CS_MAX_USER_CTX; i++) {
616	struct multi_cs_completion *mcs_compl;
617
618	mcs_compl = &hdev->multi_cs_completion[i];
619	if (!mcs_compl->used)
620	continue;
621
622	spin_lock(lock: &mcs_compl->lock);
623
624	/*
625	* complete if:
626	* 1. still waiting for completion
627	* 2. the completed CS has at least one overlapping stream
628	* master with the stream masters in the completion
629	*/
630	if (mcs_compl->used &&
631	(fence->stream_master_qid_map &
632	mcs_compl->stream_master_qid_map)) {
633	/ extract the timestamp only of first completed CS /
634	if (!mcs_compl->timestamp)
635	mcs_compl->timestamp = ktime_to_ns(kt: fence->timestamp);
636
637	complete_all(&mcs_compl->completion);
638
639	/*
640	* Setting mcs_handling_done inside the lock ensures
641	* at least one fence have mcs_handling_done set to
642	* true before wait for mcs finish. This ensures at
643	* least one CS will be set as completed when polling
644	* mcs fences.
645	*/
646	fence->mcs_handling_done = true;
647	}
648
649	spin_unlock(lock: &mcs_compl->lock);
650	}
651	/ In case CS completed without mcs completion initialized /
652	fence->mcs_handling_done = true;
653	}
654
655	static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
656	struct hl_cs *cs,
657	struct hl_cs_compl *hl_cs_cmpl)
658	{
659	/ Skip this handler if the cs wasn't submitted, to avoid putting*
660	* the hw_sob twice, since this case already handled at this point,
661	* also skip if the hw_sob pointer wasn't set.
662	*/
663	if (!hl_cs_cmpl->hw_sob \|\| !cs->submitted)
664	return;
665
666	spin_lock(lock: &hl_cs_cmpl->lock);
667
668	/*
669	* we get refcount upon reservation of signals or signal/wait cs for the
670	* hw_sob object, and need to put it when the first staged cs
671	* (which contains the encaps signals) or cs signal/wait is completed.
672	*/
673	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) \|\|
674	(hl_cs_cmpl->type == CS_TYPE_WAIT) \|\|
675	(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT) \|\|
676	(!!hl_cs_cmpl->encaps_signals)) {
677	dev_dbg(hdev->dev,
678	"CS 0x%llx type %d finished, sob_id: %d, sob_val: %u\n",
679	hl_cs_cmpl->cs_seq,
680	hl_cs_cmpl->type,
681	hl_cs_cmpl->hw_sob->sob_id,
682	hl_cs_cmpl->sob_val);
683
684	hw_sob_put(hw_sob: hl_cs_cmpl->hw_sob);
685
686	if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
687	hdev->asic_funcs->reset_sob_group(hdev,
688	hl_cs_cmpl->sob_group);
689	}
690
691	spin_unlock(lock: &hl_cs_cmpl->lock);
692	}
693
694	static void cs_do_release(struct kref *ref)
695	{
696	struct hl_cs cs = container_of(ref, struct* hl_cs, refcount);
697	struct hl_device *hdev = cs->ctx->hdev;
698	struct hl_cs_job job, tmp;
699	struct hl_cs_compl *hl_cs_cmpl =
700	container_of(cs->fence, struct hl_cs_compl, base_fence);
701
702	cs->completed = true;
703
704	/*
705	* Although if we reached here it means that all external jobs have
706	* finished, because each one of them took refcnt to CS, we still
707	* need to go over the internal jobs and complete them. Otherwise, we
708	* will have leaked memory and what's worse, the CS object (and
709	* potentially the CTX object) could be released, while the JOB
710	* still holds a pointer to them (but no reference).
711	*/
712	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
713	hl_complete_job(hdev, job);
714
715	if (!cs->submitted) {
716	/*
717	* In case the wait for signal CS was submitted, the fence put
718	* occurs in init_signal_wait_cs() or collective_wait_init_cs()
719	* right before hanging on the PQ.
720	*/
721	if (cs->type == CS_TYPE_WAIT \|\|
722	cs->type == CS_TYPE_COLLECTIVE_WAIT)
723	hl_fence_put(fence: cs->signal_fence);
724
725	goto out;
726	}
727
728	/ Need to update CI for all queue jobs that does not get completion /
729	hl_hw_queue_update_ci(cs);
730
731	/ remove CS from CS mirror list /
732	spin_lock(lock: &hdev->cs_mirror_lock);
733	list_del_init(entry: &cs->mirror_node);
734	spin_unlock(lock: &hdev->cs_mirror_lock);
735
736	cs_handle_tdr(hdev, cs);
737
738	if (cs->staged_cs) {
739	/ the completion CS decrements reference for the entire*
740	* staged submission
741	*/
742	if (cs->staged_last) {
743	struct hl_cs staged_cs, tmp_cs;
744
745	list_for_each_entry_safe(staged_cs, tmp_cs,
746	&cs->staged_cs_node, staged_cs_node)
747	staged_cs_put(hdev, cs: staged_cs);
748	}
749
750	/ A staged CS will be a member in the list only after it*
751	* was submitted. We used 'cs_mirror_lock' when inserting
752	* it to list so we will use it again when removing it
753	*/
754	if (cs->submitted) {
755	spin_lock(lock: &hdev->cs_mirror_lock);
756	list_del(entry: &cs->staged_cs_node);
757	spin_unlock(lock: &hdev->cs_mirror_lock);
758	}
759
760	/ decrement refcount to handle when first staged cs*
761	* with encaps signals is completed.
762	*/
763	if (hl_cs_cmpl->encaps_signals)
764	kref_put(kref: &hl_cs_cmpl->encaps_sig_hdl->refcount,
765	release: hl_encaps_release_handle_and_put_ctx);
766	}
767
768	if ((cs->type == CS_TYPE_WAIT \|\| cs->type == CS_TYPE_COLLECTIVE_WAIT) && cs->encaps_signals)
769	kref_put(kref: &cs->encaps_sig_hdl->refcount, release: hl_encaps_release_handle_and_put_ctx);
770
771	out:
772	/ Must be called before hl_ctx_put because inside we use ctx to get*
773	* the device
774	*/
775	hl_debugfs_remove_cs(cs);
776
777	hdev->shadow_cs_queue[cs->sequence & (hdev->asic_prop.max_pending_cs - `1`)] = NULL;
778
779	/ We need to mark an error for not submitted because in that case*
780	* the hl fence release flow is different. Mainly, we don't need
781	* to handle hw_sob for signal/wait
782	*/
783	if (cs->timedout)
784	cs->fence->error = -ETIMEDOUT;
785	else if (cs->aborted)
786	cs->fence->error = -EIO;
787	else if (!cs->submitted)
788	cs->fence->error = -EBUSY;
789
790	if (unlikely(cs->skip_reset_on_timeout)) {
791	dev_err(hdev->dev,
792	"Command submission %llu completed after %llu (s)\n",
793	cs->sequence,
794	div_u64(jiffies - cs->submission_time_jiffies, HZ));
795	}
796
797	if (cs->timestamp) {
798	cs->fence->timestamp = cs->completion_timestamp;
799	hl_push_cs_outcome(hdev, outcome_store: &cs->ctx->outcome_store, seq: cs->sequence,
800	ts: cs->fence->timestamp, error: cs->fence->error);
801	}
802
803	hl_ctx_put(ctx: cs->ctx);
804
805	complete_all(&cs->fence->completion);
806	complete_multi_cs(hdev, cs);
807
808	cs_release_sob_reset_handler(hdev, cs, hl_cs_cmpl);
809
810	hl_fence_put(fence: cs->fence);
811
812	kfree(objp: cs->jobs_in_queue_cnt);
813	kfree(objp: cs);
814	}
815
816	static void cs_timedout(struct work_struct *work)
817	{
818	struct hl_cs cs = container_of(work, struct* hl_cs, work_tdr.work);
819	bool skip_reset_on_timeout, device_reset = false;
820	struct hl_device *hdev;
821	u64 event_mask = `0x0`;
822	uint timeout_sec;
823	int rc;
824
825	skip_reset_on_timeout = cs->skip_reset_on_timeout;
826
827	rc = cs_get_unless_zero(cs);
828	if (!rc)
829	return;
830
831	if ((!cs->submitted) \|\| (cs->completed)) {
832	cs_put(cs);
833	return;
834	}
835
836	hdev = cs->ctx->hdev;
837
838	if (likely(!skip_reset_on_timeout)) {
839	if (hdev->reset_on_lockup)
840	device_reset = true;
841	else
842	hdev->reset_info.needs_reset = true;
843
844	/ Mark the CS is timed out so we won't try to cancel its TDR /
845	cs->timedout = true;
846	}
847
848	/ Save only the first CS timeout parameters /
849	rc = atomic_cmpxchg(v: &hdev->captured_err_info.cs_timeout.write_enable, old: `1`, new: `0`);
850	if (rc) {
851	hdev->captured_err_info.cs_timeout.timestamp = ktime_get();
852	hdev->captured_err_info.cs_timeout.seq = cs->sequence;
853	event_mask \|= HL_NOTIFIER_EVENT_CS_TIMEOUT;
854	}
855
856	timeout_sec = jiffies_to_msecs(j: hdev->timeout_jiffies) / `1000`;
857
858	switch (cs->type) {
859	case CS_TYPE_SIGNAL:
860	dev_err(hdev->dev,
861	"Signal command submission %llu has not finished in %u seconds!\n",
862	cs->sequence, timeout_sec);
863	break;
864
865	case CS_TYPE_WAIT:
866	dev_err(hdev->dev,
867	"Wait command submission %llu has not finished in %u seconds!\n",
868	cs->sequence, timeout_sec);
869	break;
870
871	case CS_TYPE_COLLECTIVE_WAIT:
872	dev_err(hdev->dev,
873	"Collective Wait command submission %llu has not finished in %u seconds!\n",
874	cs->sequence, timeout_sec);
875	break;
876
877	default:
878	dev_err(hdev->dev,
879	"Command submission %llu has not finished in %u seconds!\n",
880	cs->sequence, timeout_sec);
881	break;
882	}
883
884	rc = hl_state_dump(hdev);
885	if (rc)
886	dev_err(hdev->dev, "Error during system state dump %d\n", rc);
887
888	cs_put(cs);
889
890	if (device_reset) {
891	event_mask \|= HL_NOTIFIER_EVENT_DEVICE_RESET;
892	hl_device_cond_reset(hdev, HL_DRV_RESET_TDR, event_mask);
893	} else if (event_mask) {
894	hl_notifier_event_send_all(hdev, event_mask);
895	}
896	}
897
898	static int allocate_cs(struct hl_device hdev, struct* hl_ctx *ctx,
899	enum hl_cs_type cs_type, u64 user_sequence,
900	struct hl_cs **cs_new, u32 flags, u32 timeout)
901	{
902	struct hl_cs_counters_atomic *cntr;
903	struct hl_fence *other = NULL;
904	struct hl_cs_compl *cs_cmpl;
905	struct hl_cs *cs;
906	int rc;
907
908	cntr = &hdev->aggregated_cs_counters;
909
910	cs = kzalloc(size: sizeof(*cs), GFP_ATOMIC);
911	if (!cs)
912	cs = kzalloc(size: sizeof(*cs), GFP_KERNEL);
913
914	if (!cs) {
915	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
916	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
917	return -ENOMEM;
918	}
919
920	/ increment refcnt for context /
921	hl_ctx_get(ctx);
922
923	cs->ctx = ctx;
924	cs->submitted = false;
925	cs->completed = false;
926	cs->type = cs_type;
927	cs->timestamp = !!(flags & HL_CS_FLAGS_TIMESTAMP);
928	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
929	cs->timeout_jiffies = timeout;
930	cs->skip_reset_on_timeout =
931	hdev->reset_info.skip_reset_on_timeout \|\|
932	!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
933	cs->submission_time_jiffies = jiffies;
934	INIT_LIST_HEAD(list: &cs->job_list);
935	INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout);
936	kref_init(kref: &cs->refcount);
937	spin_lock_init(&cs->job_lock);
938
939	cs_cmpl = kzalloc(size: sizeof(*cs_cmpl), GFP_ATOMIC);
940	if (!cs_cmpl)
941	cs_cmpl = kzalloc(size: sizeof(*cs_cmpl), GFP_KERNEL);
942
943	if (!cs_cmpl) {
944	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
945	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
946	rc = -ENOMEM;
947	goto free_cs;
948	}
949
950	cs->jobs_in_queue_cnt = kcalloc(n: hdev->asic_prop.max_queues,
951	size: sizeof(*cs->jobs_in_queue_cnt), GFP_ATOMIC);
952	if (!cs->jobs_in_queue_cnt)
953	cs->jobs_in_queue_cnt = kcalloc(n: hdev->asic_prop.max_queues,
954	size: sizeof(*cs->jobs_in_queue_cnt), GFP_KERNEL);
955
956	if (!cs->jobs_in_queue_cnt) {
957	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
958	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
959	rc = -ENOMEM;
960	goto free_cs_cmpl;
961	}
962
963	cs_cmpl->hdev = hdev;
964	cs_cmpl->type = cs->type;
965	spin_lock_init(&cs_cmpl->lock);
966	cs->fence = &cs_cmpl->base_fence;
967
968	spin_lock(lock: &ctx->cs_lock);
969
970	cs_cmpl->cs_seq = ctx->cs_sequence;
971	other = ctx->cs_pending[cs_cmpl->cs_seq &
972	(hdev->asic_prop.max_pending_cs - `1`)];
973
974	if (other && !completion_done(x: &other->completion)) {
975	/ If the following statement is true, it means we have reached*
976	* a point in which only part of the staged submission was
977	* submitted and we don't have enough room in the 'cs_pending'
978	* array for the rest of the submission.
979	* This causes a deadlock because this CS will never be
980	* completed as it depends on future CS's for completion.
981	*/
982	if (other->cs_sequence == user_sequence)
983	dev_crit_ratelimited(hdev->dev,
984	"Staged CS %llu deadlock due to lack of resources",
985	user_sequence);
986
987	dev_dbg_ratelimited(hdev->dev,
988	"Rejecting CS because of too many in-flights CS\n");
989	atomic64_inc(v: &ctx->cs_counters.max_cs_in_flight_drop_cnt);
990	atomic64_inc(v: &cntr->max_cs_in_flight_drop_cnt);
991	rc = -EAGAIN;
992	goto free_fence;
993	}
994
995	/ init hl_fence /
996	hl_fence_init(fence: &cs_cmpl->base_fence, sequence: cs_cmpl->cs_seq);
997
998	cs->sequence = cs_cmpl->cs_seq;
999
1000	ctx->cs_pending[cs_cmpl->cs_seq &
1001	(hdev->asic_prop.max_pending_cs - `1`)] =
1002	&cs_cmpl->base_fence;
1003	ctx->cs_sequence++;
1004
1005	hl_fence_get(fence: &cs_cmpl->base_fence);
1006
1007	hl_fence_put(fence: other);
1008
1009	spin_unlock(lock: &ctx->cs_lock);
1010
1011	*cs_new = cs;
1012
1013	return `0`;
1014
1015	free_fence:
1016	spin_unlock(lock: &ctx->cs_lock);
1017	kfree(objp: cs->jobs_in_queue_cnt);
1018	free_cs_cmpl:
1019	kfree(objp: cs_cmpl);
1020	free_cs:
1021	kfree(objp: cs);
1022	hl_ctx_put(ctx);
1023	return rc;
1024	}
1025
1026	static void cs_rollback(struct hl_device hdev, struct* hl_cs *cs)
1027	{
1028	struct hl_cs_job job, tmp;
1029
1030	staged_cs_put(hdev, cs);
1031
1032	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1033	hl_complete_job(hdev, job);
1034	}
1035
1036	/*
1037	* release_reserved_encaps_signals() - release reserved encapsulated signals.
1038	* @hdev: pointer to habanalabs device structure
1039	*
1040	* Release reserved encapsulated signals which weren't un-reserved, or for which a CS with
1041	* encapsulated signals wasn't submitted and thus weren't released as part of CS roll-back.
1042	* For these signals need also to put the refcount of the H/W SOB which was taken at the
1043	* reservation.
1044	*/
1045	static void release_reserved_encaps_signals(struct hl_device *hdev)
1046	{
1047	struct hl_ctx *ctx = hl_get_compute_ctx(hdev);
1048	struct hl_cs_encaps_sig_handle *handle;
1049	struct hl_encaps_signals_mgr *mgr;
1050	u32 id;
1051
1052	if (!ctx)
1053	return;
1054
1055	mgr = &ctx->sig_mgr;
1056
1057	idr_for_each_entry(&mgr->handles, handle, id)
1058	if (handle->cs_seq == ULLONG_MAX)
1059	kref_put(kref: &handle->refcount, release: hl_encaps_release_handle_and_put_sob_ctx);
1060
1061	hl_ctx_put(ctx);
1062	}
1063
1064	void hl_cs_rollback_all(struct hl_device *hdev, bool skip_wq_flush)
1065	{
1066	int i;
1067	struct hl_cs cs, tmp;
1068
1069	if (!skip_wq_flush) {
1070	flush_workqueue(hdev->ts_free_obj_wq);
1071
1072	/ flush all completions before iterating over the CS mirror list in*
1073	* order to avoid a race with the release functions
1074	*/
1075	for (i = `0` ; i < hdev->asic_prop.completion_queues_count ; i++)
1076	flush_workqueue(hdev->cq_wq[i]);
1077
1078	flush_workqueue(hdev->cs_cmplt_wq);
1079	}
1080
1081	/ Make sure we don't have leftovers in the CS mirror list /
1082	list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
1083	cs_get(cs);
1084	cs->aborted = true;
1085	dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
1086	cs->ctx->asid, cs->sequence);
1087	cs_rollback(hdev, cs);
1088	cs_put(cs);
1089	}
1090
1091	force_complete_multi_cs(hdev);
1092
1093	release_reserved_encaps_signals(hdev);
1094	}
1095
1096	static void
1097	wake_pending_user_interrupt_threads(struct hl_user_interrupt *interrupt)
1098	{
1099	struct hl_user_pending_interrupt pend, temp;
1100	unsigned long flags;
1101
1102	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
1103	list_for_each_entry_safe(pend, temp, &interrupt->wait_list_head, list_node) {
1104	pend->fence.error = -EIO;
1105	complete_all(&pend->fence.completion);
1106	}
1107	spin_unlock_irqrestore(lock: &interrupt->wait_list_lock, flags);
1108
1109	spin_lock_irqsave(&interrupt->ts_list_lock, flags);
1110	list_for_each_entry_safe(pend, temp, &interrupt->ts_list_head, list_node) {
1111	list_del(entry: &pend->list_node);
1112	hl_mmap_mem_buf_put(buf: pend->ts_reg_info.buf);
1113	hl_cb_put(cb: pend->ts_reg_info.cq_cb);
1114	}
1115	spin_unlock_irqrestore(lock: &interrupt->ts_list_lock, flags);
1116	}
1117
1118	void hl_release_pending_user_interrupts(struct hl_device *hdev)
1119	{
1120	struct asic_fixed_properties *prop = &hdev->asic_prop;
1121	struct hl_user_interrupt *interrupt;
1122	int i;
1123
1124	if (!prop->user_interrupt_count)
1125	return;
1126
1127	/ We iterate through the user interrupt requests and waking up all*
1128	* user threads waiting for interrupt completion. We iterate the
1129	* list under a lock, this is why all user threads, once awake,
1130	* will wait on the same lock and will release the waiting object upon
1131	* unlock.
1132	*/
1133
1134	for (i = `0` ; i < prop->user_interrupt_count ; i++) {
1135	interrupt = &hdev->user_interrupt[i];
1136	wake_pending_user_interrupt_threads(interrupt);
1137	}
1138
1139	interrupt = &hdev->common_user_cq_interrupt;
1140	wake_pending_user_interrupt_threads(interrupt);
1141
1142	interrupt = &hdev->common_decoder_interrupt;
1143	wake_pending_user_interrupt_threads(interrupt);
1144	}
1145
1146	static void force_complete_cs(struct hl_device *hdev)
1147	{
1148	struct hl_cs *cs;
1149
1150	spin_lock(lock: &hdev->cs_mirror_lock);
1151
1152	list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node) {
1153	cs->fence->error = -EIO;
1154	complete_all(&cs->fence->completion);
1155	}
1156
1157	spin_unlock(lock: &hdev->cs_mirror_lock);
1158	}
1159
1160	void hl_abort_waiting_for_cs_completions(struct hl_device *hdev)
1161	{
1162	force_complete_cs(hdev);
1163	force_complete_multi_cs(hdev);
1164	}
1165
1166	static void job_wq_completion(struct work_struct *work)
1167	{
1168	struct hl_cs_job job = container_of(work, struct* hl_cs_job,
1169	finish_work);
1170	struct hl_cs *cs = job->cs;
1171	struct hl_device *hdev = cs->ctx->hdev;
1172
1173	/ job is no longer needed /
1174	hl_complete_job(hdev, job);
1175	}
1176
1177	static void cs_completion(struct work_struct *work)
1178	{
1179	struct hl_cs cs = container_of(work, struct* hl_cs, finish_work);
1180	struct hl_device *hdev = cs->ctx->hdev;
1181	struct hl_cs_job job, tmp;
1182
1183	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
1184	hl_complete_job(hdev, job);
1185	}
1186
1187	u32 hl_get_active_cs_num(struct hl_device *hdev)
1188	{
1189	u32 active_cs_num = `0`;
1190	struct hl_cs *cs;
1191
1192	spin_lock(lock: &hdev->cs_mirror_lock);
1193
1194	list_for_each_entry(cs, &hdev->cs_mirror_list, mirror_node)
1195	if (!cs->completed)
1196	active_cs_num++;
1197
1198	spin_unlock(lock: &hdev->cs_mirror_lock);
1199
1200	return active_cs_num;
1201	}
1202
1203	static int validate_queue_index(struct hl_device *hdev,
1204	struct hl_cs_chunk *chunk,
1205	enum hl_queue_type *queue_type,
1206	bool *is_kernel_allocated_cb)
1207	{
1208	struct asic_fixed_properties *asic = &hdev->asic_prop;
1209	struct hw_queue_properties *hw_queue_prop;
1210
1211	/ This must be checked here to prevent out-of-bounds access to*
1212	* hw_queues_props array
1213	*/
1214	if (chunk->queue_index >= asic->max_queues) {
1215	dev_err(hdev->dev, "Queue index %d is invalid\n",
1216	chunk->queue_index);
1217	return -EINVAL;
1218	}
1219
1220	hw_queue_prop = &asic->hw_queues_props[chunk->queue_index];
1221
1222	if (hw_queue_prop->type == QUEUE_TYPE_NA) {
1223	dev_err(hdev->dev, "Queue index %d is not applicable\n",
1224	chunk->queue_index);
1225	return -EINVAL;
1226	}
1227
1228	if (hw_queue_prop->binned) {
1229	dev_err(hdev->dev, "Queue index %d is binned out\n",
1230	chunk->queue_index);
1231	return -EINVAL;
1232	}
1233
1234	if (hw_queue_prop->driver_only) {
1235	dev_err(hdev->dev,
1236	"Queue index %d is restricted for the kernel driver\n",
1237	chunk->queue_index);
1238	return -EINVAL;
1239	}
1240
1241	/ When hw queue type isn't QUEUE_TYPE_HW,*
1242	* USER_ALLOC_CB flag shall be referred as "don't care".
1243	*/
1244	if (hw_queue_prop->type == QUEUE_TYPE_HW) {
1245	if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
1246	if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
1247	dev_err(hdev->dev,
1248	"Queue index %d doesn't support user CB\n",
1249	chunk->queue_index);
1250	return -EINVAL;
1251	}
1252
1253	*is_kernel_allocated_cb = false;
1254	} else {
1255	if (!(hw_queue_prop->cb_alloc_flags &
1256	CB_ALLOC_KERNEL)) {
1257	dev_err(hdev->dev,
1258	"Queue index %d doesn't support kernel CB\n",
1259	chunk->queue_index);
1260	return -EINVAL;
1261	}
1262
1263	*is_kernel_allocated_cb = true;
1264	}
1265	} else {
1266	*is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
1267	& CB_ALLOC_KERNEL);
1268	}
1269
1270	*queue_type = hw_queue_prop->type;
1271	return `0`;
1272	}
1273
1274	static struct hl_cb get_cb_from_cs_chunk(struct* hl_device *hdev,
1275	struct hl_mem_mgr *mmg,
1276	struct hl_cs_chunk *chunk)
1277	{
1278	struct hl_cb *cb;
1279
1280	cb = hl_cb_get(mmg, handle: chunk->cb_handle);
1281	if (!cb) {
1282	dev_err(hdev->dev, "CB handle 0x%llx invalid\n", chunk->cb_handle);
1283	return NULL;
1284	}
1285
1286	if ((chunk->cb_size < `8`) \|\| (chunk->cb_size > cb->size)) {
1287	dev_err(hdev->dev, "CB size %u invalid\n", chunk->cb_size);
1288	goto release_cb;
1289	}
1290
1291	atomic_inc(v: &cb->cs_cnt);
1292
1293	return cb;
1294
1295	release_cb:
1296	hl_cb_put(cb);
1297	return NULL;
1298	}
1299
1300	struct hl_cs_job hl_cs_allocate_job(struct* hl_device *hdev,
1301	enum hl_queue_type queue_type, bool is_kernel_allocated_cb)
1302	{
1303	struct hl_cs_job *job;
1304
1305	job = kzalloc(size: sizeof(*job), GFP_ATOMIC);
1306	if (!job)
1307	job = kzalloc(size: sizeof(*job), GFP_KERNEL);
1308
1309	if (!job)
1310	return NULL;
1311
1312	kref_init(kref: &job->refcount);
1313	job->queue_type = queue_type;
1314	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
1315
1316	if (is_cb_patched(hdev, job))
1317	INIT_LIST_HEAD(list: &job->userptr_list);
1318
1319	if (job->queue_type == QUEUE_TYPE_EXT)
1320	INIT_WORK(&job->finish_work, job_wq_completion);
1321
1322	return job;
1323	}
1324
1325	static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
1326	{
1327	if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
1328	return CS_TYPE_SIGNAL;
1329	else if (cs_type_flags & HL_CS_FLAGS_WAIT)
1330	return CS_TYPE_WAIT;
1331	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
1332	return CS_TYPE_COLLECTIVE_WAIT;
1333	else if (cs_type_flags & HL_CS_FLAGS_RESERVE_SIGNALS_ONLY)
1334	return CS_RESERVE_SIGNALS;
1335	else if (cs_type_flags & HL_CS_FLAGS_UNRESERVE_SIGNALS_ONLY)
1336	return CS_UNRESERVE_SIGNALS;
1337	else if (cs_type_flags & HL_CS_FLAGS_ENGINE_CORE_COMMAND)
1338	return CS_TYPE_ENGINE_CORE;
1339	else if (cs_type_flags & HL_CS_FLAGS_ENGINES_COMMAND)
1340	return CS_TYPE_ENGINES;
1341	else if (cs_type_flags & HL_CS_FLAGS_FLUSH_PCI_HBW_WRITES)
1342	return CS_TYPE_FLUSH_PCI_HBW_WRITES;
1343	else
1344	return CS_TYPE_DEFAULT;
1345	}
1346
1347	static int hl_cs_sanity_checks(struct hl_fpriv hpriv, union* hl_cs_args *args)
1348	{
1349	struct hl_device *hdev = hpriv->hdev;
1350	struct hl_ctx *ctx = hpriv->ctx;
1351	u32 cs_type_flags, num_chunks;
1352	enum hl_device_status status;
1353	enum hl_cs_type cs_type;
1354	bool is_sync_stream;
1355	int i;
1356
1357	for (i = `0` ; i < sizeof(args->in.pad) ; i++)
1358	if (args->in.pad[i]) {
1359	dev_dbg(hdev->dev, "Padding bytes must be 0\n");
1360	return -EINVAL;
1361	}
1362
1363	if (!hl_device_operational(hdev, status: &status))
1364	return -EBUSY;
1365
1366	if ((args->in.cs_flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1367	!hdev->supports_staged_submission) {
1368	dev_err(hdev->dev, "staged submission not supported");
1369	return -EPERM;
1370	}
1371
1372	cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
1373
1374	if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
1375	dev_err(hdev->dev,
1376	"CS type flags are mutually exclusive, context %d\n",
1377	ctx->asid);
1378	return -EINVAL;
1379	}
1380
1381	cs_type = hl_cs_get_cs_type(cs_type_flags);
1382	num_chunks = args->in.num_chunks_execute;
1383
1384	is_sync_stream = (cs_type == CS_TYPE_SIGNAL \|\| cs_type == CS_TYPE_WAIT \|\|
1385	cs_type == CS_TYPE_COLLECTIVE_WAIT);
1386
1387	if (unlikely(is_sync_stream && !hdev->supports_sync_stream)) {
1388	dev_err(hdev->dev, "Sync stream CS is not supported\n");
1389	return -EINVAL;
1390	}
1391
1392	if (cs_type == CS_TYPE_DEFAULT) {
1393	if (!num_chunks) {
1394	dev_err(hdev->dev, "Got execute CS with 0 chunks, context %d\n", ctx->asid);
1395	return -EINVAL;
1396	}
1397	} else if (is_sync_stream && num_chunks != `1`) {
1398	dev_err(hdev->dev,
1399	"Sync stream CS mandates one chunk only, context %d\n",
1400	ctx->asid);
1401	return -EINVAL;
1402	}
1403
1404	return `0`;
1405	}
1406
1407	static int hl_cs_copy_chunk_array(struct hl_device *hdev,
1408	struct hl_cs_chunk **cs_chunk_array,
1409	void __user *chunks, u32 num_chunks,
1410	struct hl_ctx *ctx)
1411	{
1412	u32 size_to_copy;
1413
1414	if (num_chunks > HL_MAX_JOBS_PER_CS) {
1415	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1416	atomic64_inc(v: &hdev->aggregated_cs_counters.validation_drop_cnt);
1417	dev_err(hdev->dev,
1418	"Number of chunks can NOT be larger than %d\n",
1419	HL_MAX_JOBS_PER_CS);
1420	return -EINVAL;
1421	}
1422
1423	cs_chunk_array = kmalloc_array(n: num_chunks, size: sizeof(*cs_chunk_array),
1424	GFP_ATOMIC);
1425	if (!*cs_chunk_array)
1426	*cs_chunk_array = kmalloc_array(n: num_chunks,
1427	size: sizeof(**cs_chunk_array), GFP_KERNEL);
1428	if (!*cs_chunk_array) {
1429	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
1430	atomic64_inc(v: &hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1431	return -ENOMEM;
1432	}
1433
1434	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
1435	if (copy_from_user(to: *cs_chunk_array, from: chunks, n: size_to_copy)) {
1436	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1437	atomic64_inc(v: &hdev->aggregated_cs_counters.validation_drop_cnt);
1438	dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
1439	kfree(objp: *cs_chunk_array);
1440	return -EFAULT;
1441	}
1442
1443	return `0`;
1444	}
1445
1446	static int cs_staged_submission(struct hl_device hdev, struct* hl_cs *cs,
1447	u64 sequence, u32 flags,
1448	u32 encaps_signal_handle)
1449	{
1450	if (!(flags & HL_CS_FLAGS_STAGED_SUBMISSION))
1451	return `0`;
1452
1453	cs->staged_last = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_LAST);
1454	cs->staged_first = !!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST);
1455
1456	if (cs->staged_first) {
1457	/ Staged CS sequence is the first CS sequence /
1458	INIT_LIST_HEAD(list: &cs->staged_cs_node);
1459	cs->staged_sequence = cs->sequence;
1460
1461	if (cs->encaps_signals)
1462	cs->encaps_sig_hdl_id = encaps_signal_handle;
1463	} else {
1464	/ User sequence will be validated in 'hl_hw_queue_schedule_cs'*
1465	* under the cs_mirror_lock
1466	*/
1467	cs->staged_sequence = sequence;
1468	}
1469
1470	/ Increment CS reference if needed /
1471	staged_cs_get(hdev, cs);
1472
1473	cs->staged_cs = true;
1474
1475	return `0`;
1476	}
1477
1478	static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
1479	{
1480	int i;
1481
1482	for (i = `0`; i < hdev->stream_master_qid_arr_size; i++)
1483	if (qid == hdev->stream_master_qid_arr[i])
1484	return BIT(i);
1485
1486	return `0`;
1487	}
1488
1489	static int cs_ioctl_default(struct hl_fpriv hpriv, void* __user *chunks,
1490	u32 num_chunks, u64 *cs_seq, u32 flags,
1491	u32 encaps_signals_handle, u32 timeout,
1492	u16 *signal_initial_sob_count)
1493	{
1494	bool staged_mid, int_queues_only = true, using_hw_queues = false;
1495	struct hl_device *hdev = hpriv->hdev;
1496	struct hl_cs_chunk *cs_chunk_array;
1497	struct hl_cs_counters_atomic *cntr;
1498	struct hl_ctx *ctx = hpriv->ctx;
1499	struct hl_cs_job *job;
1500	struct hl_cs *cs;
1501	struct hl_cb *cb;
1502	u64 user_sequence;
1503	u8 stream_master_qid_map = `0`;
1504	int rc, i;
1505
1506	cntr = &hdev->aggregated_cs_counters;
1507	user_sequence = *cs_seq;
1508	*cs_seq = ULLONG_MAX;
1509
1510	rc = hl_cs_copy_chunk_array(hdev, cs_chunk_array: &cs_chunk_array, chunks, num_chunks,
1511	ctx: hpriv->ctx);
1512	if (rc)
1513	goto out;
1514
1515	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
1516	!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
1517	staged_mid = true;
1518	else
1519	staged_mid = false;
1520
1521	rc = allocate_cs(hdev, ctx: hpriv->ctx, cs_type: CS_TYPE_DEFAULT,
1522	user_sequence: staged_mid ? user_sequence : ULLONG_MAX, cs_new: &cs, flags,
1523	timeout);
1524	if (rc)
1525	goto free_cs_chunk_array;
1526
1527	*cs_seq = cs->sequence;
1528
1529	hl_debugfs_add_cs(cs);
1530
1531	rc = cs_staged_submission(hdev, cs, sequence: user_sequence, flags,
1532	encaps_signal_handle: encaps_signals_handle);
1533	if (rc)
1534	goto free_cs_object;
1535
1536	/ If this is a staged submission we must return the staged sequence*
1537	* rather than the internal CS sequence
1538	*/
1539	if (cs->staged_cs)
1540	*cs_seq = cs->staged_sequence;
1541
1542	/ Validate ALL the CS chunks before submitting the CS /
1543	for (i = `0` ; i < num_chunks ; i++) {
1544	struct hl_cs_chunk *chunk = &cs_chunk_array[i];
1545	enum hl_queue_type queue_type;
1546	bool is_kernel_allocated_cb;
1547
1548	rc = validate_queue_index(hdev, chunk, queue_type: &queue_type,
1549	is_kernel_allocated_cb: &is_kernel_allocated_cb);
1550	if (rc) {
1551	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1552	atomic64_inc(v: &cntr->validation_drop_cnt);
1553	goto free_cs_object;
1554	}
1555
1556	if (is_kernel_allocated_cb) {
1557	cb = get_cb_from_cs_chunk(hdev, mmg: &hpriv->mem_mgr, chunk);
1558	if (!cb) {
1559	atomic64_inc(
1560	v: &ctx->cs_counters.validation_drop_cnt);
1561	atomic64_inc(v: &cntr->validation_drop_cnt);
1562	rc = -EINVAL;
1563	goto free_cs_object;
1564	}
1565	} else {
1566	cb = (struct hl_cb *) (uintptr_t) chunk->cb_handle;
1567	}
1568
1569	if (queue_type == QUEUE_TYPE_EXT \|\|
1570	queue_type == QUEUE_TYPE_HW) {
1571	int_queues_only = false;
1572
1573	/*
1574	* store which stream are being used for external/HW
1575	* queues of this CS
1576	*/
1577	if (hdev->supports_wait_for_multi_cs)
1578	stream_master_qid_map \|=
1579	get_stream_master_qid_mask(hdev,
1580	qid: chunk->queue_index);
1581	}
1582
1583	if (queue_type == QUEUE_TYPE_HW)
1584	using_hw_queues = true;
1585
1586	job = hl_cs_allocate_job(hdev, queue_type,
1587	is_kernel_allocated_cb);
1588	if (!job) {
1589	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
1590	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
1591	dev_err(hdev->dev, "Failed to allocate a new job\n");
1592	rc = -ENOMEM;
1593	if (is_kernel_allocated_cb)
1594	goto release_cb;
1595
1596	goto free_cs_object;
1597	}
1598
1599	job->id = i + `1`;
1600	job->cs = cs;
1601	job->user_cb = cb;
1602	job->user_cb_size = chunk->cb_size;
1603	job->hw_queue_id = chunk->queue_index;
1604
1605	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1606	cs->jobs_cnt++;
1607
1608	list_add_tail(new: &job->cs_node, head: &cs->job_list);
1609
1610	/*
1611	* Increment CS reference. When CS reference is 0, CS is
1612	* done and can be signaled to user and free all its resources
1613	* Only increment for JOB on external or H/W queues, because
1614	* only for those JOBs we get completion
1615	*/
1616	if (cs_needs_completion(cs) &&
1617	(job->queue_type == QUEUE_TYPE_EXT \|\|
1618	job->queue_type == QUEUE_TYPE_HW))
1619	cs_get(cs);
1620
1621	hl_debugfs_add_job(hdev, job);
1622
1623	rc = cs_parser(hpriv, job);
1624	if (rc) {
1625	atomic64_inc(v: &ctx->cs_counters.parsing_drop_cnt);
1626	atomic64_inc(v: &cntr->parsing_drop_cnt);
1627	dev_err(hdev->dev,
1628	"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
1629	cs->ctx->asid, cs->sequence, job->id, rc);
1630	goto free_cs_object;
1631	}
1632	}
1633
1634	/ We allow a CS with any queue type combination as long as it does*
1635	* not get a completion
1636	*/
1637	if (int_queues_only && cs_needs_completion(cs)) {
1638	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1639	atomic64_inc(v: &cntr->validation_drop_cnt);
1640	dev_err(hdev->dev,
1641	"Reject CS %d.%llu since it contains only internal queues jobs and needs completion\n",
1642	cs->ctx->asid, cs->sequence);
1643	rc = -EINVAL;
1644	goto free_cs_object;
1645	}
1646
1647	if (using_hw_queues)
1648	INIT_WORK(&cs->finish_work, cs_completion);
1649
1650	/*
1651	* store the (external/HW queues) streams used by the CS in the
1652	* fence object for multi-CS completion
1653	*/
1654	if (hdev->supports_wait_for_multi_cs)
1655	cs->fence->stream_master_qid_map = stream_master_qid_map;
1656
1657	rc = hl_hw_queue_schedule_cs(cs);
1658	if (rc) {
1659	if (rc != -EAGAIN)
1660	dev_err(hdev->dev,
1661	"Failed to submit CS %d.%llu to H/W queues, error %d\n",
1662	cs->ctx->asid, cs->sequence, rc);
1663	goto free_cs_object;
1664	}
1665
1666	*signal_initial_sob_count = cs->initial_sob_count;
1667
1668	rc = HL_CS_STATUS_SUCCESS;
1669	goto put_cs;
1670
1671	release_cb:
1672	atomic_dec(v: &cb->cs_cnt);
1673	hl_cb_put(cb);
1674	free_cs_object:
1675	cs_rollback(hdev, cs);
1676	*cs_seq = ULLONG_MAX;
1677	/ The path below is both for good and erroneous exits /
1678	put_cs:
1679	/ We finished with the CS in this function, so put the ref /
1680	cs_put(cs);
1681	free_cs_chunk_array:
1682	kfree(objp: cs_chunk_array);
1683	out:
1684	return rc;
1685	}
1686
1687	static int hl_cs_ctx_switch(struct hl_fpriv hpriv, union* hl_cs_args *args,
1688	u64 *cs_seq)
1689	{
1690	struct hl_device *hdev = hpriv->hdev;
1691	struct hl_ctx *ctx = hpriv->ctx;
1692	bool need_soft_reset = false;
1693	int rc = `0`, do_ctx_switch = `0`;
1694	void __user *chunks;
1695	u32 num_chunks, tmp;
1696	u16 sob_count;
1697	int ret;
1698
1699	if (hdev->supports_ctx_switch)
1700	do_ctx_switch = atomic_cmpxchg(v: &ctx->thread_ctx_switch_token, old: `1`, new: `0`);
1701
1702	if (do_ctx_switch \|\| (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
1703	mutex_lock(&hpriv->restore_phase_mutex);
1704
1705	if (do_ctx_switch) {
1706	rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
1707	if (rc) {
1708	dev_err_ratelimited(hdev->dev,
1709	"Failed to switch to context %d, rejecting CS! %d\n",
1710	ctx->asid, rc);
1711	/*
1712	* If we timedout, or if the device is not IDLE
1713	* while we want to do context-switch (-EBUSY),
1714	* we need to soft-reset because QMAN is
1715	* probably stuck. However, we can't call to
1716	* reset here directly because of deadlock, so
1717	* need to do it at the very end of this
1718	* function
1719	*/
1720	if ((rc == -ETIMEDOUT) \|\| (rc == -EBUSY))
1721	need_soft_reset = true;
1722	mutex_unlock(lock: &hpriv->restore_phase_mutex);
1723	goto out;
1724	}
1725	}
1726
1727	hdev->asic_funcs->restore_phase_topology(hdev);
1728
1729	chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
1730	num_chunks = args->in.num_chunks_restore;
1731
1732	if (!num_chunks) {
1733	dev_dbg(hdev->dev,
1734	"Need to run restore phase but restore CS is empty\n");
1735	rc = `0`;
1736	} else {
1737	rc = cs_ioctl_default(hpriv, chunks, num_chunks,
1738	cs_seq, flags: `0`, encaps_signals_handle: `0`, timeout: hdev->timeout_jiffies, signal_initial_sob_count: &sob_count);
1739	}
1740
1741	mutex_unlock(lock: &hpriv->restore_phase_mutex);
1742
1743	if (rc) {
1744	dev_err(hdev->dev,
1745	"Failed to submit restore CS for context %d (%d)\n",
1746	ctx->asid, rc);
1747	goto out;
1748	}
1749
1750	/ Need to wait for restore completion before execution phase /
1751	if (num_chunks) {
1752	enum hl_cs_wait_status status;
1753
1754	ret = _hl_cs_wait_ioctl(hdev, ctx,
1755	timeout_us: jiffies_to_usecs(j: hdev->timeout_jiffies),
1756	seq: *cs_seq, status: &status, NULL);
1757	if (ret) {
1758	dev_err(hdev->dev,
1759	"Restore CS for context %d failed to complete %d\n",
1760	ctx->asid, ret);
1761	rc = -ENOEXEC;
1762	goto out;
1763	}
1764	}
1765
1766	if (hdev->supports_ctx_switch)
1767	ctx->thread_ctx_switch_wait_token = `1`;
1768
1769	} else if (hdev->supports_ctx_switch && !ctx->thread_ctx_switch_wait_token) {
1770	rc = hl_poll_timeout_memory(hdev,
1771	&ctx->thread_ctx_switch_wait_token, tmp, (tmp == `1`),
1772	`100`, jiffies_to_usecs(hdev->timeout_jiffies), false);
1773
1774	if (rc == -ETIMEDOUT) {
1775	dev_err(hdev->dev,
1776	"context switch phase timeout (%d)\n", tmp);
1777	goto out;
1778	}
1779	}
1780
1781	out:
1782	if ((rc == -ETIMEDOUT \|\| rc == -EBUSY) && (need_soft_reset))
1783	hl_device_reset(hdev, flags: `0`);
1784
1785	return rc;
1786	}
1787
1788	/*
1789	* hl_cs_signal_sob_wraparound_handler: handle SOB value wrapaound case.
1790	* if the SOB value reaches the max value move to the other SOB reserved
1791	* to the queue.
1792	* @hdev: pointer to device structure
1793	* @q_idx: stream queue index
1794	* @hw_sob: the H/W SOB used in this signal CS.
1795	* @count: signals count
1796	* @encaps_sig: tells whether it's reservation for encaps signals or not.
1797	*
1798	* Note that this function must be called while hw_queues_lock is taken.
1799	*/
1800	int hl_cs_signal_sob_wraparound_handler(struct hl_device *hdev, u32 q_idx,
1801	struct hl_hw_sob **hw_sob, u32 count, bool encaps_sig)
1802
1803	{
1804	struct hl_sync_stream_properties *prop;
1805	struct hl_hw_sob sob = hw_sob, *other_sob;
1806	u8 other_sob_offset;
1807
1808	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
1809
1810	hw_sob_get(hw_sob: sob);
1811
1812	/ check for wraparound /
1813	if (prop->next_sob_val + count >= HL_MAX_SOB_VAL) {
1814	/*
1815	* Decrement as we reached the max value.
1816	* The release function won't be called here as we've
1817	* just incremented the refcount right before calling this
1818	* function.
1819	*/
1820	hw_sob_put_err(hw_sob: sob);
1821
1822	/*
1823	* check the other sob value, if it still in use then fail
1824	* otherwise make the switch
1825	*/
1826	other_sob_offset = (prop->curr_sob_offset + `1`) % HL_RSVD_SOBS;
1827	other_sob = &prop->hw_sob[other_sob_offset];
1828
1829	if (kref_read(kref: &other_sob->kref) != `1`) {
1830	dev_err(hdev->dev, "error: Cannot switch SOBs q_idx: %d\n",
1831	q_idx);
1832	return -EINVAL;
1833	}
1834
1835	/*
1836	* next_sob_val always points to the next available signal
1837	* in the sob, so in encaps signals it will be the next one
1838	* after reserving the required amount.
1839	*/
1840	if (encaps_sig)
1841	prop->next_sob_val = count + `1`;
1842	else
1843	prop->next_sob_val = count;
1844
1845	/ only two SOBs are currently in use /
1846	prop->curr_sob_offset = other_sob_offset;
1847	*hw_sob = other_sob;
1848
1849	/*
1850	* check if other_sob needs reset, then do it before using it
1851	* for the reservation or the next signal cs.
1852	* we do it here, and for both encaps and regular signal cs
1853	* cases in order to avoid possible races of two kref_put
1854	* of the sob which can occur at the same time if we move the
1855	* sob reset(kref_put) to cs_do_release function.
1856	* in addition, if we have combination of cs signal and
1857	* encaps, and at the point we need to reset the sob there was
1858	* no more reservations and only signal cs keep coming,
1859	* in such case we need signal_cs to put the refcount and
1860	* reset the sob.
1861	*/
1862	if (other_sob->need_reset)
1863	hw_sob_put(hw_sob: other_sob);
1864
1865	if (encaps_sig) {
1866	/ set reset indication for the sob /
1867	sob->need_reset = true;
1868	hw_sob_get(hw_sob: other_sob);
1869	}
1870
1871	dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
1872	prop->curr_sob_offset, q_idx);
1873	} else {
1874	prop->next_sob_val += count;
1875	}
1876
1877	return `0`;
1878	}
1879
1880	static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
1881	struct hl_cs_chunk chunk, u64 signal_seq, struct hl_ctx *ctx,
1882	bool encaps_signals)
1883	{
1884	u64 *signal_seq_arr = NULL;
1885	u32 size_to_copy, signal_seq_arr_len;
1886	int rc = `0`;
1887
1888	if (encaps_signals) {
1889	*signal_seq = chunk->encaps_signal_seq;
1890	return `0`;
1891	}
1892
1893	signal_seq_arr_len = chunk->num_signal_seq_arr;
1894
1895	/ currently only one signal seq is supported /
1896	if (signal_seq_arr_len != `1`) {
1897	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1898	atomic64_inc(v: &hdev->aggregated_cs_counters.validation_drop_cnt);
1899	dev_err(hdev->dev,
1900	"Wait for signal CS supports only one signal CS seq\n");
1901	return -EINVAL;
1902	}
1903
1904	signal_seq_arr = kmalloc_array(n: signal_seq_arr_len,
1905	size: sizeof(*signal_seq_arr),
1906	GFP_ATOMIC);
1907	if (!signal_seq_arr)
1908	signal_seq_arr = kmalloc_array(n: signal_seq_arr_len,
1909	size: sizeof(*signal_seq_arr),
1910	GFP_KERNEL);
1911	if (!signal_seq_arr) {
1912	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
1913	atomic64_inc(v: &hdev->aggregated_cs_counters.out_of_mem_drop_cnt);
1914	return -ENOMEM;
1915	}
1916
1917	size_to_copy = signal_seq_arr_len * sizeof(*signal_seq_arr);
1918	if (copy_from_user(to: signal_seq_arr,
1919	u64_to_user_ptr(chunk->signal_seq_arr),
1920	n: size_to_copy)) {
1921	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
1922	atomic64_inc(v: &hdev->aggregated_cs_counters.validation_drop_cnt);
1923	dev_err(hdev->dev,
1924	"Failed to copy signal seq array from user\n");
1925	rc = -EFAULT;
1926	goto out;
1927	}
1928
1929	/ currently it is guaranteed to have only one signal seq /
1930	*signal_seq = signal_seq_arr[`0`];
1931
1932	out:
1933	kfree(objp: signal_seq_arr);
1934
1935	return rc;
1936	}
1937
1938	static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
1939	struct hl_ctx ctx, struct* hl_cs *cs,
1940	enum hl_queue_type q_type, u32 q_idx, u32 encaps_signal_offset)
1941	{
1942	struct hl_cs_counters_atomic *cntr;
1943	struct hl_cs_job *job;
1944	struct hl_cb *cb;
1945	u32 cb_size;
1946
1947	cntr = &hdev->aggregated_cs_counters;
1948
1949	job = hl_cs_allocate_job(hdev, queue_type: q_type, is_kernel_allocated_cb: true);
1950	if (!job) {
1951	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
1952	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
1953	dev_err(hdev->dev, "Failed to allocate a new job\n");
1954	return -ENOMEM;
1955	}
1956
1957	if (cs->type == CS_TYPE_WAIT)
1958	cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
1959	else
1960	cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
1961
1962	cb = hl_cb_kernel_create(hdev, cb_size, internal_cb: q_type == QUEUE_TYPE_HW);
1963	if (!cb) {
1964	atomic64_inc(v: &ctx->cs_counters.out_of_mem_drop_cnt);
1965	atomic64_inc(v: &cntr->out_of_mem_drop_cnt);
1966	kfree(objp: job);
1967	return -EFAULT;
1968	}
1969
1970	job->id = `0`;
1971	job->cs = cs;
1972	job->user_cb = cb;
1973	atomic_inc(v: &job->user_cb->cs_cnt);
1974	job->user_cb_size = cb_size;
1975	job->hw_queue_id = q_idx;
1976
1977	if ((cs->type == CS_TYPE_WAIT \|\| cs->type == CS_TYPE_COLLECTIVE_WAIT)
1978	&& cs->encaps_signals)
1979	job->encaps_sig_wait_offset = encaps_signal_offset;
1980	/*
1981	* No need in parsing, user CB is the patched CB.
1982	* We call hl_cb_destroy() out of two reasons - we don't need the CB in
1983	* the CB idr anymore and to decrement its refcount as it was
1984	* incremented inside hl_cb_kernel_create().
1985	*/
1986	job->patched_cb = job->user_cb;
1987	job->job_cb_size = job->user_cb_size;
1988	hl_cb_destroy(mmg: &hdev->kernel_mem_mgr, cb_handle: cb->buf->handle);
1989
1990	/ increment refcount as for external queues we get completion /
1991	cs_get(cs);
1992
1993	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
1994	cs->jobs_cnt++;
1995
1996	list_add_tail(new: &job->cs_node, head: &cs->job_list);
1997
1998	hl_debugfs_add_job(hdev, job);
1999
2000	return `0`;
2001	}
2002
2003	static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
2004	u32 q_idx, u32 count,
2005	u32 handle_id, u32 sob_addr,
2006	u32 *signals_count)
2007	{
2008	struct hw_queue_properties *hw_queue_prop;
2009	struct hl_sync_stream_properties *prop;
2010	struct hl_device *hdev = hpriv->hdev;
2011	struct hl_cs_encaps_sig_handle *handle;
2012	struct hl_encaps_signals_mgr *mgr;
2013	struct hl_hw_sob *hw_sob;
2014	int hdl_id;
2015	int rc = `0`;
2016
2017	if (count >= HL_MAX_SOB_VAL) {
2018	dev_err(hdev->dev, "signals count(%u) exceeds the max SOB value\n",
2019	count);
2020	rc = -EINVAL;
2021	goto out;
2022	}
2023
2024	if (q_idx >= hdev->asic_prop.max_queues) {
2025	dev_err(hdev->dev, "Queue index %d is invalid\n",
2026	q_idx);
2027	rc = -EINVAL;
2028	goto out;
2029	}
2030
2031	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2032
2033	if (!hw_queue_prop->supports_sync_stream) {
2034	dev_err(hdev->dev,
2035	"Queue index %d does not support sync stream operations\n",
2036	q_idx);
2037	rc = -EINVAL;
2038	goto out;
2039	}
2040
2041	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2042
2043	handle = kzalloc(size: sizeof(*handle), GFP_KERNEL);
2044	if (!handle) {
2045	rc = -ENOMEM;
2046	goto out;
2047	}
2048
2049	handle->count = count;
2050
2051	hl_ctx_get(ctx: hpriv->ctx);
2052	handle->ctx = hpriv->ctx;
2053	mgr = &hpriv->ctx->sig_mgr;
2054
2055	spin_lock(lock: &mgr->lock);
2056	hdl_id = idr_alloc(&mgr->handles, ptr: handle, start: `1`, end: `0`, GFP_ATOMIC);
2057	spin_unlock(lock: &mgr->lock);
2058
2059	if (hdl_id < `0`) {
2060	dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
2061	rc = -EINVAL;
2062	goto put_ctx;
2063	}
2064
2065	handle->id = hdl_id;
2066	handle->q_idx = q_idx;
2067	handle->hdev = hdev;
2068	kref_init(kref: &handle->refcount);
2069
2070	hdev->asic_funcs->hw_queues_lock(hdev);
2071
2072	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2073
2074	/*
2075	* Increment the SOB value by count by user request
2076	* to reserve those signals
2077	* check if the signals amount to reserve is not exceeding the max sob
2078	* value, if yes then switch sob.
2079	*/
2080	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, hw_sob: &hw_sob, count,
2081	encaps_sig: true);
2082	if (rc) {
2083	dev_err(hdev->dev, "Failed to switch SOB\n");
2084	hdev->asic_funcs->hw_queues_unlock(hdev);
2085	rc = -EINVAL;
2086	goto remove_idr;
2087	}
2088	/ set the hw_sob to the handle after calling the sob wraparound handler*
2089	* since sob could have changed.
2090	*/
2091	handle->hw_sob = hw_sob;
2092
2093	/ store the current sob value for unreserve validity check, and*
2094	* signal offset support
2095	*/
2096	handle->pre_sob_val = prop->next_sob_val - handle->count;
2097
2098	handle->cs_seq = ULLONG_MAX;
2099
2100	*signals_count = prop->next_sob_val;
2101	hdev->asic_funcs->hw_queues_unlock(hdev);
2102
2103	*sob_addr = handle->hw_sob->sob_addr;
2104	*handle_id = hdl_id;
2105
2106	dev_dbg(hdev->dev,
2107	"Signals reserved, sob_id: %d, sob addr: 0x%x, last sob_val: %u, q_idx: %d, hdl_id: %d\n",
2108	hw_sob->sob_id, handle->hw_sob->sob_addr,
2109	prop->next_sob_val - `1`, q_idx, hdl_id);
2110	goto out;
2111
2112	remove_idr:
2113	spin_lock(lock: &mgr->lock);
2114	idr_remove(&mgr->handles, id: hdl_id);
2115	spin_unlock(lock: &mgr->lock);
2116
2117	put_ctx:
2118	hl_ctx_put(ctx: handle->ctx);
2119	kfree(objp: handle);
2120
2121	out:
2122	return rc;
2123	}
2124
2125	static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
2126	{
2127	struct hl_cs_encaps_sig_handle *encaps_sig_hdl;
2128	struct hl_sync_stream_properties *prop;
2129	struct hl_device *hdev = hpriv->hdev;
2130	struct hl_encaps_signals_mgr *mgr;
2131	struct hl_hw_sob *hw_sob;
2132	u32 q_idx, sob_addr;
2133	int rc = `0`;
2134
2135	mgr = &hpriv->ctx->sig_mgr;
2136
2137	spin_lock(lock: &mgr->lock);
2138	encaps_sig_hdl = idr_find(&mgr->handles, id: handle_id);
2139	if (encaps_sig_hdl) {
2140	dev_dbg(hdev->dev, "unreserve signals, handle: %u, SOB:0x%x, count: %u\n",
2141	handle_id, encaps_sig_hdl->hw_sob->sob_addr,
2142	encaps_sig_hdl->count);
2143
2144	hdev->asic_funcs->hw_queues_lock(hdev);
2145
2146	q_idx = encaps_sig_hdl->q_idx;
2147	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
2148	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
2149	sob_addr = hdev->asic_funcs->get_sob_addr(hdev, hw_sob->sob_id);
2150
2151	/ Check if sob_val got out of sync due to other*
2152	* signal submission requests which were handled
2153	* between the reserve-unreserve calls or SOB switch
2154	* upon reaching SOB max value.
2155	*/
2156	if (encaps_sig_hdl->pre_sob_val + encaps_sig_hdl->count
2157	!= prop->next_sob_val \|\|
2158	sob_addr != encaps_sig_hdl->hw_sob->sob_addr) {
2159	dev_err(hdev->dev, "Cannot unreserve signals, SOB val ran out of sync, expected: %u, actual val: %u\n",
2160	encaps_sig_hdl->pre_sob_val,
2161	(prop->next_sob_val - encaps_sig_hdl->count));
2162
2163	hdev->asic_funcs->hw_queues_unlock(hdev);
2164	rc = -EINVAL;
2165	goto out_unlock;
2166	}
2167
2168	/*
2169	* Decrement the SOB value by count by user request
2170	* to unreserve those signals
2171	*/
2172	prop->next_sob_val -= encaps_sig_hdl->count;
2173
2174	hdev->asic_funcs->hw_queues_unlock(hdev);
2175
2176	hw_sob_put(hw_sob);
2177
2178	/ Release the id and free allocated memory of the handle /
2179	idr_remove(&mgr->handles, id: handle_id);
2180
2181	/ unlock before calling ctx_put, where we might sleep /
2182	spin_unlock(lock: &mgr->lock);
2183	hl_ctx_put(ctx: encaps_sig_hdl->ctx);
2184	kfree(objp: encaps_sig_hdl);
2185	goto out;
2186	} else {
2187	rc = -EINVAL;
2188	dev_err(hdev->dev, "failed to unreserve signals, cannot find handler\n");
2189	}
2190
2191	out_unlock:
2192	spin_unlock(lock: &mgr->lock);
2193
2194	out:
2195	return rc;
2196	}
2197
2198	static int cs_ioctl_signal_wait(struct hl_fpriv hpriv, enum* hl_cs_type cs_type,
2199	void __user *chunks, u32 num_chunks,
2200	u64 *cs_seq, u32 flags, u32 timeout,
2201	u32 signal_sob_addr_offset, u16 signal_initial_sob_count)
2202	{
2203	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
2204	bool handle_found = false, is_wait_cs = false,
2205	wait_cs_submitted = false,
2206	cs_encaps_signals = false;
2207	struct hl_cs_chunk cs_chunk_array, chunk;
2208	bool staged_cs_with_encaps_signals = false;
2209	struct hw_queue_properties *hw_queue_prop;
2210	struct hl_device *hdev = hpriv->hdev;
2211	struct hl_cs_compl *sig_waitcs_cmpl;
2212	u32 q_idx, collective_engine_id = `0`;
2213	struct hl_cs_counters_atomic *cntr;
2214	struct hl_fence *sig_fence = NULL;
2215	struct hl_ctx *ctx = hpriv->ctx;
2216	enum hl_queue_type q_type;
2217	struct hl_cs *cs;
2218	u64 signal_seq;
2219	int rc;
2220
2221	cntr = &hdev->aggregated_cs_counters;
2222	*cs_seq = ULLONG_MAX;
2223
2224	rc = hl_cs_copy_chunk_array(hdev, cs_chunk_array: &cs_chunk_array, chunks, num_chunks,
2225	ctx);
2226	if (rc)
2227	goto out;
2228
2229	/ currently it is guaranteed to have only one chunk /
2230	chunk = &cs_chunk_array[`0`];
2231
2232	if (chunk->queue_index >= hdev->asic_prop.max_queues) {
2233	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2234	atomic64_inc(v: &cntr->validation_drop_cnt);
2235	dev_err(hdev->dev, "Queue index %d is invalid\n",
2236	chunk->queue_index);
2237	rc = -EINVAL;
2238	goto free_cs_chunk_array;
2239	}
2240
2241	q_idx = chunk->queue_index;
2242	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
2243	q_type = hw_queue_prop->type;
2244
2245	if (!hw_queue_prop->supports_sync_stream) {
2246	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2247	atomic64_inc(v: &cntr->validation_drop_cnt);
2248	dev_err(hdev->dev,
2249	"Queue index %d does not support sync stream operations\n",
2250	q_idx);
2251	rc = -EINVAL;
2252	goto free_cs_chunk_array;
2253	}
2254
2255	if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
2256	if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
2257	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2258	atomic64_inc(v: &cntr->validation_drop_cnt);
2259	dev_err(hdev->dev,
2260	"Queue index %d is invalid\n", q_idx);
2261	rc = -EINVAL;
2262	goto free_cs_chunk_array;
2263	}
2264
2265	if (!hdev->nic_ports_mask) {
2266	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2267	atomic64_inc(v: &cntr->validation_drop_cnt);
2268	dev_err(hdev->dev,
2269	"Collective operations not supported when NIC ports are disabled");
2270	rc = -EINVAL;
2271	goto free_cs_chunk_array;
2272	}
2273
2274	collective_engine_id = chunk->collective_engine_id;
2275	}
2276
2277	is_wait_cs = !!(cs_type == CS_TYPE_WAIT \|\|
2278	cs_type == CS_TYPE_COLLECTIVE_WAIT);
2279
2280	cs_encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
2281
2282	if (is_wait_cs) {
2283	rc = cs_ioctl_extract_signal_seq(hdev, chunk, signal_seq: &signal_seq,
2284	ctx, encaps_signals: cs_encaps_signals);
2285	if (rc)
2286	goto free_cs_chunk_array;
2287
2288	if (cs_encaps_signals) {
2289	/ check if cs sequence has encapsulated*
2290	* signals handle
2291	*/
2292	struct idr *idp;
2293	u32 id;
2294
2295	spin_lock(lock: &ctx->sig_mgr.lock);
2296	idp = &ctx->sig_mgr.handles;
2297	idr_for_each_entry(idp, encaps_sig_hdl, id) {
2298	if (encaps_sig_hdl->cs_seq == signal_seq) {
2299	/ get refcount to protect removing this handle from idr,*
2300	* needed when multiple wait cs are used with offset
2301	* to wait on reserved encaps signals.
2302	* Since kref_put of this handle is executed outside the
2303	* current lock, it is possible that the handle refcount
2304	* is 0 but it yet to be removed from the list. In this
2305	* case need to consider the handle as not valid.
2306	*/
2307	if (kref_get_unless_zero(kref: &encaps_sig_hdl->refcount))
2308	handle_found = true;
2309	break;
2310	}
2311	}
2312	spin_unlock(lock: &ctx->sig_mgr.lock);
2313
2314	if (!handle_found) {
2315	/ treat as signal CS already finished /
2316	dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n",
2317	signal_seq);
2318	rc = `0`;
2319	goto free_cs_chunk_array;
2320	}
2321
2322	/ validate also the signal offset value /
2323	if (chunk->encaps_signal_offset >
2324	encaps_sig_hdl->count) {
2325	dev_err(hdev->dev, "offset(%u) value exceed max reserved signals count(%u)!\n",
2326	chunk->encaps_signal_offset,
2327	encaps_sig_hdl->count);
2328	rc = -EINVAL;
2329	goto free_cs_chunk_array;
2330	}
2331	}
2332
2333	sig_fence = hl_ctx_get_fence(ctx, seq: signal_seq);
2334	if (IS_ERR(ptr: sig_fence)) {
2335	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2336	atomic64_inc(v: &cntr->validation_drop_cnt);
2337	dev_err(hdev->dev,
2338	"Failed to get signal CS with seq 0x%llx\n",
2339	signal_seq);
2340	rc = PTR_ERR(ptr: sig_fence);
2341	goto free_cs_chunk_array;
2342	}
2343
2344	if (!sig_fence) {
2345	/ signal CS already finished /
2346	rc = `0`;
2347	goto free_cs_chunk_array;
2348	}
2349
2350	sig_waitcs_cmpl =
2351	container_of(sig_fence, struct hl_cs_compl, base_fence);
2352
2353	staged_cs_with_encaps_signals = !!
2354	(sig_waitcs_cmpl->type == CS_TYPE_DEFAULT &&
2355	(flags & HL_CS_FLAGS_ENCAP_SIGNALS));
2356
2357	if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL &&
2358	!staged_cs_with_encaps_signals) {
2359	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2360	atomic64_inc(v: &cntr->validation_drop_cnt);
2361	dev_err(hdev->dev,
2362	"CS seq 0x%llx is not of a signal/encaps-signal CS\n",
2363	signal_seq);
2364	hl_fence_put(fence: sig_fence);
2365	rc = -EINVAL;
2366	goto free_cs_chunk_array;
2367	}
2368
2369	if (completion_done(x: &sig_fence->completion)) {
2370	/ signal CS already finished /
2371	hl_fence_put(fence: sig_fence);
2372	rc = `0`;
2373	goto free_cs_chunk_array;
2374	}
2375	}
2376
2377	rc = allocate_cs(hdev, ctx, cs_type, ULLONG_MAX, cs_new: &cs, flags, timeout);
2378	if (rc) {
2379	if (is_wait_cs)
2380	hl_fence_put(fence: sig_fence);
2381
2382	goto free_cs_chunk_array;
2383	}
2384
2385	/*
2386	* Save the signal CS fence for later initialization right before
2387	* hanging the wait CS on the queue.
2388	* for encaps signals case, we save the cs sequence and handle pointer
2389	* for later initialization.
2390	*/
2391	if (is_wait_cs) {
2392	cs->signal_fence = sig_fence;
2393	/ store the handle pointer, so we don't have to*
2394	* look for it again, later on the flow
2395	* when we need to set SOB info in hw_queue.
2396	*/
2397	if (cs->encaps_signals)
2398	cs->encaps_sig_hdl = encaps_sig_hdl;
2399	}
2400
2401	hl_debugfs_add_cs(cs);
2402
2403	*cs_seq = cs->sequence;
2404
2405	if (cs_type == CS_TYPE_WAIT \|\| cs_type == CS_TYPE_SIGNAL)
2406	rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
2407	q_idx, encaps_signal_offset: chunk->encaps_signal_offset);
2408	else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
2409	rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
2410	cs, q_idx, collective_engine_id,
2411	chunk->encaps_signal_offset);
2412	else {
2413	atomic64_inc(v: &ctx->cs_counters.validation_drop_cnt);
2414	atomic64_inc(v: &cntr->validation_drop_cnt);
2415	rc = -EINVAL;
2416	}
2417
2418	if (rc)
2419	goto free_cs_object;
2420
2421	if (q_type == QUEUE_TYPE_HW)
2422	INIT_WORK(&cs->finish_work, cs_completion);
2423
2424	rc = hl_hw_queue_schedule_cs(cs);
2425	if (rc) {
2426	/ In case wait cs failed here, it means the signal cs*
2427	* already completed. we want to free all it's related objects
2428	* but we don't want to fail the ioctl.
2429	*/
2430	if (is_wait_cs)
2431	rc = `0`;
2432	else if (rc != -EAGAIN)
2433	dev_err(hdev->dev,
2434	"Failed to submit CS %d.%llu to H/W queues, error %d\n",
2435	ctx->asid, cs->sequence, rc);
2436	goto free_cs_object;
2437	}
2438
2439	*signal_sob_addr_offset = cs->sob_addr_offset;
2440	*signal_initial_sob_count = cs->initial_sob_count;
2441
2442	rc = HL_CS_STATUS_SUCCESS;
2443	if (is_wait_cs)
2444	wait_cs_submitted = true;
2445	goto put_cs;
2446
2447	free_cs_object:
2448	cs_rollback(hdev, cs);
2449	*cs_seq = ULLONG_MAX;
2450	/ The path below is both for good and erroneous exits /
2451	put_cs:
2452	/ We finished with the CS in this function, so put the ref /
2453	cs_put(cs);
2454	free_cs_chunk_array:
2455	if (!wait_cs_submitted && cs_encaps_signals && handle_found && is_wait_cs)
2456	kref_put(kref: &encaps_sig_hdl->refcount, release: hl_encaps_release_handle_and_put_ctx);
2457	kfree(objp: cs_chunk_array);
2458	out:
2459	return rc;
2460	}
2461
2462	static int cs_ioctl_engine_cores(struct hl_fpriv *hpriv, u64 engine_cores,
2463	u32 num_engine_cores, u32 core_command)
2464	{
2465	struct hl_device *hdev = hpriv->hdev;
2466	void __user *engine_cores_arr;
2467	u32 *cores;
2468	int rc;
2469
2470	if (!hdev->asic_prop.supports_engine_modes)
2471	return -EPERM;
2472
2473	if (!num_engine_cores \|\| num_engine_cores > hdev->asic_prop.num_engine_cores) {
2474	dev_err(hdev->dev, "Number of engine cores %d is invalid\n", num_engine_cores);
2475	return -EINVAL;
2476	}
2477
2478	if (core_command != HL_ENGINE_CORE_RUN && core_command != HL_ENGINE_CORE_HALT) {
2479	dev_err(hdev->dev, "Engine core command is invalid\n");
2480	return -EINVAL;
2481	}
2482
2483	engine_cores_arr = (void __user *) (uintptr_t) engine_cores;
2484	cores = kmalloc_array(n: num_engine_cores, size: sizeof(u32), GFP_KERNEL);
2485	if (!cores)
2486	return -ENOMEM;
2487
2488	if (copy_from_user(to: cores, from: engine_cores_arr, n: num_engine_cores * sizeof(u32))) {
2489	dev_err(hdev->dev, "Failed to copy core-ids array from user\n");
2490	kfree(objp: cores);
2491	return -EFAULT;
2492	}
2493
2494	rc = hdev->asic_funcs->set_engine_cores(hdev, cores, num_engine_cores, core_command);
2495	kfree(objp: cores);
2496
2497	return rc;
2498	}
2499
2500	static int cs_ioctl_engines(struct hl_fpriv *hpriv, u64 engines_arr_user_addr,
2501	u32 num_engines, enum hl_engine_command command)
2502	{
2503	struct hl_device *hdev = hpriv->hdev;
2504	u32 *engines, max_num_of_engines;
2505	void __user *engines_arr;
2506	int rc;
2507
2508	if (!hdev->asic_prop.supports_engine_modes)
2509	return -EPERM;
2510
2511	if (command >= HL_ENGINE_COMMAND_MAX) {
2512	dev_err(hdev->dev, "Engine command is invalid\n");
2513	return -EINVAL;
2514	}
2515
2516	max_num_of_engines = hdev->asic_prop.max_num_of_engines;
2517	if (command == HL_ENGINE_CORE_RUN \|\| command == HL_ENGINE_CORE_HALT)
2518	max_num_of_engines = hdev->asic_prop.num_engine_cores;
2519
2520	if (!num_engines \|\| num_engines > max_num_of_engines) {
2521	dev_err(hdev->dev, "Number of engines %d is invalid\n", num_engines);
2522	return -EINVAL;
2523	}
2524
2525	engines_arr = (void __user *) (uintptr_t) engines_arr_user_addr;
2526	engines = kmalloc_array(n: num_engines, size: sizeof(u32), GFP_KERNEL);
2527	if (!engines)
2528	return -ENOMEM;
2529
2530	if (copy_from_user(to: engines, from: engines_arr, n: num_engines * sizeof(u32))) {
2531	dev_err(hdev->dev, "Failed to copy engine-ids array from user\n");
2532	kfree(objp: engines);
2533	return -EFAULT;
2534	}
2535
2536	rc = hdev->asic_funcs->set_engines(hdev, engines, num_engines, command);
2537	kfree(objp: engines);
2538
2539	return rc;
2540	}
2541
2542	static int cs_ioctl_flush_pci_hbw_writes(struct hl_fpriv *hpriv)
2543	{
2544	struct hl_device *hdev = hpriv->hdev;
2545	struct asic_fixed_properties *prop = &hdev->asic_prop;
2546
2547	if (!prop->hbw_flush_reg) {
2548	dev_dbg(hdev->dev, "HBW flush is not supported\n");
2549	return -EOPNOTSUPP;
2550	}
2551
2552	RREG32(prop->hbw_flush_reg);
2553
2554	return `0`;
2555	}
2556
2557	int hl_cs_ioctl(struct drm_device ddev, void* data, struct* drm_file *file_priv)
2558	{
2559	struct hl_fpriv *hpriv = file_priv->driver_priv;
2560	union hl_cs_args *args = data;
2561	enum hl_cs_type cs_type = `0`;
2562	u64 cs_seq = ULONG_MAX;
2563	void __user *chunks;
2564	u32 num_chunks, flags, timeout,
2565	signals_count = `0`, sob_addr = `0`, handle_id = `0`;
2566	u16 sob_initial_count = `0`;
2567	int rc;
2568
2569	rc = hl_cs_sanity_checks(hpriv, args);
2570	if (rc)
2571	goto out;
2572
2573	rc = hl_cs_ctx_switch(hpriv, args, cs_seq: &cs_seq);
2574	if (rc)
2575	goto out;
2576
2577	cs_type = hl_cs_get_cs_type(cs_type_flags: args->in.cs_flags &
2578	~HL_CS_FLAGS_FORCE_RESTORE);
2579	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
2580	num_chunks = args->in.num_chunks_execute;
2581	flags = args->in.cs_flags;
2582
2583	/ In case this is a staged CS, user should supply the CS sequence /
2584	if ((flags & HL_CS_FLAGS_STAGED_SUBMISSION) &&
2585	!(flags & HL_CS_FLAGS_STAGED_SUBMISSION_FIRST))
2586	cs_seq = args->in.seq;
2587
2588	timeout = flags & HL_CS_FLAGS_CUSTOM_TIMEOUT
2589	? msecs_to_jiffies(m: args->in.timeout * `1000`)
2590	: hpriv->hdev->timeout_jiffies;
2591
2592	switch (cs_type) {
2593	case CS_TYPE_SIGNAL:
2594	case CS_TYPE_WAIT:
2595	case CS_TYPE_COLLECTIVE_WAIT:
2596	rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
2597	cs_seq: &cs_seq, flags: args->in.cs_flags, timeout,
2598	signal_sob_addr_offset: &sob_addr, signal_initial_sob_count: &sob_initial_count);
2599	break;
2600	case CS_RESERVE_SIGNALS:
2601	rc = cs_ioctl_reserve_signals(hpriv,
2602	q_idx: args->in.encaps_signals_q_idx,
2603	count: args->in.encaps_signals_count,
2604	handle_id: &handle_id, sob_addr: &sob_addr, signals_count: &signals_count);
2605	break;
2606	case CS_UNRESERVE_SIGNALS:
2607	rc = cs_ioctl_unreserve_signals(hpriv,
2608	handle_id: args->in.encaps_sig_handle_id);
2609	break;
2610	case CS_TYPE_ENGINE_CORE:
2611	rc = cs_ioctl_engine_cores(hpriv, engine_cores: args->in.engine_cores,
2612	num_engine_cores: args->in.num_engine_cores, core_command: args->in.core_command);
2613	break;
2614	case CS_TYPE_ENGINES:
2615	rc = cs_ioctl_engines(hpriv, engines_arr_user_addr: args->in.engines,
2616	num_engines: args->in.num_engines, command: args->in.engine_command);
2617	break;
2618	case CS_TYPE_FLUSH_PCI_HBW_WRITES:
2619	rc = cs_ioctl_flush_pci_hbw_writes(hpriv);
2620	break;
2621	default:
2622	rc = cs_ioctl_default(hpriv, chunks, num_chunks, cs_seq: &cs_seq,
2623	flags: args->in.cs_flags,
2624	encaps_signals_handle: args->in.encaps_sig_handle_id,
2625	timeout, signal_initial_sob_count: &sob_initial_count);
2626	break;
2627	}
2628	out:
2629	if (rc != -EAGAIN) {
2630	memset(args, `0`, sizeof(*args));
2631
2632	switch (cs_type) {
2633	case CS_RESERVE_SIGNALS:
2634	args->out.handle_id = handle_id;
2635	args->out.sob_base_addr_offset = sob_addr;
2636	args->out.count = signals_count;
2637	break;
2638	case CS_TYPE_SIGNAL:
2639	args->out.sob_base_addr_offset = sob_addr;
2640	args->out.sob_count_before_submission = sob_initial_count;
2641	args->out.seq = cs_seq;
2642	break;
2643	case CS_TYPE_DEFAULT:
2644	args->out.sob_count_before_submission = sob_initial_count;
2645	args->out.seq = cs_seq;
2646	break;
2647	default:
2648	args->out.seq = cs_seq;
2649	break;
2650	}
2651
2652	args->out.status = rc;
2653	}
2654
2655	return rc;
2656	}
2657
2658	static int hl_wait_for_fence(struct hl_ctx ctx, u64 seq, struct* hl_fence *fence,
2659	enum hl_cs_wait_status status, u64 timeout_us, s64 timestamp)
2660	{
2661	struct hl_device *hdev = ctx->hdev;
2662	ktime_t timestamp_kt;
2663	long completion_rc;
2664	int rc = `0`, error;
2665
2666	if (IS_ERR(ptr: fence)) {
2667	rc = PTR_ERR(ptr: fence);
2668	if (rc == -EINVAL)
2669	dev_notice_ratelimited(hdev->dev,
2670	"Can't wait on CS %llu because current CS is at seq %llu\n",
2671	seq, ctx->cs_sequence);
2672	return rc;
2673	}
2674
2675	if (!fence) {
2676	if (!hl_pop_cs_outcome(outcome_store: &ctx->outcome_store, seq, ts: &timestamp_kt, error: &error)) {
2677	dev_dbg(hdev->dev,
2678	"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
2679	seq, ctx->cs_sequence);
2680	*status = CS_WAIT_STATUS_GONE;
2681	return `0`;
2682	}
2683
2684	completion_rc = `1`;
2685	goto report_results;
2686	}
2687
2688	if (!timeout_us) {
2689	completion_rc = completion_done(x: &fence->completion);
2690	} else {
2691	unsigned long timeout;
2692
2693	timeout = (timeout_us == MAX_SCHEDULE_TIMEOUT) ?
2694	timeout_us : usecs_to_jiffies(u: timeout_us);
2695	completion_rc =
2696	wait_for_completion_interruptible_timeout(
2697	x: &fence->completion, timeout);
2698	}
2699
2700	error = fence->error;
2701	timestamp_kt = fence->timestamp;
2702
2703	report_results:
2704	if (completion_rc > `0`) {
2705	*status = CS_WAIT_STATUS_COMPLETED;
2706	if (timestamp)
2707	*timestamp = ktime_to_ns(kt: timestamp_kt);
2708	} else {
2709	*status = CS_WAIT_STATUS_BUSY;
2710	}
2711
2712	if (completion_rc == -ERESTARTSYS)
2713	rc = completion_rc;
2714	else if (error == -ETIMEDOUT \|\| error == -EIO)
2715	rc = error;
2716
2717	return rc;
2718	}
2719
2720	/*
2721	* hl_cs_poll_fences - iterate CS fences to check for CS completion
2722	*
2723	* @mcs_data: multi-CS internal data
2724	* @mcs_compl: multi-CS completion structure
2725	*
2726	* @return 0 on success, otherwise non 0 error code
2727	*
2728	* The function iterates on all CS sequence in the list and set bit in
2729	* completion_bitmap for each completed CS.
2730	* While iterating, the function sets the stream map of each fence in the fence
2731	* array in the completion QID stream map to be used by CSs to perform
2732	* completion to the multi-CS context.
2733	* This function shall be called after taking context ref
2734	*/
2735	static int hl_cs_poll_fences(struct multi_cs_data mcs_data, struct* multi_cs_completion *mcs_compl)
2736	{
2737	struct hl_fence **fence_ptr = mcs_data->fence_arr;
2738	struct hl_device *hdev = mcs_data->ctx->hdev;
2739	int i, rc, arr_len = mcs_data->arr_len;
2740	u64 *seq_arr = mcs_data->seq_arr;
2741	ktime_t max_ktime, first_cs_time;
2742	enum hl_cs_wait_status status;
2743
2744	memset(fence_ptr, `0`, arr_len * sizeof(struct hl_fence *));
2745
2746	/ get all fences under the same lock /
2747	rc = hl_ctx_get_fences(ctx: mcs_data->ctx, seq_arr, fence: fence_ptr, arr_len);
2748	if (rc)
2749	return rc;
2750
2751	/*
2752	* re-initialize the completion here to handle 2 possible cases:
2753	* 1. CS will complete the multi-CS prior clearing the completion. in which
2754	* case the fence iteration is guaranteed to catch the CS completion.
2755	* 2. the completion will occur after re-init of the completion.
2756	* in which case we will wake up immediately in wait_for_completion.
2757	*/
2758	reinit_completion(x: &mcs_compl->completion);
2759
2760	/*
2761	* set to maximum time to verify timestamp is valid: if at the end
2762	* this value is maintained- no timestamp was updated
2763	*/
2764	max_ktime = ktime_set(KTIME_SEC_MAX, nsecs: `0`);
2765	first_cs_time = max_ktime;
2766
2767	for (i = `0`; i < arr_len; i++, fence_ptr++) {
2768	struct hl_fence fence = fence_ptr;
2769
2770	/*
2771	* In order to prevent case where we wait until timeout even though a CS associated
2772	* with the multi-CS actually completed we do things in the below order:
2773	* 1. for each fence set it's QID map in the multi-CS completion QID map. This way
2774	* any CS can, potentially, complete the multi CS for the specific QID (note
2775	* that once completion is initialized, calling complete* and then wait on the
2776	* completion will cause it to return at once)
2777	* 2. only after allowing multi-CS completion for the specific QID we check whether
2778	* the specific CS already completed (and thus the wait for completion part will
2779	* be skipped). if the CS not completed it is guaranteed that completing CS will
2780	* wake up the completion.
2781	*/
2782	if (fence)
2783	mcs_compl->stream_master_qid_map \|= fence->stream_master_qid_map;
2784
2785	/*
2786	* function won't sleep as it is called with timeout 0 (i.e.
2787	* poll the fence)
2788	*/
2789	rc = hl_wait_for_fence(ctx: mcs_data->ctx, seq: seq_arr[i], fence, status: &status, timeout_us: `0`, NULL);
2790	if (rc) {
2791	dev_err(hdev->dev,
2792	"wait_for_fence error :%d for CS seq %llu\n",
2793	rc, seq_arr[i]);
2794	break;
2795	}
2796
2797	switch (status) {
2798	case CS_WAIT_STATUS_BUSY:
2799	/ CS did not finished, QID to wait on already stored /
2800	break;
2801	case CS_WAIT_STATUS_COMPLETED:
2802	/*
2803	* Using mcs_handling_done to avoid possibility of mcs_data
2804	* returns to user indicating CS completed before it finished
2805	* all of its mcs handling, to avoid race the next time the
2806	* user waits for mcs.
2807	* note: when reaching this case fence is definitely not NULL
2808	* but NULL check was added to overcome static analysis
2809	*/
2810	if (fence && !fence->mcs_handling_done) {
2811	/*
2812	* in case multi CS is completed but MCS handling not done
2813	* we "complete" the multi CS to prevent it from waiting
2814	* until time-out and the "multi-CS handling done" will have
2815	* another chance at the next iteration
2816	*/
2817	complete_all(&mcs_compl->completion);
2818	break;
2819	}
2820
2821	mcs_data->completion_bitmap \|= BIT(i);
2822	/*
2823	* For all completed CSs we take the earliest timestamp.
2824	* For this we have to validate that the timestamp is
2825	* earliest of all timestamps so far.
2826	*/
2827	if (fence && mcs_data->update_ts &&
2828	(ktime_compare(cmp1: fence->timestamp, cmp2: first_cs_time) < `0`))
2829	first_cs_time = fence->timestamp;
2830	break;
2831	case CS_WAIT_STATUS_GONE:
2832	mcs_data->update_ts = false;
2833	mcs_data->gone_cs = true;
2834	/*
2835	* It is possible to get an old sequence numbers from user
2836	* which related to already completed CSs and their fences
2837	* already gone. In this case, CS set as completed but
2838	* no need to consider its QID for mcs completion.
2839	*/
2840	mcs_data->completion_bitmap \|= BIT(i);
2841	break;
2842	default:
2843	dev_err(hdev->dev, "Invalid fence status\n");
2844	rc = -EINVAL;
2845	break;
2846	}
2847
2848	}
2849
2850	hl_fences_put(fence: mcs_data->fence_arr, len: arr_len);
2851
2852	if (mcs_data->update_ts &&
2853	(ktime_compare(cmp1: first_cs_time, cmp2: max_ktime) != `0`))
2854	mcs_data->timestamp = ktime_to_ns(kt: first_cs_time);
2855
2856	return rc;
2857	}
2858
2859	static int _hl_cs_wait_ioctl(struct hl_device hdev, struct* hl_ctx *ctx, u64 timeout_us, u64 seq,
2860	enum hl_cs_wait_status status, s64 timestamp)
2861	{
2862	struct hl_fence *fence;
2863	int rc = `0`;
2864
2865	if (timestamp)
2866	*timestamp = `0`;
2867
2868	hl_ctx_get(ctx);
2869
2870	fence = hl_ctx_get_fence(ctx, seq);
2871
2872	rc = hl_wait_for_fence(ctx, seq, fence, status, timeout_us, timestamp);
2873	hl_fence_put(fence);
2874	hl_ctx_put(ctx);
2875
2876	return rc;
2877	}
2878
2879	static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
2880	{
2881	if (usecs <= U32_MAX)
2882	return usecs_to_jiffies(u: usecs);
2883
2884	/*
2885	* If the value in nanoseconds is larger than 64 bit, use the largest
2886	* 64 bit value.
2887	*/
2888	if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
2889	return nsecs_to_jiffies(U64_MAX);
2890
2891	return nsecs_to_jiffies(n: usecs * NSEC_PER_USEC);
2892	}
2893
2894	/*
2895	* hl_wait_multi_cs_completion_init - init completion structure
2896	*
2897	* @hdev: pointer to habanalabs device structure
2898	* @stream_master_bitmap: stream master QIDs map, set bit indicates stream
2899	* master QID to wait on
2900	*
2901	* @return valid completion struct pointer on success, otherwise error pointer
2902	*
2903	* up to MULTI_CS_MAX_USER_CTX calls can be done concurrently to the driver.
2904	* the function gets the first available completion (by marking it "used")
2905	* and initialize its values.
2906	*/
2907	static struct multi_cs_completion hl_wait_multi_cs_completion_init(struct* hl_device *hdev)
2908	{
2909	struct multi_cs_completion *mcs_compl;
2910	int i;
2911
2912	/ find free multi_cs completion structure /
2913	for (i = `0`; i < MULTI_CS_MAX_USER_CTX; i++) {
2914	mcs_compl = &hdev->multi_cs_completion[i];
2915	spin_lock(lock: &mcs_compl->lock);
2916	if (!mcs_compl->used) {
2917	mcs_compl->used = `1`;
2918	mcs_compl->timestamp = `0`;
2919	/*
2920	* init QID map to 0 to avoid completion by CSs. the actual QID map
2921	* to multi-CS CSs will be set incrementally at a later stage
2922	*/
2923	mcs_compl->stream_master_qid_map = `0`;
2924	spin_unlock(lock: &mcs_compl->lock);
2925	break;
2926	}
2927	spin_unlock(lock: &mcs_compl->lock);
2928	}
2929
2930	if (i == MULTI_CS_MAX_USER_CTX) {
2931	dev_err(hdev->dev, "no available multi-CS completion structure\n");
2932	return ERR_PTR(error: -ENOMEM);
2933	}
2934	return mcs_compl;
2935	}
2936
2937	/*
2938	* hl_wait_multi_cs_completion_fini - return completion structure and set as
2939	* unused
2940	*
2941	* @mcs_compl: pointer to the completion structure
2942	*/
2943	static void hl_wait_multi_cs_completion_fini(
2944	struct multi_cs_completion *mcs_compl)
2945	{
2946	/*
2947	* free completion structure, do it under lock to be in-sync with the
2948	* thread that signals completion
2949	*/
2950	spin_lock(lock: &mcs_compl->lock);
2951	mcs_compl->used = `0`;
2952	spin_unlock(lock: &mcs_compl->lock);
2953	}
2954
2955	/*
2956	* hl_wait_multi_cs_completion - wait for first CS to complete
2957	*
2958	* @mcs_data: multi-CS internal data
2959	*
2960	* @return 0 on success, otherwise non 0 error code
2961	*/
2962	static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
2963	struct multi_cs_completion *mcs_compl)
2964	{
2965	long completion_rc;
2966
2967	completion_rc = wait_for_completion_interruptible_timeout(x: &mcs_compl->completion,
2968	timeout: mcs_data->timeout_jiffies);
2969
2970	/ update timestamp /
2971	if (completion_rc > `0`)
2972	mcs_data->timestamp = mcs_compl->timestamp;
2973
2974	if (completion_rc == -ERESTARTSYS)
2975	return completion_rc;
2976
2977	mcs_data->wait_status = completion_rc;
2978
2979	return `0`;
2980	}
2981
2982	/*
2983	* hl_multi_cs_completion_init - init array of multi-CS completion structures
2984	*
2985	* @hdev: pointer to habanalabs device structure
2986	*/
2987	void hl_multi_cs_completion_init(struct hl_device *hdev)
2988	{
2989	struct multi_cs_completion *mcs_cmpl;
2990	int i;
2991
2992	for (i = `0`; i < MULTI_CS_MAX_USER_CTX; i++) {
2993	mcs_cmpl = &hdev->multi_cs_completion[i];
2994	mcs_cmpl->used = `0`;
2995	spin_lock_init(&mcs_cmpl->lock);
2996	init_completion(x: &mcs_cmpl->completion);
2997	}
2998	}
2999
3000	/*
3001	* hl_multi_cs_wait_ioctl - implementation of the multi-CS wait ioctl
3002	*
3003	* @hpriv: pointer to the private data of the fd
3004	* @data: pointer to multi-CS wait ioctl in/out args
3005	*
3006	*/
3007	static int hl_multi_cs_wait_ioctl(struct hl_fpriv hpriv, void* *data)
3008	{
3009	struct multi_cs_completion *mcs_compl;
3010	struct hl_device *hdev = hpriv->hdev;
3011	struct multi_cs_data mcs_data = {};
3012	union hl_wait_cs_args *args = data;
3013	struct hl_ctx *ctx = hpriv->ctx;
3014	struct hl_fence **fence_arr;
3015	void __user *seq_arr;
3016	u32 size_to_copy;
3017	u64 *cs_seq_arr;
3018	u8 seq_arr_len;
3019	int rc, i;
3020
3021	for (i = `0` ; i < sizeof(args->in.pad) ; i++)
3022	if (args->in.pad[i]) {
3023	dev_dbg(hdev->dev, "Padding bytes must be 0\n");
3024	return -EINVAL;
3025	}
3026
3027	if (!hdev->supports_wait_for_multi_cs) {
3028	dev_err(hdev->dev, "Wait for multi CS is not supported\n");
3029	return -EPERM;
3030	}
3031
3032	seq_arr_len = args->in.seq_arr_len;
3033
3034	if (seq_arr_len > HL_WAIT_MULTI_CS_LIST_MAX_LEN) {
3035	dev_err(hdev->dev, "Can wait only up to %d CSs, input sequence is of length %u\n",
3036	HL_WAIT_MULTI_CS_LIST_MAX_LEN, seq_arr_len);
3037	return -EINVAL;
3038	}
3039
3040	/ allocate memory for sequence array /
3041	cs_seq_arr =
3042	kmalloc_array(n: seq_arr_len, size: sizeof(*cs_seq_arr), GFP_KERNEL);
3043	if (!cs_seq_arr)
3044	return -ENOMEM;
3045
3046	/ copy CS sequence array from user /
3047	seq_arr = (void __user *) (uintptr_t) args->in.seq;
3048	size_to_copy = seq_arr_len * sizeof(*cs_seq_arr);
3049	if (copy_from_user(to: cs_seq_arr, from: seq_arr, n: size_to_copy)) {
3050	dev_err(hdev->dev, "Failed to copy multi-cs sequence array from user\n");
3051	rc = -EFAULT;
3052	goto free_seq_arr;
3053	}
3054
3055	/ allocate array for the fences /
3056	fence_arr = kmalloc_array(n: seq_arr_len, size: sizeof(struct hl_fence *), GFP_KERNEL);
3057	if (!fence_arr) {
3058	rc = -ENOMEM;
3059	goto free_seq_arr;
3060	}
3061
3062	/ initialize the multi-CS internal data /
3063	mcs_data.ctx = ctx;
3064	mcs_data.seq_arr = cs_seq_arr;
3065	mcs_data.fence_arr = fence_arr;
3066	mcs_data.arr_len = seq_arr_len;
3067
3068	hl_ctx_get(ctx);
3069
3070	/ wait (with timeout) for the first CS to be completed /
3071	mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(usecs: args->in.timeout_us);
3072	mcs_compl = hl_wait_multi_cs_completion_init(hdev);
3073	if (IS_ERR(ptr: mcs_compl)) {
3074	rc = PTR_ERR(ptr: mcs_compl);
3075	goto put_ctx;
3076	}
3077
3078	/ poll all CS fences, extract timestamp /
3079	mcs_data.update_ts = true;
3080	rc = hl_cs_poll_fences(mcs_data: &mcs_data, mcs_compl);
3081	/*
3082	* skip wait for CS completion when one of the below is true:
3083	* - an error on the poll function
3084	* - one or more CS in the list completed
3085	* - the user called ioctl with timeout 0
3086	*/
3087	if (rc \|\| mcs_data.completion_bitmap \|\| !args->in.timeout_us)
3088	goto completion_fini;
3089
3090	while (true) {
3091	rc = hl_wait_multi_cs_completion(mcs_data: &mcs_data, mcs_compl);
3092	if (rc \|\| (mcs_data.wait_status == `0`))
3093	break;
3094
3095	/*
3096	* poll fences once again to update the CS map.
3097	* no timestamp should be updated this time.
3098	*/
3099	mcs_data.update_ts = false;
3100	rc = hl_cs_poll_fences(mcs_data: &mcs_data, mcs_compl);
3101
3102	if (rc \|\| mcs_data.completion_bitmap)
3103	break;
3104
3105	/*
3106	* if hl_wait_multi_cs_completion returned before timeout (i.e.
3107	* it got a completion) it either got completed by CS in the multi CS list
3108	* (in which case the indication will be non empty completion_bitmap) or it
3109	* got completed by CS submitted to one of the shared stream master but
3110	* not in the multi CS list (in which case we should wait again but modify
3111	* the timeout and set timestamp as zero to let a CS related to the current
3112	* multi-CS set a new, relevant, timestamp)
3113	*/
3114	mcs_data.timeout_jiffies = mcs_data.wait_status;
3115	mcs_compl->timestamp = `0`;
3116	}
3117
3118	completion_fini:
3119	hl_wait_multi_cs_completion_fini(mcs_compl);
3120
3121	put_ctx:
3122	hl_ctx_put(ctx);
3123	kfree(objp: fence_arr);
3124
3125	free_seq_arr:
3126	kfree(objp: cs_seq_arr);
3127
3128	if (rc == -ERESTARTSYS) {
3129	dev_err_ratelimited(hdev->dev,
3130	"user process got signal while waiting for Multi-CS\n");
3131	rc = -EINTR;
3132	}
3133
3134	if (rc)
3135	return rc;
3136
3137	/ update output args /
3138	memset(args, `0`, sizeof(*args));
3139
3140	if (mcs_data.completion_bitmap) {
3141	args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3142	args->out.cs_completion_map = mcs_data.completion_bitmap;
3143
3144	/ if timestamp not 0- it's valid /
3145	if (mcs_data.timestamp) {
3146	args->out.timestamp_nsec = mcs_data.timestamp;
3147	args->out.flags \|= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3148	}
3149
3150	/ update if some CS was gone /
3151	if (!mcs_data.timestamp)
3152	args->out.flags \|= HL_WAIT_CS_STATUS_FLAG_GONE;
3153	} else {
3154	args->out.status = HL_WAIT_CS_STATUS_BUSY;
3155	}
3156
3157	return `0`;
3158	}
3159
3160	static int hl_cs_wait_ioctl(struct hl_fpriv hpriv, void* *data)
3161	{
3162	struct hl_device *hdev = hpriv->hdev;
3163	union hl_wait_cs_args *args = data;
3164	enum hl_cs_wait_status status;
3165	u64 seq = args->in.seq;
3166	s64 timestamp;
3167	int rc;
3168
3169	rc = _hl_cs_wait_ioctl(hdev, ctx: hpriv->ctx, timeout_us: args->in.timeout_us, seq, status: &status, timestamp: &timestamp);
3170
3171	if (rc == -ERESTARTSYS) {
3172	dev_err_ratelimited(hdev->dev,
3173	"user process got signal while waiting for CS handle %llu\n",
3174	seq);
3175	return -EINTR;
3176	}
3177
3178	memset(args, `0`, sizeof(*args));
3179
3180	if (rc) {
3181	if (rc == -ETIMEDOUT) {
3182	dev_err_ratelimited(hdev->dev,
3183	"CS %llu has timed-out while user process is waiting for it\n",
3184	seq);
3185	args->out.status = HL_WAIT_CS_STATUS_TIMEDOUT;
3186	} else if (rc == -EIO) {
3187	dev_err_ratelimited(hdev->dev,
3188	"CS %llu has been aborted while user process is waiting for it\n",
3189	seq);
3190	args->out.status = HL_WAIT_CS_STATUS_ABORTED;
3191	}
3192	return rc;
3193	}
3194
3195	if (timestamp) {
3196	args->out.flags \|= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3197	args->out.timestamp_nsec = timestamp;
3198	}
3199
3200	switch (status) {
3201	case CS_WAIT_STATUS_GONE:
3202	args->out.flags \|= HL_WAIT_CS_STATUS_FLAG_GONE;
3203	fallthrough;
3204	case CS_WAIT_STATUS_COMPLETED:
3205	args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
3206	break;
3207	case CS_WAIT_STATUS_BUSY:
3208	default:
3209	args->out.status = HL_WAIT_CS_STATUS_BUSY;
3210	break;
3211	}
3212
3213	return `0`;
3214	}
3215
3216	static inline void set_record_cq_info(struct hl_user_pending_interrupt *record,
3217	struct hl_cb *cq_cb, u32 cq_offset, u32 target_value)
3218	{
3219	record->ts_reg_info.cq_cb = cq_cb;
3220	record->cq_kernel_addr = (u64 *) cq_cb->kernel_address + cq_offset;
3221	record->cq_target_value = target_value;
3222	}
3223
3224	static int validate_and_get_ts_record(struct device *dev,
3225	struct hl_ts_buff *ts_buff, u64 ts_offset,
3226	struct hl_user_pending_interrupt **req_event_record)
3227	{
3228	struct hl_user_pending_interrupt *ts_cb_last;
3229
3230	req_event_record = (struct* hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3231	ts_offset;
3232	ts_cb_last = (struct hl_user_pending_interrupt *)ts_buff->kernel_buff_address +
3233	(ts_buff->kernel_buff_size / sizeof(struct hl_user_pending_interrupt));
3234
3235	/ Validate ts_offset not exceeding last max /
3236	if (*req_event_record >= ts_cb_last) {
3237	dev_err(dev, "Ts offset(%llu) exceeds max CB offset(0x%llx)\n",
3238	ts_offset, (u64)(uintptr_t)ts_cb_last);
3239	return -EINVAL;
3240	}
3241
3242	return `0`;
3243	}
3244
3245	static void unregister_timestamp_node(struct hl_device *hdev,
3246	struct hl_user_pending_interrupt *record, bool need_lock)
3247	{
3248	struct hl_user_interrupt *interrupt = record->ts_reg_info.interrupt;
3249	bool ts_rec_found = false;
3250	unsigned long flags;
3251
3252	if (need_lock)
3253	spin_lock_irqsave(&interrupt->ts_list_lock, flags);
3254
3255	if (record->ts_reg_info.in_use) {
3256	record->ts_reg_info.in_use = false;
3257	list_del(entry: &record->list_node);
3258	ts_rec_found = true;
3259	}
3260
3261	if (need_lock)
3262	spin_unlock_irqrestore(lock: &interrupt->ts_list_lock, flags);
3263
3264	/ Put refcounts that were taken when we registered the event /
3265	if (ts_rec_found) {
3266	hl_mmap_mem_buf_put(buf: record->ts_reg_info.buf);
3267	hl_cb_put(cb: record->ts_reg_info.cq_cb);
3268	}
3269	}
3270
3271	static int ts_get_and_handle_kernel_record(struct hl_device hdev, struct* hl_ctx *ctx,
3272	struct wait_interrupt_data data, unsigned* long *flags,
3273	struct hl_user_pending_interrupt **pend)
3274	{
3275	struct hl_user_pending_interrupt *req_offset_record;
3276	struct hl_ts_buff *ts_buff = data->buf->private;
3277	bool need_lock = false;
3278	int rc;
3279
3280	rc = validate_and_get_ts_record(dev: data->buf->mmg->dev, ts_buff, ts_offset: data->ts_offset,
3281	req_event_record: &req_offset_record);
3282	if (rc)
3283	return rc;
3284
3285	/ In case the node already registered, need to unregister first then re-use /
3286	if (req_offset_record->ts_reg_info.in_use) {
3287	dev_dbg(data->buf->mmg->dev,
3288	"Requested record %p is in use on irq: %u ts addr: %p, unregister first then put on irq: %u\n",
3289	req_offset_record,
3290	req_offset_record->ts_reg_info.interrupt->interrupt_id,
3291	req_offset_record->ts_reg_info.timestamp_kernel_addr,
3292	data->interrupt->interrupt_id);
3293	/*
3294	* Since interrupt here can be different than the one the node currently registered
3295	* on, and we don't want to lock two lists while we're doing unregister, so
3296	* unlock the new interrupt wait list here and acquire the lock again after you done
3297	*/
3298	if (data->interrupt->interrupt_id !=
3299	req_offset_record->ts_reg_info.interrupt->interrupt_id) {
3300
3301	need_lock = true;
3302	spin_unlock_irqrestore(lock: &data->interrupt->ts_list_lock, flags: *flags);
3303	}
3304
3305	unregister_timestamp_node(hdev, record: req_offset_record, need_lock);
3306
3307	if (need_lock)
3308	spin_lock_irqsave(&data->interrupt->ts_list_lock, *flags);
3309	}
3310
3311	/ Fill up the new registration node info and add it to the list /
3312	req_offset_record->ts_reg_info.in_use = true;
3313	req_offset_record->ts_reg_info.buf = data->buf;
3314	req_offset_record->ts_reg_info.timestamp_kernel_addr =
3315	(u64 *) ts_buff->user_buff_address + data->ts_offset;
3316	req_offset_record->ts_reg_info.interrupt = data->interrupt;
3317	set_record_cq_info(record: req_offset_record, cq_cb: data->cq_cb, cq_offset: data->cq_offset,
3318	target_value: data->target_value);
3319
3320	*pend = req_offset_record;
3321
3322	return rc;
3323	}
3324
3325	static int _hl_interrupt_ts_reg_ioctl(struct hl_device hdev, struct* hl_ctx *ctx,
3326	struct wait_interrupt_data *data,
3327	u32 status, u64 timestamp)
3328	{
3329	struct hl_user_pending_interrupt *pend;
3330	unsigned long flags;
3331	int rc = `0`;
3332
3333	hl_ctx_get(ctx);
3334
3335	data->cq_cb = hl_cb_get(mmg: data->mmg, handle: data->cq_handle);
3336	if (!data->cq_cb) {
3337	rc = -EINVAL;
3338	goto put_ctx;
3339	}
3340
3341	/ Validate the cq offset /
3342	if (((u64 *) data->cq_cb->kernel_address + data->cq_offset) >=
3343	((u64 ) data->cq_cb->kernel_address + (data->cq_cb->size / sizeof*(u64)))) {
3344	rc = -EINVAL;
3345	goto put_cq_cb;
3346	}
3347
3348	dev_dbg(hdev->dev, "Timestamp registration: interrupt id: %u, handle: 0x%llx, ts offset: %llu, cq_offset: %llu\n",
3349	data->interrupt->interrupt_id, data->ts_handle,
3350	data->ts_offset, data->cq_offset);
3351
3352	data->buf = hl_mmap_mem_buf_get(mmg: data->mmg, handle: data->ts_handle);
3353	if (!data->buf) {
3354	rc = -EINVAL;
3355	goto put_cq_cb;
3356	}
3357
3358	spin_lock_irqsave(&data->interrupt->ts_list_lock, flags);
3359
3360	/ get ts buffer record /
3361	rc = ts_get_and_handle_kernel_record(hdev, ctx, data, flags: &flags, pend: &pend);
3362	if (rc) {
3363	spin_unlock_irqrestore(lock: &data->interrupt->ts_list_lock, flags);
3364	goto put_ts_buff;
3365	}
3366
3367	/ We check for completion value as interrupt could have been received*
3368	* before we add the timestamp node to the ts list.
3369	*/
3370	if (*pend->cq_kernel_addr >= data->target_value) {
3371	spin_unlock_irqrestore(lock: &data->interrupt->ts_list_lock, flags);
3372
3373	dev_dbg(hdev->dev, "Target value already reached release ts record: pend: %p, offset: %llu, interrupt: %u\n",
3374	pend, data->ts_offset, data->interrupt->interrupt_id);
3375
3376	pend->ts_reg_info.in_use = `0`;
3377	*status = HL_WAIT_CS_STATUS_COMPLETED;
3378	*pend->ts_reg_info.timestamp_kernel_addr = ktime_get_ns();
3379
3380	goto put_ts_buff;
3381	}
3382
3383	list_add_tail(new: &pend->list_node, head: &data->interrupt->ts_list_head);
3384	spin_unlock_irqrestore(lock: &data->interrupt->ts_list_lock, flags);
3385
3386	rc = *status = HL_WAIT_CS_STATUS_COMPLETED;
3387
3388	hl_ctx_put(ctx);
3389
3390	return rc;
3391
3392	put_ts_buff:
3393	hl_mmap_mem_buf_put(buf: data->buf);
3394	put_cq_cb:
3395	hl_cb_put(cb: data->cq_cb);
3396	put_ctx:
3397	hl_ctx_put(ctx);
3398
3399	return rc;
3400	}
3401
3402	static int _hl_interrupt_wait_ioctl(struct hl_device hdev, struct* hl_ctx *ctx,
3403	struct wait_interrupt_data *data,
3404	u32 status, u64 timestamp)
3405	{
3406	struct hl_user_pending_interrupt *pend;
3407	unsigned long timeout, flags;
3408	long completion_rc;
3409	int rc = `0`;
3410
3411	timeout = hl_usecs64_to_jiffies(usecs: data->intr_timeout_us);
3412
3413	hl_ctx_get(ctx);
3414
3415	data->cq_cb = hl_cb_get(mmg: data->mmg, handle: data->cq_handle);
3416	if (!data->cq_cb) {
3417	rc = -EINVAL;
3418	goto put_ctx;
3419	}
3420
3421	/ Validate the cq offset /
3422	if (((u64 *) data->cq_cb->kernel_address + data->cq_offset) >=
3423	((u64 ) data->cq_cb->kernel_address + (data->cq_cb->size / sizeof*(u64)))) {
3424	rc = -EINVAL;
3425	goto put_cq_cb;
3426	}
3427
3428	pend = kzalloc(size: sizeof(*pend), GFP_KERNEL);
3429	if (!pend) {
3430	rc = -ENOMEM;
3431	goto put_cq_cb;
3432	}
3433
3434	hl_fence_init(fence: &pend->fence, ULONG_MAX);
3435	pend->cq_kernel_addr = (u64 *) data->cq_cb->kernel_address + data->cq_offset;
3436	pend->cq_target_value = data->target_value;
3437	spin_lock_irqsave(&data->interrupt->wait_list_lock, flags);
3438
3439
3440	/ We check for completion value as interrupt could have been received*
3441	* before we add the wait node to the wait list.
3442	*/
3443	if (*pend->cq_kernel_addr >= data->target_value \|\| (!data->intr_timeout_us)) {
3444	spin_unlock_irqrestore(lock: &data->interrupt->wait_list_lock, flags);
3445
3446	if (*pend->cq_kernel_addr >= data->target_value)
3447	*status = HL_WAIT_CS_STATUS_COMPLETED;
3448	else
3449	*status = HL_WAIT_CS_STATUS_BUSY;
3450
3451	pend->fence.timestamp = ktime_get();
3452	goto set_timestamp;
3453	}
3454
3455	/ Add pending user interrupt to relevant list for the interrupt*
3456	* handler to monitor.
3457	* Note that we cannot have sorted list by target value,
3458	* in order to shorten the list pass loop, since
3459	* same list could have nodes for different cq counter handle.
3460	*/
3461	list_add_tail(new: &pend->list_node, head: &data->interrupt->wait_list_head);
3462	spin_unlock_irqrestore(lock: &data->interrupt->wait_list_lock, flags);
3463
3464	/ Wait for interrupt handler to signal completion /
3465	completion_rc = wait_for_completion_interruptible_timeout(x: &pend->fence.completion,
3466	timeout);
3467	if (completion_rc > `0`) {
3468	if (pend->fence.error == -EIO) {
3469	dev_err_ratelimited(hdev->dev,
3470	"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3471	pend->fence.error);
3472	rc = -EIO;
3473	*status = HL_WAIT_CS_STATUS_ABORTED;
3474	} else {
3475	*status = HL_WAIT_CS_STATUS_COMPLETED;
3476	}
3477	} else {
3478	if (completion_rc == -ERESTARTSYS) {
3479	dev_err_ratelimited(hdev->dev,
3480	"user process got signal while waiting for interrupt ID %d\n",
3481	data->interrupt->interrupt_id);
3482	rc = -EINTR;
3483	*status = HL_WAIT_CS_STATUS_ABORTED;
3484	} else {
3485	/ The wait has timed-out. We don't know anything beyond that*
3486	* because the workload was not submitted through the driver.
3487	* Therefore, from driver's perspective, the workload is still
3488	* executing.
3489	*/
3490	rc = `0`;
3491	*status = HL_WAIT_CS_STATUS_BUSY;
3492	}
3493	}
3494
3495	/*
3496	* We keep removing the node from list here, and not at the irq handler
3497	* for completion timeout case. and if it's a registration
3498	* for ts record, the node will be deleted in the irq handler after
3499	* we reach the target value.
3500	*/
3501	spin_lock_irqsave(&data->interrupt->wait_list_lock, flags);
3502	list_del(entry: &pend->list_node);
3503	spin_unlock_irqrestore(lock: &data->interrupt->wait_list_lock, flags);
3504
3505	set_timestamp:
3506	*timestamp = ktime_to_ns(kt: pend->fence.timestamp);
3507	kfree(objp: pend);
3508	hl_cb_put(cb: data->cq_cb);
3509	hl_ctx_put(ctx);
3510
3511	return rc;
3512
3513	put_cq_cb:
3514	hl_cb_put(cb: data->cq_cb);
3515	put_ctx:
3516	hl_ctx_put(ctx);
3517
3518	return rc;
3519	}
3520
3521	static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device hdev, struct* hl_ctx *ctx,
3522	u64 timeout_us, u64 user_address,
3523	u64 target_value, struct hl_user_interrupt *interrupt,
3524	u32 *status,
3525	u64 *timestamp)
3526	{
3527	struct hl_user_pending_interrupt *pend;
3528	unsigned long timeout, flags;
3529	u64 completion_value;
3530	long completion_rc;
3531	int rc = `0`;
3532
3533	timeout = hl_usecs64_to_jiffies(usecs: timeout_us);
3534
3535	hl_ctx_get(ctx);
3536
3537	pend = kzalloc(size: sizeof(*pend), GFP_KERNEL);
3538	if (!pend) {
3539	hl_ctx_put(ctx);
3540	return -ENOMEM;
3541	}
3542
3543	hl_fence_init(fence: &pend->fence, ULONG_MAX);
3544
3545	/ Add pending user interrupt to relevant list for the interrupt*
3546	* handler to monitor
3547	*/
3548	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3549	list_add_tail(new: &pend->list_node, head: &interrupt->wait_list_head);
3550	spin_unlock_irqrestore(lock: &interrupt->wait_list_lock, flags);
3551
3552	/ We check for completion value as interrupt could have been received*
3553	* before we added the node to the wait list
3554	*/
3555	if (copy_from_user(to: &completion_value, u64_to_user_ptr(user_address), n: `8`)) {
3556	dev_err(hdev->dev, "Failed to copy completion value from user\n");
3557	rc = -EFAULT;
3558	goto remove_pending_user_interrupt;
3559	}
3560
3561	if (completion_value >= target_value) {
3562	*status = HL_WAIT_CS_STATUS_COMPLETED;
3563	/ There was no interrupt, we assume the completion is now. /
3564	pend->fence.timestamp = ktime_get();
3565	} else {
3566	*status = HL_WAIT_CS_STATUS_BUSY;
3567	}
3568
3569	if (!timeout_us \|\| (*status == HL_WAIT_CS_STATUS_COMPLETED))
3570	goto remove_pending_user_interrupt;
3571
3572	wait_again:
3573	/ Wait for interrupt handler to signal completion /
3574	completion_rc = wait_for_completion_interruptible_timeout(x: &pend->fence.completion,
3575	timeout);
3576
3577	/ If timeout did not expire we need to perform the comparison.*
3578	* If comparison fails, keep waiting until timeout expires
3579	*/
3580	if (completion_rc > `0`) {
3581	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3582	/ reinit_completion must be called before we check for user*
3583	* completion value, otherwise, if interrupt is received after
3584	* the comparison and before the next wait_for_completion,
3585	* we will reach timeout and fail
3586	*/
3587	reinit_completion(x: &pend->fence.completion);
3588	spin_unlock_irqrestore(lock: &interrupt->wait_list_lock, flags);
3589
3590	if (copy_from_user(to: &completion_value, u64_to_user_ptr(user_address), n: `8`)) {
3591	dev_err(hdev->dev, "Failed to copy completion value from user\n");
3592	rc = -EFAULT;
3593
3594	goto remove_pending_user_interrupt;
3595	}
3596
3597	if (completion_value >= target_value) {
3598	*status = HL_WAIT_CS_STATUS_COMPLETED;
3599	} else if (pend->fence.error) {
3600	dev_err_ratelimited(hdev->dev,
3601	"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
3602	pend->fence.error);
3603	/ set the command completion status as ABORTED /
3604	*status = HL_WAIT_CS_STATUS_ABORTED;
3605	} else {
3606	timeout = completion_rc;
3607	goto wait_again;
3608	}
3609	} else if (completion_rc == -ERESTARTSYS) {
3610	dev_err_ratelimited(hdev->dev,
3611	"user process got signal while waiting for interrupt ID %d\n",
3612	interrupt->interrupt_id);
3613	rc = -EINTR;
3614	} else {
3615	/ The wait has timed-out. We don't know anything beyond that*
3616	* because the workload wasn't submitted through the driver.
3617	* Therefore, from driver's perspective, the workload is still
3618	* executing.
3619	*/
3620	rc = `0`;
3621	*status = HL_WAIT_CS_STATUS_BUSY;
3622	}
3623
3624	remove_pending_user_interrupt:
3625	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
3626	list_del(entry: &pend->list_node);
3627	spin_unlock_irqrestore(lock: &interrupt->wait_list_lock, flags);
3628
3629	*timestamp = ktime_to_ns(kt: pend->fence.timestamp);
3630
3631	kfree(objp: pend);
3632	hl_ctx_put(ctx);
3633
3634	return rc;
3635	}
3636
3637	static int hl_interrupt_wait_ioctl(struct hl_fpriv hpriv, void* *data)
3638	{
3639	u16 interrupt_id, first_interrupt, last_interrupt;
3640	struct hl_device *hdev = hpriv->hdev;
3641	struct asic_fixed_properties *prop;
3642	struct hl_user_interrupt *interrupt;
3643	union hl_wait_cs_args *args = data;
3644	u32 status = HL_WAIT_CS_STATUS_BUSY;
3645	u64 timestamp = `0`;
3646	int rc, int_idx;
3647
3648	prop = &hdev->asic_prop;
3649
3650	if (!(prop->user_interrupt_count + prop->user_dec_intr_count)) {
3651	dev_err(hdev->dev, "no user interrupts allowed");
3652	return -EPERM;
3653	}
3654
3655	interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
3656
3657	first_interrupt = prop->first_available_user_interrupt;
3658	last_interrupt = prop->first_available_user_interrupt + prop->user_interrupt_count - `1`;
3659
3660	if (interrupt_id < prop->user_dec_intr_count) {
3661
3662	/ Check if the requested core is enabled /
3663	if (!(prop->decoder_enabled_mask & BIT(interrupt_id))) {
3664	dev_err(hdev->dev, "interrupt on a disabled core(%u) not allowed",
3665	interrupt_id);
3666	return -EINVAL;
3667	}
3668
3669	interrupt = &hdev->user_interrupt[interrupt_id];
3670
3671	} else if (interrupt_id >= first_interrupt && interrupt_id <= last_interrupt) {
3672
3673	int_idx = interrupt_id - first_interrupt + prop->user_dec_intr_count;
3674	interrupt = &hdev->user_interrupt[int_idx];
3675
3676	} else if (interrupt_id == HL_COMMON_USER_CQ_INTERRUPT_ID) {
3677	interrupt = &hdev->common_user_cq_interrupt;
3678	} else if (interrupt_id == HL_COMMON_DEC_INTERRUPT_ID) {
3679	interrupt = &hdev->common_decoder_interrupt;
3680	} else {
3681	dev_err(hdev->dev, "invalid user interrupt %u", interrupt_id);
3682	return -EINVAL;
3683	}
3684
3685	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ) {
3686	struct wait_interrupt_data wait_intr_data = {`0`};
3687
3688	wait_intr_data.interrupt = interrupt;
3689	wait_intr_data.mmg = &hpriv->mem_mgr;
3690	wait_intr_data.cq_handle = args->in.cq_counters_handle;
3691	wait_intr_data.cq_offset = args->in.cq_counters_offset;
3692	wait_intr_data.ts_handle = args->in.timestamp_handle;
3693	wait_intr_data.ts_offset = args->in.timestamp_offset;
3694	wait_intr_data.target_value = args->in.target;
3695	wait_intr_data.intr_timeout_us = args->in.interrupt_timeout_us;
3696
3697	if (args->in.flags & HL_WAIT_CS_FLAGS_REGISTER_INTERRUPT) {
3698	/*
3699	* Allow only one registration at a time. this is needed in order to prevent
3700	* issues while handling the flow of re-use of the same offset.
3701	* Since the registration flow is protected only by the interrupt lock,
3702	* re-use flow might request to move ts node to another interrupt list,
3703	* and in such case we're not protected.
3704	*/
3705	mutex_lock(&hpriv->ctx->ts_reg_lock);
3706
3707	rc = _hl_interrupt_ts_reg_ioctl(hdev, ctx: hpriv->ctx, data: &wait_intr_data,
3708	status: &status, timestamp: &timestamp);
3709
3710	mutex_unlock(lock: &hpriv->ctx->ts_reg_lock);
3711	} else
3712	rc = _hl_interrupt_wait_ioctl(hdev, ctx: hpriv->ctx, data: &wait_intr_data,
3713	status: &status, timestamp: &timestamp);
3714	} else {
3715	rc = _hl_interrupt_wait_ioctl_user_addr(hdev, ctx: hpriv->ctx,
3716	timeout_us: args->in.interrupt_timeout_us, user_address: args->in.addr,
3717	target_value: args->in.target, interrupt, status: &status,
3718	timestamp: &timestamp);
3719	}
3720
3721	if (rc)
3722	return rc;
3723
3724	memset(args, `0`, sizeof(*args));
3725	args->out.status = status;
3726
3727	if (timestamp) {
3728	args->out.timestamp_nsec = timestamp;
3729	args->out.flags \|= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
3730	}
3731
3732	return `0`;
3733	}
3734
3735	int hl_wait_ioctl(struct drm_device ddev, void* data, struct* drm_file *file_priv)
3736	{
3737	struct hl_fpriv *hpriv = file_priv->driver_priv;
3738	struct hl_device *hdev = hpriv->hdev;
3739	union hl_wait_cs_args *args = data;
3740	u32 flags = args->in.flags;
3741	int rc;
3742
3743	/ If the device is not operational, or if an error has happened and user should release the*
3744	* device, there is no point in waiting for any command submission or user interrupt.
3745	*/
3746	if (!hl_device_operational(hdev: hpriv->hdev, NULL) \|\| hdev->reset_info.watchdog_active)
3747	return -EBUSY;
3748
3749	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
3750	rc = hl_interrupt_wait_ioctl(hpriv, data);
3751	else if (flags & HL_WAIT_CS_FLAGS_MULTI_CS)
3752	rc = hl_multi_cs_wait_ioctl(hpriv, data);
3753	else
3754	rc = hl_cs_wait_ioctl(hpriv, data);
3755
3756	return rc;
3757	}
3758

source code of linux/drivers/accel/habanalabs/common/command_submission.c