ne_misc_dev.c source code [linux/drivers/virt/nitro_enclaves/ne_misc_dev.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
4	*/
5
6	/**
7	* DOC: Enclave lifetime management driver for Nitro Enclaves (NE).
8	* Nitro is a hypervisor that has been developed by Amazon.
9	*/
10
11	#include <linux/anon_inodes.h>
12	#include <linux/capability.h>
13	#include <linux/cpu.h>
14	#include <linux/device.h>
15	#include <linux/file.h>
16	#include <linux/hugetlb.h>
17	#include <linux/limits.h>
18	#include <linux/list.h>
19	#include <linux/miscdevice.h>
20	#include <linux/mm.h>
21	#include <linux/mman.h>
22	#include <linux/module.h>
23	#include <linux/mutex.h>
24	#include <linux/nitro_enclaves.h>
25	#include <linux/pci.h>
26	#include <linux/poll.h>
27	#include <linux/range.h>
28	#include <linux/slab.h>
29	#include <linux/types.h>
30	#include <uapi/linux/vm_sockets.h>
31
32	#include "ne_misc_dev.h"
33	#include "ne_pci_dev.h"
34
35	/**
36	* NE_CPUS_SIZE - Size for max 128 CPUs, for now, in a cpu-list string, comma
37	* separated. The NE CPU pool includes CPUs from a single NUMA
38	* node.
39	*/
40	#define NE_CPUS_SIZE (512)
41
42	/**
43	* NE_EIF_LOAD_OFFSET - The offset where to copy the Enclave Image Format (EIF)
44	* image in enclave memory.
45	*/
46	#define NE_EIF_LOAD_OFFSET (8 * 1024UL * 1024UL)
47
48	/**
49	* NE_MIN_ENCLAVE_MEM_SIZE - The minimum memory size an enclave can be launched
50	* with.
51	*/
52	#define NE_MIN_ENCLAVE_MEM_SIZE (64 * 1024UL * 1024UL)
53
54	/**
55	* NE_MIN_MEM_REGION_SIZE - The minimum size of an enclave memory region.
56	*/
57	#define NE_MIN_MEM_REGION_SIZE (2 * 1024UL * 1024UL)
58
59	/**
60	* NE_PARENT_VM_CID - The CID for the vsock device of the primary / parent VM.
61	*/
62	#define NE_PARENT_VM_CID (3)
63
64	static long ne_ioctl(struct file file, unsigned* int cmd, unsigned long arg);
65
66	static const struct file_operations ne_fops = {
67	.owner = THIS_MODULE,
68	.llseek = noop_llseek,
69	.unlocked_ioctl = ne_ioctl,
70	};
71
72	static struct miscdevice ne_misc_dev = {
73	.minor = MISC_DYNAMIC_MINOR,
74	.name = "nitro_enclaves",
75	.fops = &ne_fops,
76	.mode = `0660`,
77	};
78
79	struct ne_devs ne_devs = {
80	.ne_misc_dev = &ne_misc_dev,
81	};
82
83	/*
84	* TODO: Update logic to create new sysfs entries instead of using
85	* a kernel parameter e.g. if multiple sysfs files needed.
86	*/
87	static int ne_set_kernel_param(const char val, const* struct kernel_param *kp);
88
89	static const struct kernel_param_ops ne_cpu_pool_ops = {
90	.get = param_get_string,
91	.set = ne_set_kernel_param,
92	};
93
94	static char ne_cpus[NE_CPUS_SIZE];
95	static struct kparam_string ne_cpus_arg = {
96	.maxlen = sizeof(ne_cpus),
97	.string = ne_cpus,
98	};
99
100	module_param_cb(ne_cpus, &ne_cpu_pool_ops, &ne_cpus_arg, `0644`);
101	/ https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists /
102	MODULE_PARM_DESC(ne_cpus, "<cpu-list> - CPU pool used for Nitro Enclaves");
103
104	/**
105	* struct ne_cpu_pool - CPU pool used for Nitro Enclaves.
106	* @avail_threads_per_core: Available full CPU cores to be dedicated to
107	* enclave(s). The cpumasks from the array, indexed
108	* by core id, contain all the threads from the
109	* available cores, that are not set for created
110	* enclave(s). The full CPU cores are part of the
111	* NE CPU pool.
112	* @mutex: Mutex for the access to the NE CPU pool.
113	* @nr_parent_vm_cores : The size of the available threads per core array.
114	* The total number of CPU cores available on the
115	* primary / parent VM.
116	* @nr_threads_per_core: The number of threads that a full CPU core has.
117	* @numa_node: NUMA node of the CPUs in the pool.
118	*/
119	struct ne_cpu_pool {
120	cpumask_var_t *avail_threads_per_core;
121	struct mutex mutex;
122	unsigned int nr_parent_vm_cores;
123	unsigned int nr_threads_per_core;
124	int numa_node;
125	};
126
127	static struct ne_cpu_pool ne_cpu_pool;
128
129	/**
130	* struct ne_phys_contig_mem_regions - Contiguous physical memory regions.
131	* @num: The number of regions that currently has.
132	* @regions: The array of physical memory regions.
133	*/
134	struct ne_phys_contig_mem_regions {
135	unsigned long num;
136	struct range *regions;
137	};
138
139	/**
140	* ne_check_enclaves_created() - Verify if at least one enclave has been created.
141	* @void: No parameters provided.
142	*
143	* Context: Process context.
144	* Return:
145	* * True if at least one enclave is created.
146	* * False otherwise.
147	*/
148	static bool ne_check_enclaves_created(void)
149	{
150	struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
151	bool ret = false;
152
153	if (!ne_pci_dev)
154	return ret;
155
156	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
157
158	if (!list_empty(head: &ne_pci_dev->enclaves_list))
159	ret = true;
160
161	mutex_unlock(lock: &ne_pci_dev->enclaves_list_mutex);
162
163	return ret;
164	}
165
166	/**
167	* ne_setup_cpu_pool() - Set the NE CPU pool after handling sanity checks such
168	* as not sharing CPU cores with the primary / parent VM
169	* or not using CPU 0, which should remain available for
170	* the primary / parent VM. Offline the CPUs from the
171	* pool after the checks passed.
172	* @ne_cpu_list: The CPU list used for setting NE CPU pool.
173	*
174	* Context: Process context.
175	* Return:
176	* * 0 on success.
177	* * Negative return value on failure.
178	*/
179	static int ne_setup_cpu_pool(const char *ne_cpu_list)
180	{
181	int core_id = -`1`;
182	unsigned int cpu = `0`;
183	cpumask_var_t cpu_pool;
184	unsigned int cpu_sibling = `0`;
185	unsigned int i = `0`;
186	int numa_node = -`1`;
187	int rc = -EINVAL;
188
189	if (!zalloc_cpumask_var(mask: &cpu_pool, GFP_KERNEL))
190	return -ENOMEM;
191
192	mutex_lock(&ne_cpu_pool.mutex);
193
194	rc = cpulist_parse(buf: ne_cpu_list, dstp: cpu_pool);
195	if (rc < `0`) {
196	pr_err("%s: Error in cpulist parse [rc=%d]\n", ne_misc_dev.name, rc);
197
198	goto free_pool_cpumask;
199	}
200
201	cpu = cpumask_any(cpu_pool);
202	if (cpu >= nr_cpu_ids) {
203	pr_err("%s: No CPUs available in CPU pool\n", ne_misc_dev.name);
204
205	rc = -EINVAL;
206
207	goto free_pool_cpumask;
208	}
209
210	/*
211	* Check if the CPUs are online, to further get info about them
212	* e.g. numa node, core id, siblings.
213	*/
214	for_each_cpu(cpu, cpu_pool)
215	if (cpu_is_offline(cpu)) {
216	pr_err("%s: CPU %d is offline, has to be online to get its metadata\n",
217	ne_misc_dev.name, cpu);
218
219	rc = -EINVAL;
220
221	goto free_pool_cpumask;
222	}
223
224	/*
225	* Check if the CPUs from the NE CPU pool are from the same NUMA node.
226	*/
227	for_each_cpu(cpu, cpu_pool)
228	if (numa_node < `0`) {
229	numa_node = cpu_to_node(cpu);
230	if (numa_node < `0`) {
231	pr_err("%s: Invalid NUMA node %d\n",
232	ne_misc_dev.name, numa_node);
233
234	rc = -EINVAL;
235
236	goto free_pool_cpumask;
237	}
238	} else {
239	if (numa_node != cpu_to_node(cpu)) {
240	pr_err("%s: CPUs with different NUMA nodes\n",
241	ne_misc_dev.name);
242
243	rc = -EINVAL;
244
245	goto free_pool_cpumask;
246	}
247	}
248
249	/*
250	* Check if CPU 0 and its siblings are included in the provided CPU pool
251	* They should remain available for the primary / parent VM.
252	*/
253	if (cpumask_test_cpu(cpu: `0`, cpumask: cpu_pool)) {
254	pr_err("%s: CPU 0 has to remain available\n", ne_misc_dev.name);
255
256	rc = -EINVAL;
257
258	goto free_pool_cpumask;
259	}
260
261	for_each_cpu(cpu_sibling, topology_sibling_cpumask(`0`)) {
262	if (cpumask_test_cpu(cpu: cpu_sibling, cpumask: cpu_pool)) {
263	pr_err("%s: CPU sibling %d for CPU 0 is in CPU pool\n",
264	ne_misc_dev.name, cpu_sibling);
265
266	rc = -EINVAL;
267
268	goto free_pool_cpumask;
269	}
270	}
271
272	/*
273	* Check if CPU siblings are included in the provided CPU pool. The
274	* expectation is that full CPU cores are made available in the CPU pool
275	* for enclaves.
276	*/
277	for_each_cpu(cpu, cpu_pool) {
278	for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu)) {
279	if (!cpumask_test_cpu(cpu: cpu_sibling, cpumask: cpu_pool)) {
280	pr_err("%s: CPU %d is not in CPU pool\n",
281	ne_misc_dev.name, cpu_sibling);
282
283	rc = -EINVAL;
284
285	goto free_pool_cpumask;
286	}
287	}
288	}
289
290	/ Calculate the number of threads from a full CPU core. /
291	cpu = cpumask_any(cpu_pool);
292	for_each_cpu(cpu_sibling, topology_sibling_cpumask(cpu))
293	ne_cpu_pool.nr_threads_per_core++;
294
295	ne_cpu_pool.nr_parent_vm_cores = nr_cpu_ids / ne_cpu_pool.nr_threads_per_core;
296
297	ne_cpu_pool.avail_threads_per_core = kcalloc(n: ne_cpu_pool.nr_parent_vm_cores,
298	size: sizeof(*ne_cpu_pool.avail_threads_per_core),
299	GFP_KERNEL);
300	if (!ne_cpu_pool.avail_threads_per_core) {
301	rc = -ENOMEM;
302
303	goto free_pool_cpumask;
304	}
305
306	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
307	if (!zalloc_cpumask_var(mask: &ne_cpu_pool.avail_threads_per_core[i], GFP_KERNEL)) {
308	rc = -ENOMEM;
309
310	goto free_cores_cpumask;
311	}
312
313	/*
314	* Split the NE CPU pool in threads per core to keep the CPU topology
315	* after offlining the CPUs.
316	*/
317	for_each_cpu(cpu, cpu_pool) {
318	core_id = topology_core_id(cpu);
319	if (core_id < `0` \|\| core_id >= ne_cpu_pool.nr_parent_vm_cores) {
320	pr_err("%s: Invalid core id %d for CPU %d\n",
321	ne_misc_dev.name, core_id, cpu);
322
323	rc = -EINVAL;
324
325	goto clear_cpumask;
326	}
327
328	cpumask_set_cpu(cpu, dstp: ne_cpu_pool.avail_threads_per_core[core_id]);
329	}
330
331	/*
332	* CPUs that are given to enclave(s) should not be considered online
333	* by Linux anymore, as the hypervisor will degrade them to floating.
334	* The physical CPUs (full cores) are carved out of the primary / parent
335	* VM and given to the enclave VM. The same number of vCPUs would run
336	* on less pCPUs for the primary / parent VM.
337	*
338	* We offline them here, to not degrade performance and expose correct
339	* topology to Linux and user space.
340	*/
341	for_each_cpu(cpu, cpu_pool) {
342	rc = remove_cpu(cpu);
343	if (rc != `0`) {
344	pr_err("%s: CPU %d is not offlined [rc=%d]\n",
345	ne_misc_dev.name, cpu, rc);
346
347	goto online_cpus;
348	}
349	}
350
351	free_cpumask_var(mask: cpu_pool);
352
353	ne_cpu_pool.numa_node = numa_node;
354
355	mutex_unlock(lock: &ne_cpu_pool.mutex);
356
357	return `0`;
358
359	online_cpus:
360	for_each_cpu(cpu, cpu_pool)
361	add_cpu(cpu);
362	clear_cpumask:
363	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
364	cpumask_clear(dstp: ne_cpu_pool.avail_threads_per_core[i]);
365	free_cores_cpumask:
366	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
367	free_cpumask_var(mask: ne_cpu_pool.avail_threads_per_core[i]);
368	kfree(objp: ne_cpu_pool.avail_threads_per_core);
369	free_pool_cpumask:
370	free_cpumask_var(mask: cpu_pool);
371	ne_cpu_pool.nr_parent_vm_cores = `0`;
372	ne_cpu_pool.nr_threads_per_core = `0`;
373	ne_cpu_pool.numa_node = -`1`;
374	mutex_unlock(lock: &ne_cpu_pool.mutex);
375
376	return rc;
377	}
378
379	/**
380	* ne_teardown_cpu_pool() - Online the CPUs from the NE CPU pool and cleanup the
381	* CPU pool.
382	* @void: No parameters provided.
383	*
384	* Context: Process context.
385	*/
386	static void ne_teardown_cpu_pool(void)
387	{
388	unsigned int cpu = `0`;
389	unsigned int i = `0`;
390	int rc = -EINVAL;
391
392	mutex_lock(&ne_cpu_pool.mutex);
393
394	if (!ne_cpu_pool.nr_parent_vm_cores) {
395	mutex_unlock(lock: &ne_cpu_pool.mutex);
396
397	return;
398	}
399
400	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++) {
401	for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[i]) {
402	rc = add_cpu(cpu);
403	if (rc != `0`)
404	pr_err("%s: CPU %d is not onlined [rc=%d]\n",
405	ne_misc_dev.name, cpu, rc);
406	}
407
408	cpumask_clear(dstp: ne_cpu_pool.avail_threads_per_core[i]);
409
410	free_cpumask_var(mask: ne_cpu_pool.avail_threads_per_core[i]);
411	}
412
413	kfree(objp: ne_cpu_pool.avail_threads_per_core);
414	ne_cpu_pool.nr_parent_vm_cores = `0`;
415	ne_cpu_pool.nr_threads_per_core = `0`;
416	ne_cpu_pool.numa_node = -`1`;
417
418	mutex_unlock(lock: &ne_cpu_pool.mutex);
419	}
420
421	/**
422	* ne_set_kernel_param() - Set the NE CPU pool value via the NE kernel parameter.
423	* @val: NE CPU pool string value.
424	* @kp : NE kernel parameter associated with the NE CPU pool.
425	*
426	* Context: Process context.
427	* Return:
428	* * 0 on success.
429	* * Negative return value on failure.
430	*/
431	static int ne_set_kernel_param(const char val, const* struct kernel_param *kp)
432	{
433	char error_val[] = "";
434	int rc = -EINVAL;
435
436	if (!capable(CAP_SYS_ADMIN))
437	return -EPERM;
438
439	if (ne_check_enclaves_created()) {
440	pr_err("%s: The CPU pool is used by enclave(s)\n", ne_misc_dev.name);
441
442	return -EPERM;
443	}
444
445	ne_teardown_cpu_pool();
446
447	rc = ne_setup_cpu_pool(ne_cpu_list: val);
448	if (rc < `0`) {
449	pr_err("%s: Error in setup CPU pool [rc=%d]\n", ne_misc_dev.name, rc);
450
451	param_set_copystring(val: error_val, kp);
452
453	return rc;
454	}
455
456	rc = param_set_copystring(val, kp);
457	if (rc < `0`) {
458	pr_err("%s: Error in param set copystring [rc=%d]\n", ne_misc_dev.name, rc);
459
460	ne_teardown_cpu_pool();
461
462	param_set_copystring(val: error_val, kp);
463
464	return rc;
465	}
466
467	return `0`;
468	}
469
470	/**
471	* ne_donated_cpu() - Check if the provided CPU is already used by the enclave.
472	* @ne_enclave : Private data associated with the current enclave.
473	* @cpu: CPU to check if already used.
474	*
475	* Context: Process context. This function is called with the ne_enclave mutex held.
476	* Return:
477	* * True if the provided CPU is already used by the enclave.
478	* * False otherwise.
479	*/
480	static bool ne_donated_cpu(struct ne_enclave ne_enclave, unsigned* int cpu)
481	{
482	if (cpumask_test_cpu(cpu, cpumask: ne_enclave->vcpu_ids))
483	return true;
484
485	return false;
486	}
487
488	/**
489	* ne_get_unused_core_from_cpu_pool() - Get the id of a full core from the
490	* NE CPU pool.
491	* @void: No parameters provided.
492	*
493	* Context: Process context. This function is called with the ne_enclave and
494	* ne_cpu_pool mutexes held.
495	* Return:
496	* * Core id.
497	* * -1 if no CPU core available in the pool.
498	*/
499	static int ne_get_unused_core_from_cpu_pool(void)
500	{
501	int core_id = -`1`;
502	unsigned int i = `0`;
503
504	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
505	if (!cpumask_empty(srcp: ne_cpu_pool.avail_threads_per_core[i])) {
506	core_id = i;
507
508	break;
509	}
510
511	return core_id;
512	}
513
514	/**
515	* ne_set_enclave_threads_per_core() - Set the threads of the provided core in
516	* the enclave data structure.
517	* @ne_enclave : Private data associated with the current enclave.
518	* @core_id: Core id to get its threads from the NE CPU pool.
519	* @vcpu_id: vCPU id part of the provided core.
520	*
521	* Context: Process context. This function is called with the ne_enclave and
522	* ne_cpu_pool mutexes held.
523	* Return:
524	* * 0 on success.
525	* * Negative return value on failure.
526	*/
527	static int ne_set_enclave_threads_per_core(struct ne_enclave *ne_enclave,
528	int core_id, u32 vcpu_id)
529	{
530	unsigned int cpu = `0`;
531
532	if (core_id < `0` && vcpu_id == `0`) {
533	dev_err_ratelimited(ne_misc_dev.this_device,
534	"No CPUs available in NE CPU pool\n");
535
536	return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
537	}
538
539	if (core_id < `0`) {
540	dev_err_ratelimited(ne_misc_dev.this_device,
541	"CPU %d is not in NE CPU pool\n", vcpu_id);
542
543	return -NE_ERR_VCPU_NOT_IN_CPU_POOL;
544	}
545
546	if (core_id >= ne_enclave->nr_parent_vm_cores) {
547	dev_err_ratelimited(ne_misc_dev.this_device,
548	"Invalid core id %d - ne_enclave\n", core_id);
549
550	return -NE_ERR_VCPU_INVALID_CPU_CORE;
551	}
552
553	for_each_cpu(cpu, ne_cpu_pool.avail_threads_per_core[core_id])
554	cpumask_set_cpu(cpu, dstp: ne_enclave->threads_per_core[core_id]);
555
556	cpumask_clear(dstp: ne_cpu_pool.avail_threads_per_core[core_id]);
557
558	return `0`;
559	}
560
561	/**
562	* ne_get_cpu_from_cpu_pool() - Get a CPU from the NE CPU pool, either from the
563	* remaining sibling(s) of a CPU core or the first
564	* sibling of a new CPU core.
565	* @ne_enclave : Private data associated with the current enclave.
566	* @vcpu_id: vCPU to get from the NE CPU pool.
567	*
568	* Context: Process context. This function is called with the ne_enclave mutex held.
569	* Return:
570	* * 0 on success.
571	* * Negative return value on failure.
572	*/
573	static int ne_get_cpu_from_cpu_pool(struct ne_enclave ne_enclave, u32 vcpu_id)
574	{
575	int core_id = -`1`;
576	unsigned int cpu = `0`;
577	unsigned int i = `0`;
578	int rc = -EINVAL;
579
580	/*
581	* If previously allocated a thread of a core to this enclave, first
582	* check remaining sibling(s) for new CPU allocations, so that full
583	* CPU cores are used for the enclave.
584	*/
585	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++)
586	for_each_cpu(cpu, ne_enclave->threads_per_core[i])
587	if (!ne_donated_cpu(ne_enclave, cpu)) {
588	*vcpu_id = cpu;
589
590	return `0`;
591	}
592
593	mutex_lock(&ne_cpu_pool.mutex);
594
595	/*
596	* If no remaining siblings, get a core from the NE CPU pool and keep
597	* track of all the threads in the enclave threads per core data structure.
598	*/
599	core_id = ne_get_unused_core_from_cpu_pool();
600
601	rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id: *vcpu_id);
602	if (rc < `0`)
603	goto unlock_mutex;
604
605	*vcpu_id = cpumask_any(ne_enclave->threads_per_core[core_id]);
606
607	rc = `0`;
608
609	unlock_mutex:
610	mutex_unlock(lock: &ne_cpu_pool.mutex);
611
612	return rc;
613	}
614
615	/**
616	* ne_get_vcpu_core_from_cpu_pool() - Get from the NE CPU pool the id of the
617	* core associated with the provided vCPU.
618	* @vcpu_id: Provided vCPU id to get its associated core id.
619	*
620	* Context: Process context. This function is called with the ne_enclave and
621	* ne_cpu_pool mutexes held.
622	* Return:
623	* * Core id.
624	* * -1 if the provided vCPU is not in the pool.
625	*/
626	static int ne_get_vcpu_core_from_cpu_pool(u32 vcpu_id)
627	{
628	int core_id = -`1`;
629	unsigned int i = `0`;
630
631	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
632	if (cpumask_test_cpu(cpu: vcpu_id, cpumask: ne_cpu_pool.avail_threads_per_core[i])) {
633	core_id = i;
634
635	break;
636	}
637
638	return core_id;
639	}
640
641	/**
642	* ne_check_cpu_in_cpu_pool() - Check if the given vCPU is in the available CPUs
643	* from the pool.
644	* @ne_enclave : Private data associated with the current enclave.
645	* @vcpu_id: ID of the vCPU to check if available in the NE CPU pool.
646	*
647	* Context: Process context. This function is called with the ne_enclave mutex held.
648	* Return:
649	* * 0 on success.
650	* * Negative return value on failure.
651	*/
652	static int ne_check_cpu_in_cpu_pool(struct ne_enclave *ne_enclave, u32 vcpu_id)
653	{
654	int core_id = -`1`;
655	unsigned int i = `0`;
656	int rc = -EINVAL;
657
658	if (ne_donated_cpu(ne_enclave, cpu: vcpu_id)) {
659	dev_err_ratelimited(ne_misc_dev.this_device,
660	"CPU %d already used\n", vcpu_id);
661
662	return -NE_ERR_VCPU_ALREADY_USED;
663	}
664
665	/*
666	* If previously allocated a thread of a core to this enclave, but not
667	* the full core, first check remaining sibling(s).
668	*/
669	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++)
670	if (cpumask_test_cpu(cpu: vcpu_id, cpumask: ne_enclave->threads_per_core[i]))
671	return `0`;
672
673	mutex_lock(&ne_cpu_pool.mutex);
674
675	/*
676	* If no remaining siblings, get from the NE CPU pool the core
677	* associated with the vCPU and keep track of all the threads in the
678	* enclave threads per core data structure.
679	*/
680	core_id = ne_get_vcpu_core_from_cpu_pool(vcpu_id);
681
682	rc = ne_set_enclave_threads_per_core(ne_enclave, core_id, vcpu_id);
683	if (rc < `0`)
684	goto unlock_mutex;
685
686	rc = `0`;
687
688	unlock_mutex:
689	mutex_unlock(lock: &ne_cpu_pool.mutex);
690
691	return rc;
692	}
693
694	/**
695	* ne_add_vcpu_ioctl() - Add a vCPU to the slot associated with the current
696	* enclave.
697	* @ne_enclave : Private data associated with the current enclave.
698	* @vcpu_id: ID of the CPU to be associated with the given slot,
699	* apic id on x86.
700	*
701	* Context: Process context. This function is called with the ne_enclave mutex held.
702	* Return:
703	* * 0 on success.
704	* * Negative return value on failure.
705	*/
706	static int ne_add_vcpu_ioctl(struct ne_enclave *ne_enclave, u32 vcpu_id)
707	{
708	struct ne_pci_dev_cmd_reply cmd_reply = {};
709	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
710	int rc = -EINVAL;
711	struct slot_add_vcpu_req slot_add_vcpu_req = {};
712
713	if (ne_enclave->mm != current->mm)
714	return -EIO;
715
716	slot_add_vcpu_req.slot_uid = ne_enclave->slot_uid;
717	slot_add_vcpu_req.vcpu_id = vcpu_id;
718
719	rc = ne_do_request(pdev, cmd_type: SLOT_ADD_VCPU,
720	cmd_request: &slot_add_vcpu_req, cmd_request_size: sizeof(slot_add_vcpu_req),
721	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
722	if (rc < `0`) {
723	dev_err_ratelimited(ne_misc_dev.this_device,
724	"Error in slot add vCPU [rc=%d]\n", rc);
725
726	return rc;
727	}
728
729	cpumask_set_cpu(cpu: vcpu_id, dstp: ne_enclave->vcpu_ids);
730
731	ne_enclave->nr_vcpus++;
732
733	return `0`;
734	}
735
736	/**
737	* ne_sanity_check_user_mem_region() - Sanity check the user space memory
738	* region received during the set user
739	* memory region ioctl call.
740	* @ne_enclave : Private data associated with the current enclave.
741	* @mem_region : User space memory region to be sanity checked.
742	*
743	* Context: Process context. This function is called with the ne_enclave mutex held.
744	* Return:
745	* * 0 on success.
746	* * Negative return value on failure.
747	*/
748	static int ne_sanity_check_user_mem_region(struct ne_enclave *ne_enclave,
749	struct ne_user_memory_region mem_region)
750	{
751	struct ne_mem_region *ne_mem_region = NULL;
752
753	if (ne_enclave->mm != current->mm)
754	return -EIO;
755
756	if (mem_region.memory_size & (NE_MIN_MEM_REGION_SIZE - `1`)) {
757	dev_err_ratelimited(ne_misc_dev.this_device,
758	"User space memory size is not multiple of 2 MiB\n");
759
760	return -NE_ERR_INVALID_MEM_REGION_SIZE;
761	}
762
763	if (!IS_ALIGNED(mem_region.userspace_addr, NE_MIN_MEM_REGION_SIZE)) {
764	dev_err_ratelimited(ne_misc_dev.this_device,
765	"User space address is not 2 MiB aligned\n");
766
767	return -NE_ERR_UNALIGNED_MEM_REGION_ADDR;
768	}
769
770	if ((mem_region.userspace_addr & (NE_MIN_MEM_REGION_SIZE - `1`)) \|\|
771	!access_ok((void __user )(unsigned* long)mem_region.userspace_addr,
772	mem_region.memory_size)) {
773	dev_err_ratelimited(ne_misc_dev.this_device,
774	"Invalid user space address range\n");
775
776	return -NE_ERR_INVALID_MEM_REGION_ADDR;
777	}
778
779	list_for_each_entry(ne_mem_region, &ne_enclave->mem_regions_list,
780	mem_region_list_entry) {
781	u64 memory_size = ne_mem_region->memory_size;
782	u64 userspace_addr = ne_mem_region->userspace_addr;
783
784	if ((userspace_addr <= mem_region.userspace_addr &&
785	mem_region.userspace_addr < (userspace_addr + memory_size)) \|\|
786	(mem_region.userspace_addr <= userspace_addr &&
787	(mem_region.userspace_addr + mem_region.memory_size) > userspace_addr)) {
788	dev_err_ratelimited(ne_misc_dev.this_device,
789	"User space memory region already used\n");
790
791	return -NE_ERR_MEM_REGION_ALREADY_USED;
792	}
793	}
794
795	return `0`;
796	}
797
798	/**
799	* ne_sanity_check_user_mem_region_page() - Sanity check a page from the user space
800	* memory region received during the set
801	* user memory region ioctl call.
802	* @ne_enclave : Private data associated with the current enclave.
803	* @mem_region_page: Page from the user space memory region to be sanity checked.
804	*
805	* Context: Process context. This function is called with the ne_enclave mutex held.
806	* Return:
807	* * 0 on success.
808	* * Negative return value on failure.
809	*/
810	static int ne_sanity_check_user_mem_region_page(struct ne_enclave *ne_enclave,
811	struct page *mem_region_page)
812	{
813	if (!PageHuge(page: mem_region_page)) {
814	dev_err_ratelimited(ne_misc_dev.this_device,
815	"Not a hugetlbfs page\n");
816
817	return -NE_ERR_MEM_NOT_HUGE_PAGE;
818	}
819
820	if (page_size(page: mem_region_page) & (NE_MIN_MEM_REGION_SIZE - `1`)) {
821	dev_err_ratelimited(ne_misc_dev.this_device,
822	"Page size not multiple of 2 MiB\n");
823
824	return -NE_ERR_INVALID_PAGE_SIZE;
825	}
826
827	if (ne_enclave->numa_node != page_to_nid(page: mem_region_page)) {
828	dev_err_ratelimited(ne_misc_dev.this_device,
829	"Page is not from NUMA node %d\n",
830	ne_enclave->numa_node);
831
832	return -NE_ERR_MEM_DIFFERENT_NUMA_NODE;
833	}
834
835	return `0`;
836	}
837
838	/**
839	* ne_sanity_check_phys_mem_region() - Sanity check the start address and the size
840	* of a physical memory region.
841	* @phys_mem_region_paddr : Physical start address of the region to be sanity checked.
842	* @phys_mem_region_size : Length of the region to be sanity checked.
843	*
844	* Context: Process context. This function is called with the ne_enclave mutex held.
845	* Return:
846	* * 0 on success.
847	* * Negative return value on failure.
848	*/
849	static int ne_sanity_check_phys_mem_region(u64 phys_mem_region_paddr,
850	u64 phys_mem_region_size)
851	{
852	if (phys_mem_region_size & (NE_MIN_MEM_REGION_SIZE - `1`)) {
853	dev_err_ratelimited(ne_misc_dev.this_device,
854	"Physical mem region size is not multiple of 2 MiB\n");
855
856	return -EINVAL;
857	}
858
859	if (!IS_ALIGNED(phys_mem_region_paddr, NE_MIN_MEM_REGION_SIZE)) {
860	dev_err_ratelimited(ne_misc_dev.this_device,
861	"Physical mem region address is not 2 MiB aligned\n");
862
863	return -EINVAL;
864	}
865
866	return `0`;
867	}
868
869	/**
870	* ne_merge_phys_contig_memory_regions() - Add a memory region and merge the adjacent
871	* regions if they are physically contiguous.
872	* @phys_contig_regions : Private data associated with the contiguous physical memory regions.
873	* @page_paddr : Physical start address of the region to be added.
874	* @page_size : Length of the region to be added.
875	*
876	* Context: Process context. This function is called with the ne_enclave mutex held.
877	* Return:
878	* * 0 on success.
879	* * Negative return value on failure.
880	*/
881	static int
882	ne_merge_phys_contig_memory_regions(struct ne_phys_contig_mem_regions *phys_contig_regions,
883	u64 page_paddr, u64 page_size)
884	{
885	unsigned long num = phys_contig_regions->num;
886	int rc = `0`;
887
888	rc = ne_sanity_check_phys_mem_region(phys_mem_region_paddr: page_paddr, phys_mem_region_size: page_size);
889	if (rc < `0`)
890	return rc;
891
892	/ Physically contiguous, just merge /
893	if (num && (phys_contig_regions->regions[num - `1`].end + `1`) == page_paddr) {
894	phys_contig_regions->regions[num - `1`].end += page_size;
895	} else {
896	phys_contig_regions->regions[num].start = page_paddr;
897	phys_contig_regions->regions[num].end = page_paddr + page_size - `1`;
898	phys_contig_regions->num++;
899	}
900
901	return `0`;
902	}
903
904	/**
905	* ne_set_user_memory_region_ioctl() - Add user space memory region to the slot
906	* associated with the current enclave.
907	* @ne_enclave : Private data associated with the current enclave.
908	* @mem_region : User space memory region to be associated with the given slot.
909	*
910	* Context: Process context. This function is called with the ne_enclave mutex held.
911	* Return:
912	* * 0 on success.
913	* * Negative return value on failure.
914	*/
915	static int ne_set_user_memory_region_ioctl(struct ne_enclave *ne_enclave,
916	struct ne_user_memory_region mem_region)
917	{
918	long gup_rc = `0`;
919	unsigned long i = `0`;
920	unsigned long max_nr_pages = `0`;
921	unsigned long memory_size = `0`;
922	struct ne_mem_region *ne_mem_region = NULL;
923	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
924	struct ne_phys_contig_mem_regions phys_contig_mem_regions = {};
925	int rc = -EINVAL;
926
927	rc = ne_sanity_check_user_mem_region(ne_enclave, mem_region);
928	if (rc < `0`)
929	return rc;
930
931	ne_mem_region = kzalloc(size: sizeof(*ne_mem_region), GFP_KERNEL);
932	if (!ne_mem_region)
933	return -ENOMEM;
934
935	max_nr_pages = mem_region.memory_size / NE_MIN_MEM_REGION_SIZE;
936
937	ne_mem_region->pages = kcalloc(n: max_nr_pages, size: sizeof(*ne_mem_region->pages),
938	GFP_KERNEL);
939	if (!ne_mem_region->pages) {
940	rc = -ENOMEM;
941
942	goto free_mem_region;
943	}
944
945	phys_contig_mem_regions.regions = kcalloc(n: max_nr_pages,
946	size: sizeof(*phys_contig_mem_regions.regions),
947	GFP_KERNEL);
948	if (!phys_contig_mem_regions.regions) {
949	rc = -ENOMEM;
950
951	goto free_mem_region;
952	}
953
954	do {
955	i = ne_mem_region->nr_pages;
956
957	if (i == max_nr_pages) {
958	dev_err_ratelimited(ne_misc_dev.this_device,
959	"Reached max nr of pages in the pages data struct\n");
960
961	rc = -ENOMEM;
962
963	goto put_pages;
964	}
965
966	gup_rc = get_user_pages_unlocked(start: mem_region.userspace_addr + memory_size, nr_pages: `1`,
967	pages: ne_mem_region->pages + i, gup_flags: FOLL_GET);
968
969	if (gup_rc < `0`) {
970	rc = gup_rc;
971
972	dev_err_ratelimited(ne_misc_dev.this_device,
973	"Error in get user pages [rc=%d]\n", rc);
974
975	goto put_pages;
976	}
977
978	rc = ne_sanity_check_user_mem_region_page(ne_enclave, mem_region_page: ne_mem_region->pages[i]);
979	if (rc < `0`)
980	goto put_pages;
981
982	rc = ne_merge_phys_contig_memory_regions(phys_contig_regions: &phys_contig_mem_regions,
983	page_to_phys(ne_mem_region->pages[i]),
984	page_size: page_size(page: ne_mem_region->pages[i]));
985	if (rc < `0`)
986	goto put_pages;
987
988	memory_size += page_size(page: ne_mem_region->pages[i]);
989
990	ne_mem_region->nr_pages++;
991	} while (memory_size < mem_region.memory_size);
992
993	if ((ne_enclave->nr_mem_regions + phys_contig_mem_regions.num) >
994	ne_enclave->max_mem_regions) {
995	dev_err_ratelimited(ne_misc_dev.this_device,
996	"Reached max memory regions %lld\n",
997	ne_enclave->max_mem_regions);
998
999	rc = -NE_ERR_MEM_MAX_REGIONS;
1000
1001	goto put_pages;
1002	}
1003
1004	for (i = `0`; i < phys_contig_mem_regions.num; i++) {
1005	u64 phys_region_addr = phys_contig_mem_regions.regions[i].start;
1006	u64 phys_region_size = range_len(range: &phys_contig_mem_regions.regions[i]);
1007
1008	rc = ne_sanity_check_phys_mem_region(phys_mem_region_paddr: phys_region_addr, phys_mem_region_size: phys_region_size);
1009	if (rc < `0`)
1010	goto put_pages;
1011	}
1012
1013	ne_mem_region->memory_size = mem_region.memory_size;
1014	ne_mem_region->userspace_addr = mem_region.userspace_addr;
1015
1016	list_add(new: &ne_mem_region->mem_region_list_entry, head: &ne_enclave->mem_regions_list);
1017
1018	for (i = `0`; i < phys_contig_mem_regions.num; i++) {
1019	struct ne_pci_dev_cmd_reply cmd_reply = {};
1020	struct slot_add_mem_req slot_add_mem_req = {};
1021
1022	slot_add_mem_req.slot_uid = ne_enclave->slot_uid;
1023	slot_add_mem_req.paddr = phys_contig_mem_regions.regions[i].start;
1024	slot_add_mem_req.size = range_len(range: &phys_contig_mem_regions.regions[i]);
1025
1026	rc = ne_do_request(pdev, cmd_type: SLOT_ADD_MEM,
1027	cmd_request: &slot_add_mem_req, cmd_request_size: sizeof(slot_add_mem_req),
1028	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
1029	if (rc < `0`) {
1030	dev_err_ratelimited(ne_misc_dev.this_device,
1031	"Error in slot add mem [rc=%d]\n", rc);
1032
1033	kfree(objp: phys_contig_mem_regions.regions);
1034
1035	/*
1036	* Exit here without put pages as memory regions may
1037	* already been added.
1038	*/
1039	return rc;
1040	}
1041
1042	ne_enclave->mem_size += slot_add_mem_req.size;
1043	ne_enclave->nr_mem_regions++;
1044	}
1045
1046	kfree(objp: phys_contig_mem_regions.regions);
1047
1048	return `0`;
1049
1050	put_pages:
1051	for (i = `0`; i < ne_mem_region->nr_pages; i++)
1052	put_page(page: ne_mem_region->pages[i]);
1053	free_mem_region:
1054	kfree(objp: phys_contig_mem_regions.regions);
1055	kfree(objp: ne_mem_region->pages);
1056	kfree(objp: ne_mem_region);
1057
1058	return rc;
1059	}
1060
1061	/**
1062	* ne_start_enclave_ioctl() - Trigger enclave start after the enclave resources,
1063	* such as memory and CPU, have been set.
1064	* @ne_enclave : Private data associated with the current enclave.
1065	* @enclave_start_info : Enclave info that includes enclave cid and flags.
1066	*
1067	* Context: Process context. This function is called with the ne_enclave mutex held.
1068	* Return:
1069	* * 0 on success.
1070	* * Negative return value on failure.
1071	*/
1072	static int ne_start_enclave_ioctl(struct ne_enclave *ne_enclave,
1073	struct ne_enclave_start_info *enclave_start_info)
1074	{
1075	struct ne_pci_dev_cmd_reply cmd_reply = {};
1076	unsigned int cpu = `0`;
1077	struct enclave_start_req enclave_start_req = {};
1078	unsigned int i = `0`;
1079	struct pci_dev *pdev = ne_devs.ne_pci_dev->pdev;
1080	int rc = -EINVAL;
1081
1082	if (!ne_enclave->nr_mem_regions) {
1083	dev_err_ratelimited(ne_misc_dev.this_device,
1084	"Enclave has no mem regions\n");
1085
1086	return -NE_ERR_NO_MEM_REGIONS_ADDED;
1087	}
1088
1089	if (ne_enclave->mem_size < NE_MIN_ENCLAVE_MEM_SIZE) {
1090	dev_err_ratelimited(ne_misc_dev.this_device,
1091	"Enclave memory is less than %ld\n",
1092	NE_MIN_ENCLAVE_MEM_SIZE);
1093
1094	return -NE_ERR_ENCLAVE_MEM_MIN_SIZE;
1095	}
1096
1097	if (!ne_enclave->nr_vcpus) {
1098	dev_err_ratelimited(ne_misc_dev.this_device,
1099	"Enclave has no vCPUs\n");
1100
1101	return -NE_ERR_NO_VCPUS_ADDED;
1102	}
1103
1104	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++)
1105	for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1106	if (!cpumask_test_cpu(cpu, cpumask: ne_enclave->vcpu_ids)) {
1107	dev_err_ratelimited(ne_misc_dev.this_device,
1108	"Full CPU cores not used\n");
1109
1110	return -NE_ERR_FULL_CORES_NOT_USED;
1111	}
1112
1113	enclave_start_req.enclave_cid = enclave_start_info->enclave_cid;
1114	enclave_start_req.flags = enclave_start_info->flags;
1115	enclave_start_req.slot_uid = ne_enclave->slot_uid;
1116
1117	rc = ne_do_request(pdev, cmd_type: ENCLAVE_START,
1118	cmd_request: &enclave_start_req, cmd_request_size: sizeof(enclave_start_req),
1119	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
1120	if (rc < `0`) {
1121	dev_err_ratelimited(ne_misc_dev.this_device,
1122	"Error in enclave start [rc=%d]\n", rc);
1123
1124	return rc;
1125	}
1126
1127	ne_enclave->state = NE_STATE_RUNNING;
1128
1129	enclave_start_info->enclave_cid = cmd_reply.enclave_cid;
1130
1131	return `0`;
1132	}
1133
1134	/**
1135	* ne_enclave_ioctl() - Ioctl function provided by the enclave file.
1136	* @file: File associated with this ioctl function.
1137	* @cmd: The command that is set for the ioctl call.
1138	* @arg: The argument that is provided for the ioctl call.
1139	*
1140	* Context: Process context.
1141	* Return:
1142	* * 0 on success.
1143	* * Negative return value on failure.
1144	*/
1145	static long ne_enclave_ioctl(struct file file, unsigned* int cmd, unsigned long arg)
1146	{
1147	struct ne_enclave *ne_enclave = file->private_data;
1148
1149	switch (cmd) {
1150	case NE_ADD_VCPU: {
1151	int rc = -EINVAL;
1152	u32 vcpu_id = `0`;
1153
1154	if (copy_from_user(to: &vcpu_id, from: (void __user )arg, n: sizeof*(vcpu_id)))
1155	return -EFAULT;
1156
1157	mutex_lock(&ne_enclave->enclave_info_mutex);
1158
1159	if (ne_enclave->state != NE_STATE_INIT) {
1160	dev_err_ratelimited(ne_misc_dev.this_device,
1161	"Enclave is not in init state\n");
1162
1163	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1164
1165	return -NE_ERR_NOT_IN_INIT_STATE;
1166	}
1167
1168	if (vcpu_id >= (ne_enclave->nr_parent_vm_cores *
1169	ne_enclave->nr_threads_per_core)) {
1170	dev_err_ratelimited(ne_misc_dev.this_device,
1171	"vCPU id higher than max CPU id\n");
1172
1173	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1174
1175	return -NE_ERR_INVALID_VCPU;
1176	}
1177
1178	if (!vcpu_id) {
1179	/ Use the CPU pool for choosing a CPU for the enclave. /
1180	rc = ne_get_cpu_from_cpu_pool(ne_enclave, vcpu_id: &vcpu_id);
1181	if (rc < `0`) {
1182	dev_err_ratelimited(ne_misc_dev.this_device,
1183	"Error in get CPU from pool [rc=%d]\n",
1184	rc);
1185
1186	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1187
1188	return rc;
1189	}
1190	} else {
1191	/ Check if the provided vCPU is available in the NE CPU pool. /
1192	rc = ne_check_cpu_in_cpu_pool(ne_enclave, vcpu_id);
1193	if (rc < `0`) {
1194	dev_err_ratelimited(ne_misc_dev.this_device,
1195	"Error in check CPU %d in pool [rc=%d]\n",
1196	vcpu_id, rc);
1197
1198	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1199
1200	return rc;
1201	}
1202	}
1203
1204	rc = ne_add_vcpu_ioctl(ne_enclave, vcpu_id);
1205	if (rc < `0`) {
1206	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1207
1208	return rc;
1209	}
1210
1211	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1212
1213	if (copy_to_user(to: (void __user )arg, from: &vcpu_id, n: sizeof*(vcpu_id)))
1214	return -EFAULT;
1215
1216	return `0`;
1217	}
1218
1219	case NE_GET_IMAGE_LOAD_INFO: {
1220	struct ne_image_load_info image_load_info = {};
1221
1222	if (copy_from_user(to: &image_load_info, from: (void __user )arg, n: sizeof*(image_load_info)))
1223	return -EFAULT;
1224
1225	mutex_lock(&ne_enclave->enclave_info_mutex);
1226
1227	if (ne_enclave->state != NE_STATE_INIT) {
1228	dev_err_ratelimited(ne_misc_dev.this_device,
1229	"Enclave is not in init state\n");
1230
1231	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1232
1233	return -NE_ERR_NOT_IN_INIT_STATE;
1234	}
1235
1236	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1237
1238	if (!image_load_info.flags \|\|
1239	image_load_info.flags >= NE_IMAGE_LOAD_MAX_FLAG_VAL) {
1240	dev_err_ratelimited(ne_misc_dev.this_device,
1241	"Incorrect flag in enclave image load info\n");
1242
1243	return -NE_ERR_INVALID_FLAG_VALUE;
1244	}
1245
1246	if (image_load_info.flags == NE_EIF_IMAGE)
1247	image_load_info.memory_offset = NE_EIF_LOAD_OFFSET;
1248
1249	if (copy_to_user(to: (void __user )arg, from: &image_load_info, n: sizeof*(image_load_info)))
1250	return -EFAULT;
1251
1252	return `0`;
1253	}
1254
1255	case NE_SET_USER_MEMORY_REGION: {
1256	struct ne_user_memory_region mem_region = {};
1257	int rc = -EINVAL;
1258
1259	if (copy_from_user(to: &mem_region, from: (void __user )arg, n: sizeof*(mem_region)))
1260	return -EFAULT;
1261
1262	if (mem_region.flags >= NE_MEMORY_REGION_MAX_FLAG_VAL) {
1263	dev_err_ratelimited(ne_misc_dev.this_device,
1264	"Incorrect flag for user memory region\n");
1265
1266	return -NE_ERR_INVALID_FLAG_VALUE;
1267	}
1268
1269	mutex_lock(&ne_enclave->enclave_info_mutex);
1270
1271	if (ne_enclave->state != NE_STATE_INIT) {
1272	dev_err_ratelimited(ne_misc_dev.this_device,
1273	"Enclave is not in init state\n");
1274
1275	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1276
1277	return -NE_ERR_NOT_IN_INIT_STATE;
1278	}
1279
1280	rc = ne_set_user_memory_region_ioctl(ne_enclave, mem_region);
1281	if (rc < `0`) {
1282	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1283
1284	return rc;
1285	}
1286
1287	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1288
1289	return `0`;
1290	}
1291
1292	case NE_START_ENCLAVE: {
1293	struct ne_enclave_start_info enclave_start_info = {};
1294	int rc = -EINVAL;
1295
1296	if (copy_from_user(to: &enclave_start_info, from: (void __user *)arg,
1297	n: sizeof(enclave_start_info)))
1298	return -EFAULT;
1299
1300	if (enclave_start_info.flags >= NE_ENCLAVE_START_MAX_FLAG_VAL) {
1301	dev_err_ratelimited(ne_misc_dev.this_device,
1302	"Incorrect flag in enclave start info\n");
1303
1304	return -NE_ERR_INVALID_FLAG_VALUE;
1305	}
1306
1307	/*
1308	* Do not use well-known CIDs - 0, 1, 2 - for enclaves.
1309	* VMADDR_CID_ANY = -1U
1310	* VMADDR_CID_HYPERVISOR = 0
1311	* VMADDR_CID_LOCAL = 1
1312	* VMADDR_CID_HOST = 2
1313	* Note: 0 is used as a placeholder to auto-generate an enclave CID.
1314	* http://man7.org/linux/man-pages/man7/vsock.7.html
1315	*/
1316	if (enclave_start_info.enclave_cid > `0` &&
1317	enclave_start_info.enclave_cid <= VMADDR_CID_HOST) {
1318	dev_err_ratelimited(ne_misc_dev.this_device,
1319	"Well-known CID value, not to be used for enclaves\n");
1320
1321	return -NE_ERR_INVALID_ENCLAVE_CID;
1322	}
1323
1324	if (enclave_start_info.enclave_cid == U32_MAX) {
1325	dev_err_ratelimited(ne_misc_dev.this_device,
1326	"Well-known CID value, not to be used for enclaves\n");
1327
1328	return -NE_ERR_INVALID_ENCLAVE_CID;
1329	}
1330
1331	/*
1332	* Do not use the CID of the primary / parent VM for enclaves.
1333	*/
1334	if (enclave_start_info.enclave_cid == NE_PARENT_VM_CID) {
1335	dev_err_ratelimited(ne_misc_dev.this_device,
1336	"CID of the parent VM, not to be used for enclaves\n");
1337
1338	return -NE_ERR_INVALID_ENCLAVE_CID;
1339	}
1340
1341	/ 64-bit CIDs are not yet supported for the vsock device. /
1342	if (enclave_start_info.enclave_cid > U32_MAX) {
1343	dev_err_ratelimited(ne_misc_dev.this_device,
1344	"64-bit CIDs not yet supported for the vsock device\n");
1345
1346	return -NE_ERR_INVALID_ENCLAVE_CID;
1347	}
1348
1349	mutex_lock(&ne_enclave->enclave_info_mutex);
1350
1351	if (ne_enclave->state != NE_STATE_INIT) {
1352	dev_err_ratelimited(ne_misc_dev.this_device,
1353	"Enclave is not in init state\n");
1354
1355	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1356
1357	return -NE_ERR_NOT_IN_INIT_STATE;
1358	}
1359
1360	rc = ne_start_enclave_ioctl(ne_enclave, enclave_start_info: &enclave_start_info);
1361	if (rc < `0`) {
1362	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1363
1364	return rc;
1365	}
1366
1367	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1368
1369	if (copy_to_user(to: (void __user *)arg, from: &enclave_start_info,
1370	n: sizeof(enclave_start_info)))
1371	return -EFAULT;
1372
1373	return `0`;
1374	}
1375
1376	default:
1377	return -ENOTTY;
1378	}
1379
1380	return `0`;
1381	}
1382
1383	/**
1384	* ne_enclave_remove_all_mem_region_entries() - Remove all memory region entries
1385	* from the enclave data structure.
1386	* @ne_enclave : Private data associated with the current enclave.
1387	*
1388	* Context: Process context. This function is called with the ne_enclave mutex held.
1389	*/
1390	static void ne_enclave_remove_all_mem_region_entries(struct ne_enclave *ne_enclave)
1391	{
1392	unsigned long i = `0`;
1393	struct ne_mem_region *ne_mem_region = NULL;
1394	struct ne_mem_region *ne_mem_region_tmp = NULL;
1395
1396	list_for_each_entry_safe(ne_mem_region, ne_mem_region_tmp,
1397	&ne_enclave->mem_regions_list,
1398	mem_region_list_entry) {
1399	list_del(entry: &ne_mem_region->mem_region_list_entry);
1400
1401	for (i = `0`; i < ne_mem_region->nr_pages; i++)
1402	put_page(page: ne_mem_region->pages[i]);
1403
1404	kfree(objp: ne_mem_region->pages);
1405
1406	kfree(objp: ne_mem_region);
1407	}
1408	}
1409
1410	/**
1411	* ne_enclave_remove_all_vcpu_id_entries() - Remove all vCPU id entries from
1412	* the enclave data structure.
1413	* @ne_enclave : Private data associated with the current enclave.
1414	*
1415	* Context: Process context. This function is called with the ne_enclave mutex held.
1416	*/
1417	static void ne_enclave_remove_all_vcpu_id_entries(struct ne_enclave *ne_enclave)
1418	{
1419	unsigned int cpu = `0`;
1420	unsigned int i = `0`;
1421
1422	mutex_lock(&ne_cpu_pool.mutex);
1423
1424	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++) {
1425	for_each_cpu(cpu, ne_enclave->threads_per_core[i])
1426	/ Update the available NE CPU pool. /
1427	cpumask_set_cpu(cpu, dstp: ne_cpu_pool.avail_threads_per_core[i]);
1428
1429	free_cpumask_var(mask: ne_enclave->threads_per_core[i]);
1430	}
1431
1432	mutex_unlock(lock: &ne_cpu_pool.mutex);
1433
1434	kfree(objp: ne_enclave->threads_per_core);
1435
1436	free_cpumask_var(mask: ne_enclave->vcpu_ids);
1437	}
1438
1439	/**
1440	* ne_pci_dev_remove_enclave_entry() - Remove the enclave entry from the data
1441	* structure that is part of the NE PCI
1442	* device private data.
1443	* @ne_enclave : Private data associated with the current enclave.
1444	* @ne_pci_dev : Private data associated with the PCI device.
1445	*
1446	* Context: Process context. This function is called with the ne_pci_dev enclave
1447	* mutex held.
1448	*/
1449	static void ne_pci_dev_remove_enclave_entry(struct ne_enclave *ne_enclave,
1450	struct ne_pci_dev *ne_pci_dev)
1451	{
1452	struct ne_enclave *ne_enclave_entry = NULL;
1453	struct ne_enclave *ne_enclave_entry_tmp = NULL;
1454
1455	list_for_each_entry_safe(ne_enclave_entry, ne_enclave_entry_tmp,
1456	&ne_pci_dev->enclaves_list, enclave_list_entry) {
1457	if (ne_enclave_entry->slot_uid == ne_enclave->slot_uid) {
1458	list_del(entry: &ne_enclave_entry->enclave_list_entry);
1459
1460	break;
1461	}
1462	}
1463	}
1464
1465	/**
1466	* ne_enclave_release() - Release function provided by the enclave file.
1467	* @inode: Inode associated with this file release function.
1468	* @file: File associated with this release function.
1469	*
1470	* Context: Process context.
1471	* Return:
1472	* * 0 on success.
1473	* * Negative return value on failure.
1474	*/
1475	static int ne_enclave_release(struct inode inode, struct* file *file)
1476	{
1477	struct ne_pci_dev_cmd_reply cmd_reply = {};
1478	struct enclave_stop_req enclave_stop_request = {};
1479	struct ne_enclave *ne_enclave = file->private_data;
1480	struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1481	struct pci_dev *pdev = ne_pci_dev->pdev;
1482	int rc = -EINVAL;
1483	struct slot_free_req slot_free_req = {};
1484
1485	if (!ne_enclave)
1486	return `0`;
1487
1488	/*
1489	* Early exit in case there is an error in the enclave creation logic
1490	* and fput() is called on the cleanup path.
1491	*/
1492	if (!ne_enclave->slot_uid)
1493	return `0`;
1494
1495	/*
1496	* Acquire the enclave list mutex before the enclave mutex
1497	* in order to avoid deadlocks with @ref ne_event_work_handler.
1498	*/
1499	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1500	mutex_lock(&ne_enclave->enclave_info_mutex);
1501
1502	if (ne_enclave->state != NE_STATE_INIT && ne_enclave->state != NE_STATE_STOPPED) {
1503	enclave_stop_request.slot_uid = ne_enclave->slot_uid;
1504
1505	rc = ne_do_request(pdev, cmd_type: ENCLAVE_STOP,
1506	cmd_request: &enclave_stop_request, cmd_request_size: sizeof(enclave_stop_request),
1507	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
1508	if (rc < `0`) {
1509	dev_err_ratelimited(ne_misc_dev.this_device,
1510	"Error in enclave stop [rc=%d]\n", rc);
1511
1512	goto unlock_mutex;
1513	}
1514
1515	memset(&cmd_reply, `0`, sizeof(cmd_reply));
1516	}
1517
1518	slot_free_req.slot_uid = ne_enclave->slot_uid;
1519
1520	rc = ne_do_request(pdev, cmd_type: SLOT_FREE,
1521	cmd_request: &slot_free_req, cmd_request_size: sizeof(slot_free_req),
1522	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
1523	if (rc < `0`) {
1524	dev_err_ratelimited(ne_misc_dev.this_device,
1525	"Error in slot free [rc=%d]\n", rc);
1526
1527	goto unlock_mutex;
1528	}
1529
1530	ne_pci_dev_remove_enclave_entry(ne_enclave, ne_pci_dev);
1531	ne_enclave_remove_all_mem_region_entries(ne_enclave);
1532	ne_enclave_remove_all_vcpu_id_entries(ne_enclave);
1533
1534	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1535	mutex_unlock(lock: &ne_pci_dev->enclaves_list_mutex);
1536
1537	kfree(objp: ne_enclave);
1538
1539	return `0`;
1540
1541	unlock_mutex:
1542	mutex_unlock(lock: &ne_enclave->enclave_info_mutex);
1543	mutex_unlock(lock: &ne_pci_dev->enclaves_list_mutex);
1544
1545	return rc;
1546	}
1547
1548	/**
1549	* ne_enclave_poll() - Poll functionality used for enclave out-of-band events.
1550	* @file: File associated with this poll function.
1551	* @wait: Poll table data structure.
1552	*
1553	* Context: Process context.
1554	* Return:
1555	* * Poll mask.
1556	*/
1557	static __poll_t ne_enclave_poll(struct file file, poll_table wait)
1558	{
1559	__poll_t mask = `0`;
1560	struct ne_enclave *ne_enclave = file->private_data;
1561
1562	poll_wait(filp: file, wait_address: &ne_enclave->eventq, p: wait);
1563
1564	if (ne_enclave->has_event)
1565	mask \|= EPOLLHUP;
1566
1567	return mask;
1568	}
1569
1570	static const struct file_operations ne_enclave_fops = {
1571	.owner = THIS_MODULE,
1572	.llseek = noop_llseek,
1573	.poll = ne_enclave_poll,
1574	.unlocked_ioctl = ne_enclave_ioctl,
1575	.release = ne_enclave_release,
1576	};
1577
1578	/**
1579	* ne_create_vm_ioctl() - Alloc slot to be associated with an enclave. Create
1580	* enclave file descriptor to be further used for enclave
1581	* resources handling e.g. memory regions and CPUs.
1582	* @ne_pci_dev : Private data associated with the PCI device.
1583	* @slot_uid: User pointer to store the generated unique slot id
1584	* associated with an enclave to.
1585	*
1586	* Context: Process context. This function is called with the ne_pci_dev enclave
1587	* mutex held.
1588	* Return:
1589	* * Enclave fd on success.
1590	* * Negative return value on failure.
1591	*/
1592	static int ne_create_vm_ioctl(struct ne_pci_dev ne_pci_dev, u64 __user slot_uid)
1593	{
1594	struct ne_pci_dev_cmd_reply cmd_reply = {};
1595	int enclave_fd = -`1`;
1596	struct file *enclave_file = NULL;
1597	unsigned int i = `0`;
1598	struct ne_enclave *ne_enclave = NULL;
1599	struct pci_dev *pdev = ne_pci_dev->pdev;
1600	int rc = -EINVAL;
1601	struct slot_alloc_req slot_alloc_req = {};
1602
1603	mutex_lock(&ne_cpu_pool.mutex);
1604
1605	for (i = `0`; i < ne_cpu_pool.nr_parent_vm_cores; i++)
1606	if (!cpumask_empty(srcp: ne_cpu_pool.avail_threads_per_core[i]))
1607	break;
1608
1609	if (i == ne_cpu_pool.nr_parent_vm_cores) {
1610	dev_err_ratelimited(ne_misc_dev.this_device,
1611	"No CPUs available in CPU pool\n");
1612
1613	mutex_unlock(lock: &ne_cpu_pool.mutex);
1614
1615	return -NE_ERR_NO_CPUS_AVAIL_IN_POOL;
1616	}
1617
1618	mutex_unlock(lock: &ne_cpu_pool.mutex);
1619
1620	ne_enclave = kzalloc(size: sizeof(*ne_enclave), GFP_KERNEL);
1621	if (!ne_enclave)
1622	return -ENOMEM;
1623
1624	mutex_lock(&ne_cpu_pool.mutex);
1625
1626	ne_enclave->nr_parent_vm_cores = ne_cpu_pool.nr_parent_vm_cores;
1627	ne_enclave->nr_threads_per_core = ne_cpu_pool.nr_threads_per_core;
1628	ne_enclave->numa_node = ne_cpu_pool.numa_node;
1629
1630	mutex_unlock(lock: &ne_cpu_pool.mutex);
1631
1632	ne_enclave->threads_per_core = kcalloc(n: ne_enclave->nr_parent_vm_cores,
1633	size: sizeof(*ne_enclave->threads_per_core),
1634	GFP_KERNEL);
1635	if (!ne_enclave->threads_per_core) {
1636	rc = -ENOMEM;
1637
1638	goto free_ne_enclave;
1639	}
1640
1641	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++)
1642	if (!zalloc_cpumask_var(mask: &ne_enclave->threads_per_core[i], GFP_KERNEL)) {
1643	rc = -ENOMEM;
1644
1645	goto free_cpumask;
1646	}
1647
1648	if (!zalloc_cpumask_var(mask: &ne_enclave->vcpu_ids, GFP_KERNEL)) {
1649	rc = -ENOMEM;
1650
1651	goto free_cpumask;
1652	}
1653
1654	enclave_fd = get_unused_fd_flags(O_CLOEXEC);
1655	if (enclave_fd < `0`) {
1656	rc = enclave_fd;
1657
1658	dev_err_ratelimited(ne_misc_dev.this_device,
1659	"Error in getting unused fd [rc=%d]\n", rc);
1660
1661	goto free_cpumask;
1662	}
1663
1664	enclave_file = anon_inode_getfile(name: "ne-vm", fops: &ne_enclave_fops, priv: ne_enclave, O_RDWR);
1665	if (IS_ERR(ptr: enclave_file)) {
1666	rc = PTR_ERR(ptr: enclave_file);
1667
1668	dev_err_ratelimited(ne_misc_dev.this_device,
1669	"Error in anon inode get file [rc=%d]\n", rc);
1670
1671	goto put_fd;
1672	}
1673
1674	rc = ne_do_request(pdev, cmd_type: SLOT_ALLOC,
1675	cmd_request: &slot_alloc_req, cmd_request_size: sizeof(slot_alloc_req),
1676	cmd_reply: &cmd_reply, cmd_reply_size: sizeof(cmd_reply));
1677	if (rc < `0`) {
1678	dev_err_ratelimited(ne_misc_dev.this_device,
1679	"Error in slot alloc [rc=%d]\n", rc);
1680
1681	goto put_file;
1682	}
1683
1684	init_waitqueue_head(&ne_enclave->eventq);
1685	ne_enclave->has_event = false;
1686	mutex_init(&ne_enclave->enclave_info_mutex);
1687	ne_enclave->max_mem_regions = cmd_reply.mem_regions;
1688	INIT_LIST_HEAD(list: &ne_enclave->mem_regions_list);
1689	ne_enclave->mm = current->mm;
1690	ne_enclave->slot_uid = cmd_reply.slot_uid;
1691	ne_enclave->state = NE_STATE_INIT;
1692
1693	list_add(new: &ne_enclave->enclave_list_entry, head: &ne_pci_dev->enclaves_list);
1694
1695	if (copy_to_user(to: slot_uid, from: &ne_enclave->slot_uid, n: sizeof(ne_enclave->slot_uid))) {
1696	/*
1697	* As we're holding the only reference to 'enclave_file', fput()
1698	* will call ne_enclave_release() which will do a proper cleanup
1699	* of all so far allocated resources, leaving only the unused fd
1700	* for us to free.
1701	*/
1702	fput(enclave_file);
1703	put_unused_fd(fd: enclave_fd);
1704
1705	return -EFAULT;
1706	}
1707
1708	fd_install(fd: enclave_fd, file: enclave_file);
1709
1710	return enclave_fd;
1711
1712	put_file:
1713	fput(enclave_file);
1714	put_fd:
1715	put_unused_fd(fd: enclave_fd);
1716	free_cpumask:
1717	free_cpumask_var(mask: ne_enclave->vcpu_ids);
1718	for (i = `0`; i < ne_enclave->nr_parent_vm_cores; i++)
1719	free_cpumask_var(mask: ne_enclave->threads_per_core[i]);
1720	kfree(objp: ne_enclave->threads_per_core);
1721	free_ne_enclave:
1722	kfree(objp: ne_enclave);
1723
1724	return rc;
1725	}
1726
1727	/**
1728	* ne_ioctl() - Ioctl function provided by the NE misc device.
1729	* @file: File associated with this ioctl function.
1730	* @cmd: The command that is set for the ioctl call.
1731	* @arg: The argument that is provided for the ioctl call.
1732	*
1733	* Context: Process context.
1734	* Return:
1735	* * Ioctl result (e.g. enclave file descriptor) on success.
1736	* * Negative return value on failure.
1737	*/
1738	static long ne_ioctl(struct file file, unsigned* int cmd, unsigned long arg)
1739	{
1740	switch (cmd) {
1741	case NE_CREATE_VM: {
1742	int enclave_fd = -`1`;
1743	struct ne_pci_dev *ne_pci_dev = ne_devs.ne_pci_dev;
1744	u64 __user slot_uid = (void* __user *)arg;
1745
1746	mutex_lock(&ne_pci_dev->enclaves_list_mutex);
1747	enclave_fd = ne_create_vm_ioctl(ne_pci_dev, slot_uid);
1748	mutex_unlock(lock: &ne_pci_dev->enclaves_list_mutex);
1749
1750	return enclave_fd;
1751	}
1752
1753	default:
1754	return -ENOTTY;
1755	}
1756
1757	return `0`;
1758	}
1759
1760	#if defined(CONFIG_NITRO_ENCLAVES_MISC_DEV_TEST)
1761	#include "ne_misc_dev_test.c"
1762	#endif
1763
1764	static int __init ne_init(void)
1765	{
1766	mutex_init(&ne_cpu_pool.mutex);
1767
1768	return pci_register_driver(&ne_pci_driver);
1769	}
1770
1771	static void __exit ne_exit(void)
1772	{
1773	pci_unregister_driver(dev: &ne_pci_driver);
1774
1775	ne_teardown_cpu_pool();
1776	}
1777
1778	module_init(ne_init);
1779	module_exit(ne_exit);
1780
1781	MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
1782	MODULE_DESCRIPTION("Nitro Enclaves Driver");
1783	MODULE_LICENSE("GPL v2");
1784

source code of linux/drivers/virt/nitro_enclaves/ne_misc_dev.c