1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright (c) 2017, Oracle and/or its affiliates. All rights reserved. |
4 | */ |
5 | |
6 | /* |
7 | * Oracle Data Analytics Accelerator (DAX) |
8 | * |
9 | * DAX is a coprocessor which resides on the SPARC M7 (DAX1) and M8 |
10 | * (DAX2) processor chips, and has direct access to the CPU's L3 |
11 | * caches as well as physical memory. It can perform several |
12 | * operations on data streams with various input and output formats. |
13 | * The driver provides a transport mechanism only and has limited |
14 | * knowledge of the various opcodes and data formats. A user space |
15 | * library provides high level services and translates these into low |
16 | * level commands which are then passed into the driver and |
17 | * subsequently the hypervisor and the coprocessor. The library is |
18 | * the recommended way for applications to use the coprocessor, and |
19 | * the driver interface is not intended for general use. |
20 | * |
21 | * See Documentation/arch/sparc/oradax/oracle-dax.rst for more details. |
22 | */ |
23 | |
24 | #include <linux/uaccess.h> |
25 | #include <linux/module.h> |
26 | #include <linux/delay.h> |
27 | #include <linux/cdev.h> |
28 | #include <linux/slab.h> |
29 | #include <linux/mm.h> |
30 | |
31 | #include <asm/hypervisor.h> |
32 | #include <asm/mdesc.h> |
33 | #include <asm/oradax.h> |
34 | |
35 | MODULE_LICENSE("GPL" ); |
36 | MODULE_DESCRIPTION("Driver for Oracle Data Analytics Accelerator" ); |
37 | |
38 | #define DAX_DBG_FLG_BASIC 0x01 |
39 | #define DAX_DBG_FLG_STAT 0x02 |
40 | #define DAX_DBG_FLG_INFO 0x04 |
41 | #define DAX_DBG_FLG_ALL 0xff |
42 | |
43 | #define dax_err(fmt, ...) pr_err("%s: " fmt "\n", __func__, ##__VA_ARGS__) |
44 | #define dax_info(fmt, ...) pr_info("%s: " fmt "\n", __func__, ##__VA_ARGS__) |
45 | |
46 | #define dax_dbg(fmt, ...) do { \ |
47 | if (dax_debug & DAX_DBG_FLG_BASIC)\ |
48 | dax_info(fmt, ##__VA_ARGS__); \ |
49 | } while (0) |
50 | #define dax_stat_dbg(fmt, ...) do { \ |
51 | if (dax_debug & DAX_DBG_FLG_STAT) \ |
52 | dax_info(fmt, ##__VA_ARGS__); \ |
53 | } while (0) |
54 | #define dax_info_dbg(fmt, ...) do { \ |
55 | if (dax_debug & DAX_DBG_FLG_INFO) \ |
56 | dax_info(fmt, ##__VA_ARGS__); \ |
57 | } while (0) |
58 | |
59 | #define DAX1_MINOR 1 |
60 | #define DAX1_MAJOR 1 |
61 | #define DAX2_MINOR 0 |
62 | #define DAX2_MAJOR 2 |
63 | |
64 | #define DAX1_STR "ORCL,sun4v-dax" |
65 | #define DAX2_STR "ORCL,sun4v-dax2" |
66 | |
67 | #define DAX_CA_ELEMS (DAX_MMAP_LEN / sizeof(struct dax_cca)) |
68 | |
69 | #define DAX_CCB_USEC 100 |
70 | #define DAX_CCB_RETRIES 10000 |
71 | |
72 | /* stream types */ |
73 | enum { |
74 | OUT, |
75 | PRI, |
76 | SEC, |
77 | TBL, |
78 | NUM_STREAM_TYPES |
79 | }; |
80 | |
81 | /* completion status */ |
82 | #define CCA_STAT_NOT_COMPLETED 0 |
83 | #define CCA_STAT_COMPLETED 1 |
84 | #define CCA_STAT_FAILED 2 |
85 | #define CCA_STAT_KILLED 3 |
86 | #define CCA_STAT_NOT_RUN 4 |
87 | #define CCA_STAT_PIPE_OUT 5 |
88 | #define CCA_STAT_PIPE_SRC 6 |
89 | #define CCA_STAT_PIPE_DST 7 |
90 | |
91 | /* completion err */ |
92 | #define CCA_ERR_SUCCESS 0x0 /* no error */ |
93 | #define CCA_ERR_OVERFLOW 0x1 /* buffer overflow */ |
94 | #define CCA_ERR_DECODE 0x2 /* CCB decode error */ |
95 | #define CCA_ERR_PAGE_OVERFLOW 0x3 /* page overflow */ |
96 | #define CCA_ERR_KILLED 0x7 /* command was killed */ |
97 | #define CCA_ERR_TIMEOUT 0x8 /* Timeout */ |
98 | #define CCA_ERR_ADI 0x9 /* ADI error */ |
99 | #define CCA_ERR_DATA_FMT 0xA /* data format error */ |
100 | #define CCA_ERR_OTHER_NO_RETRY 0xE /* Other error, do not retry */ |
101 | #define CCA_ERR_OTHER_RETRY 0xF /* Other error, retry */ |
102 | #define CCA_ERR_PARTIAL_SYMBOL 0x80 /* QP partial symbol warning */ |
103 | |
104 | /* CCB address types */ |
105 | #define DAX_ADDR_TYPE_NONE 0 |
106 | #define DAX_ADDR_TYPE_VA_ALT 1 /* secondary context */ |
107 | #define DAX_ADDR_TYPE_RA 2 /* real address */ |
108 | #define DAX_ADDR_TYPE_VA 3 /* virtual address */ |
109 | |
110 | /* dax_header_t opcode */ |
111 | #define DAX_OP_SYNC_NOP 0x0 |
112 | #define 0x1 |
113 | #define DAX_OP_SCAN_VALUE 0x2 |
114 | #define DAX_OP_SCAN_RANGE 0x3 |
115 | #define DAX_OP_TRANSLATE 0x4 |
116 | #define DAX_OP_SELECT 0x5 |
117 | #define DAX_OP_INVERT 0x10 /* OR with translate, scan opcodes */ |
118 | |
119 | struct { |
120 | u32 :4; /* 31:28 CCB Version */ |
121 | /* 27:24 Sync Flags */ |
122 | u32 :1; /* Pipeline */ |
123 | u32 :1; /* Longccb. Set for scan with lu2, lu3, lu4. */ |
124 | u32 :1; /* Conditional */ |
125 | u32 :1; /* Serial */ |
126 | u32 :8; /* 23:16 Opcode */ |
127 | /* 15:0 Address Type. */ |
128 | u32 :3; /* 15:13 reserved */ |
129 | u32 :2; /* 12:11 Huffman Table Address Type */ |
130 | u32 :3; /* 10:8 Destination Address Type */ |
131 | u32 :3; /* 7:5 Secondary Source Address Type */ |
132 | u32 :3; /* 4:2 Primary Source Address Type */ |
133 | u32 :2; /* 1:0 Completion Address Type */ |
134 | }; |
135 | |
136 | struct dax_control { |
137 | u32 pri_fmt:4; /* 31:28 Primary Input Format */ |
138 | u32 pri_elem_size:5; /* 27:23 Primary Input Element Size(less1) */ |
139 | u32 pri_offset:3; /* 22:20 Primary Input Starting Offset */ |
140 | u32 sec_encoding:1; /* 19 Secondary Input Encoding */ |
141 | /* (must be 0 for Select) */ |
142 | u32 sec_offset:3; /* 18:16 Secondary Input Starting Offset */ |
143 | u32 sec_elem_size:2; /* 15:14 Secondary Input Element Size */ |
144 | /* (must be 0 for Select) */ |
145 | u32 out_fmt:2; /* 13:12 Output Format */ |
146 | u32 out_elem_size:2; /* 11:10 Output Element Size */ |
147 | u32 misc:10; /* 9:0 Opcode specific info */ |
148 | }; |
149 | |
150 | struct dax_data_access { |
151 | u64 flow_ctrl:2; /* 63:62 Flow Control Type */ |
152 | u64 pipe_target:2; /* 61:60 Pipeline Target */ |
153 | u64 out_buf_size:20; /* 59:40 Output Buffer Size */ |
154 | /* (cachelines less 1) */ |
155 | u64 unused1:8; /* 39:32 Reserved, Set to 0 */ |
156 | u64 out_alloc:5; /* 31:27 Output Allocation */ |
157 | u64 unused2:1; /* 26 Reserved */ |
158 | u64 pri_len_fmt:2; /* 25:24 Input Length Format */ |
159 | u64 pri_len:24; /* 23:0 Input Element/Byte/Bit Count */ |
160 | /* (less 1) */ |
161 | }; |
162 | |
163 | struct dax_ccb { |
164 | struct dax_header hdr; /* CCB Header */ |
165 | struct dax_control ctrl;/* Control Word */ |
166 | void *ca; /* Completion Address */ |
167 | void *pri; /* Primary Input Address */ |
168 | struct dax_data_access dac; /* Data Access Control */ |
169 | void *sec; /* Secondary Input Address */ |
170 | u64 dword5; /* depends on opcode */ |
171 | void *out; /* Output Address */ |
172 | void *tbl; /* Table Address or bitmap */ |
173 | }; |
174 | |
175 | struct dax_cca { |
176 | u8 status; /* user may mwait on this address */ |
177 | u8 err; /* user visible error notification */ |
178 | u8 rsvd[2]; /* reserved */ |
179 | u32 n_remaining; /* for QP partial symbol warning */ |
180 | u32 output_sz; /* output in bytes */ |
181 | u32 rsvd2; /* reserved */ |
182 | u64 run_cycles; /* run time in OCND2 cycles */ |
183 | u64 run_stats; /* nothing reported in version 1.0 */ |
184 | u32 n_processed; /* number input elements */ |
185 | u32 rsvd3[5]; /* reserved */ |
186 | u64 retval; /* command return value */ |
187 | u64 rsvd4[8]; /* reserved */ |
188 | }; |
189 | |
190 | /* per thread CCB context */ |
191 | struct dax_ctx { |
192 | struct dax_ccb *ccb_buf; |
193 | u64 ccb_buf_ra; /* cached RA of ccb_buf */ |
194 | struct dax_cca *ca_buf; |
195 | u64 ca_buf_ra; /* cached RA of ca_buf */ |
196 | struct page *pages[DAX_CA_ELEMS][NUM_STREAM_TYPES]; |
197 | /* array of locked pages */ |
198 | struct task_struct *owner; /* thread that owns ctx */ |
199 | struct task_struct *client; /* requesting thread */ |
200 | union ccb_result result; |
201 | u32 ccb_count; |
202 | u32 fail_count; |
203 | }; |
204 | |
205 | /* driver public entry points */ |
206 | static int dax_open(struct inode *inode, struct file *file); |
207 | static ssize_t dax_read(struct file *filp, char __user *buf, |
208 | size_t count, loff_t *ppos); |
209 | static ssize_t dax_write(struct file *filp, const char __user *buf, |
210 | size_t count, loff_t *ppos); |
211 | static int dax_devmap(struct file *f, struct vm_area_struct *vma); |
212 | static int dax_close(struct inode *i, struct file *f); |
213 | |
214 | static const struct file_operations dax_fops = { |
215 | .owner = THIS_MODULE, |
216 | .open = dax_open, |
217 | .read = dax_read, |
218 | .write = dax_write, |
219 | .mmap = dax_devmap, |
220 | .release = dax_close, |
221 | }; |
222 | |
223 | static int dax_ccb_exec(struct dax_ctx *ctx, const char __user *buf, |
224 | size_t count, loff_t *ppos); |
225 | static int dax_ccb_info(u64 ca, struct ccb_info_result *info); |
226 | static int dax_ccb_kill(u64 ca, u16 *kill_res); |
227 | |
228 | static struct cdev c_dev; |
229 | static dev_t first; |
230 | static const struct class cl = { |
231 | .name = DAX_NAME, |
232 | }; |
233 | |
234 | static int max_ccb_version; |
235 | static int dax_debug; |
236 | module_param(dax_debug, int, 0644); |
237 | MODULE_PARM_DESC(dax_debug, "Debug flags" ); |
238 | |
239 | static int __init dax_attach(void) |
240 | { |
241 | unsigned long dummy, hv_rv, major, minor, minor_requested, max_ccbs; |
242 | struct mdesc_handle *hp = mdesc_grab(); |
243 | char *prop, *dax_name; |
244 | bool found = false; |
245 | int len, ret = 0; |
246 | u64 pn; |
247 | |
248 | if (hp == NULL) { |
249 | dax_err("Unable to grab mdesc" ); |
250 | return -ENODEV; |
251 | } |
252 | |
253 | mdesc_for_each_node_by_name(hp, pn, "virtual-device" ) { |
254 | prop = (char *)mdesc_get_property(hp, pn, "name" , &len); |
255 | if (prop == NULL) |
256 | continue; |
257 | if (strncmp(prop, "dax" , strlen("dax" ))) |
258 | continue; |
259 | dax_dbg("Found node 0x%llx = %s" , pn, prop); |
260 | |
261 | prop = (char *)mdesc_get_property(hp, pn, "compatible" , &len); |
262 | if (prop == NULL) |
263 | continue; |
264 | dax_dbg("Found node 0x%llx = %s" , pn, prop); |
265 | found = true; |
266 | break; |
267 | } |
268 | |
269 | if (!found) { |
270 | dax_err("No DAX device found" ); |
271 | ret = -ENODEV; |
272 | goto done; |
273 | } |
274 | |
275 | if (strncmp(prop, DAX2_STR, strlen(DAX2_STR)) == 0) { |
276 | dax_name = DAX_NAME "2" ; |
277 | major = DAX2_MAJOR; |
278 | minor_requested = DAX2_MINOR; |
279 | max_ccb_version = 1; |
280 | dax_dbg("MD indicates DAX2 coprocessor" ); |
281 | } else if (strncmp(prop, DAX1_STR, strlen(DAX1_STR)) == 0) { |
282 | dax_name = DAX_NAME "1" ; |
283 | major = DAX1_MAJOR; |
284 | minor_requested = DAX1_MINOR; |
285 | max_ccb_version = 0; |
286 | dax_dbg("MD indicates DAX1 coprocessor" ); |
287 | } else { |
288 | dax_err("Unknown dax type: %s" , prop); |
289 | ret = -ENODEV; |
290 | goto done; |
291 | } |
292 | |
293 | minor = minor_requested; |
294 | dax_dbg("Registering DAX HV api with major %ld minor %ld" , major, |
295 | minor); |
296 | if (sun4v_hvapi_register(HV_GRP_DAX, major, &minor)) { |
297 | dax_err("hvapi_register failed" ); |
298 | ret = -ENODEV; |
299 | goto done; |
300 | } else { |
301 | dax_dbg("Max minor supported by HV = %ld (major %ld)" , minor, |
302 | major); |
303 | minor = min(minor, minor_requested); |
304 | dax_dbg("registered DAX major %ld minor %ld" , major, minor); |
305 | } |
306 | |
307 | /* submit a zero length ccb array to query coprocessor queue size */ |
308 | hv_rv = sun4v_ccb_submit(0, 0, HV_CCB_QUERY_CMD, 0, &max_ccbs, &dummy); |
309 | if (hv_rv != 0) { |
310 | dax_err("get_hwqueue_size failed with status=%ld and max_ccbs=%ld" , |
311 | hv_rv, max_ccbs); |
312 | ret = -ENODEV; |
313 | goto done; |
314 | } |
315 | |
316 | if (max_ccbs != DAX_MAX_CCBS) { |
317 | dax_err("HV reports unsupported max_ccbs=%ld" , max_ccbs); |
318 | ret = -ENODEV; |
319 | goto done; |
320 | } |
321 | |
322 | if (alloc_chrdev_region(&first, 0, 1, DAX_NAME) < 0) { |
323 | dax_err("alloc_chrdev_region failed" ); |
324 | ret = -ENXIO; |
325 | goto done; |
326 | } |
327 | |
328 | ret = class_register(class: &cl); |
329 | if (ret) |
330 | goto class_error; |
331 | |
332 | if (device_create(cls: &cl, NULL, devt: first, NULL, fmt: dax_name) == NULL) { |
333 | dax_err("device_create failed" ); |
334 | ret = -ENXIO; |
335 | goto device_error; |
336 | } |
337 | |
338 | cdev_init(&c_dev, &dax_fops); |
339 | if (cdev_add(&c_dev, first, 1) == -1) { |
340 | dax_err("cdev_add failed" ); |
341 | ret = -ENXIO; |
342 | goto cdev_error; |
343 | } |
344 | |
345 | pr_info("Attached DAX module\n" ); |
346 | goto done; |
347 | |
348 | cdev_error: |
349 | device_destroy(cls: &cl, devt: first); |
350 | device_error: |
351 | class_unregister(class: &cl); |
352 | class_error: |
353 | unregister_chrdev_region(first, 1); |
354 | done: |
355 | mdesc_release(hp); |
356 | return ret; |
357 | } |
358 | module_init(dax_attach); |
359 | |
360 | static void __exit dax_detach(void) |
361 | { |
362 | pr_info("Cleaning up DAX module\n" ); |
363 | cdev_del(&c_dev); |
364 | device_destroy(cls: &cl, devt: first); |
365 | class_unregister(class: &cl); |
366 | unregister_chrdev_region(first, 1); |
367 | } |
368 | module_exit(dax_detach); |
369 | |
370 | /* map completion area */ |
371 | static int dax_devmap(struct file *f, struct vm_area_struct *vma) |
372 | { |
373 | struct dax_ctx *ctx = (struct dax_ctx *)f->private_data; |
374 | size_t len = vma->vm_end - vma->vm_start; |
375 | |
376 | dax_dbg("len=0x%lx, flags=0x%lx" , len, vma->vm_flags); |
377 | |
378 | if (ctx->owner != current) { |
379 | dax_dbg("devmap called from wrong thread" ); |
380 | return -EINVAL; |
381 | } |
382 | |
383 | if (len != DAX_MMAP_LEN) { |
384 | dax_dbg("len(%lu) != DAX_MMAP_LEN(%d)" , len, DAX_MMAP_LEN); |
385 | return -EINVAL; |
386 | } |
387 | |
388 | /* completion area is mapped read-only for user */ |
389 | if (vma->vm_flags & VM_WRITE) |
390 | return -EPERM; |
391 | vm_flags_clear(vma, VM_MAYWRITE); |
392 | |
393 | if (remap_pfn_range(vma, addr: vma->vm_start, pfn: ctx->ca_buf_ra >> PAGE_SHIFT, |
394 | size: len, vma->vm_page_prot)) |
395 | return -EAGAIN; |
396 | |
397 | dax_dbg("mmapped completion area at uva 0x%lx" , vma->vm_start); |
398 | return 0; |
399 | } |
400 | |
401 | /* Unlock user pages. Called during dequeue or device close */ |
402 | static void dax_unlock_pages(struct dax_ctx *ctx, int ccb_index, int nelem) |
403 | { |
404 | int i, j; |
405 | |
406 | for (i = ccb_index; i < ccb_index + nelem; i++) { |
407 | for (j = 0; j < NUM_STREAM_TYPES; j++) { |
408 | struct page *p = ctx->pages[i][j]; |
409 | |
410 | if (p) { |
411 | dax_dbg("freeing page %p" , p); |
412 | unpin_user_pages_dirty_lock(pages: &p, npages: 1, make_dirty: j == OUT); |
413 | ctx->pages[i][j] = NULL; |
414 | } |
415 | } |
416 | } |
417 | } |
418 | |
419 | static int dax_lock_page(void *va, struct page **p) |
420 | { |
421 | int ret; |
422 | |
423 | dax_dbg("uva %p" , va); |
424 | |
425 | ret = pin_user_pages_fast(start: (unsigned long)va, nr_pages: 1, gup_flags: FOLL_WRITE, pages: p); |
426 | if (ret == 1) { |
427 | dax_dbg("locked page %p, for VA %p" , *p, va); |
428 | return 0; |
429 | } |
430 | |
431 | dax_dbg("pin_user_pages failed, va=%p, ret=%d" , va, ret); |
432 | return -1; |
433 | } |
434 | |
435 | static int dax_lock_pages(struct dax_ctx *ctx, int idx, |
436 | int nelem, u64 *err_va) |
437 | { |
438 | int i; |
439 | |
440 | for (i = 0; i < nelem; i++) { |
441 | struct dax_ccb *ccbp = &ctx->ccb_buf[i]; |
442 | |
443 | /* |
444 | * For each address in the CCB whose type is virtual, |
445 | * lock the page and change the type to virtual alternate |
446 | * context. On error, return the offending address in |
447 | * err_va. |
448 | */ |
449 | if (ccbp->hdr.out_addr_type == DAX_ADDR_TYPE_VA) { |
450 | dax_dbg("output" ); |
451 | if (dax_lock_page(va: ccbp->out, |
452 | p: &ctx->pages[i + idx][OUT]) != 0) { |
453 | *err_va = (u64)ccbp->out; |
454 | goto error; |
455 | } |
456 | ccbp->hdr.out_addr_type = DAX_ADDR_TYPE_VA_ALT; |
457 | } |
458 | |
459 | if (ccbp->hdr.pri_addr_type == DAX_ADDR_TYPE_VA) { |
460 | dax_dbg("input" ); |
461 | if (dax_lock_page(va: ccbp->pri, |
462 | p: &ctx->pages[i + idx][PRI]) != 0) { |
463 | *err_va = (u64)ccbp->pri; |
464 | goto error; |
465 | } |
466 | ccbp->hdr.pri_addr_type = DAX_ADDR_TYPE_VA_ALT; |
467 | } |
468 | |
469 | if (ccbp->hdr.sec_addr_type == DAX_ADDR_TYPE_VA) { |
470 | dax_dbg("sec input" ); |
471 | if (dax_lock_page(va: ccbp->sec, |
472 | p: &ctx->pages[i + idx][SEC]) != 0) { |
473 | *err_va = (u64)ccbp->sec; |
474 | goto error; |
475 | } |
476 | ccbp->hdr.sec_addr_type = DAX_ADDR_TYPE_VA_ALT; |
477 | } |
478 | |
479 | if (ccbp->hdr.table_addr_type == DAX_ADDR_TYPE_VA) { |
480 | dax_dbg("tbl" ); |
481 | if (dax_lock_page(va: ccbp->tbl, |
482 | p: &ctx->pages[i + idx][TBL]) != 0) { |
483 | *err_va = (u64)ccbp->tbl; |
484 | goto error; |
485 | } |
486 | ccbp->hdr.table_addr_type = DAX_ADDR_TYPE_VA_ALT; |
487 | } |
488 | |
489 | /* skip over 2nd 64 bytes of long CCB */ |
490 | if (ccbp->hdr.longccb) |
491 | i++; |
492 | } |
493 | return DAX_SUBMIT_OK; |
494 | |
495 | error: |
496 | dax_unlock_pages(ctx, ccb_index: idx, nelem); |
497 | return DAX_SUBMIT_ERR_NOACCESS; |
498 | } |
499 | |
500 | static void dax_ccb_wait(struct dax_ctx *ctx, int idx) |
501 | { |
502 | int ret, nretries; |
503 | u16 kill_res; |
504 | |
505 | dax_dbg("idx=%d" , idx); |
506 | |
507 | for (nretries = 0; nretries < DAX_CCB_RETRIES; nretries++) { |
508 | if (ctx->ca_buf[idx].status == CCA_STAT_NOT_COMPLETED) |
509 | udelay(DAX_CCB_USEC); |
510 | else |
511 | return; |
512 | } |
513 | dax_dbg("ctx (%p): CCB[%d] timed out, wait usec=%d, retries=%d. Killing ccb" , |
514 | (void *)ctx, idx, DAX_CCB_USEC, DAX_CCB_RETRIES); |
515 | |
516 | ret = dax_ccb_kill(ca: ctx->ca_buf_ra + idx * sizeof(struct dax_cca), |
517 | kill_res: &kill_res); |
518 | dax_dbg("Kill CCB[%d] %s" , idx, ret ? "failed" : "succeeded" ); |
519 | } |
520 | |
521 | static int dax_close(struct inode *ino, struct file *f) |
522 | { |
523 | struct dax_ctx *ctx = (struct dax_ctx *)f->private_data; |
524 | int i; |
525 | |
526 | f->private_data = NULL; |
527 | |
528 | for (i = 0; i < DAX_CA_ELEMS; i++) { |
529 | if (ctx->ca_buf[i].status == CCA_STAT_NOT_COMPLETED) { |
530 | dax_dbg("CCB[%d] not completed" , i); |
531 | dax_ccb_wait(ctx, i); |
532 | } |
533 | dax_unlock_pages(ctx, i, 1); |
534 | } |
535 | |
536 | kfree(objp: ctx->ccb_buf); |
537 | kfree(objp: ctx->ca_buf); |
538 | dax_stat_dbg("CCBs: %d good, %d bad" , ctx->ccb_count, ctx->fail_count); |
539 | kfree(objp: ctx); |
540 | |
541 | return 0; |
542 | } |
543 | |
544 | static ssize_t dax_read(struct file *f, char __user *buf, |
545 | size_t count, loff_t *ppos) |
546 | { |
547 | struct dax_ctx *ctx = f->private_data; |
548 | |
549 | if (ctx->client != current) |
550 | return -EUSERS; |
551 | |
552 | ctx->client = NULL; |
553 | |
554 | if (count != sizeof(union ccb_result)) |
555 | return -EINVAL; |
556 | if (copy_to_user(buf, &ctx->result, sizeof(union ccb_result))) |
557 | return -EFAULT; |
558 | return count; |
559 | } |
560 | |
561 | static ssize_t dax_write(struct file *f, const char __user *buf, |
562 | size_t count, loff_t *ppos) |
563 | { |
564 | struct dax_ctx *ctx = f->private_data; |
565 | struct dax_command hdr; |
566 | unsigned long ca; |
567 | int i, idx, ret; |
568 | |
569 | if (ctx->client != NULL) |
570 | return -EINVAL; |
571 | |
572 | if (count == 0 || count > DAX_MAX_CCBS * sizeof(struct dax_ccb)) |
573 | return -EINVAL; |
574 | |
575 | if (count % sizeof(struct dax_ccb) == 0) |
576 | return dax_ccb_exec(ctx, buf, count, ppos); /* CCB EXEC */ |
577 | |
578 | if (count != sizeof(struct dax_command)) |
579 | return -EINVAL; |
580 | |
581 | /* immediate command */ |
582 | if (ctx->owner != current) |
583 | return -EUSERS; |
584 | |
585 | if (copy_from_user(to: &hdr, from: buf, n: sizeof(hdr))) |
586 | return -EFAULT; |
587 | |
588 | ca = ctx->ca_buf_ra + hdr.ca_offset; |
589 | |
590 | switch (hdr.command) { |
591 | case CCB_KILL: |
592 | if (hdr.ca_offset >= DAX_MMAP_LEN) { |
593 | dax_dbg("invalid ca_offset (%d) >= ca_buflen (%d)" , |
594 | hdr.ca_offset, DAX_MMAP_LEN); |
595 | return -EINVAL; |
596 | } |
597 | |
598 | ret = dax_ccb_kill(ca, kill_res: &ctx->result.kill.action); |
599 | if (ret != 0) { |
600 | dax_dbg("dax_ccb_kill failed (ret=%d)" , ret); |
601 | return ret; |
602 | } |
603 | |
604 | dax_info_dbg("killed (ca_offset %d)" , hdr.ca_offset); |
605 | idx = hdr.ca_offset / sizeof(struct dax_cca); |
606 | ctx->ca_buf[idx].status = CCA_STAT_KILLED; |
607 | ctx->ca_buf[idx].err = CCA_ERR_KILLED; |
608 | ctx->client = current; |
609 | return count; |
610 | |
611 | case CCB_INFO: |
612 | if (hdr.ca_offset >= DAX_MMAP_LEN) { |
613 | dax_dbg("invalid ca_offset (%d) >= ca_buflen (%d)" , |
614 | hdr.ca_offset, DAX_MMAP_LEN); |
615 | return -EINVAL; |
616 | } |
617 | |
618 | ret = dax_ccb_info(ca, info: &ctx->result.info); |
619 | if (ret != 0) { |
620 | dax_dbg("dax_ccb_info failed (ret=%d)" , ret); |
621 | return ret; |
622 | } |
623 | |
624 | dax_info_dbg("info succeeded on ca_offset %d" , hdr.ca_offset); |
625 | ctx->client = current; |
626 | return count; |
627 | |
628 | case CCB_DEQUEUE: |
629 | for (i = 0; i < DAX_CA_ELEMS; i++) { |
630 | if (ctx->ca_buf[i].status != |
631 | CCA_STAT_NOT_COMPLETED) |
632 | dax_unlock_pages(ctx, i, 1); |
633 | } |
634 | return count; |
635 | |
636 | default: |
637 | return -EINVAL; |
638 | } |
639 | } |
640 | |
641 | static int dax_open(struct inode *inode, struct file *f) |
642 | { |
643 | struct dax_ctx *ctx = NULL; |
644 | int i; |
645 | |
646 | ctx = kzalloc(size: sizeof(*ctx), GFP_KERNEL); |
647 | if (ctx == NULL) |
648 | goto done; |
649 | |
650 | ctx->ccb_buf = kcalloc(n: DAX_MAX_CCBS, size: sizeof(struct dax_ccb), |
651 | GFP_KERNEL); |
652 | if (ctx->ccb_buf == NULL) |
653 | goto done; |
654 | |
655 | ctx->ccb_buf_ra = virt_to_phys(ctx->ccb_buf); |
656 | dax_dbg("ctx->ccb_buf=0x%p, ccb_buf_ra=0x%llx" , |
657 | (void *)ctx->ccb_buf, ctx->ccb_buf_ra); |
658 | |
659 | /* allocate CCB completion area buffer */ |
660 | ctx->ca_buf = kzalloc(size: DAX_MMAP_LEN, GFP_KERNEL); |
661 | if (ctx->ca_buf == NULL) |
662 | goto alloc_error; |
663 | for (i = 0; i < DAX_CA_ELEMS; i++) |
664 | ctx->ca_buf[i].status = CCA_STAT_COMPLETED; |
665 | |
666 | ctx->ca_buf_ra = virt_to_phys(ctx->ca_buf); |
667 | dax_dbg("ctx=0x%p, ctx->ca_buf=0x%p, ca_buf_ra=0x%llx" , |
668 | (void *)ctx, (void *)ctx->ca_buf, ctx->ca_buf_ra); |
669 | |
670 | ctx->owner = current; |
671 | f->private_data = ctx; |
672 | return 0; |
673 | |
674 | alloc_error: |
675 | kfree(objp: ctx->ccb_buf); |
676 | done: |
677 | kfree(objp: ctx); |
678 | return -ENOMEM; |
679 | } |
680 | |
681 | static char *dax_hv_errno(unsigned long hv_ret, int *ret) |
682 | { |
683 | switch (hv_ret) { |
684 | case HV_EBADALIGN: |
685 | *ret = -EFAULT; |
686 | return "HV_EBADALIGN" ; |
687 | case HV_ENORADDR: |
688 | *ret = -EFAULT; |
689 | return "HV_ENORADDR" ; |
690 | case HV_EINVAL: |
691 | *ret = -EINVAL; |
692 | return "HV_EINVAL" ; |
693 | case HV_EWOULDBLOCK: |
694 | *ret = -EAGAIN; |
695 | return "HV_EWOULDBLOCK" ; |
696 | case HV_ENOACCESS: |
697 | *ret = -EPERM; |
698 | return "HV_ENOACCESS" ; |
699 | default: |
700 | break; |
701 | } |
702 | |
703 | *ret = -EIO; |
704 | return "UNKNOWN" ; |
705 | } |
706 | |
707 | static int dax_ccb_kill(u64 ca, u16 *kill_res) |
708 | { |
709 | unsigned long hv_ret; |
710 | int count, ret = 0; |
711 | char *err_str; |
712 | |
713 | for (count = 0; count < DAX_CCB_RETRIES; count++) { |
714 | dax_dbg("attempting kill on ca_ra 0x%llx" , ca); |
715 | hv_ret = sun4v_ccb_kill(ca, kill_res); |
716 | |
717 | if (hv_ret == HV_EOK) { |
718 | dax_info_dbg("HV_EOK (ca_ra 0x%llx): %d" , ca, |
719 | *kill_res); |
720 | } else { |
721 | err_str = dax_hv_errno(hv_ret, ret: &ret); |
722 | dax_dbg("%s (ca_ra 0x%llx)" , err_str, ca); |
723 | } |
724 | |
725 | if (ret != -EAGAIN) |
726 | return ret; |
727 | dax_info_dbg("ccb_kill count = %d" , count); |
728 | udelay(DAX_CCB_USEC); |
729 | } |
730 | |
731 | return -EAGAIN; |
732 | } |
733 | |
734 | static int dax_ccb_info(u64 ca, struct ccb_info_result *info) |
735 | { |
736 | unsigned long hv_ret; |
737 | char *err_str; |
738 | int ret = 0; |
739 | |
740 | dax_dbg("attempting info on ca_ra 0x%llx" , ca); |
741 | hv_ret = sun4v_ccb_info(ca, info); |
742 | |
743 | if (hv_ret == HV_EOK) { |
744 | dax_info_dbg("HV_EOK (ca_ra 0x%llx): %d" , ca, info->state); |
745 | if (info->state == DAX_CCB_ENQUEUED) { |
746 | dax_info_dbg("dax_unit %d, queue_num %d, queue_pos %d" , |
747 | info->inst_num, info->q_num, info->q_pos); |
748 | } |
749 | } else { |
750 | err_str = dax_hv_errno(hv_ret, ret: &ret); |
751 | dax_dbg("%s (ca_ra 0x%llx)" , err_str, ca); |
752 | } |
753 | |
754 | return ret; |
755 | } |
756 | |
757 | static void dax_prt_ccbs(struct dax_ccb *ccb, int nelem) |
758 | { |
759 | int i, j; |
760 | u64 *ccbp; |
761 | |
762 | dax_dbg("ccb buffer:" ); |
763 | for (i = 0; i < nelem; i++) { |
764 | ccbp = (u64 *)&ccb[i]; |
765 | dax_dbg(" %sccb[%d]" , ccb[i].hdr.longccb ? "long " : "" , i); |
766 | for (j = 0; j < 8; j++) |
767 | dax_dbg("\tccb[%d].dwords[%d]=0x%llx" , |
768 | i, j, *(ccbp + j)); |
769 | } |
770 | } |
771 | |
772 | /* |
773 | * Validates user CCB content. Also sets completion address and address types |
774 | * for all addresses contained in CCB. |
775 | */ |
776 | static int dax_preprocess_usr_ccbs(struct dax_ctx *ctx, int idx, int nelem) |
777 | { |
778 | int i; |
779 | |
780 | /* |
781 | * The user is not allowed to specify real address types in |
782 | * the CCB header. This must be enforced by the kernel before |
783 | * submitting the CCBs to HV. The only allowed values for all |
784 | * address fields are VA or IMM |
785 | */ |
786 | for (i = 0; i < nelem; i++) { |
787 | struct dax_ccb *ccbp = &ctx->ccb_buf[i]; |
788 | unsigned long ca_offset; |
789 | |
790 | if (ccbp->hdr.ccb_version > max_ccb_version) |
791 | return DAX_SUBMIT_ERR_CCB_INVAL; |
792 | |
793 | switch (ccbp->hdr.opcode) { |
794 | case DAX_OP_SYNC_NOP: |
795 | case DAX_OP_EXTRACT: |
796 | case DAX_OP_SCAN_VALUE: |
797 | case DAX_OP_SCAN_RANGE: |
798 | case DAX_OP_TRANSLATE: |
799 | case DAX_OP_SCAN_VALUE | DAX_OP_INVERT: |
800 | case DAX_OP_SCAN_RANGE | DAX_OP_INVERT: |
801 | case DAX_OP_TRANSLATE | DAX_OP_INVERT: |
802 | case DAX_OP_SELECT: |
803 | break; |
804 | default: |
805 | return DAX_SUBMIT_ERR_CCB_INVAL; |
806 | } |
807 | |
808 | if (ccbp->hdr.out_addr_type != DAX_ADDR_TYPE_VA && |
809 | ccbp->hdr.out_addr_type != DAX_ADDR_TYPE_NONE) { |
810 | dax_dbg("invalid out_addr_type in user CCB[%d]" , i); |
811 | return DAX_SUBMIT_ERR_CCB_INVAL; |
812 | } |
813 | |
814 | if (ccbp->hdr.pri_addr_type != DAX_ADDR_TYPE_VA && |
815 | ccbp->hdr.pri_addr_type != DAX_ADDR_TYPE_NONE) { |
816 | dax_dbg("invalid pri_addr_type in user CCB[%d]" , i); |
817 | return DAX_SUBMIT_ERR_CCB_INVAL; |
818 | } |
819 | |
820 | if (ccbp->hdr.sec_addr_type != DAX_ADDR_TYPE_VA && |
821 | ccbp->hdr.sec_addr_type != DAX_ADDR_TYPE_NONE) { |
822 | dax_dbg("invalid sec_addr_type in user CCB[%d]" , i); |
823 | return DAX_SUBMIT_ERR_CCB_INVAL; |
824 | } |
825 | |
826 | if (ccbp->hdr.table_addr_type != DAX_ADDR_TYPE_VA && |
827 | ccbp->hdr.table_addr_type != DAX_ADDR_TYPE_NONE) { |
828 | dax_dbg("invalid table_addr_type in user CCB[%d]" , i); |
829 | return DAX_SUBMIT_ERR_CCB_INVAL; |
830 | } |
831 | |
832 | /* set completion (real) address and address type */ |
833 | ccbp->hdr.cca_addr_type = DAX_ADDR_TYPE_RA; |
834 | ca_offset = (idx + i) * sizeof(struct dax_cca); |
835 | ccbp->ca = (void *)ctx->ca_buf_ra + ca_offset; |
836 | memset(&ctx->ca_buf[idx + i], 0, sizeof(struct dax_cca)); |
837 | |
838 | dax_dbg("ccb[%d]=%p, ca_offset=0x%lx, compl RA=0x%llx" , |
839 | i, ccbp, ca_offset, ctx->ca_buf_ra + ca_offset); |
840 | |
841 | /* skip over 2nd 64 bytes of long CCB */ |
842 | if (ccbp->hdr.longccb) |
843 | i++; |
844 | } |
845 | |
846 | return DAX_SUBMIT_OK; |
847 | } |
848 | |
849 | static int dax_ccb_exec(struct dax_ctx *ctx, const char __user *buf, |
850 | size_t count, loff_t *ppos) |
851 | { |
852 | unsigned long accepted_len, hv_rv; |
853 | int i, idx, nccbs, naccepted; |
854 | |
855 | ctx->client = current; |
856 | idx = *ppos; |
857 | nccbs = count / sizeof(struct dax_ccb); |
858 | |
859 | if (ctx->owner != current) { |
860 | dax_dbg("wrong thread" ); |
861 | ctx->result.exec.status = DAX_SUBMIT_ERR_THR_INIT; |
862 | return 0; |
863 | } |
864 | dax_dbg("args: ccb_buf_len=%ld, idx=%d" , count, idx); |
865 | |
866 | /* for given index and length, verify ca_buf range exists */ |
867 | if (idx < 0 || idx > (DAX_CA_ELEMS - nccbs)) { |
868 | ctx->result.exec.status = DAX_SUBMIT_ERR_NO_CA_AVAIL; |
869 | return 0; |
870 | } |
871 | |
872 | /* |
873 | * Copy CCBs into kernel buffer to prevent modification by the |
874 | * user in between validation and submission. |
875 | */ |
876 | if (copy_from_user(to: ctx->ccb_buf, from: buf, n: count)) { |
877 | dax_dbg("copyin of user CCB buffer failed" ); |
878 | ctx->result.exec.status = DAX_SUBMIT_ERR_CCB_ARR_MMU_MISS; |
879 | return 0; |
880 | } |
881 | |
882 | /* check to see if ca_buf[idx] .. ca_buf[idx + nccbs] are available */ |
883 | for (i = idx; i < idx + nccbs; i++) { |
884 | if (ctx->ca_buf[i].status == CCA_STAT_NOT_COMPLETED) { |
885 | dax_dbg("CA range not available, dequeue needed" ); |
886 | ctx->result.exec.status = DAX_SUBMIT_ERR_NO_CA_AVAIL; |
887 | return 0; |
888 | } |
889 | } |
890 | dax_unlock_pages(ctx, ccb_index: idx, nelem: nccbs); |
891 | |
892 | ctx->result.exec.status = dax_preprocess_usr_ccbs(ctx, idx, nelem: nccbs); |
893 | if (ctx->result.exec.status != DAX_SUBMIT_OK) |
894 | return 0; |
895 | |
896 | ctx->result.exec.status = dax_lock_pages(ctx, idx, nelem: nccbs, |
897 | err_va: &ctx->result.exec.status_data); |
898 | if (ctx->result.exec.status != DAX_SUBMIT_OK) |
899 | return 0; |
900 | |
901 | if (dax_debug & DAX_DBG_FLG_BASIC) |
902 | dax_prt_ccbs(ccb: ctx->ccb_buf, nelem: nccbs); |
903 | |
904 | hv_rv = sun4v_ccb_submit(ctx->ccb_buf_ra, count, |
905 | HV_CCB_QUERY_CMD | HV_CCB_VA_SECONDARY, 0, |
906 | &accepted_len, &ctx->result.exec.status_data); |
907 | |
908 | switch (hv_rv) { |
909 | case HV_EOK: |
910 | /* |
911 | * Hcall succeeded with no errors but the accepted |
912 | * length may be less than the requested length. The |
913 | * only way the driver can resubmit the remainder is |
914 | * to wait for completion of the submitted CCBs since |
915 | * there is no way to guarantee the ordering semantics |
916 | * required by the client applications. Therefore we |
917 | * let the user library deal with resubmissions. |
918 | */ |
919 | ctx->result.exec.status = DAX_SUBMIT_OK; |
920 | break; |
921 | case HV_EWOULDBLOCK: |
922 | /* |
923 | * This is a transient HV API error. The user library |
924 | * can retry. |
925 | */ |
926 | dax_dbg("hcall returned HV_EWOULDBLOCK" ); |
927 | ctx->result.exec.status = DAX_SUBMIT_ERR_WOULDBLOCK; |
928 | break; |
929 | case HV_ENOMAP: |
930 | /* |
931 | * HV was unable to translate a VA. The VA it could |
932 | * not translate is returned in the status_data param. |
933 | */ |
934 | dax_dbg("hcall returned HV_ENOMAP" ); |
935 | ctx->result.exec.status = DAX_SUBMIT_ERR_NOMAP; |
936 | break; |
937 | case HV_EINVAL: |
938 | /* |
939 | * This is the result of an invalid user CCB as HV is |
940 | * validating some of the user CCB fields. Pass this |
941 | * error back to the user. There is no supporting info |
942 | * to isolate the invalid field. |
943 | */ |
944 | dax_dbg("hcall returned HV_EINVAL" ); |
945 | ctx->result.exec.status = DAX_SUBMIT_ERR_CCB_INVAL; |
946 | break; |
947 | case HV_ENOACCESS: |
948 | /* |
949 | * HV found a VA that did not have the appropriate |
950 | * permissions (such as the w bit). The VA in question |
951 | * is returned in status_data param. |
952 | */ |
953 | dax_dbg("hcall returned HV_ENOACCESS" ); |
954 | ctx->result.exec.status = DAX_SUBMIT_ERR_NOACCESS; |
955 | break; |
956 | case HV_EUNAVAILABLE: |
957 | /* |
958 | * The requested CCB operation could not be performed |
959 | * at this time. Return the specific unavailable code |
960 | * in the status_data field. |
961 | */ |
962 | dax_dbg("hcall returned HV_EUNAVAILABLE" ); |
963 | ctx->result.exec.status = DAX_SUBMIT_ERR_UNAVAIL; |
964 | break; |
965 | default: |
966 | ctx->result.exec.status = DAX_SUBMIT_ERR_INTERNAL; |
967 | dax_dbg("unknown hcall return value (%ld)" , hv_rv); |
968 | break; |
969 | } |
970 | |
971 | /* unlock pages associated with the unaccepted CCBs */ |
972 | naccepted = accepted_len / sizeof(struct dax_ccb); |
973 | dax_unlock_pages(ctx, ccb_index: idx + naccepted, nelem: nccbs - naccepted); |
974 | |
975 | /* mark unaccepted CCBs as not completed */ |
976 | for (i = idx + naccepted; i < idx + nccbs; i++) |
977 | ctx->ca_buf[i].status = CCA_STAT_COMPLETED; |
978 | |
979 | ctx->ccb_count += naccepted; |
980 | ctx->fail_count += nccbs - naccepted; |
981 | |
982 | dax_dbg("hcall rv=%ld, accepted_len=%ld, status_data=0x%llx, ret status=%d" , |
983 | hv_rv, accepted_len, ctx->result.exec.status_data, |
984 | ctx->result.exec.status); |
985 | |
986 | if (count == accepted_len) |
987 | ctx->client = NULL; /* no read needed to complete protocol */ |
988 | return accepted_len; |
989 | } |
990 | |