1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Support for Partition Mobility/Migration |
4 | * |
5 | * Copyright (C) 2010 Nathan Fontenot |
6 | * Copyright (C) 2010 IBM Corporation |
7 | */ |
8 | |
9 | |
10 | #define pr_fmt(fmt) "mobility: " fmt |
11 | |
12 | #include <linux/cpu.h> |
13 | #include <linux/kernel.h> |
14 | #include <linux/kobject.h> |
15 | #include <linux/nmi.h> |
16 | #include <linux/sched.h> |
17 | #include <linux/smp.h> |
18 | #include <linux/stat.h> |
19 | #include <linux/stop_machine.h> |
20 | #include <linux/completion.h> |
21 | #include <linux/device.h> |
22 | #include <linux/delay.h> |
23 | #include <linux/slab.h> |
24 | #include <linux/stringify.h> |
25 | |
26 | #include <asm/machdep.h> |
27 | #include <asm/nmi.h> |
28 | #include <asm/rtas.h> |
29 | #include "pseries.h" |
30 | #include "vas.h" /* vas_migration_handler() */ |
31 | #include "../../kernel/cacheinfo.h" |
32 | |
33 | static struct kobject *mobility_kobj; |
34 | |
35 | struct update_props_workarea { |
36 | __be32 phandle; |
37 | __be32 state; |
38 | __be64 reserved; |
39 | __be32 nprops; |
40 | } __packed; |
41 | |
42 | #define NODE_ACTION_MASK 0xff000000 |
43 | #define NODE_COUNT_MASK 0x00ffffff |
44 | |
45 | #define DELETE_DT_NODE 0x01000000 |
46 | #define UPDATE_DT_NODE 0x02000000 |
47 | #define ADD_DT_NODE 0x03000000 |
48 | |
49 | #define MIGRATION_SCOPE (1) |
50 | #define PRRN_SCOPE -2 |
51 | |
52 | #ifdef CONFIG_PPC_WATCHDOG |
53 | static unsigned int nmi_wd_lpm_factor = 200; |
54 | |
55 | #ifdef CONFIG_SYSCTL |
56 | static struct ctl_table nmi_wd_lpm_factor_ctl_table[] = { |
57 | { |
58 | .procname = "nmi_wd_lpm_factor" , |
59 | .data = &nmi_wd_lpm_factor, |
60 | .maxlen = sizeof(int), |
61 | .mode = 0644, |
62 | .proc_handler = proc_douintvec_minmax, |
63 | }, |
64 | }; |
65 | |
66 | static int __init register_nmi_wd_lpm_factor_sysctl(void) |
67 | { |
68 | register_sysctl("kernel" , nmi_wd_lpm_factor_ctl_table); |
69 | |
70 | return 0; |
71 | } |
72 | device_initcall(register_nmi_wd_lpm_factor_sysctl); |
73 | #endif /* CONFIG_SYSCTL */ |
74 | #endif /* CONFIG_PPC_WATCHDOG */ |
75 | |
76 | static int mobility_rtas_call(int token, char *buf, s32 scope) |
77 | { |
78 | int rc; |
79 | |
80 | spin_lock(lock: &rtas_data_buf_lock); |
81 | |
82 | memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE); |
83 | rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope); |
84 | memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); |
85 | |
86 | spin_unlock(lock: &rtas_data_buf_lock); |
87 | return rc; |
88 | } |
89 | |
90 | static int delete_dt_node(struct device_node *dn) |
91 | { |
92 | struct device_node *pdn; |
93 | bool is_platfac; |
94 | |
95 | pdn = of_get_parent(node: dn); |
96 | is_platfac = of_node_is_type(np: dn, type: "ibm,platform-facilities" ) || |
97 | of_node_is_type(np: pdn, type: "ibm,platform-facilities" ); |
98 | of_node_put(node: pdn); |
99 | |
100 | /* |
101 | * The drivers that bind to nodes in the platform-facilities |
102 | * hierarchy don't support node removal, and the removal directive |
103 | * from firmware is always followed by an add of an equivalent |
104 | * node. The capability (e.g. RNG, encryption, compression) |
105 | * represented by the node is never interrupted by the migration. |
106 | * So ignore changes to this part of the tree. |
107 | */ |
108 | if (is_platfac) { |
109 | pr_notice("ignoring remove operation for %pOFfp\n" , dn); |
110 | return 0; |
111 | } |
112 | |
113 | pr_debug("removing node %pOFfp\n" , dn); |
114 | dlpar_detach_node(dn); |
115 | return 0; |
116 | } |
117 | |
118 | static int update_dt_property(struct device_node *dn, struct property **prop, |
119 | const char *name, u32 vd, char *value) |
120 | { |
121 | struct property *new_prop = *prop; |
122 | int more = 0; |
123 | |
124 | /* A negative 'vd' value indicates that only part of the new property |
125 | * value is contained in the buffer and we need to call |
126 | * ibm,update-properties again to get the rest of the value. |
127 | * |
128 | * A negative value is also the two's compliment of the actual value. |
129 | */ |
130 | if (vd & 0x80000000) { |
131 | vd = ~vd + 1; |
132 | more = 1; |
133 | } |
134 | |
135 | if (new_prop) { |
136 | /* partial property fixup */ |
137 | char *new_data = kzalloc(size: new_prop->length + vd, GFP_KERNEL); |
138 | if (!new_data) |
139 | return -ENOMEM; |
140 | |
141 | memcpy(new_data, new_prop->value, new_prop->length); |
142 | memcpy(new_data + new_prop->length, value, vd); |
143 | |
144 | kfree(objp: new_prop->value); |
145 | new_prop->value = new_data; |
146 | new_prop->length += vd; |
147 | } else { |
148 | new_prop = kzalloc(size: sizeof(*new_prop), GFP_KERNEL); |
149 | if (!new_prop) |
150 | return -ENOMEM; |
151 | |
152 | new_prop->name = kstrdup(s: name, GFP_KERNEL); |
153 | if (!new_prop->name) { |
154 | kfree(objp: new_prop); |
155 | return -ENOMEM; |
156 | } |
157 | |
158 | new_prop->length = vd; |
159 | new_prop->value = kzalloc(size: new_prop->length, GFP_KERNEL); |
160 | if (!new_prop->value) { |
161 | kfree(objp: new_prop->name); |
162 | kfree(objp: new_prop); |
163 | return -ENOMEM; |
164 | } |
165 | |
166 | memcpy(new_prop->value, value, vd); |
167 | *prop = new_prop; |
168 | } |
169 | |
170 | if (!more) { |
171 | pr_debug("updating node %pOF property %s\n" , dn, name); |
172 | of_update_property(np: dn, newprop: new_prop); |
173 | *prop = NULL; |
174 | } |
175 | |
176 | return 0; |
177 | } |
178 | |
179 | static int update_dt_node(struct device_node *dn, s32 scope) |
180 | { |
181 | struct update_props_workarea *upwa; |
182 | struct property *prop = NULL; |
183 | int i, rc, rtas_rc; |
184 | char *prop_data; |
185 | char *rtas_buf; |
186 | int update_properties_token; |
187 | u32 nprops; |
188 | u32 vd; |
189 | |
190 | update_properties_token = rtas_function_token(RTAS_FN_IBM_UPDATE_PROPERTIES); |
191 | if (update_properties_token == RTAS_UNKNOWN_SERVICE) |
192 | return -EINVAL; |
193 | |
194 | rtas_buf = kzalloc(size: RTAS_DATA_BUF_SIZE, GFP_KERNEL); |
195 | if (!rtas_buf) |
196 | return -ENOMEM; |
197 | |
198 | upwa = (struct update_props_workarea *)&rtas_buf[0]; |
199 | upwa->phandle = cpu_to_be32(dn->phandle); |
200 | |
201 | do { |
202 | rtas_rc = mobility_rtas_call(token: update_properties_token, buf: rtas_buf, |
203 | scope); |
204 | if (rtas_rc < 0) |
205 | break; |
206 | |
207 | prop_data = rtas_buf + sizeof(*upwa); |
208 | nprops = be32_to_cpu(upwa->nprops); |
209 | |
210 | /* On the first call to ibm,update-properties for a node the |
211 | * first property value descriptor contains an empty |
212 | * property name, the property value length encoded as u32, |
213 | * and the property value is the node path being updated. |
214 | */ |
215 | if (*prop_data == 0) { |
216 | prop_data++; |
217 | vd = be32_to_cpu(*(__be32 *)prop_data); |
218 | prop_data += vd + sizeof(vd); |
219 | nprops--; |
220 | } |
221 | |
222 | for (i = 0; i < nprops; i++) { |
223 | char *prop_name; |
224 | |
225 | prop_name = prop_data; |
226 | prop_data += strlen(prop_name) + 1; |
227 | vd = be32_to_cpu(*(__be32 *)prop_data); |
228 | prop_data += sizeof(vd); |
229 | |
230 | switch (vd) { |
231 | case 0x00000000: |
232 | /* name only property, nothing to do */ |
233 | break; |
234 | |
235 | case 0x80000000: |
236 | of_remove_property(np: dn, prop: of_find_property(np: dn, |
237 | name: prop_name, NULL)); |
238 | prop = NULL; |
239 | break; |
240 | |
241 | default: |
242 | rc = update_dt_property(dn, prop: &prop, name: prop_name, |
243 | vd, value: prop_data); |
244 | if (rc) { |
245 | pr_err("updating %s property failed: %d\n" , |
246 | prop_name, rc); |
247 | } |
248 | |
249 | prop_data += vd; |
250 | break; |
251 | } |
252 | |
253 | cond_resched(); |
254 | } |
255 | |
256 | cond_resched(); |
257 | } while (rtas_rc == 1); |
258 | |
259 | kfree(objp: rtas_buf); |
260 | return 0; |
261 | } |
262 | |
263 | static int add_dt_node(struct device_node *parent_dn, __be32 drc_index) |
264 | { |
265 | struct device_node *dn; |
266 | int rc; |
267 | |
268 | dn = dlpar_configure_connector(drc_index, parent_dn); |
269 | if (!dn) |
270 | return -ENOENT; |
271 | |
272 | /* |
273 | * Since delete_dt_node() ignores this node type, this is the |
274 | * necessary counterpart. We also know that a platform-facilities |
275 | * node returned from dlpar_configure_connector() has children |
276 | * attached, and dlpar_attach_node() only adds the parent, leaking |
277 | * the children. So ignore these on the add side for now. |
278 | */ |
279 | if (of_node_is_type(np: dn, type: "ibm,platform-facilities" )) { |
280 | pr_notice("ignoring add operation for %pOF\n" , dn); |
281 | dlpar_free_cc_nodes(dn); |
282 | return 0; |
283 | } |
284 | |
285 | rc = dlpar_attach_node(dn, parent_dn); |
286 | if (rc) |
287 | dlpar_free_cc_nodes(dn); |
288 | |
289 | pr_debug("added node %pOFfp\n" , dn); |
290 | |
291 | return rc; |
292 | } |
293 | |
294 | static int pseries_devicetree_update(s32 scope) |
295 | { |
296 | char *rtas_buf; |
297 | __be32 *data; |
298 | int update_nodes_token; |
299 | int rc; |
300 | |
301 | update_nodes_token = rtas_function_token(RTAS_FN_IBM_UPDATE_NODES); |
302 | if (update_nodes_token == RTAS_UNKNOWN_SERVICE) |
303 | return 0; |
304 | |
305 | rtas_buf = kzalloc(size: RTAS_DATA_BUF_SIZE, GFP_KERNEL); |
306 | if (!rtas_buf) |
307 | return -ENOMEM; |
308 | |
309 | do { |
310 | rc = mobility_rtas_call(token: update_nodes_token, buf: rtas_buf, scope); |
311 | if (rc && rc != 1) |
312 | break; |
313 | |
314 | data = (__be32 *)rtas_buf + 4; |
315 | while (be32_to_cpu(*data) & NODE_ACTION_MASK) { |
316 | int i; |
317 | u32 action = be32_to_cpu(*data) & NODE_ACTION_MASK; |
318 | u32 node_count = be32_to_cpu(*data) & NODE_COUNT_MASK; |
319 | |
320 | data++; |
321 | |
322 | for (i = 0; i < node_count; i++) { |
323 | struct device_node *np; |
324 | __be32 phandle = *data++; |
325 | __be32 drc_index; |
326 | |
327 | np = of_find_node_by_phandle(be32_to_cpu(phandle)); |
328 | if (!np) { |
329 | pr_warn("Failed lookup: phandle 0x%x for action 0x%x\n" , |
330 | be32_to_cpu(phandle), action); |
331 | continue; |
332 | } |
333 | |
334 | switch (action) { |
335 | case DELETE_DT_NODE: |
336 | delete_dt_node(dn: np); |
337 | break; |
338 | case UPDATE_DT_NODE: |
339 | update_dt_node(dn: np, scope); |
340 | break; |
341 | case ADD_DT_NODE: |
342 | drc_index = *data++; |
343 | add_dt_node(parent_dn: np, drc_index); |
344 | break; |
345 | } |
346 | |
347 | of_node_put(node: np); |
348 | cond_resched(); |
349 | } |
350 | } |
351 | |
352 | cond_resched(); |
353 | } while (rc == 1); |
354 | |
355 | kfree(objp: rtas_buf); |
356 | return rc; |
357 | } |
358 | |
359 | void post_mobility_fixup(void) |
360 | { |
361 | int rc; |
362 | |
363 | rtas_activate_firmware(); |
364 | |
365 | /* |
366 | * We don't want CPUs to go online/offline while the device |
367 | * tree is being updated. |
368 | */ |
369 | cpus_read_lock(); |
370 | |
371 | /* |
372 | * It's common for the destination firmware to replace cache |
373 | * nodes. Release all of the cacheinfo hierarchy's references |
374 | * before updating the device tree. |
375 | */ |
376 | cacheinfo_teardown(); |
377 | |
378 | rc = pseries_devicetree_update(MIGRATION_SCOPE); |
379 | if (rc) |
380 | pr_err("device tree update failed: %d\n" , rc); |
381 | |
382 | cacheinfo_rebuild(); |
383 | |
384 | cpus_read_unlock(); |
385 | |
386 | /* Possibly switch to a new L1 flush type */ |
387 | pseries_setup_security_mitigations(); |
388 | |
389 | /* Reinitialise system information for hv-24x7 */ |
390 | read_24x7_sys_info(); |
391 | |
392 | return; |
393 | } |
394 | |
395 | static int poll_vasi_state(u64 handle, unsigned long *res) |
396 | { |
397 | unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; |
398 | long hvrc; |
399 | int ret; |
400 | |
401 | hvrc = plpar_hcall(H_VASI_STATE, retbuf, handle); |
402 | switch (hvrc) { |
403 | case H_SUCCESS: |
404 | ret = 0; |
405 | *res = retbuf[0]; |
406 | break; |
407 | case H_PARAMETER: |
408 | ret = -EINVAL; |
409 | break; |
410 | case H_FUNCTION: |
411 | ret = -EOPNOTSUPP; |
412 | break; |
413 | case H_HARDWARE: |
414 | default: |
415 | pr_err("unexpected H_VASI_STATE result %ld\n" , hvrc); |
416 | ret = -EIO; |
417 | break; |
418 | } |
419 | return ret; |
420 | } |
421 | |
422 | static int wait_for_vasi_session_suspending(u64 handle) |
423 | { |
424 | unsigned long state; |
425 | int ret; |
426 | |
427 | /* |
428 | * Wait for transition from H_VASI_ENABLED to |
429 | * H_VASI_SUSPENDING. Treat anything else as an error. |
430 | */ |
431 | while (true) { |
432 | ret = poll_vasi_state(handle, res: &state); |
433 | |
434 | if (ret != 0 || state == H_VASI_SUSPENDING) { |
435 | break; |
436 | } else if (state == H_VASI_ENABLED) { |
437 | ssleep(seconds: 1); |
438 | } else { |
439 | pr_err("unexpected H_VASI_STATE result %lu\n" , state); |
440 | ret = -EIO; |
441 | break; |
442 | } |
443 | } |
444 | |
445 | /* |
446 | * Proceed even if H_VASI_STATE is unavailable. If H_JOIN or |
447 | * ibm,suspend-me are also unimplemented, we'll recover then. |
448 | */ |
449 | if (ret == -EOPNOTSUPP) |
450 | ret = 0; |
451 | |
452 | return ret; |
453 | } |
454 | |
455 | static void wait_for_vasi_session_completed(u64 handle) |
456 | { |
457 | unsigned long state = 0; |
458 | int ret; |
459 | |
460 | pr_info("waiting for memory transfer to complete...\n" ); |
461 | |
462 | /* |
463 | * Wait for transition from H_VASI_RESUMED to H_VASI_COMPLETED. |
464 | */ |
465 | while (true) { |
466 | ret = poll_vasi_state(handle, res: &state); |
467 | |
468 | /* |
469 | * If the memory transfer is already complete and the migration |
470 | * has been cleaned up by the hypervisor, H_PARAMETER is return, |
471 | * which is translate in EINVAL by poll_vasi_state(). |
472 | */ |
473 | if (ret == -EINVAL || (!ret && state == H_VASI_COMPLETED)) { |
474 | pr_info("memory transfer completed.\n" ); |
475 | break; |
476 | } |
477 | |
478 | if (ret) { |
479 | pr_err("H_VASI_STATE return error (%d)\n" , ret); |
480 | break; |
481 | } |
482 | |
483 | if (state != H_VASI_RESUMED) { |
484 | pr_err("unexpected H_VASI_STATE result %lu\n" , state); |
485 | break; |
486 | } |
487 | |
488 | msleep(msecs: 500); |
489 | } |
490 | } |
491 | |
492 | static void prod_single(unsigned int target_cpu) |
493 | { |
494 | long hvrc; |
495 | int hwid; |
496 | |
497 | hwid = get_hard_smp_processor_id(target_cpu); |
498 | hvrc = plpar_hcall_norets(H_PROD, hwid); |
499 | if (hvrc == H_SUCCESS) |
500 | return; |
501 | pr_err_ratelimited("H_PROD of CPU %u (hwid %d) error: %ld\n" , |
502 | target_cpu, hwid, hvrc); |
503 | } |
504 | |
505 | static void prod_others(void) |
506 | { |
507 | unsigned int cpu; |
508 | |
509 | for_each_online_cpu(cpu) { |
510 | if (cpu != smp_processor_id()) |
511 | prod_single(target_cpu: cpu); |
512 | } |
513 | } |
514 | |
515 | static u16 clamp_slb_size(void) |
516 | { |
517 | #ifdef CONFIG_PPC_64S_HASH_MMU |
518 | u16 prev = mmu_slb_size; |
519 | |
520 | slb_set_size(SLB_MIN_SIZE); |
521 | |
522 | return prev; |
523 | #else |
524 | return 0; |
525 | #endif |
526 | } |
527 | |
528 | static int do_suspend(void) |
529 | { |
530 | u16 saved_slb_size; |
531 | int status; |
532 | int ret; |
533 | |
534 | pr_info("calling ibm,suspend-me on CPU %i\n" , smp_processor_id()); |
535 | |
536 | /* |
537 | * The destination processor model may have fewer SLB entries |
538 | * than the source. We reduce mmu_slb_size to a safe minimum |
539 | * before suspending in order to minimize the possibility of |
540 | * programming non-existent entries on the destination. If |
541 | * suspend fails, we restore it before returning. On success |
542 | * the OF reconfig path will update it from the new device |
543 | * tree after resuming on the destination. |
544 | */ |
545 | saved_slb_size = clamp_slb_size(); |
546 | |
547 | ret = rtas_ibm_suspend_me(&status); |
548 | if (ret != 0) { |
549 | pr_err("ibm,suspend-me error: %d\n" , status); |
550 | slb_set_size(saved_slb_size); |
551 | } |
552 | |
553 | return ret; |
554 | } |
555 | |
556 | /** |
557 | * struct pseries_suspend_info - State shared between CPUs for join/suspend. |
558 | * @counter: Threads are to increment this upon resuming from suspend |
559 | * or if an error is received from H_JOIN. The thread which performs |
560 | * the first increment (i.e. sets it to 1) is responsible for |
561 | * waking the other threads. |
562 | * @done: False if join/suspend is in progress. True if the operation is |
563 | * complete (successful or not). |
564 | */ |
565 | struct pseries_suspend_info { |
566 | atomic_t counter; |
567 | bool done; |
568 | }; |
569 | |
570 | static int do_join(void *arg) |
571 | { |
572 | struct pseries_suspend_info *info = arg; |
573 | atomic_t *counter = &info->counter; |
574 | long hvrc; |
575 | int ret; |
576 | |
577 | retry: |
578 | /* Must ensure MSR.EE off for H_JOIN. */ |
579 | hard_irq_disable(); |
580 | hvrc = plpar_hcall_norets(H_JOIN); |
581 | |
582 | switch (hvrc) { |
583 | case H_CONTINUE: |
584 | /* |
585 | * All other CPUs are offline or in H_JOIN. This CPU |
586 | * attempts the suspend. |
587 | */ |
588 | ret = do_suspend(); |
589 | break; |
590 | case H_SUCCESS: |
591 | /* |
592 | * The suspend is complete and this cpu has received a |
593 | * prod, or we've received a stray prod from unrelated |
594 | * code (e.g. paravirt spinlocks) and we need to join |
595 | * again. |
596 | * |
597 | * This barrier orders the return from H_JOIN above vs |
598 | * the load of info->done. It pairs with the barrier |
599 | * in the wakeup/prod path below. |
600 | */ |
601 | smp_mb(); |
602 | if (READ_ONCE(info->done) == false) { |
603 | pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying" , |
604 | smp_processor_id()); |
605 | goto retry; |
606 | } |
607 | ret = 0; |
608 | break; |
609 | case H_BAD_MODE: |
610 | case H_HARDWARE: |
611 | default: |
612 | ret = -EIO; |
613 | pr_err_ratelimited("H_JOIN error %ld on CPU %i\n" , |
614 | hvrc, smp_processor_id()); |
615 | break; |
616 | } |
617 | |
618 | if (atomic_inc_return(v: counter) == 1) { |
619 | pr_info("CPU %u waking all threads\n" , smp_processor_id()); |
620 | WRITE_ONCE(info->done, true); |
621 | /* |
622 | * This barrier orders the store to info->done vs subsequent |
623 | * H_PRODs to wake the other CPUs. It pairs with the barrier |
624 | * in the H_SUCCESS case above. |
625 | */ |
626 | smp_mb(); |
627 | prod_others(); |
628 | } |
629 | /* |
630 | * Execution may have been suspended for several seconds, so reset |
631 | * the watchdogs. touch_nmi_watchdog() also touches the soft lockup |
632 | * watchdog. |
633 | */ |
634 | rcu_cpu_stall_reset(); |
635 | touch_nmi_watchdog(); |
636 | |
637 | return ret; |
638 | } |
639 | |
640 | /* |
641 | * Abort reason code byte 0. We use only the 'Migrating partition' value. |
642 | */ |
643 | enum vasi_aborting_entity { |
644 | ORCHESTRATOR = 1, |
645 | VSP_SOURCE = 2, |
646 | PARTITION_FIRMWARE = 3, |
647 | PLATFORM_FIRMWARE = 4, |
648 | VSP_TARGET = 5, |
649 | MIGRATING_PARTITION = 6, |
650 | }; |
651 | |
652 | static void pseries_cancel_migration(u64 handle, int err) |
653 | { |
654 | u32 reason_code; |
655 | u32 detail; |
656 | u8 entity; |
657 | long hvrc; |
658 | |
659 | entity = MIGRATING_PARTITION; |
660 | detail = abs(err) & 0xffffff; |
661 | reason_code = (entity << 24) | detail; |
662 | |
663 | hvrc = plpar_hcall_norets(H_VASI_SIGNAL, handle, |
664 | H_VASI_SIGNAL_CANCEL, reason_code); |
665 | if (hvrc) |
666 | pr_err("H_VASI_SIGNAL error: %ld\n" , hvrc); |
667 | } |
668 | |
669 | static int pseries_suspend(u64 handle) |
670 | { |
671 | const unsigned int max_attempts = 5; |
672 | unsigned int retry_interval_ms = 1; |
673 | unsigned int attempt = 1; |
674 | int ret; |
675 | |
676 | while (true) { |
677 | struct pseries_suspend_info info; |
678 | unsigned long vasi_state; |
679 | int vasi_err; |
680 | |
681 | info = (struct pseries_suspend_info) { |
682 | .counter = ATOMIC_INIT(0), |
683 | .done = false, |
684 | }; |
685 | |
686 | ret = stop_machine(fn: do_join, data: &info, cpu_online_mask); |
687 | if (ret == 0) |
688 | break; |
689 | /* |
690 | * Encountered an error. If the VASI stream is still |
691 | * in Suspending state, it's likely a transient |
692 | * condition related to some device in the partition |
693 | * and we can retry in the hope that the cause has |
694 | * cleared after some delay. |
695 | * |
696 | * A better design would allow drivers etc to prepare |
697 | * for the suspend and avoid conditions which prevent |
698 | * the suspend from succeeding. For now, we have this |
699 | * mitigation. |
700 | */ |
701 | pr_notice("Partition suspend attempt %u of %u error: %d\n" , |
702 | attempt, max_attempts, ret); |
703 | |
704 | if (attempt == max_attempts) |
705 | break; |
706 | |
707 | vasi_err = poll_vasi_state(handle, res: &vasi_state); |
708 | if (vasi_err == 0) { |
709 | if (vasi_state != H_VASI_SUSPENDING) { |
710 | pr_notice("VASI state %lu after failed suspend\n" , |
711 | vasi_state); |
712 | break; |
713 | } |
714 | } else if (vasi_err != -EOPNOTSUPP) { |
715 | pr_err("VASI state poll error: %d" , vasi_err); |
716 | break; |
717 | } |
718 | |
719 | pr_notice("Will retry partition suspend after %u ms\n" , |
720 | retry_interval_ms); |
721 | |
722 | msleep(msecs: retry_interval_ms); |
723 | retry_interval_ms *= 10; |
724 | attempt++; |
725 | } |
726 | |
727 | return ret; |
728 | } |
729 | |
730 | static int pseries_migrate_partition(u64 handle) |
731 | { |
732 | int ret; |
733 | unsigned int factor = 0; |
734 | |
735 | #ifdef CONFIG_PPC_WATCHDOG |
736 | factor = nmi_wd_lpm_factor; |
737 | #endif |
738 | /* |
739 | * When the migration is initiated, the hypervisor changes VAS |
740 | * mappings to prepare before OS gets the notification and |
741 | * closes all VAS windows. NX generates continuous faults during |
742 | * this time and the user space can not differentiate these |
743 | * faults from the migration event. So reduce this time window |
744 | * by closing VAS windows at the beginning of this function. |
745 | */ |
746 | vas_migration_handler(action: VAS_SUSPEND); |
747 | |
748 | ret = wait_for_vasi_session_suspending(handle); |
749 | if (ret) |
750 | goto out; |
751 | |
752 | if (factor) |
753 | watchdog_hardlockup_set_timeout_pct(factor); |
754 | |
755 | ret = pseries_suspend(handle); |
756 | if (ret == 0) { |
757 | post_mobility_fixup(); |
758 | /* |
759 | * Wait until the memory transfer is complete, so that the user |
760 | * space process returns from the syscall after the transfer is |
761 | * complete. This allows the user hooks to be executed at the |
762 | * right time. |
763 | */ |
764 | wait_for_vasi_session_completed(handle); |
765 | } else |
766 | pseries_cancel_migration(handle, err: ret); |
767 | |
768 | if (factor) |
769 | watchdog_hardlockup_set_timeout_pct(0); |
770 | |
771 | out: |
772 | vas_migration_handler(action: VAS_RESUME); |
773 | |
774 | return ret; |
775 | } |
776 | |
777 | int rtas_syscall_dispatch_ibm_suspend_me(u64 handle) |
778 | { |
779 | return pseries_migrate_partition(handle); |
780 | } |
781 | |
782 | static ssize_t migration_store(const struct class *class, |
783 | const struct class_attribute *attr, const char *buf, |
784 | size_t count) |
785 | { |
786 | u64 streamid; |
787 | int rc; |
788 | |
789 | rc = kstrtou64(s: buf, base: 0, res: &streamid); |
790 | if (rc) |
791 | return rc; |
792 | |
793 | rc = pseries_migrate_partition(handle: streamid); |
794 | if (rc) |
795 | return rc; |
796 | |
797 | return count; |
798 | } |
799 | |
800 | /* |
801 | * Used by drmgr to determine the kernel behavior of the migration interface. |
802 | * |
803 | * Version 1: Performs all PAPR requirements for migration including |
804 | * firmware activation and device tree update. |
805 | */ |
806 | #define MIGRATION_API_VERSION 1 |
807 | |
808 | static CLASS_ATTR_WO(migration); |
809 | static CLASS_ATTR_STRING(api_version, 0444, __stringify(MIGRATION_API_VERSION)); |
810 | |
811 | static int __init mobility_sysfs_init(void) |
812 | { |
813 | int rc; |
814 | |
815 | mobility_kobj = kobject_create_and_add(name: "mobility" , parent: kernel_kobj); |
816 | if (!mobility_kobj) |
817 | return -ENOMEM; |
818 | |
819 | rc = sysfs_create_file(kobj: mobility_kobj, attr: &class_attr_migration.attr); |
820 | if (rc) |
821 | pr_err("unable to create migration sysfs file (%d)\n" , rc); |
822 | |
823 | rc = sysfs_create_file(kobj: mobility_kobj, attr: &class_attr_api_version.attr.attr); |
824 | if (rc) |
825 | pr_err("unable to create api_version sysfs file (%d)\n" , rc); |
826 | |
827 | return 0; |
828 | } |
829 | machine_device_initcall(pseries, mobility_sysfs_init); |
830 | |