| 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | /* |
| 3 | * Deferred user space unwinding |
| 4 | */ |
| 5 | #include <linux/sched/task_stack.h> |
| 6 | #include <linux/unwind_deferred.h> |
| 7 | #include <linux/sched/clock.h> |
| 8 | #include <linux/task_work.h> |
| 9 | #include <linux/kernel.h> |
| 10 | #include <linux/sched.h> |
| 11 | #include <linux/sizes.h> |
| 12 | #include <linux/slab.h> |
| 13 | #include <linux/mm.h> |
| 14 | |
| 15 | /* |
| 16 | * For requesting a deferred user space stack trace from NMI context |
| 17 | * the architecture must support a safe cmpxchg in NMI context. |
| 18 | * For those architectures that do not have that, then it cannot ask |
| 19 | * for a deferred user space stack trace from an NMI context. If it |
| 20 | * does, then it will get -EINVAL. |
| 21 | */ |
| 22 | #if defined(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) |
| 23 | # define CAN_USE_IN_NMI 1 |
| 24 | static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) |
| 25 | { |
| 26 | u32 old = 0; |
| 27 | |
| 28 | return try_cmpxchg(&info->id.cnt, &old, cnt); |
| 29 | } |
| 30 | #else |
| 31 | # define CAN_USE_IN_NMI 0 |
| 32 | /* When NMIs are not allowed, this always succeeds */ |
| 33 | static inline bool try_assign_cnt(struct unwind_task_info *info, u32 cnt) |
| 34 | { |
| 35 | info->id.cnt = cnt; |
| 36 | return true; |
| 37 | } |
| 38 | #endif |
| 39 | |
| 40 | /* Make the cache fit in a 4K page */ |
| 41 | #define UNWIND_MAX_ENTRIES \ |
| 42 | ((SZ_4K - sizeof(struct unwind_cache)) / sizeof(long)) |
| 43 | |
| 44 | /* Guards adding to or removing from the list of callbacks */ |
| 45 | static DEFINE_MUTEX(callback_mutex); |
| 46 | static LIST_HEAD(callbacks); |
| 47 | |
| 48 | #define RESERVED_BITS (UNWIND_PENDING | UNWIND_USED) |
| 49 | |
| 50 | /* Zero'd bits are available for assigning callback users */ |
| 51 | static unsigned long unwind_mask = RESERVED_BITS; |
| 52 | DEFINE_STATIC_SRCU(unwind_srcu); |
| 53 | |
| 54 | static inline bool unwind_pending(struct unwind_task_info *info) |
| 55 | { |
| 56 | return atomic_long_read(v: &info->unwind_mask) & UNWIND_PENDING; |
| 57 | } |
| 58 | |
| 59 | /* |
| 60 | * This is a unique percpu identifier for a given task entry context. |
| 61 | * Conceptually, it's incremented every time the CPU enters the kernel from |
| 62 | * user space, so that each "entry context" on the CPU gets a unique ID. In |
| 63 | * reality, as an optimization, it's only incremented on demand for the first |
| 64 | * deferred unwind request after a given entry-from-user. |
| 65 | * |
| 66 | * It's combined with the CPU id to make a systemwide-unique "context cookie". |
| 67 | */ |
| 68 | static DEFINE_PER_CPU(u32, unwind_ctx_ctr); |
| 69 | |
| 70 | /* |
| 71 | * The context cookie is a unique identifier that is assigned to a user |
| 72 | * space stacktrace. As the user space stacktrace remains the same while |
| 73 | * the task is in the kernel, the cookie is an identifier for the stacktrace. |
| 74 | * Although it is possible for the stacktrace to get another cookie if another |
| 75 | * request is made after the cookie was cleared and before reentering user |
| 76 | * space. |
| 77 | */ |
| 78 | static u64 get_cookie(struct unwind_task_info *info) |
| 79 | { |
| 80 | u32 cnt = 1; |
| 81 | |
| 82 | lockdep_assert_irqs_disabled(); |
| 83 | |
| 84 | if (info->id.cpu) |
| 85 | return info->id.id; |
| 86 | |
| 87 | /* LSB is always set to ensure 0 is an invalid value */ |
| 88 | cnt |= __this_cpu_read(unwind_ctx_ctr) + 2; |
| 89 | if (try_assign_cnt(info, cnt)) { |
| 90 | /* Update the per cpu counter */ |
| 91 | __this_cpu_write(unwind_ctx_ctr, cnt); |
| 92 | } |
| 93 | /* Interrupts are disabled, the CPU will always be same */ |
| 94 | info->id.cpu = smp_processor_id() + 1; /* Must be non zero */ |
| 95 | |
| 96 | return info->id.id; |
| 97 | } |
| 98 | |
| 99 | /** |
| 100 | * unwind_user_faultable - Produce a user stacktrace in faultable context |
| 101 | * @trace: The descriptor that will store the user stacktrace |
| 102 | * |
| 103 | * This must be called in a known faultable context (usually when entering |
| 104 | * or exiting user space). Depending on the available implementations |
| 105 | * the @trace will be loaded with the addresses of the user space stacktrace |
| 106 | * if it can be found. |
| 107 | * |
| 108 | * Return: 0 on success and negative on error |
| 109 | * On success @trace will contain the user space stacktrace |
| 110 | */ |
| 111 | int unwind_user_faultable(struct unwind_stacktrace *trace) |
| 112 | { |
| 113 | struct unwind_task_info *info = ¤t->unwind_info; |
| 114 | struct unwind_cache *cache; |
| 115 | |
| 116 | /* Should always be called from faultable context */ |
| 117 | might_fault(); |
| 118 | |
| 119 | if (!current->mm) |
| 120 | return -EINVAL; |
| 121 | |
| 122 | if (!info->cache) { |
| 123 | info->cache = kzalloc(struct_size(cache, entries, UNWIND_MAX_ENTRIES), |
| 124 | GFP_KERNEL); |
| 125 | if (!info->cache) |
| 126 | return -ENOMEM; |
| 127 | } |
| 128 | |
| 129 | cache = info->cache; |
| 130 | trace->entries = cache->entries; |
| 131 | trace->nr = cache->nr_entries; |
| 132 | /* |
| 133 | * The user stack has already been previously unwound in this |
| 134 | * entry context. Skip the unwind and use the cache. |
| 135 | */ |
| 136 | if (trace->nr) |
| 137 | return 0; |
| 138 | |
| 139 | unwind_user(trace, UNWIND_MAX_ENTRIES); |
| 140 | |
| 141 | cache->nr_entries = trace->nr; |
| 142 | |
| 143 | /* Clear nr_entries on way back to user space */ |
| 144 | atomic_long_or(i: UNWIND_USED, v: &info->unwind_mask); |
| 145 | |
| 146 | return 0; |
| 147 | } |
| 148 | |
| 149 | static void process_unwind_deferred(struct task_struct *task) |
| 150 | { |
| 151 | struct unwind_task_info *info = &task->unwind_info; |
| 152 | struct unwind_stacktrace trace; |
| 153 | struct unwind_work *work; |
| 154 | unsigned long bits; |
| 155 | u64 cookie; |
| 156 | |
| 157 | if (WARN_ON_ONCE(!unwind_pending(info))) |
| 158 | return; |
| 159 | |
| 160 | /* Clear pending bit but make sure to have the current bits */ |
| 161 | bits = atomic_long_fetch_andnot(i: UNWIND_PENDING, |
| 162 | v: &info->unwind_mask); |
| 163 | /* |
| 164 | * From here on out, the callback must always be called, even if it's |
| 165 | * just an empty trace. |
| 166 | */ |
| 167 | trace.nr = 0; |
| 168 | trace.entries = NULL; |
| 169 | |
| 170 | unwind_user_faultable(trace: &trace); |
| 171 | |
| 172 | if (info->cache) |
| 173 | bits &= ~(info->cache->unwind_completed); |
| 174 | |
| 175 | cookie = info->id.id; |
| 176 | |
| 177 | guard(srcu)(l: &unwind_srcu); |
| 178 | list_for_each_entry_srcu(work, &callbacks, list, |
| 179 | srcu_read_lock_held(&unwind_srcu)) { |
| 180 | if (test_bit(work->bit, &bits)) { |
| 181 | work->func(work, &trace, cookie); |
| 182 | if (info->cache) |
| 183 | info->cache->unwind_completed |= BIT(work->bit); |
| 184 | } |
| 185 | } |
| 186 | } |
| 187 | |
| 188 | static void unwind_deferred_task_work(struct callback_head *head) |
| 189 | { |
| 190 | process_unwind_deferred(current); |
| 191 | } |
| 192 | |
| 193 | void unwind_deferred_task_exit(struct task_struct *task) |
| 194 | { |
| 195 | struct unwind_task_info *info = ¤t->unwind_info; |
| 196 | |
| 197 | if (!unwind_pending(info)) |
| 198 | return; |
| 199 | |
| 200 | process_unwind_deferred(task); |
| 201 | |
| 202 | task_work_cancel(task, cb: &info->work); |
| 203 | } |
| 204 | |
| 205 | /** |
| 206 | * unwind_deferred_request - Request a user stacktrace on task kernel exit |
| 207 | * @work: Unwind descriptor requesting the trace |
| 208 | * @cookie: The cookie of the first request made for this task |
| 209 | * |
| 210 | * Schedule a user space unwind to be done in task work before exiting the |
| 211 | * kernel. |
| 212 | * |
| 213 | * The returned @cookie output is the generated cookie of the very first |
| 214 | * request for a user space stacktrace for this task since it entered the |
| 215 | * kernel. It can be from a request by any caller of this infrastructure. |
| 216 | * Its value will also be passed to the callback function. It can be |
| 217 | * used to stitch kernel and user stack traces together in post-processing. |
| 218 | * |
| 219 | * It's valid to call this function multiple times for the same @work within |
| 220 | * the same task entry context. Each call will return the same cookie |
| 221 | * while the task hasn't left the kernel. If the callback is not pending |
| 222 | * because it has already been previously called for the same entry context, |
| 223 | * it will be called again with the same stack trace and cookie. |
| 224 | * |
| 225 | * Return: 0 if the callback successfully was queued. |
| 226 | * 1 if the callback is pending or was already executed. |
| 227 | * Negative if there's an error. |
| 228 | * @cookie holds the cookie of the first request by any user |
| 229 | */ |
| 230 | int unwind_deferred_request(struct unwind_work *work, u64 *cookie) |
| 231 | { |
| 232 | struct unwind_task_info *info = ¤t->unwind_info; |
| 233 | int twa_mode = TWA_RESUME; |
| 234 | unsigned long old, bits; |
| 235 | unsigned long bit; |
| 236 | int ret; |
| 237 | |
| 238 | *cookie = 0; |
| 239 | |
| 240 | if ((current->flags & (PF_KTHREAD | PF_EXITING)) || |
| 241 | !user_mode(task_pt_regs(current))) |
| 242 | return -EINVAL; |
| 243 | |
| 244 | /* |
| 245 | * NMI requires having safe cmpxchg operations. |
| 246 | * Trigger a warning to make it obvious that an architecture |
| 247 | * is using this in NMI when it should not be. |
| 248 | */ |
| 249 | if (in_nmi()) { |
| 250 | if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) |
| 251 | return -EINVAL; |
| 252 | twa_mode = TWA_NMI_CURRENT; |
| 253 | } |
| 254 | |
| 255 | /* Do not allow cancelled works to request again */ |
| 256 | bit = READ_ONCE(work->bit); |
| 257 | if (WARN_ON_ONCE(bit < 0)) |
| 258 | return -EINVAL; |
| 259 | |
| 260 | /* Only need the mask now */ |
| 261 | bit = BIT(bit); |
| 262 | |
| 263 | guard(irqsave)(); |
| 264 | |
| 265 | *cookie = get_cookie(info); |
| 266 | |
| 267 | old = atomic_long_read(v: &info->unwind_mask); |
| 268 | |
| 269 | /* Is this already queued or executed */ |
| 270 | if (old & bit) |
| 271 | return 1; |
| 272 | |
| 273 | /* |
| 274 | * This work's bit hasn't been set yet. Now set it with the PENDING |
| 275 | * bit and fetch the current value of unwind_mask. If ether the |
| 276 | * work's bit or PENDING was already set, then this is already queued |
| 277 | * to have a callback. |
| 278 | */ |
| 279 | bits = UNWIND_PENDING | bit; |
| 280 | old = atomic_long_fetch_or(i: bits, v: &info->unwind_mask); |
| 281 | if (old & bits) { |
| 282 | /* |
| 283 | * If the work's bit was set, whatever set it had better |
| 284 | * have also set pending and queued a callback. |
| 285 | */ |
| 286 | WARN_ON_ONCE(!(old & UNWIND_PENDING)); |
| 287 | return old & bit; |
| 288 | } |
| 289 | |
| 290 | /* The work has been claimed, now schedule it. */ |
| 291 | ret = task_work_add(current, twork: &info->work, mode: twa_mode); |
| 292 | |
| 293 | if (WARN_ON_ONCE(ret)) |
| 294 | atomic_long_set(v: &info->unwind_mask, i: 0); |
| 295 | |
| 296 | return ret; |
| 297 | } |
| 298 | |
| 299 | void unwind_deferred_cancel(struct unwind_work *work) |
| 300 | { |
| 301 | struct task_struct *g, *t; |
| 302 | int bit; |
| 303 | |
| 304 | if (!work) |
| 305 | return; |
| 306 | |
| 307 | bit = work->bit; |
| 308 | |
| 309 | /* No work should be using a reserved bit */ |
| 310 | if (WARN_ON_ONCE(BIT(bit) & RESERVED_BITS)) |
| 311 | return; |
| 312 | |
| 313 | guard(mutex)(T: &callback_mutex); |
| 314 | list_del_rcu(entry: &work->list); |
| 315 | |
| 316 | /* Do not allow any more requests and prevent callbacks */ |
| 317 | work->bit = -1; |
| 318 | |
| 319 | __clear_bit(bit, &unwind_mask); |
| 320 | |
| 321 | synchronize_srcu(ssp: &unwind_srcu); |
| 322 | |
| 323 | guard(rcu)(); |
| 324 | /* Clear this bit from all threads */ |
| 325 | for_each_process_thread(g, t) { |
| 326 | atomic_long_andnot(BIT(bit), |
| 327 | v: &t->unwind_info.unwind_mask); |
| 328 | if (t->unwind_info.cache) |
| 329 | clear_bit(nr: bit, addr: &t->unwind_info.cache->unwind_completed); |
| 330 | } |
| 331 | } |
| 332 | |
| 333 | int unwind_deferred_init(struct unwind_work *work, unwind_callback_t func) |
| 334 | { |
| 335 | memset(work, 0, sizeof(*work)); |
| 336 | |
| 337 | guard(mutex)(T: &callback_mutex); |
| 338 | |
| 339 | /* See if there's a bit in the mask available */ |
| 340 | if (unwind_mask == ~0UL) |
| 341 | return -EBUSY; |
| 342 | |
| 343 | work->bit = ffz(unwind_mask); |
| 344 | __set_bit(work->bit, &unwind_mask); |
| 345 | |
| 346 | list_add_rcu(new: &work->list, head: &callbacks); |
| 347 | work->func = func; |
| 348 | return 0; |
| 349 | } |
| 350 | |
| 351 | void unwind_task_init(struct task_struct *task) |
| 352 | { |
| 353 | struct unwind_task_info *info = &task->unwind_info; |
| 354 | |
| 355 | memset(info, 0, sizeof(*info)); |
| 356 | init_task_work(twork: &info->work, func: unwind_deferred_task_work); |
| 357 | atomic_long_set(v: &info->unwind_mask, i: 0); |
| 358 | } |
| 359 | |
| 360 | void unwind_task_free(struct task_struct *task) |
| 361 | { |
| 362 | struct unwind_task_info *info = &task->unwind_info; |
| 363 | |
| 364 | kfree(objp: info->cache); |
| 365 | task_work_cancel(task, cb: &info->work); |
| 366 | } |
| 367 | |