| 1 | // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause |
| 2 | /* |
| 3 | * Copyright(c) 2020 Cornelis Networks, Inc. |
| 4 | * Copyright(c) 2016 - 2017 Intel Corporation. |
| 5 | */ |
| 6 | |
| 7 | #include <linux/list.h> |
| 8 | #include <linux/rculist.h> |
| 9 | #include <linux/mmu_notifier.h> |
| 10 | #include <linux/interval_tree_generic.h> |
| 11 | #include <linux/sched/mm.h> |
| 12 | |
| 13 | #include "mmu_rb.h" |
| 14 | #include "trace.h" |
| 15 | |
| 16 | static unsigned long mmu_node_start(struct mmu_rb_node *); |
| 17 | static unsigned long mmu_node_last(struct mmu_rb_node *); |
| 18 | static int mmu_notifier_range_start(struct mmu_notifier *, |
| 19 | const struct mmu_notifier_range *); |
| 20 | static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, |
| 21 | unsigned long, unsigned long); |
| 22 | static void release_immediate(struct kref *refcount); |
| 23 | static void handle_remove(struct work_struct *work); |
| 24 | |
| 25 | static const struct mmu_notifier_ops mn_opts = { |
| 26 | .invalidate_range_start = mmu_notifier_range_start, |
| 27 | }; |
| 28 | |
| 29 | INTERVAL_TREE_DEFINE(struct mmu_rb_node, node, unsigned long, __last, |
| 30 | mmu_node_start, mmu_node_last, static, __mmu_int_rb); |
| 31 | |
| 32 | static unsigned long mmu_node_start(struct mmu_rb_node *node) |
| 33 | { |
| 34 | return node->addr & PAGE_MASK; |
| 35 | } |
| 36 | |
| 37 | static unsigned long mmu_node_last(struct mmu_rb_node *node) |
| 38 | { |
| 39 | return PAGE_ALIGN(node->addr + node->len) - 1; |
| 40 | } |
| 41 | |
| 42 | int hfi1_mmu_rb_register(void *ops_arg, |
| 43 | const struct mmu_rb_ops *ops, |
| 44 | struct workqueue_struct *wq, |
| 45 | struct mmu_rb_handler **handler) |
| 46 | { |
| 47 | struct mmu_rb_handler *h; |
| 48 | void *free_ptr; |
| 49 | int ret; |
| 50 | |
| 51 | free_ptr = kzalloc(sizeof(*h) + cache_line_size() - 1, GFP_KERNEL); |
| 52 | if (!free_ptr) |
| 53 | return -ENOMEM; |
| 54 | |
| 55 | h = PTR_ALIGN(free_ptr, cache_line_size()); |
| 56 | h->root = RB_ROOT_CACHED; |
| 57 | h->ops = ops; |
| 58 | h->ops_arg = ops_arg; |
| 59 | INIT_HLIST_NODE(h: &h->mn.hlist); |
| 60 | spin_lock_init(&h->lock); |
| 61 | h->mn.ops = &mn_opts; |
| 62 | INIT_WORK(&h->del_work, handle_remove); |
| 63 | INIT_LIST_HEAD(list: &h->del_list); |
| 64 | INIT_LIST_HEAD(list: &h->lru_list); |
| 65 | h->wq = wq; |
| 66 | h->free_ptr = free_ptr; |
| 67 | |
| 68 | ret = mmu_notifier_register(subscription: &h->mn, current->mm); |
| 69 | if (ret) { |
| 70 | kfree(objp: free_ptr); |
| 71 | return ret; |
| 72 | } |
| 73 | |
| 74 | *handler = h; |
| 75 | return 0; |
| 76 | } |
| 77 | |
| 78 | void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) |
| 79 | { |
| 80 | struct mmu_rb_node *rbnode; |
| 81 | struct rb_node *node; |
| 82 | unsigned long flags; |
| 83 | struct list_head del_list; |
| 84 | |
| 85 | /* Prevent freeing of mm until we are completely finished. */ |
| 86 | mmgrab(mm: handler->mn.mm); |
| 87 | |
| 88 | /* Unregister first so we don't get any more notifications. */ |
| 89 | mmu_notifier_unregister(subscription: &handler->mn, mm: handler->mn.mm); |
| 90 | |
| 91 | /* |
| 92 | * Make sure the wq delete handler is finished running. It will not |
| 93 | * be triggered once the mmu notifiers are unregistered above. |
| 94 | */ |
| 95 | flush_work(work: &handler->del_work); |
| 96 | |
| 97 | INIT_LIST_HEAD(list: &del_list); |
| 98 | |
| 99 | spin_lock_irqsave(&handler->lock, flags); |
| 100 | while ((node = rb_first_cached(&handler->root))) { |
| 101 | rbnode = rb_entry(node, struct mmu_rb_node, node); |
| 102 | rb_erase_cached(node, root: &handler->root); |
| 103 | /* move from LRU list to delete list */ |
| 104 | list_move(list: &rbnode->list, head: &del_list); |
| 105 | } |
| 106 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 107 | |
| 108 | while (!list_empty(head: &del_list)) { |
| 109 | rbnode = list_first_entry(&del_list, struct mmu_rb_node, list); |
| 110 | list_del(entry: &rbnode->list); |
| 111 | kref_put(kref: &rbnode->refcount, release: release_immediate); |
| 112 | } |
| 113 | |
| 114 | /* Now the mm may be freed. */ |
| 115 | mmdrop(mm: handler->mn.mm); |
| 116 | |
| 117 | kfree(objp: handler->free_ptr); |
| 118 | } |
| 119 | |
| 120 | int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, |
| 121 | struct mmu_rb_node *mnode) |
| 122 | { |
| 123 | struct mmu_rb_node *node; |
| 124 | unsigned long flags; |
| 125 | int ret = 0; |
| 126 | |
| 127 | trace_hfi1_mmu_rb_insert(node: mnode); |
| 128 | |
| 129 | if (current->mm != handler->mn.mm) |
| 130 | return -EPERM; |
| 131 | |
| 132 | spin_lock_irqsave(&handler->lock, flags); |
| 133 | node = __mmu_rb_search(handler, mnode->addr, mnode->len); |
| 134 | if (node) { |
| 135 | ret = -EEXIST; |
| 136 | goto unlock; |
| 137 | } |
| 138 | __mmu_int_rb_insert(node: mnode, root: &handler->root); |
| 139 | list_add_tail(new: &mnode->list, head: &handler->lru_list); |
| 140 | mnode->handler = handler; |
| 141 | unlock: |
| 142 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 143 | return ret; |
| 144 | } |
| 145 | |
| 146 | /* Caller must hold handler lock */ |
| 147 | struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, |
| 148 | unsigned long addr, unsigned long len) |
| 149 | { |
| 150 | struct mmu_rb_node *node; |
| 151 | |
| 152 | trace_hfi1_mmu_rb_search(addr, len); |
| 153 | node = __mmu_int_rb_iter_first(root: &handler->root, start: addr, last: (addr + len) - 1); |
| 154 | if (node) |
| 155 | list_move_tail(list: &node->list, head: &handler->lru_list); |
| 156 | return node; |
| 157 | } |
| 158 | |
| 159 | /* Caller must hold handler lock */ |
| 160 | static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, |
| 161 | unsigned long addr, |
| 162 | unsigned long len) |
| 163 | { |
| 164 | struct mmu_rb_node *node = NULL; |
| 165 | |
| 166 | trace_hfi1_mmu_rb_search(addr, len); |
| 167 | if (!handler->ops->filter) { |
| 168 | node = __mmu_int_rb_iter_first(root: &handler->root, start: addr, |
| 169 | last: (addr + len) - 1); |
| 170 | } else { |
| 171 | for (node = __mmu_int_rb_iter_first(root: &handler->root, start: addr, |
| 172 | last: (addr + len) - 1); |
| 173 | node; |
| 174 | node = __mmu_int_rb_iter_next(node, start: addr, |
| 175 | last: (addr + len) - 1)) { |
| 176 | if (handler->ops->filter(node, addr, len)) |
| 177 | return node; |
| 178 | } |
| 179 | } |
| 180 | return node; |
| 181 | } |
| 182 | |
| 183 | /* |
| 184 | * Must NOT call while holding mnode->handler->lock. |
| 185 | * mnode->handler->ops->remove() may sleep and mnode->handler->lock is a |
| 186 | * spinlock. |
| 187 | */ |
| 188 | static void release_immediate(struct kref *refcount) |
| 189 | { |
| 190 | struct mmu_rb_node *mnode = |
| 191 | container_of(refcount, struct mmu_rb_node, refcount); |
| 192 | trace_hfi1_mmu_release_node(node: mnode); |
| 193 | mnode->handler->ops->remove(mnode->handler->ops_arg, mnode); |
| 194 | } |
| 195 | |
| 196 | /* Caller must hold mnode->handler->lock */ |
| 197 | static void release_nolock(struct kref *refcount) |
| 198 | { |
| 199 | struct mmu_rb_node *mnode = |
| 200 | container_of(refcount, struct mmu_rb_node, refcount); |
| 201 | list_move(list: &mnode->list, head: &mnode->handler->del_list); |
| 202 | queue_work(wq: mnode->handler->wq, work: &mnode->handler->del_work); |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | * struct mmu_rb_node->refcount kref_put() callback. |
| 207 | * Adds mmu_rb_node to mmu_rb_node->handler->del_list and queues |
| 208 | * handler->del_work on handler->wq. |
| 209 | * Does not remove mmu_rb_node from handler->lru_list or handler->rb_root. |
| 210 | * Acquires mmu_rb_node->handler->lock; do not call while already holding |
| 211 | * handler->lock. |
| 212 | */ |
| 213 | void hfi1_mmu_rb_release(struct kref *refcount) |
| 214 | { |
| 215 | struct mmu_rb_node *mnode = |
| 216 | container_of(refcount, struct mmu_rb_node, refcount); |
| 217 | struct mmu_rb_handler *handler = mnode->handler; |
| 218 | unsigned long flags; |
| 219 | |
| 220 | spin_lock_irqsave(&handler->lock, flags); |
| 221 | list_move(list: &mnode->list, head: &mnode->handler->del_list); |
| 222 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 223 | queue_work(wq: handler->wq, work: &handler->del_work); |
| 224 | } |
| 225 | |
| 226 | void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) |
| 227 | { |
| 228 | struct mmu_rb_node *rbnode, *ptr; |
| 229 | struct list_head del_list; |
| 230 | unsigned long flags; |
| 231 | bool stop = false; |
| 232 | |
| 233 | if (current->mm != handler->mn.mm) |
| 234 | return; |
| 235 | |
| 236 | INIT_LIST_HEAD(list: &del_list); |
| 237 | |
| 238 | spin_lock_irqsave(&handler->lock, flags); |
| 239 | list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) { |
| 240 | /* refcount == 1 implies mmu_rb_handler has only rbnode ref */ |
| 241 | if (kref_read(kref: &rbnode->refcount) > 1) |
| 242 | continue; |
| 243 | |
| 244 | if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, |
| 245 | &stop)) { |
| 246 | __mmu_int_rb_remove(node: rbnode, root: &handler->root); |
| 247 | /* move from LRU list to delete list */ |
| 248 | list_move(list: &rbnode->list, head: &del_list); |
| 249 | } |
| 250 | if (stop) |
| 251 | break; |
| 252 | } |
| 253 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 254 | |
| 255 | list_for_each_entry_safe(rbnode, ptr, &del_list, list) { |
| 256 | trace_hfi1_mmu_rb_evict(node: rbnode); |
| 257 | kref_put(kref: &rbnode->refcount, release: release_immediate); |
| 258 | } |
| 259 | } |
| 260 | |
| 261 | static int mmu_notifier_range_start(struct mmu_notifier *mn, |
| 262 | const struct mmu_notifier_range *range) |
| 263 | { |
| 264 | struct mmu_rb_handler *handler = |
| 265 | container_of(mn, struct mmu_rb_handler, mn); |
| 266 | struct rb_root_cached *root = &handler->root; |
| 267 | struct mmu_rb_node *node, *ptr = NULL; |
| 268 | unsigned long flags; |
| 269 | |
| 270 | spin_lock_irqsave(&handler->lock, flags); |
| 271 | for (node = __mmu_int_rb_iter_first(root, start: range->start, last: range->end-1); |
| 272 | node; node = ptr) { |
| 273 | /* Guard against node removal. */ |
| 274 | ptr = __mmu_int_rb_iter_next(node, start: range->start, |
| 275 | last: range->end - 1); |
| 276 | trace_hfi1_mmu_mem_invalidate(node); |
| 277 | /* Remove from rb tree and lru_list. */ |
| 278 | __mmu_int_rb_remove(node, root); |
| 279 | list_del_init(entry: &node->list); |
| 280 | kref_put(kref: &node->refcount, release: release_nolock); |
| 281 | } |
| 282 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 283 | |
| 284 | return 0; |
| 285 | } |
| 286 | |
| 287 | /* |
| 288 | * Work queue function to remove all nodes that have been queued up to |
| 289 | * be removed. The key feature is that mm->mmap_lock is not being held |
| 290 | * and the remove callback can sleep while taking it, if needed. |
| 291 | */ |
| 292 | static void handle_remove(struct work_struct *work) |
| 293 | { |
| 294 | struct mmu_rb_handler *handler = container_of(work, |
| 295 | struct mmu_rb_handler, |
| 296 | del_work); |
| 297 | struct list_head del_list; |
| 298 | unsigned long flags; |
| 299 | struct mmu_rb_node *node; |
| 300 | |
| 301 | /* remove anything that is queued to get removed */ |
| 302 | spin_lock_irqsave(&handler->lock, flags); |
| 303 | list_replace_init(old: &handler->del_list, new: &del_list); |
| 304 | spin_unlock_irqrestore(lock: &handler->lock, flags); |
| 305 | |
| 306 | while (!list_empty(head: &del_list)) { |
| 307 | node = list_first_entry(&del_list, struct mmu_rb_node, list); |
| 308 | list_del(entry: &node->list); |
| 309 | trace_hfi1_mmu_release_node(node); |
| 310 | handler->ops->remove(handler->ops_arg, node); |
| 311 | } |
| 312 | } |
| 313 | |