| 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
| 2 | /* |
| 3 | * Copyright (C) IBM Corporation, 2014, 2017 |
| 4 | * Anton Blanchard, Rashmica Gupta. |
| 5 | */ |
| 6 | |
| 7 | #define pr_fmt(fmt) "memtrace: " fmt |
| 8 | |
| 9 | #include <linux/bitops.h> |
| 10 | #include <linux/string.h> |
| 11 | #include <linux/memblock.h> |
| 12 | #include <linux/init.h> |
| 13 | #include <linux/moduleparam.h> |
| 14 | #include <linux/fs.h> |
| 15 | #include <linux/debugfs.h> |
| 16 | #include <linux/slab.h> |
| 17 | #include <linux/memory.h> |
| 18 | #include <linux/memory_hotplug.h> |
| 19 | #include <linux/numa.h> |
| 20 | #include <asm/machdep.h> |
| 21 | #include <asm/cacheflush.h> |
| 22 | |
| 23 | /* This enables us to keep track of the memory removed from each node. */ |
| 24 | struct memtrace_entry { |
| 25 | void *mem; |
| 26 | u64 start; |
| 27 | u64 size; |
| 28 | u32 nid; |
| 29 | struct dentry *dir; |
| 30 | char name[16]; |
| 31 | }; |
| 32 | |
| 33 | static DEFINE_MUTEX(memtrace_mutex); |
| 34 | static u64 memtrace_size; |
| 35 | |
| 36 | static struct memtrace_entry *memtrace_array; |
| 37 | static unsigned int memtrace_array_nr; |
| 38 | |
| 39 | |
| 40 | static ssize_t memtrace_read(struct file *filp, char __user *ubuf, |
| 41 | size_t count, loff_t *ppos) |
| 42 | { |
| 43 | struct memtrace_entry *ent = filp->private_data; |
| 44 | |
| 45 | return simple_read_from_buffer(to: ubuf, count, ppos, from: ent->mem, available: ent->size); |
| 46 | } |
| 47 | |
| 48 | static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) |
| 49 | { |
| 50 | struct memtrace_entry *ent = filp->private_data; |
| 51 | unsigned long ent_nrpages = ent->size >> PAGE_SHIFT; |
| 52 | unsigned long vma_nrpages = vma_pages(vma); |
| 53 | |
| 54 | /* The requested page offset should be within object's page count */ |
| 55 | if (vma->vm_pgoff >= ent_nrpages) |
| 56 | return -EINVAL; |
| 57 | |
| 58 | /* The requested mapping range should remain within the bounds */ |
| 59 | if (vma_nrpages > ent_nrpages - vma->vm_pgoff) |
| 60 | return -EINVAL; |
| 61 | |
| 62 | vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); |
| 63 | return remap_pfn_range(vma, addr: vma->vm_start, PHYS_PFN(ent->start) + vma->vm_pgoff, |
| 64 | size: vma->vm_end - vma->vm_start, pgprot: vma->vm_page_prot); |
| 65 | } |
| 66 | |
| 67 | static const struct file_operations memtrace_fops = { |
| 68 | .llseek = default_llseek, |
| 69 | .read = memtrace_read, |
| 70 | .open = simple_open, |
| 71 | .mmap = memtrace_mmap, |
| 72 | }; |
| 73 | |
| 74 | #define FLUSH_CHUNK_SIZE SZ_1G |
| 75 | /** |
| 76 | * flush_dcache_range_chunked(): Write any modified data cache blocks out to |
| 77 | * memory and invalidate them, in chunks of up to FLUSH_CHUNK_SIZE |
| 78 | * Does not invalidate the corresponding instruction cache blocks. |
| 79 | * |
| 80 | * @start: the start address |
| 81 | * @stop: the stop address (exclusive) |
| 82 | * @chunk: the max size of the chunks |
| 83 | */ |
| 84 | static void flush_dcache_range_chunked(unsigned long start, unsigned long stop, |
| 85 | unsigned long chunk) |
| 86 | { |
| 87 | unsigned long i; |
| 88 | |
| 89 | for (i = start; i < stop; i += chunk) { |
| 90 | flush_dcache_range(i, min(stop, i + chunk)); |
| 91 | cond_resched(); |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | static u64 memtrace_alloc_node(u32 nid, u64 size) |
| 96 | { |
| 97 | const unsigned long nr_pages = PHYS_PFN(size); |
| 98 | unsigned long pfn, start_pfn; |
| 99 | struct page *page; |
| 100 | |
| 101 | /* |
| 102 | * Trace memory needs to be aligned to the size, which is guaranteed |
| 103 | * by alloc_contig_pages(). |
| 104 | */ |
| 105 | page = alloc_contig_pages(nr_pages, GFP_KERNEL | __GFP_THISNODE | |
| 106 | __GFP_NOWARN | __GFP_ZERO, nid, NULL); |
| 107 | if (!page) |
| 108 | return 0; |
| 109 | start_pfn = page_to_pfn(page); |
| 110 | |
| 111 | /* |
| 112 | * Before we go ahead and use this range as cache inhibited range |
| 113 | * flush the cache. |
| 114 | */ |
| 115 | flush_dcache_range_chunked(start: (unsigned long)pfn_to_kaddr(pfn: start_pfn), |
| 116 | stop: (unsigned long)pfn_to_kaddr(pfn: start_pfn + nr_pages), |
| 117 | FLUSH_CHUNK_SIZE); |
| 118 | |
| 119 | /* |
| 120 | * Set pages PageOffline(), to indicate that nobody (e.g., hibernation, |
| 121 | * dumping, ...) should be touching these pages. |
| 122 | */ |
| 123 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) |
| 124 | __SetPageOffline(pfn_to_page(pfn)); |
| 125 | |
| 126 | arch_remove_linear_mapping(PFN_PHYS(start_pfn), size); |
| 127 | |
| 128 | return PFN_PHYS(start_pfn); |
| 129 | } |
| 130 | |
| 131 | static int memtrace_init_regions_runtime(u64 size) |
| 132 | { |
| 133 | u32 nid; |
| 134 | u64 m; |
| 135 | |
| 136 | memtrace_array = kcalloc(num_online_nodes(), |
| 137 | sizeof(struct memtrace_entry), GFP_KERNEL); |
| 138 | if (!memtrace_array) { |
| 139 | pr_err("Failed to allocate memtrace_array\n" ); |
| 140 | return -EINVAL; |
| 141 | } |
| 142 | |
| 143 | for_each_online_node(nid) { |
| 144 | m = memtrace_alloc_node(nid, size); |
| 145 | |
| 146 | /* |
| 147 | * A node might not have any local memory, so warn but |
| 148 | * continue on. |
| 149 | */ |
| 150 | if (!m) { |
| 151 | pr_err("Failed to allocate trace memory on node %d\n" , nid); |
| 152 | continue; |
| 153 | } |
| 154 | |
| 155 | pr_info("Allocated trace memory on node %d at 0x%016llx\n" , nid, m); |
| 156 | |
| 157 | memtrace_array[memtrace_array_nr].start = m; |
| 158 | memtrace_array[memtrace_array_nr].size = size; |
| 159 | memtrace_array[memtrace_array_nr].nid = nid; |
| 160 | memtrace_array_nr++; |
| 161 | } |
| 162 | |
| 163 | return 0; |
| 164 | } |
| 165 | |
| 166 | static struct dentry *memtrace_debugfs_dir; |
| 167 | |
| 168 | static int memtrace_init_debugfs(void) |
| 169 | { |
| 170 | int ret = 0; |
| 171 | int i; |
| 172 | |
| 173 | for (i = 0; i < memtrace_array_nr; i++) { |
| 174 | struct dentry *dir; |
| 175 | struct memtrace_entry *ent = &memtrace_array[i]; |
| 176 | |
| 177 | ent->mem = ioremap(offset: ent->start, size: ent->size); |
| 178 | /* Warn but continue on */ |
| 179 | if (!ent->mem) { |
| 180 | pr_err("Failed to map trace memory at 0x%llx\n" , |
| 181 | ent->start); |
| 182 | ret = -1; |
| 183 | continue; |
| 184 | } |
| 185 | |
| 186 | snprintf(buf: ent->name, size: 16, fmt: "%08x" , ent->nid); |
| 187 | dir = debugfs_create_dir(name: ent->name, parent: memtrace_debugfs_dir); |
| 188 | |
| 189 | ent->dir = dir; |
| 190 | debugfs_create_file_unsafe(name: "trace" , mode: 0600, parent: dir, data: ent, fops: &memtrace_fops); |
| 191 | debugfs_create_x64(name: "start" , mode: 0400, parent: dir, value: &ent->start); |
| 192 | debugfs_create_x64(name: "size" , mode: 0400, parent: dir, value: &ent->size); |
| 193 | } |
| 194 | |
| 195 | return ret; |
| 196 | } |
| 197 | |
| 198 | static int memtrace_free(int nid, u64 start, u64 size) |
| 199 | { |
| 200 | struct mhp_params params = { .pgprot = PAGE_KERNEL }; |
| 201 | const unsigned long nr_pages = PHYS_PFN(size); |
| 202 | const unsigned long start_pfn = PHYS_PFN(start); |
| 203 | unsigned long pfn; |
| 204 | int ret; |
| 205 | |
| 206 | ret = arch_create_linear_mapping(nid, start, size, params: ¶ms); |
| 207 | if (ret) |
| 208 | return ret; |
| 209 | |
| 210 | for (pfn = start_pfn; pfn < start_pfn + nr_pages; pfn++) |
| 211 | __ClearPageOffline(pfn_to_page(pfn)); |
| 212 | |
| 213 | free_contig_range(pfn: start_pfn, nr_pages); |
| 214 | return 0; |
| 215 | } |
| 216 | |
| 217 | /* |
| 218 | * Iterate through the chunks of memory we allocated and attempt to expose |
| 219 | * them back to the kernel. |
| 220 | */ |
| 221 | static int memtrace_free_regions(void) |
| 222 | { |
| 223 | int i, ret = 0; |
| 224 | struct memtrace_entry *ent; |
| 225 | |
| 226 | for (i = memtrace_array_nr - 1; i >= 0; i--) { |
| 227 | ent = &memtrace_array[i]; |
| 228 | |
| 229 | /* We have freed this chunk previously */ |
| 230 | if (ent->nid == NUMA_NO_NODE) |
| 231 | continue; |
| 232 | |
| 233 | /* Remove from io mappings */ |
| 234 | if (ent->mem) { |
| 235 | iounmap(addr: ent->mem); |
| 236 | ent->mem = 0; |
| 237 | } |
| 238 | |
| 239 | if (memtrace_free(nid: ent->nid, start: ent->start, size: ent->size)) { |
| 240 | pr_err("Failed to free trace memory on node %d\n" , |
| 241 | ent->nid); |
| 242 | ret += 1; |
| 243 | continue; |
| 244 | } |
| 245 | |
| 246 | /* |
| 247 | * Memory was freed successfully so clean up references to it |
| 248 | * so on reentry we can tell that this chunk was freed. |
| 249 | */ |
| 250 | debugfs_remove_recursive(dentry: ent->dir); |
| 251 | pr_info("Freed trace memory back on node %d\n" , ent->nid); |
| 252 | ent->size = ent->start = ent->nid = NUMA_NO_NODE; |
| 253 | } |
| 254 | if (ret) |
| 255 | return ret; |
| 256 | |
| 257 | /* If all chunks of memory were freed successfully, reset globals */ |
| 258 | kfree(objp: memtrace_array); |
| 259 | memtrace_array = NULL; |
| 260 | memtrace_size = 0; |
| 261 | memtrace_array_nr = 0; |
| 262 | return 0; |
| 263 | } |
| 264 | |
| 265 | static int memtrace_enable_set(void *data, u64 val) |
| 266 | { |
| 267 | int rc = -EAGAIN; |
| 268 | u64 bytes; |
| 269 | |
| 270 | /* |
| 271 | * Don't attempt to do anything if size isn't aligned to a memory |
| 272 | * block or equal to zero. |
| 273 | */ |
| 274 | bytes = memory_block_size_bytes(); |
| 275 | if (val & (bytes - 1)) { |
| 276 | pr_err("Value must be aligned with 0x%llx\n" , bytes); |
| 277 | return -EINVAL; |
| 278 | } |
| 279 | |
| 280 | mutex_lock(&memtrace_mutex); |
| 281 | |
| 282 | /* Free all previously allocated memory. */ |
| 283 | if (memtrace_size && memtrace_free_regions()) |
| 284 | goto out_unlock; |
| 285 | |
| 286 | if (!val) { |
| 287 | rc = 0; |
| 288 | goto out_unlock; |
| 289 | } |
| 290 | |
| 291 | /* Allocate memory. */ |
| 292 | if (memtrace_init_regions_runtime(size: val)) |
| 293 | goto out_unlock; |
| 294 | |
| 295 | if (memtrace_init_debugfs()) |
| 296 | goto out_unlock; |
| 297 | |
| 298 | memtrace_size = val; |
| 299 | rc = 0; |
| 300 | out_unlock: |
| 301 | mutex_unlock(lock: &memtrace_mutex); |
| 302 | return rc; |
| 303 | } |
| 304 | |
| 305 | static int memtrace_enable_get(void *data, u64 *val) |
| 306 | { |
| 307 | *val = memtrace_size; |
| 308 | return 0; |
| 309 | } |
| 310 | |
| 311 | DEFINE_SIMPLE_ATTRIBUTE(memtrace_init_fops, memtrace_enable_get, |
| 312 | memtrace_enable_set, "0x%016llx\n" ); |
| 313 | |
| 314 | static int memtrace_init(void) |
| 315 | { |
| 316 | memtrace_debugfs_dir = debugfs_create_dir(name: "memtrace" , |
| 317 | parent: arch_debugfs_dir); |
| 318 | |
| 319 | debugfs_create_file("enable" , 0600, memtrace_debugfs_dir, |
| 320 | NULL, &memtrace_init_fops); |
| 321 | |
| 322 | return 0; |
| 323 | } |
| 324 | machine_device_initcall(powernv, memtrace_init); |
| 325 | |