1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright IBM Corp. 2006
4 * Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>
5 */
6
7 #include <linux/memory_hotplug.h>
8 #include <linux/memblock.h>
9 #include <linux/pfn.h>
10 #include <linux/mm.h>
11 #include <linux/init.h>
12 #include <linux/list.h>
13 #include <linux/hugetlb.h>
14 #include <linux/slab.h>
15 #include <asm/cacheflush.h>
16 #include <asm/nospec-branch.h>
17 #include <asm/pgalloc.h>
18 #include <asm/setup.h>
19 #include <asm/tlbflush.h>
20 #include <asm/sections.h>
21 #include <asm/set_memory.h>
22
23 static DEFINE_MUTEX(vmem_mutex);
24
vmem_alloc_pages(unsigned int order)25 static void __ref *vmem_alloc_pages(unsigned int order)
26 {
27 unsigned long size = PAGE_SIZE << order;
28
29 if (slab_is_available())
30 return (void *)__get_free_pages(GFP_KERNEL, order);
31 return memblock_alloc(size, size);
32 }
33
vmem_free_pages(unsigned long addr,int order)34 static void vmem_free_pages(unsigned long addr, int order)
35 {
36 /* We don't expect boot memory to be removed ever. */
37 if (!slab_is_available() ||
38 WARN_ON_ONCE(PageReserved(virt_to_page(addr))))
39 return;
40 free_pages(addr, order);
41 }
42
vmem_crst_alloc(unsigned long val)43 void *vmem_crst_alloc(unsigned long val)
44 {
45 unsigned long *table;
46
47 table = vmem_alloc_pages(CRST_ALLOC_ORDER);
48 if (table)
49 crst_table_init(table, val);
50 return table;
51 }
52
vmem_pte_alloc(void)53 pte_t __ref *vmem_pte_alloc(void)
54 {
55 unsigned long size = PTRS_PER_PTE * sizeof(pte_t);
56 pte_t *pte;
57
58 if (slab_is_available())
59 pte = (pte_t *) page_table_alloc(&init_mm);
60 else
61 pte = (pte_t *) memblock_alloc(size, size);
62 if (!pte)
63 return NULL;
64 memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE);
65 return pte;
66 }
67
vmem_pte_free(unsigned long * table)68 static void vmem_pte_free(unsigned long *table)
69 {
70 /* We don't expect boot memory to be removed ever. */
71 if (!slab_is_available() ||
72 WARN_ON_ONCE(PageReserved(virt_to_page(table))))
73 return;
74 page_table_free(&init_mm, table);
75 }
76
77 #define PAGE_UNUSED 0xFD
78
79 /*
80 * The unused vmemmap range, which was not yet memset(PAGE_UNUSED) ranges
81 * from unused_sub_pmd_start to next PMD_SIZE boundary.
82 */
83 static unsigned long unused_sub_pmd_start;
84
vmemmap_flush_unused_sub_pmd(void)85 static void vmemmap_flush_unused_sub_pmd(void)
86 {
87 if (!unused_sub_pmd_start)
88 return;
89 memset((void *)unused_sub_pmd_start, PAGE_UNUSED,
90 ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start);
91 unused_sub_pmd_start = 0;
92 }
93
vmemmap_mark_sub_pmd_used(unsigned long start,unsigned long end)94 static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end)
95 {
96 /*
97 * As we expect to add in the same granularity as we remove, it's
98 * sufficient to mark only some piece used to block the memmap page from
99 * getting removed (just in case the memmap never gets initialized,
100 * e.g., because the memory block never gets onlined).
101 */
102 memset((void *)start, 0, sizeof(struct page));
103 }
104
vmemmap_use_sub_pmd(unsigned long start,unsigned long end)105 static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end)
106 {
107 /*
108 * We only optimize if the new used range directly follows the
109 * previously unused range (esp., when populating consecutive sections).
110 */
111 if (unused_sub_pmd_start == start) {
112 unused_sub_pmd_start = end;
113 if (likely(IS_ALIGNED(unused_sub_pmd_start, PMD_SIZE)))
114 unused_sub_pmd_start = 0;
115 return;
116 }
117 vmemmap_flush_unused_sub_pmd();
118 vmemmap_mark_sub_pmd_used(start, end);
119 }
120
vmemmap_use_new_sub_pmd(unsigned long start,unsigned long end)121 static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end)
122 {
123 unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
124
125 vmemmap_flush_unused_sub_pmd();
126
127 /* Could be our memmap page is filled with PAGE_UNUSED already ... */
128 vmemmap_mark_sub_pmd_used(start, end);
129
130 /* Mark the unused parts of the new memmap page PAGE_UNUSED. */
131 if (!IS_ALIGNED(start, PMD_SIZE))
132 memset((void *)page, PAGE_UNUSED, start - page);
133 /*
134 * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of
135 * consecutive sections. Remember for the last added PMD the last
136 * unused range in the populated PMD.
137 */
138 if (!IS_ALIGNED(end, PMD_SIZE))
139 unused_sub_pmd_start = end;
140 }
141
142 /* Returns true if the PMD is completely unused and can be freed. */
vmemmap_unuse_sub_pmd(unsigned long start,unsigned long end)143 static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end)
144 {
145 unsigned long page = ALIGN_DOWN(start, PMD_SIZE);
146
147 vmemmap_flush_unused_sub_pmd();
148 memset((void *)start, PAGE_UNUSED, end - start);
149 return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE);
150 }
151
152 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
modify_pte_table(pmd_t * pmd,unsigned long addr,unsigned long end,bool add,bool direct)153 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr,
154 unsigned long end, bool add, bool direct)
155 {
156 unsigned long prot, pages = 0;
157 int ret = -ENOMEM;
158 pte_t *pte;
159
160 prot = pgprot_val(PAGE_KERNEL);
161 if (!MACHINE_HAS_NX)
162 prot &= ~_PAGE_NOEXEC;
163
164 pte = pte_offset_kernel(pmd, addr);
165 for (; addr < end; addr += PAGE_SIZE, pte++) {
166 if (!add) {
167 if (pte_none(*pte))
168 continue;
169 if (!direct)
170 vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0);
171 pte_clear(&init_mm, addr, pte);
172 } else if (pte_none(*pte)) {
173 if (!direct) {
174 void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE);
175
176 if (!new_page)
177 goto out;
178 pte_val(*pte) = __pa(new_page) | prot;
179 } else {
180 pte_val(*pte) = __pa(addr) | prot;
181 }
182 } else {
183 continue;
184 }
185 pages++;
186 }
187 ret = 0;
188 out:
189 if (direct)
190 update_page_count(PG_DIRECT_MAP_4K, add ? pages : -pages);
191 return ret;
192 }
193
try_free_pte_table(pmd_t * pmd,unsigned long start)194 static void try_free_pte_table(pmd_t *pmd, unsigned long start)
195 {
196 pte_t *pte;
197 int i;
198
199 /* We can safely assume this is fully in 1:1 mapping & vmemmap area */
200 pte = pte_offset_kernel(pmd, start);
201 for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
202 if (!pte_none(*pte))
203 return;
204 }
205 vmem_pte_free((unsigned long *) pmd_deref(*pmd));
206 pmd_clear(pmd);
207 }
208
209 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */
modify_pmd_table(pud_t * pud,unsigned long addr,unsigned long end,bool add,bool direct)210 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr,
211 unsigned long end, bool add, bool direct)
212 {
213 unsigned long next, prot, pages = 0;
214 int ret = -ENOMEM;
215 pmd_t *pmd;
216 pte_t *pte;
217
218 prot = pgprot_val(SEGMENT_KERNEL);
219 if (!MACHINE_HAS_NX)
220 prot &= ~_SEGMENT_ENTRY_NOEXEC;
221
222 pmd = pmd_offset(pud, addr);
223 for (; addr < end; addr = next, pmd++) {
224 next = pmd_addr_end(addr, end);
225 if (!add) {
226 if (pmd_none(*pmd))
227 continue;
228 if (pmd_large(*pmd)) {
229 if (IS_ALIGNED(addr, PMD_SIZE) &&
230 IS_ALIGNED(next, PMD_SIZE)) {
231 if (!direct)
232 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
233 pmd_clear(pmd);
234 pages++;
235 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) {
236 vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE));
237 pmd_clear(pmd);
238 }
239 continue;
240 }
241 } else if (pmd_none(*pmd)) {
242 if (IS_ALIGNED(addr, PMD_SIZE) &&
243 IS_ALIGNED(next, PMD_SIZE) &&
244 MACHINE_HAS_EDAT1 && addr && direct &&
245 !debug_pagealloc_enabled()) {
246 pmd_val(*pmd) = __pa(addr) | prot;
247 pages++;
248 continue;
249 } else if (!direct && MACHINE_HAS_EDAT1) {
250 void *new_page;
251
252 /*
253 * Use 1MB frames for vmemmap if available. We
254 * always use large frames even if they are only
255 * partially used. Otherwise we would have also
256 * page tables since vmemmap_populate gets
257 * called for each section separately.
258 */
259 new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE);
260 if (new_page) {
261 pmd_val(*pmd) = __pa(new_page) | prot;
262 if (!IS_ALIGNED(addr, PMD_SIZE) ||
263 !IS_ALIGNED(next, PMD_SIZE)) {
264 vmemmap_use_new_sub_pmd(addr, next);
265 }
266 continue;
267 }
268 }
269 pte = vmem_pte_alloc();
270 if (!pte)
271 goto out;
272 pmd_populate(&init_mm, pmd, pte);
273 } else if (pmd_large(*pmd)) {
274 if (!direct)
275 vmemmap_use_sub_pmd(addr, next);
276 continue;
277 }
278 ret = modify_pte_table(pmd, addr, next, add, direct);
279 if (ret)
280 goto out;
281 if (!add)
282 try_free_pte_table(pmd, addr & PMD_MASK);
283 }
284 ret = 0;
285 out:
286 if (direct)
287 update_page_count(PG_DIRECT_MAP_1M, add ? pages : -pages);
288 return ret;
289 }
290
try_free_pmd_table(pud_t * pud,unsigned long start)291 static void try_free_pmd_table(pud_t *pud, unsigned long start)
292 {
293 const unsigned long end = start + PUD_SIZE;
294 pmd_t *pmd;
295 int i;
296
297 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
298 if (end > VMALLOC_START)
299 return;
300 #ifdef CONFIG_KASAN
301 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
302 return;
303 #endif
304 pmd = pmd_offset(pud, start);
305 for (i = 0; i < PTRS_PER_PMD; i++, pmd++)
306 if (!pmd_none(*pmd))
307 return;
308 vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER);
309 pud_clear(pud);
310 }
311
modify_pud_table(p4d_t * p4d,unsigned long addr,unsigned long end,bool add,bool direct)312 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end,
313 bool add, bool direct)
314 {
315 unsigned long next, prot, pages = 0;
316 int ret = -ENOMEM;
317 pud_t *pud;
318 pmd_t *pmd;
319
320 prot = pgprot_val(REGION3_KERNEL);
321 if (!MACHINE_HAS_NX)
322 prot &= ~_REGION_ENTRY_NOEXEC;
323 pud = pud_offset(p4d, addr);
324 for (; addr < end; addr = next, pud++) {
325 next = pud_addr_end(addr, end);
326 if (!add) {
327 if (pud_none(*pud))
328 continue;
329 if (pud_large(*pud)) {
330 if (IS_ALIGNED(addr, PUD_SIZE) &&
331 IS_ALIGNED(next, PUD_SIZE)) {
332 pud_clear(pud);
333 pages++;
334 }
335 continue;
336 }
337 } else if (pud_none(*pud)) {
338 if (IS_ALIGNED(addr, PUD_SIZE) &&
339 IS_ALIGNED(next, PUD_SIZE) &&
340 MACHINE_HAS_EDAT2 && addr && direct &&
341 !debug_pagealloc_enabled()) {
342 pud_val(*pud) = __pa(addr) | prot;
343 pages++;
344 continue;
345 }
346 pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
347 if (!pmd)
348 goto out;
349 pud_populate(&init_mm, pud, pmd);
350 } else if (pud_large(*pud)) {
351 continue;
352 }
353 ret = modify_pmd_table(pud, addr, next, add, direct);
354 if (ret)
355 goto out;
356 if (!add)
357 try_free_pmd_table(pud, addr & PUD_MASK);
358 }
359 ret = 0;
360 out:
361 if (direct)
362 update_page_count(PG_DIRECT_MAP_2G, add ? pages : -pages);
363 return ret;
364 }
365
try_free_pud_table(p4d_t * p4d,unsigned long start)366 static void try_free_pud_table(p4d_t *p4d, unsigned long start)
367 {
368 const unsigned long end = start + P4D_SIZE;
369 pud_t *pud;
370 int i;
371
372 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
373 if (end > VMALLOC_START)
374 return;
375 #ifdef CONFIG_KASAN
376 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
377 return;
378 #endif
379
380 pud = pud_offset(p4d, start);
381 for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
382 if (!pud_none(*pud))
383 return;
384 }
385 vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER);
386 p4d_clear(p4d);
387 }
388
modify_p4d_table(pgd_t * pgd,unsigned long addr,unsigned long end,bool add,bool direct)389 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end,
390 bool add, bool direct)
391 {
392 unsigned long next;
393 int ret = -ENOMEM;
394 p4d_t *p4d;
395 pud_t *pud;
396
397 p4d = p4d_offset(pgd, addr);
398 for (; addr < end; addr = next, p4d++) {
399 next = p4d_addr_end(addr, end);
400 if (!add) {
401 if (p4d_none(*p4d))
402 continue;
403 } else if (p4d_none(*p4d)) {
404 pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
405 if (!pud)
406 goto out;
407 p4d_populate(&init_mm, p4d, pud);
408 }
409 ret = modify_pud_table(p4d, addr, next, add, direct);
410 if (ret)
411 goto out;
412 if (!add)
413 try_free_pud_table(p4d, addr & P4D_MASK);
414 }
415 ret = 0;
416 out:
417 return ret;
418 }
419
try_free_p4d_table(pgd_t * pgd,unsigned long start)420 static void try_free_p4d_table(pgd_t *pgd, unsigned long start)
421 {
422 const unsigned long end = start + PGDIR_SIZE;
423 p4d_t *p4d;
424 int i;
425
426 /* Don't mess with any tables not fully in 1:1 mapping & vmemmap area */
427 if (end > VMALLOC_START)
428 return;
429 #ifdef CONFIG_KASAN
430 if (start < KASAN_SHADOW_END && KASAN_SHADOW_START > end)
431 return;
432 #endif
433
434 p4d = p4d_offset(pgd, start);
435 for (i = 0; i < PTRS_PER_P4D; i++, p4d++) {
436 if (!p4d_none(*p4d))
437 return;
438 }
439 vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER);
440 pgd_clear(pgd);
441 }
442
modify_pagetable(unsigned long start,unsigned long end,bool add,bool direct)443 static int modify_pagetable(unsigned long start, unsigned long end, bool add,
444 bool direct)
445 {
446 unsigned long addr, next;
447 int ret = -ENOMEM;
448 pgd_t *pgd;
449 p4d_t *p4d;
450
451 if (WARN_ON_ONCE(!PAGE_ALIGNED(start | end)))
452 return -EINVAL;
453 for (addr = start; addr < end; addr = next) {
454 next = pgd_addr_end(addr, end);
455 pgd = pgd_offset_k(addr);
456
457 if (!add) {
458 if (pgd_none(*pgd))
459 continue;
460 } else if (pgd_none(*pgd)) {
461 p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
462 if (!p4d)
463 goto out;
464 pgd_populate(&init_mm, pgd, p4d);
465 }
466 ret = modify_p4d_table(pgd, addr, next, add, direct);
467 if (ret)
468 goto out;
469 if (!add)
470 try_free_p4d_table(pgd, addr & PGDIR_MASK);
471 }
472 ret = 0;
473 out:
474 if (!add)
475 flush_tlb_kernel_range(start, end);
476 return ret;
477 }
478
add_pagetable(unsigned long start,unsigned long end,bool direct)479 static int add_pagetable(unsigned long start, unsigned long end, bool direct)
480 {
481 return modify_pagetable(start, end, true, direct);
482 }
483
remove_pagetable(unsigned long start,unsigned long end,bool direct)484 static int remove_pagetable(unsigned long start, unsigned long end, bool direct)
485 {
486 return modify_pagetable(start, end, false, direct);
487 }
488
489 /*
490 * Add a physical memory range to the 1:1 mapping.
491 */
vmem_add_range(unsigned long start,unsigned long size)492 static int vmem_add_range(unsigned long start, unsigned long size)
493 {
494 return add_pagetable(start, start + size, true);
495 }
496
497 /*
498 * Remove a physical memory range from the 1:1 mapping.
499 */
vmem_remove_range(unsigned long start,unsigned long size)500 static void vmem_remove_range(unsigned long start, unsigned long size)
501 {
502 remove_pagetable(start, start + size, true);
503 }
504
505 /*
506 * Add a backed mem_map array to the virtual mem_map array.
507 */
vmemmap_populate(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)508 int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node,
509 struct vmem_altmap *altmap)
510 {
511 int ret;
512
513 mutex_lock(&vmem_mutex);
514 /* We don't care about the node, just use NUMA_NO_NODE on allocations */
515 ret = add_pagetable(start, end, false);
516 if (ret)
517 remove_pagetable(start, end, false);
518 mutex_unlock(&vmem_mutex);
519 return ret;
520 }
521
vmemmap_free(unsigned long start,unsigned long end,struct vmem_altmap * altmap)522 void vmemmap_free(unsigned long start, unsigned long end,
523 struct vmem_altmap *altmap)
524 {
525 mutex_lock(&vmem_mutex);
526 remove_pagetable(start, end, false);
527 mutex_unlock(&vmem_mutex);
528 }
529
vmem_remove_mapping(unsigned long start,unsigned long size)530 void vmem_remove_mapping(unsigned long start, unsigned long size)
531 {
532 mutex_lock(&vmem_mutex);
533 vmem_remove_range(start, size);
534 mutex_unlock(&vmem_mutex);
535 }
536
arch_get_mappable_range(void)537 struct range arch_get_mappable_range(void)
538 {
539 struct range mhp_range;
540
541 mhp_range.start = 0;
542 mhp_range.end = VMEM_MAX_PHYS - 1;
543 return mhp_range;
544 }
545
vmem_add_mapping(unsigned long start,unsigned long size)546 int vmem_add_mapping(unsigned long start, unsigned long size)
547 {
548 struct range range = arch_get_mappable_range();
549 int ret;
550
551 if (start < range.start ||
552 start + size > range.end + 1 ||
553 start + size < start)
554 return -ERANGE;
555
556 mutex_lock(&vmem_mutex);
557 ret = vmem_add_range(start, size);
558 if (ret)
559 vmem_remove_range(start, size);
560 mutex_unlock(&vmem_mutex);
561 return ret;
562 }
563
564 /*
565 * map whole physical memory to virtual memory (identity mapping)
566 * we reserve enough space in the vmalloc area for vmemmap to hotplug
567 * additional memory segments.
568 */
vmem_map_init(void)569 void __init vmem_map_init(void)
570 {
571 phys_addr_t base, end;
572 u64 i;
573
574 for_each_mem_range(i, &base, &end)
575 vmem_add_range(base, end - base);
576 __set_memory((unsigned long)_stext,
577 (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
578 SET_MEMORY_RO | SET_MEMORY_X);
579 __set_memory((unsigned long)_etext,
580 (unsigned long)(__end_rodata - _etext) >> PAGE_SHIFT,
581 SET_MEMORY_RO);
582 __set_memory((unsigned long)_sinittext,
583 (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT,
584 SET_MEMORY_RO | SET_MEMORY_X);
585 __set_memory(__stext_amode31, (__etext_amode31 - __stext_amode31) >> PAGE_SHIFT,
586 SET_MEMORY_RO | SET_MEMORY_X);
587
588 if (nospec_uses_trampoline() || !static_key_enabled(&cpu_has_bear)) {
589 /*
590 * Lowcore must be executable for LPSWE
591 * and expoline trampoline branch instructions.
592 */
593 set_memory_x(0, 1);
594 }
595
596 pr_info("Write protected kernel read-only data: %luk\n",
597 (unsigned long)(__end_rodata - _stext) >> 10);
598 }
599