1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Virtual Memory Map support
4 *
5 * (C) 2007 sgi. Christoph Lameter.
6 *
7 * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
8 * virt_to_page, page_address() to be implemented as a base offset
9 * calculation without memory access.
10 *
11 * However, virtual mappings need a page table and TLBs. Many Linux
12 * architectures already map their physical space using 1-1 mappings
13 * via TLBs. For those arches the virtual memory map is essentially
14 * for free if we use the same page size as the 1-1 mappings. In that
15 * case the overhead consists of a few additional pages that are
16 * allocated to create a view of memory for vmemmap.
17 *
18 * The architecture is expected to provide a vmemmap_populate() function
19 * to instantiate the mapping.
20 */
21 #include <linux/mm.h>
22 #include <linux/mmzone.h>
23 #include <linux/memblock.h>
24 #include <linux/memremap.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/spinlock.h>
28 #include <linux/vmalloc.h>
29 #include <linux/sched.h>
30 #include <linux/pgtable.h>
31 #include <linux/bootmem_info.h>
32
33 #include <asm/dma.h>
34 #include <asm/pgalloc.h>
35 #include <asm/tlbflush.h>
36
37 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
38 /**
39 * struct vmemmap_remap_walk - walk vmemmap page table
40 *
41 * @remap_pte: called for each lowest-level entry (PTE).
42 * @nr_walked: the number of walked pte.
43 * @reuse_page: the page which is reused for the tail vmemmap pages.
44 * @reuse_addr: the virtual address of the @reuse_page page.
45 * @vmemmap_pages: the list head of the vmemmap pages that can be freed
46 * or is mapped from.
47 */
48 struct vmemmap_remap_walk {
49 void (*remap_pte)(pte_t *pte, unsigned long addr,
50 struct vmemmap_remap_walk *walk);
51 unsigned long nr_walked;
52 struct page *reuse_page;
53 unsigned long reuse_addr;
54 struct list_head *vmemmap_pages;
55 };
56
__split_vmemmap_huge_pmd(pmd_t * pmd,unsigned long start)57 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
58 {
59 pmd_t __pmd;
60 int i;
61 unsigned long addr = start;
62 struct page *page = pmd_page(*pmd);
63 pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
64
65 if (!pgtable)
66 return -ENOMEM;
67
68 pmd_populate_kernel(&init_mm, &__pmd, pgtable);
69
70 for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
71 pte_t entry, *pte;
72 pgprot_t pgprot = PAGE_KERNEL;
73
74 entry = mk_pte(page + i, pgprot);
75 pte = pte_offset_kernel(&__pmd, addr);
76 set_pte_at(&init_mm, addr, pte, entry);
77 }
78
79 spin_lock(&init_mm.page_table_lock);
80 if (likely(pmd_leaf(*pmd))) {
81 /*
82 * Higher order allocations from buddy allocator must be able to
83 * be treated as indepdenent small pages (as they can be freed
84 * individually).
85 */
86 if (!PageReserved(page))
87 split_page(page, get_order(PMD_SIZE));
88
89 /* Make pte visible before pmd. See comment in pmd_install(). */
90 smp_wmb();
91 pmd_populate_kernel(&init_mm, pmd, pgtable);
92 flush_tlb_kernel_range(start, start + PMD_SIZE);
93 } else {
94 pte_free_kernel(&init_mm, pgtable);
95 }
96 spin_unlock(&init_mm.page_table_lock);
97
98 return 0;
99 }
100
split_vmemmap_huge_pmd(pmd_t * pmd,unsigned long start)101 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
102 {
103 int leaf;
104
105 spin_lock(&init_mm.page_table_lock);
106 leaf = pmd_leaf(*pmd);
107 spin_unlock(&init_mm.page_table_lock);
108
109 if (!leaf)
110 return 0;
111
112 return __split_vmemmap_huge_pmd(pmd, start);
113 }
114
vmemmap_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)115 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
116 unsigned long end,
117 struct vmemmap_remap_walk *walk)
118 {
119 pte_t *pte = pte_offset_kernel(pmd, addr);
120
121 /*
122 * The reuse_page is found 'first' in table walk before we start
123 * remapping (which is calling @walk->remap_pte).
124 */
125 if (!walk->reuse_page) {
126 walk->reuse_page = pte_page(*pte);
127 /*
128 * Because the reuse address is part of the range that we are
129 * walking, skip the reuse address range.
130 */
131 addr += PAGE_SIZE;
132 pte++;
133 walk->nr_walked++;
134 }
135
136 for (; addr != end; addr += PAGE_SIZE, pte++) {
137 walk->remap_pte(pte, addr, walk);
138 walk->nr_walked++;
139 }
140 }
141
vmemmap_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)142 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
143 unsigned long end,
144 struct vmemmap_remap_walk *walk)
145 {
146 pmd_t *pmd;
147 unsigned long next;
148
149 pmd = pmd_offset(pud, addr);
150 do {
151 int ret;
152
153 ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
154 if (ret)
155 return ret;
156
157 next = pmd_addr_end(addr, end);
158 vmemmap_pte_range(pmd, addr, next, walk);
159 } while (pmd++, addr = next, addr != end);
160
161 return 0;
162 }
163
vmemmap_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)164 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
165 unsigned long end,
166 struct vmemmap_remap_walk *walk)
167 {
168 pud_t *pud;
169 unsigned long next;
170
171 pud = pud_offset(p4d, addr);
172 do {
173 int ret;
174
175 next = pud_addr_end(addr, end);
176 ret = vmemmap_pmd_range(pud, addr, next, walk);
177 if (ret)
178 return ret;
179 } while (pud++, addr = next, addr != end);
180
181 return 0;
182 }
183
vmemmap_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)184 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
185 unsigned long end,
186 struct vmemmap_remap_walk *walk)
187 {
188 p4d_t *p4d;
189 unsigned long next;
190
191 p4d = p4d_offset(pgd, addr);
192 do {
193 int ret;
194
195 next = p4d_addr_end(addr, end);
196 ret = vmemmap_pud_range(p4d, addr, next, walk);
197 if (ret)
198 return ret;
199 } while (p4d++, addr = next, addr != end);
200
201 return 0;
202 }
203
vmemmap_remap_range(unsigned long start,unsigned long end,struct vmemmap_remap_walk * walk)204 static int vmemmap_remap_range(unsigned long start, unsigned long end,
205 struct vmemmap_remap_walk *walk)
206 {
207 unsigned long addr = start;
208 unsigned long next;
209 pgd_t *pgd;
210
211 VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
212 VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
213
214 pgd = pgd_offset_k(addr);
215 do {
216 int ret;
217
218 next = pgd_addr_end(addr, end);
219 ret = vmemmap_p4d_range(pgd, addr, next, walk);
220 if (ret)
221 return ret;
222 } while (pgd++, addr = next, addr != end);
223
224 /*
225 * We only change the mapping of the vmemmap virtual address range
226 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
227 * belongs to the range.
228 */
229 flush_tlb_kernel_range(start + PAGE_SIZE, end);
230
231 return 0;
232 }
233
234 /*
235 * Free a vmemmap page. A vmemmap page can be allocated from the memblock
236 * allocator or buddy allocator. If the PG_reserved flag is set, it means
237 * that it allocated from the memblock allocator, just free it via the
238 * free_bootmem_page(). Otherwise, use __free_page().
239 */
free_vmemmap_page(struct page * page)240 static inline void free_vmemmap_page(struct page *page)
241 {
242 if (PageReserved(page))
243 free_bootmem_page(page);
244 else
245 __free_page(page);
246 }
247
248 /* Free a list of the vmemmap pages */
free_vmemmap_page_list(struct list_head * list)249 static void free_vmemmap_page_list(struct list_head *list)
250 {
251 struct page *page, *next;
252
253 list_for_each_entry_safe(page, next, list, lru) {
254 list_del(&page->lru);
255 free_vmemmap_page(page);
256 }
257 }
258
vmemmap_remap_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)259 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
260 struct vmemmap_remap_walk *walk)
261 {
262 /*
263 * Remap the tail pages as read-only to catch illegal write operation
264 * to the tail pages.
265 */
266 pgprot_t pgprot = PAGE_KERNEL_RO;
267 pte_t entry = mk_pte(walk->reuse_page, pgprot);
268 struct page *page = pte_page(*pte);
269
270 list_add_tail(&page->lru, walk->vmemmap_pages);
271 set_pte_at(&init_mm, addr, pte, entry);
272 }
273
274 /*
275 * How many struct page structs need to be reset. When we reuse the head
276 * struct page, the special metadata (e.g. page->flags or page->mapping)
277 * cannot copy to the tail struct page structs. The invalid value will be
278 * checked in the free_tail_pages_check(). In order to avoid the message
279 * of "corrupted mapping in tail page". We need to reset at least 3 (one
280 * head struct page struct and two tail struct page structs) struct page
281 * structs.
282 */
283 #define NR_RESET_STRUCT_PAGE 3
284
reset_struct_pages(struct page * start)285 static inline void reset_struct_pages(struct page *start)
286 {
287 int i;
288 struct page *from = start + NR_RESET_STRUCT_PAGE;
289
290 for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
291 memcpy(start + i, from, sizeof(*from));
292 }
293
vmemmap_restore_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)294 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
295 struct vmemmap_remap_walk *walk)
296 {
297 pgprot_t pgprot = PAGE_KERNEL;
298 struct page *page;
299 void *to;
300
301 BUG_ON(pte_page(*pte) != walk->reuse_page);
302
303 page = list_first_entry(walk->vmemmap_pages, struct page, lru);
304 list_del(&page->lru);
305 to = page_to_virt(page);
306 copy_page(to, (void *)walk->reuse_addr);
307 reset_struct_pages(to);
308
309 set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
310 }
311
312 /**
313 * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
314 * to the page which @reuse is mapped to, then free vmemmap
315 * which the range are mapped to.
316 * @start: start address of the vmemmap virtual address range that we want
317 * to remap.
318 * @end: end address of the vmemmap virtual address range that we want to
319 * remap.
320 * @reuse: reuse address.
321 *
322 * Return: %0 on success, negative error code otherwise.
323 */
vmemmap_remap_free(unsigned long start,unsigned long end,unsigned long reuse)324 int vmemmap_remap_free(unsigned long start, unsigned long end,
325 unsigned long reuse)
326 {
327 int ret;
328 LIST_HEAD(vmemmap_pages);
329 struct vmemmap_remap_walk walk = {
330 .remap_pte = vmemmap_remap_pte,
331 .reuse_addr = reuse,
332 .vmemmap_pages = &vmemmap_pages,
333 };
334
335 /*
336 * In order to make remapping routine most efficient for the huge pages,
337 * the routine of vmemmap page table walking has the following rules
338 * (see more details from the vmemmap_pte_range()):
339 *
340 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
341 * should be continuous.
342 * - The @reuse address is part of the range [@reuse, @end) that we are
343 * walking which is passed to vmemmap_remap_range().
344 * - The @reuse address is the first in the complete range.
345 *
346 * So we need to make sure that @start and @reuse meet the above rules.
347 */
348 BUG_ON(start - reuse != PAGE_SIZE);
349
350 mmap_read_lock(&init_mm);
351 ret = vmemmap_remap_range(reuse, end, &walk);
352 if (ret && walk.nr_walked) {
353 end = reuse + walk.nr_walked * PAGE_SIZE;
354 /*
355 * vmemmap_pages contains pages from the previous
356 * vmemmap_remap_range call which failed. These
357 * are pages which were removed from the vmemmap.
358 * They will be restored in the following call.
359 */
360 walk = (struct vmemmap_remap_walk) {
361 .remap_pte = vmemmap_restore_pte,
362 .reuse_addr = reuse,
363 .vmemmap_pages = &vmemmap_pages,
364 };
365
366 vmemmap_remap_range(reuse, end, &walk);
367 }
368 mmap_read_unlock(&init_mm);
369
370 free_vmemmap_page_list(&vmemmap_pages);
371
372 return ret;
373 }
374
alloc_vmemmap_page_list(unsigned long start,unsigned long end,gfp_t gfp_mask,struct list_head * list)375 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
376 gfp_t gfp_mask, struct list_head *list)
377 {
378 unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
379 int nid = page_to_nid((struct page *)start);
380 struct page *page, *next;
381
382 while (nr_pages--) {
383 page = alloc_pages_node(nid, gfp_mask, 0);
384 if (!page)
385 goto out;
386 list_add_tail(&page->lru, list);
387 }
388
389 return 0;
390 out:
391 list_for_each_entry_safe(page, next, list, lru)
392 __free_pages(page, 0);
393 return -ENOMEM;
394 }
395
396 /**
397 * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
398 * to the page which is from the @vmemmap_pages
399 * respectively.
400 * @start: start address of the vmemmap virtual address range that we want
401 * to remap.
402 * @end: end address of the vmemmap virtual address range that we want to
403 * remap.
404 * @reuse: reuse address.
405 * @gfp_mask: GFP flag for allocating vmemmap pages.
406 *
407 * Return: %0 on success, negative error code otherwise.
408 */
vmemmap_remap_alloc(unsigned long start,unsigned long end,unsigned long reuse,gfp_t gfp_mask)409 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
410 unsigned long reuse, gfp_t gfp_mask)
411 {
412 LIST_HEAD(vmemmap_pages);
413 struct vmemmap_remap_walk walk = {
414 .remap_pte = vmemmap_restore_pte,
415 .reuse_addr = reuse,
416 .vmemmap_pages = &vmemmap_pages,
417 };
418
419 /* See the comment in the vmemmap_remap_free(). */
420 BUG_ON(start - reuse != PAGE_SIZE);
421
422 if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
423 return -ENOMEM;
424
425 mmap_read_lock(&init_mm);
426 vmemmap_remap_range(reuse, end, &walk);
427 mmap_read_unlock(&init_mm);
428
429 return 0;
430 }
431 #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
432
433 /*
434 * Allocate a block of memory to be used to back the virtual memory map
435 * or to back the page tables that are used to create the mapping.
436 * Uses the main allocators if they are available, else bootmem.
437 */
438
__earlyonly_bootmem_alloc(int node,unsigned long size,unsigned long align,unsigned long goal)439 static void * __ref __earlyonly_bootmem_alloc(int node,
440 unsigned long size,
441 unsigned long align,
442 unsigned long goal)
443 {
444 return memblock_alloc_try_nid_raw(size, align, goal,
445 MEMBLOCK_ALLOC_ACCESSIBLE, node);
446 }
447
vmemmap_alloc_block(unsigned long size,int node)448 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
449 {
450 /* If the main allocator is up use that, fallback to bootmem. */
451 if (slab_is_available()) {
452 gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
453 int order = get_order(size);
454 static bool warned;
455 struct page *page;
456
457 page = alloc_pages_node(node, gfp_mask, order);
458 if (page)
459 return page_address(page);
460
461 if (!warned) {
462 warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
463 "vmemmap alloc failure: order:%u", order);
464 warned = true;
465 }
466 return NULL;
467 } else
468 return __earlyonly_bootmem_alloc(node, size, size,
469 __pa(MAX_DMA_ADDRESS));
470 }
471
472 static void * __meminit altmap_alloc_block_buf(unsigned long size,
473 struct vmem_altmap *altmap);
474
475 /* need to make sure size is all the same during early stage */
vmemmap_alloc_block_buf(unsigned long size,int node,struct vmem_altmap * altmap)476 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
477 struct vmem_altmap *altmap)
478 {
479 void *ptr;
480
481 if (altmap)
482 return altmap_alloc_block_buf(size, altmap);
483
484 ptr = sparse_buffer_alloc(size);
485 if (!ptr)
486 ptr = vmemmap_alloc_block(size, node);
487 return ptr;
488 }
489
vmem_altmap_next_pfn(struct vmem_altmap * altmap)490 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
491 {
492 return altmap->base_pfn + altmap->reserve + altmap->alloc
493 + altmap->align;
494 }
495
vmem_altmap_nr_free(struct vmem_altmap * altmap)496 static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
497 {
498 unsigned long allocated = altmap->alloc + altmap->align;
499
500 if (altmap->free > allocated)
501 return altmap->free - allocated;
502 return 0;
503 }
504
altmap_alloc_block_buf(unsigned long size,struct vmem_altmap * altmap)505 static void * __meminit altmap_alloc_block_buf(unsigned long size,
506 struct vmem_altmap *altmap)
507 {
508 unsigned long pfn, nr_pfns, nr_align;
509
510 if (size & ~PAGE_MASK) {
511 pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
512 __func__, size);
513 return NULL;
514 }
515
516 pfn = vmem_altmap_next_pfn(altmap);
517 nr_pfns = size >> PAGE_SHIFT;
518 nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
519 nr_align = ALIGN(pfn, nr_align) - pfn;
520 if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
521 return NULL;
522
523 altmap->alloc += nr_pfns;
524 altmap->align += nr_align;
525 pfn += nr_align;
526
527 pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
528 __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
529 return __va(__pfn_to_phys(pfn));
530 }
531
vmemmap_verify(pte_t * pte,int node,unsigned long start,unsigned long end)532 void __meminit vmemmap_verify(pte_t *pte, int node,
533 unsigned long start, unsigned long end)
534 {
535 unsigned long pfn = pte_pfn(*pte);
536 int actual_node = early_pfn_to_nid(pfn);
537
538 if (node_distance(actual_node, node) > LOCAL_DISTANCE)
539 pr_warn("[%lx-%lx] potential offnode page_structs\n",
540 start, end - 1);
541 }
542
vmemmap_pte_populate(pmd_t * pmd,unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)543 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
544 struct vmem_altmap *altmap,
545 struct page *reuse)
546 {
547 pte_t *pte = pte_offset_kernel(pmd, addr);
548 if (pte_none(*pte)) {
549 pte_t entry;
550 void *p;
551
552 if (!reuse) {
553 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
554 if (!p)
555 return NULL;
556 } else {
557 /*
558 * When a PTE/PMD entry is freed from the init_mm
559 * there's a a free_pages() call to this page allocated
560 * above. Thus this get_page() is paired with the
561 * put_page_testzero() on the freeing path.
562 * This can only called by certain ZONE_DEVICE path,
563 * and through vmemmap_populate_compound_pages() when
564 * slab is available.
565 */
566 get_page(reuse);
567 p = page_to_virt(reuse);
568 }
569 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
570 set_pte_at(&init_mm, addr, pte, entry);
571 }
572 return pte;
573 }
574
vmemmap_alloc_block_zero(unsigned long size,int node)575 static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
576 {
577 void *p = vmemmap_alloc_block(size, node);
578
579 if (!p)
580 return NULL;
581 memset(p, 0, size);
582
583 return p;
584 }
585
vmemmap_pmd_populate(pud_t * pud,unsigned long addr,int node)586 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
587 {
588 pmd_t *pmd = pmd_offset(pud, addr);
589 if (pmd_none(*pmd)) {
590 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
591 if (!p)
592 return NULL;
593 pmd_populate_kernel(&init_mm, pmd, p);
594 }
595 return pmd;
596 }
597
vmemmap_pud_populate(p4d_t * p4d,unsigned long addr,int node)598 pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
599 {
600 pud_t *pud = pud_offset(p4d, addr);
601 if (pud_none(*pud)) {
602 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
603 if (!p)
604 return NULL;
605 pud_populate(&init_mm, pud, p);
606 }
607 return pud;
608 }
609
vmemmap_p4d_populate(pgd_t * pgd,unsigned long addr,int node)610 p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
611 {
612 p4d_t *p4d = p4d_offset(pgd, addr);
613 if (p4d_none(*p4d)) {
614 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
615 if (!p)
616 return NULL;
617 p4d_populate(&init_mm, p4d, p);
618 }
619 return p4d;
620 }
621
vmemmap_pgd_populate(unsigned long addr,int node)622 pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
623 {
624 pgd_t *pgd = pgd_offset_k(addr);
625 if (pgd_none(*pgd)) {
626 void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
627 if (!p)
628 return NULL;
629 pgd_populate(&init_mm, pgd, p);
630 }
631 return pgd;
632 }
633
vmemmap_populate_address(unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)634 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
635 struct vmem_altmap *altmap,
636 struct page *reuse)
637 {
638 pgd_t *pgd;
639 p4d_t *p4d;
640 pud_t *pud;
641 pmd_t *pmd;
642 pte_t *pte;
643
644 pgd = vmemmap_pgd_populate(addr, node);
645 if (!pgd)
646 return NULL;
647 p4d = vmemmap_p4d_populate(pgd, addr, node);
648 if (!p4d)
649 return NULL;
650 pud = vmemmap_pud_populate(p4d, addr, node);
651 if (!pud)
652 return NULL;
653 pmd = vmemmap_pmd_populate(pud, addr, node);
654 if (!pmd)
655 return NULL;
656 pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
657 if (!pte)
658 return NULL;
659 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
660
661 return pte;
662 }
663
vmemmap_populate_range(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap,struct page * reuse)664 static int __meminit vmemmap_populate_range(unsigned long start,
665 unsigned long end, int node,
666 struct vmem_altmap *altmap,
667 struct page *reuse)
668 {
669 unsigned long addr = start;
670 pte_t *pte;
671
672 for (; addr < end; addr += PAGE_SIZE) {
673 pte = vmemmap_populate_address(addr, node, altmap, reuse);
674 if (!pte)
675 return -ENOMEM;
676 }
677
678 return 0;
679 }
680
vmemmap_populate_basepages(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)681 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
682 int node, struct vmem_altmap *altmap)
683 {
684 return vmemmap_populate_range(start, end, node, altmap, NULL);
685 }
686
687 /*
688 * For compound pages bigger than section size (e.g. x86 1G compound
689 * pages with 2M subsection size) fill the rest of sections as tail
690 * pages.
691 *
692 * Note that memremap_pages() resets @nr_range value and will increment
693 * it after each range successful onlining. Thus the value or @nr_range
694 * at section memmap populate corresponds to the in-progress range
695 * being onlined here.
696 */
reuse_compound_section(unsigned long start_pfn,struct dev_pagemap * pgmap)697 static bool __meminit reuse_compound_section(unsigned long start_pfn,
698 struct dev_pagemap *pgmap)
699 {
700 unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
701 unsigned long offset = start_pfn -
702 PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
703
704 return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
705 }
706
compound_section_tail_page(unsigned long addr)707 static pte_t * __meminit compound_section_tail_page(unsigned long addr)
708 {
709 pte_t *pte;
710
711 addr -= PAGE_SIZE;
712
713 /*
714 * Assuming sections are populated sequentially, the previous section's
715 * page data can be reused.
716 */
717 pte = pte_offset_kernel(pmd_off_k(addr), addr);
718 if (!pte)
719 return NULL;
720
721 return pte;
722 }
723
vmemmap_populate_compound_pages(unsigned long start_pfn,unsigned long start,unsigned long end,int node,struct dev_pagemap * pgmap)724 static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
725 unsigned long start,
726 unsigned long end, int node,
727 struct dev_pagemap *pgmap)
728 {
729 unsigned long size, addr;
730 pte_t *pte;
731 int rc;
732
733 if (reuse_compound_section(start_pfn, pgmap)) {
734 pte = compound_section_tail_page(start);
735 if (!pte)
736 return -ENOMEM;
737
738 /*
739 * Reuse the page that was populated in the prior iteration
740 * with just tail struct pages.
741 */
742 return vmemmap_populate_range(start, end, node, NULL,
743 pte_page(*pte));
744 }
745
746 size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
747 for (addr = start; addr < end; addr += size) {
748 unsigned long next = addr, last = addr + size;
749
750 /* Populate the head page vmemmap page */
751 pte = vmemmap_populate_address(addr, node, NULL, NULL);
752 if (!pte)
753 return -ENOMEM;
754
755 /* Populate the tail pages vmemmap page */
756 next = addr + PAGE_SIZE;
757 pte = vmemmap_populate_address(next, node, NULL, NULL);
758 if (!pte)
759 return -ENOMEM;
760
761 /*
762 * Reuse the previous page for the rest of tail pages
763 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
764 */
765 next += PAGE_SIZE;
766 rc = vmemmap_populate_range(next, last, node, NULL,
767 pte_page(*pte));
768 if (rc)
769 return -ENOMEM;
770 }
771
772 return 0;
773 }
774
__populate_section_memmap(unsigned long pfn,unsigned long nr_pages,int nid,struct vmem_altmap * altmap,struct dev_pagemap * pgmap)775 struct page * __meminit __populate_section_memmap(unsigned long pfn,
776 unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
777 struct dev_pagemap *pgmap)
778 {
779 unsigned long start = (unsigned long) pfn_to_page(pfn);
780 unsigned long end = start + nr_pages * sizeof(struct page);
781 int r;
782
783 if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
784 !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
785 return NULL;
786
787 if (is_power_of_2(sizeof(struct page)) &&
788 pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
789 r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
790 else
791 r = vmemmap_populate(start, end, nid, altmap);
792
793 if (r < 0)
794 return NULL;
795
796 return pfn_to_page(pfn);
797 }
798