1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Virtual Memory Map support
4  *
5  * (C) 2007 sgi. Christoph Lameter.
6  *
7  * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
8  * virt_to_page, page_address() to be implemented as a base offset
9  * calculation without memory access.
10  *
11  * However, virtual mappings need a page table and TLBs. Many Linux
12  * architectures already map their physical space using 1-1 mappings
13  * via TLBs. For those arches the virtual memory map is essentially
14  * for free if we use the same page size as the 1-1 mappings. In that
15  * case the overhead consists of a few additional pages that are
16  * allocated to create a view of memory for vmemmap.
17  *
18  * The architecture is expected to provide a vmemmap_populate() function
19  * to instantiate the mapping.
20  */
21 #include <linux/mm.h>
22 #include <linux/mmzone.h>
23 #include <linux/memblock.h>
24 #include <linux/memremap.h>
25 #include <linux/highmem.h>
26 #include <linux/slab.h>
27 #include <linux/spinlock.h>
28 #include <linux/vmalloc.h>
29 #include <linux/sched.h>
30 #include <linux/pgtable.h>
31 #include <linux/bootmem_info.h>
32 
33 #include <asm/dma.h>
34 #include <asm/pgalloc.h>
35 #include <asm/tlbflush.h>
36 
37 #ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
38 /**
39  * struct vmemmap_remap_walk - walk vmemmap page table
40  *
41  * @remap_pte:		called for each lowest-level entry (PTE).
42  * @nr_walked:		the number of walked pte.
43  * @reuse_page:		the page which is reused for the tail vmemmap pages.
44  * @reuse_addr:		the virtual address of the @reuse_page page.
45  * @vmemmap_pages:	the list head of the vmemmap pages that can be freed
46  *			or is mapped from.
47  */
48 struct vmemmap_remap_walk {
49 	void (*remap_pte)(pte_t *pte, unsigned long addr,
50 			  struct vmemmap_remap_walk *walk);
51 	unsigned long nr_walked;
52 	struct page *reuse_page;
53 	unsigned long reuse_addr;
54 	struct list_head *vmemmap_pages;
55 };
56 
__split_vmemmap_huge_pmd(pmd_t * pmd,unsigned long start)57 static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
58 {
59 	pmd_t __pmd;
60 	int i;
61 	unsigned long addr = start;
62 	struct page *page = pmd_page(*pmd);
63 	pte_t *pgtable = pte_alloc_one_kernel(&init_mm);
64 
65 	if (!pgtable)
66 		return -ENOMEM;
67 
68 	pmd_populate_kernel(&init_mm, &__pmd, pgtable);
69 
70 	for (i = 0; i < PMD_SIZE / PAGE_SIZE; i++, addr += PAGE_SIZE) {
71 		pte_t entry, *pte;
72 		pgprot_t pgprot = PAGE_KERNEL;
73 
74 		entry = mk_pte(page + i, pgprot);
75 		pte = pte_offset_kernel(&__pmd, addr);
76 		set_pte_at(&init_mm, addr, pte, entry);
77 	}
78 
79 	spin_lock(&init_mm.page_table_lock);
80 	if (likely(pmd_leaf(*pmd))) {
81 		/*
82 		 * Higher order allocations from buddy allocator must be able to
83 		 * be treated as indepdenent small pages (as they can be freed
84 		 * individually).
85 		 */
86 		if (!PageReserved(page))
87 			split_page(page, get_order(PMD_SIZE));
88 
89 		/* Make pte visible before pmd. See comment in pmd_install(). */
90 		smp_wmb();
91 		pmd_populate_kernel(&init_mm, pmd, pgtable);
92 		flush_tlb_kernel_range(start, start + PMD_SIZE);
93 	} else {
94 		pte_free_kernel(&init_mm, pgtable);
95 	}
96 	spin_unlock(&init_mm.page_table_lock);
97 
98 	return 0;
99 }
100 
split_vmemmap_huge_pmd(pmd_t * pmd,unsigned long start)101 static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
102 {
103 	int leaf;
104 
105 	spin_lock(&init_mm.page_table_lock);
106 	leaf = pmd_leaf(*pmd);
107 	spin_unlock(&init_mm.page_table_lock);
108 
109 	if (!leaf)
110 		return 0;
111 
112 	return __split_vmemmap_huge_pmd(pmd, start);
113 }
114 
vmemmap_pte_range(pmd_t * pmd,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)115 static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
116 			      unsigned long end,
117 			      struct vmemmap_remap_walk *walk)
118 {
119 	pte_t *pte = pte_offset_kernel(pmd, addr);
120 
121 	/*
122 	 * The reuse_page is found 'first' in table walk before we start
123 	 * remapping (which is calling @walk->remap_pte).
124 	 */
125 	if (!walk->reuse_page) {
126 		walk->reuse_page = pte_page(*pte);
127 		/*
128 		 * Because the reuse address is part of the range that we are
129 		 * walking, skip the reuse address range.
130 		 */
131 		addr += PAGE_SIZE;
132 		pte++;
133 		walk->nr_walked++;
134 	}
135 
136 	for (; addr != end; addr += PAGE_SIZE, pte++) {
137 		walk->remap_pte(pte, addr, walk);
138 		walk->nr_walked++;
139 	}
140 }
141 
vmemmap_pmd_range(pud_t * pud,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)142 static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
143 			     unsigned long end,
144 			     struct vmemmap_remap_walk *walk)
145 {
146 	pmd_t *pmd;
147 	unsigned long next;
148 
149 	pmd = pmd_offset(pud, addr);
150 	do {
151 		int ret;
152 
153 		ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
154 		if (ret)
155 			return ret;
156 
157 		next = pmd_addr_end(addr, end);
158 		vmemmap_pte_range(pmd, addr, next, walk);
159 	} while (pmd++, addr = next, addr != end);
160 
161 	return 0;
162 }
163 
vmemmap_pud_range(p4d_t * p4d,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)164 static int vmemmap_pud_range(p4d_t *p4d, unsigned long addr,
165 			     unsigned long end,
166 			     struct vmemmap_remap_walk *walk)
167 {
168 	pud_t *pud;
169 	unsigned long next;
170 
171 	pud = pud_offset(p4d, addr);
172 	do {
173 		int ret;
174 
175 		next = pud_addr_end(addr, end);
176 		ret = vmemmap_pmd_range(pud, addr, next, walk);
177 		if (ret)
178 			return ret;
179 	} while (pud++, addr = next, addr != end);
180 
181 	return 0;
182 }
183 
vmemmap_p4d_range(pgd_t * pgd,unsigned long addr,unsigned long end,struct vmemmap_remap_walk * walk)184 static int vmemmap_p4d_range(pgd_t *pgd, unsigned long addr,
185 			     unsigned long end,
186 			     struct vmemmap_remap_walk *walk)
187 {
188 	p4d_t *p4d;
189 	unsigned long next;
190 
191 	p4d = p4d_offset(pgd, addr);
192 	do {
193 		int ret;
194 
195 		next = p4d_addr_end(addr, end);
196 		ret = vmemmap_pud_range(p4d, addr, next, walk);
197 		if (ret)
198 			return ret;
199 	} while (p4d++, addr = next, addr != end);
200 
201 	return 0;
202 }
203 
vmemmap_remap_range(unsigned long start,unsigned long end,struct vmemmap_remap_walk * walk)204 static int vmemmap_remap_range(unsigned long start, unsigned long end,
205 			       struct vmemmap_remap_walk *walk)
206 {
207 	unsigned long addr = start;
208 	unsigned long next;
209 	pgd_t *pgd;
210 
211 	VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
212 	VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
213 
214 	pgd = pgd_offset_k(addr);
215 	do {
216 		int ret;
217 
218 		next = pgd_addr_end(addr, end);
219 		ret = vmemmap_p4d_range(pgd, addr, next, walk);
220 		if (ret)
221 			return ret;
222 	} while (pgd++, addr = next, addr != end);
223 
224 	/*
225 	 * We only change the mapping of the vmemmap virtual address range
226 	 * [@start + PAGE_SIZE, end), so we only need to flush the TLB which
227 	 * belongs to the range.
228 	 */
229 	flush_tlb_kernel_range(start + PAGE_SIZE, end);
230 
231 	return 0;
232 }
233 
234 /*
235  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
236  * allocator or buddy allocator. If the PG_reserved flag is set, it means
237  * that it allocated from the memblock allocator, just free it via the
238  * free_bootmem_page(). Otherwise, use __free_page().
239  */
free_vmemmap_page(struct page * page)240 static inline void free_vmemmap_page(struct page *page)
241 {
242 	if (PageReserved(page))
243 		free_bootmem_page(page);
244 	else
245 		__free_page(page);
246 }
247 
248 /* Free a list of the vmemmap pages */
free_vmemmap_page_list(struct list_head * list)249 static void free_vmemmap_page_list(struct list_head *list)
250 {
251 	struct page *page, *next;
252 
253 	list_for_each_entry_safe(page, next, list, lru) {
254 		list_del(&page->lru);
255 		free_vmemmap_page(page);
256 	}
257 }
258 
vmemmap_remap_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)259 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
260 			      struct vmemmap_remap_walk *walk)
261 {
262 	/*
263 	 * Remap the tail pages as read-only to catch illegal write operation
264 	 * to the tail pages.
265 	 */
266 	pgprot_t pgprot = PAGE_KERNEL_RO;
267 	pte_t entry = mk_pte(walk->reuse_page, pgprot);
268 	struct page *page = pte_page(*pte);
269 
270 	list_add_tail(&page->lru, walk->vmemmap_pages);
271 	set_pte_at(&init_mm, addr, pte, entry);
272 }
273 
274 /*
275  * How many struct page structs need to be reset. When we reuse the head
276  * struct page, the special metadata (e.g. page->flags or page->mapping)
277  * cannot copy to the tail struct page structs. The invalid value will be
278  * checked in the free_tail_pages_check(). In order to avoid the message
279  * of "corrupted mapping in tail page". We need to reset at least 3 (one
280  * head struct page struct and two tail struct page structs) struct page
281  * structs.
282  */
283 #define NR_RESET_STRUCT_PAGE		3
284 
reset_struct_pages(struct page * start)285 static inline void reset_struct_pages(struct page *start)
286 {
287 	int i;
288 	struct page *from = start + NR_RESET_STRUCT_PAGE;
289 
290 	for (i = 0; i < NR_RESET_STRUCT_PAGE; i++)
291 		memcpy(start + i, from, sizeof(*from));
292 }
293 
vmemmap_restore_pte(pte_t * pte,unsigned long addr,struct vmemmap_remap_walk * walk)294 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
295 				struct vmemmap_remap_walk *walk)
296 {
297 	pgprot_t pgprot = PAGE_KERNEL;
298 	struct page *page;
299 	void *to;
300 
301 	BUG_ON(pte_page(*pte) != walk->reuse_page);
302 
303 	page = list_first_entry(walk->vmemmap_pages, struct page, lru);
304 	list_del(&page->lru);
305 	to = page_to_virt(page);
306 	copy_page(to, (void *)walk->reuse_addr);
307 	reset_struct_pages(to);
308 
309 	set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
310 }
311 
312 /**
313  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
314  *			to the page which @reuse is mapped to, then free vmemmap
315  *			which the range are mapped to.
316  * @start:	start address of the vmemmap virtual address range that we want
317  *		to remap.
318  * @end:	end address of the vmemmap virtual address range that we want to
319  *		remap.
320  * @reuse:	reuse address.
321  *
322  * Return: %0 on success, negative error code otherwise.
323  */
vmemmap_remap_free(unsigned long start,unsigned long end,unsigned long reuse)324 int vmemmap_remap_free(unsigned long start, unsigned long end,
325 		       unsigned long reuse)
326 {
327 	int ret;
328 	LIST_HEAD(vmemmap_pages);
329 	struct vmemmap_remap_walk walk = {
330 		.remap_pte	= vmemmap_remap_pte,
331 		.reuse_addr	= reuse,
332 		.vmemmap_pages	= &vmemmap_pages,
333 	};
334 
335 	/*
336 	 * In order to make remapping routine most efficient for the huge pages,
337 	 * the routine of vmemmap page table walking has the following rules
338 	 * (see more details from the vmemmap_pte_range()):
339 	 *
340 	 * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
341 	 *   should be continuous.
342 	 * - The @reuse address is part of the range [@reuse, @end) that we are
343 	 *   walking which is passed to vmemmap_remap_range().
344 	 * - The @reuse address is the first in the complete range.
345 	 *
346 	 * So we need to make sure that @start and @reuse meet the above rules.
347 	 */
348 	BUG_ON(start - reuse != PAGE_SIZE);
349 
350 	mmap_read_lock(&init_mm);
351 	ret = vmemmap_remap_range(reuse, end, &walk);
352 	if (ret && walk.nr_walked) {
353 		end = reuse + walk.nr_walked * PAGE_SIZE;
354 		/*
355 		 * vmemmap_pages contains pages from the previous
356 		 * vmemmap_remap_range call which failed.  These
357 		 * are pages which were removed from the vmemmap.
358 		 * They will be restored in the following call.
359 		 */
360 		walk = (struct vmemmap_remap_walk) {
361 			.remap_pte	= vmemmap_restore_pte,
362 			.reuse_addr	= reuse,
363 			.vmemmap_pages	= &vmemmap_pages,
364 		};
365 
366 		vmemmap_remap_range(reuse, end, &walk);
367 	}
368 	mmap_read_unlock(&init_mm);
369 
370 	free_vmemmap_page_list(&vmemmap_pages);
371 
372 	return ret;
373 }
374 
alloc_vmemmap_page_list(unsigned long start,unsigned long end,gfp_t gfp_mask,struct list_head * list)375 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
376 				   gfp_t gfp_mask, struct list_head *list)
377 {
378 	unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
379 	int nid = page_to_nid((struct page *)start);
380 	struct page *page, *next;
381 
382 	while (nr_pages--) {
383 		page = alloc_pages_node(nid, gfp_mask, 0);
384 		if (!page)
385 			goto out;
386 		list_add_tail(&page->lru, list);
387 	}
388 
389 	return 0;
390 out:
391 	list_for_each_entry_safe(page, next, list, lru)
392 		__free_pages(page, 0);
393 	return -ENOMEM;
394 }
395 
396 /**
397  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
398  *			 to the page which is from the @vmemmap_pages
399  *			 respectively.
400  * @start:	start address of the vmemmap virtual address range that we want
401  *		to remap.
402  * @end:	end address of the vmemmap virtual address range that we want to
403  *		remap.
404  * @reuse:	reuse address.
405  * @gfp_mask:	GFP flag for allocating vmemmap pages.
406  *
407  * Return: %0 on success, negative error code otherwise.
408  */
vmemmap_remap_alloc(unsigned long start,unsigned long end,unsigned long reuse,gfp_t gfp_mask)409 int vmemmap_remap_alloc(unsigned long start, unsigned long end,
410 			unsigned long reuse, gfp_t gfp_mask)
411 {
412 	LIST_HEAD(vmemmap_pages);
413 	struct vmemmap_remap_walk walk = {
414 		.remap_pte	= vmemmap_restore_pte,
415 		.reuse_addr	= reuse,
416 		.vmemmap_pages	= &vmemmap_pages,
417 	};
418 
419 	/* See the comment in the vmemmap_remap_free(). */
420 	BUG_ON(start - reuse != PAGE_SIZE);
421 
422 	if (alloc_vmemmap_page_list(start, end, gfp_mask, &vmemmap_pages))
423 		return -ENOMEM;
424 
425 	mmap_read_lock(&init_mm);
426 	vmemmap_remap_range(reuse, end, &walk);
427 	mmap_read_unlock(&init_mm);
428 
429 	return 0;
430 }
431 #endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */
432 
433 /*
434  * Allocate a block of memory to be used to back the virtual memory map
435  * or to back the page tables that are used to create the mapping.
436  * Uses the main allocators if they are available, else bootmem.
437  */
438 
__earlyonly_bootmem_alloc(int node,unsigned long size,unsigned long align,unsigned long goal)439 static void * __ref __earlyonly_bootmem_alloc(int node,
440 				unsigned long size,
441 				unsigned long align,
442 				unsigned long goal)
443 {
444 	return memblock_alloc_try_nid_raw(size, align, goal,
445 					       MEMBLOCK_ALLOC_ACCESSIBLE, node);
446 }
447 
vmemmap_alloc_block(unsigned long size,int node)448 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
449 {
450 	/* If the main allocator is up use that, fallback to bootmem. */
451 	if (slab_is_available()) {
452 		gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
453 		int order = get_order(size);
454 		static bool warned;
455 		struct page *page;
456 
457 		page = alloc_pages_node(node, gfp_mask, order);
458 		if (page)
459 			return page_address(page);
460 
461 		if (!warned) {
462 			warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
463 				   "vmemmap alloc failure: order:%u", order);
464 			warned = true;
465 		}
466 		return NULL;
467 	} else
468 		return __earlyonly_bootmem_alloc(node, size, size,
469 				__pa(MAX_DMA_ADDRESS));
470 }
471 
472 static void * __meminit altmap_alloc_block_buf(unsigned long size,
473 					       struct vmem_altmap *altmap);
474 
475 /* need to make sure size is all the same during early stage */
vmemmap_alloc_block_buf(unsigned long size,int node,struct vmem_altmap * altmap)476 void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
477 					 struct vmem_altmap *altmap)
478 {
479 	void *ptr;
480 
481 	if (altmap)
482 		return altmap_alloc_block_buf(size, altmap);
483 
484 	ptr = sparse_buffer_alloc(size);
485 	if (!ptr)
486 		ptr = vmemmap_alloc_block(size, node);
487 	return ptr;
488 }
489 
vmem_altmap_next_pfn(struct vmem_altmap * altmap)490 static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
491 {
492 	return altmap->base_pfn + altmap->reserve + altmap->alloc
493 		+ altmap->align;
494 }
495 
vmem_altmap_nr_free(struct vmem_altmap * altmap)496 static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
497 {
498 	unsigned long allocated = altmap->alloc + altmap->align;
499 
500 	if (altmap->free > allocated)
501 		return altmap->free - allocated;
502 	return 0;
503 }
504 
altmap_alloc_block_buf(unsigned long size,struct vmem_altmap * altmap)505 static void * __meminit altmap_alloc_block_buf(unsigned long size,
506 					       struct vmem_altmap *altmap)
507 {
508 	unsigned long pfn, nr_pfns, nr_align;
509 
510 	if (size & ~PAGE_MASK) {
511 		pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
512 				__func__, size);
513 		return NULL;
514 	}
515 
516 	pfn = vmem_altmap_next_pfn(altmap);
517 	nr_pfns = size >> PAGE_SHIFT;
518 	nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
519 	nr_align = ALIGN(pfn, nr_align) - pfn;
520 	if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
521 		return NULL;
522 
523 	altmap->alloc += nr_pfns;
524 	altmap->align += nr_align;
525 	pfn += nr_align;
526 
527 	pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
528 			__func__, pfn, altmap->alloc, altmap->align, nr_pfns);
529 	return __va(__pfn_to_phys(pfn));
530 }
531 
vmemmap_verify(pte_t * pte,int node,unsigned long start,unsigned long end)532 void __meminit vmemmap_verify(pte_t *pte, int node,
533 				unsigned long start, unsigned long end)
534 {
535 	unsigned long pfn = pte_pfn(*pte);
536 	int actual_node = early_pfn_to_nid(pfn);
537 
538 	if (node_distance(actual_node, node) > LOCAL_DISTANCE)
539 		pr_warn("[%lx-%lx] potential offnode page_structs\n",
540 			start, end - 1);
541 }
542 
vmemmap_pte_populate(pmd_t * pmd,unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)543 pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
544 				       struct vmem_altmap *altmap,
545 				       struct page *reuse)
546 {
547 	pte_t *pte = pte_offset_kernel(pmd, addr);
548 	if (pte_none(*pte)) {
549 		pte_t entry;
550 		void *p;
551 
552 		if (!reuse) {
553 			p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
554 			if (!p)
555 				return NULL;
556 		} else {
557 			/*
558 			 * When a PTE/PMD entry is freed from the init_mm
559 			 * there's a a free_pages() call to this page allocated
560 			 * above. Thus this get_page() is paired with the
561 			 * put_page_testzero() on the freeing path.
562 			 * This can only called by certain ZONE_DEVICE path,
563 			 * and through vmemmap_populate_compound_pages() when
564 			 * slab is available.
565 			 */
566 			get_page(reuse);
567 			p = page_to_virt(reuse);
568 		}
569 		entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
570 		set_pte_at(&init_mm, addr, pte, entry);
571 	}
572 	return pte;
573 }
574 
vmemmap_alloc_block_zero(unsigned long size,int node)575 static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
576 {
577 	void *p = vmemmap_alloc_block(size, node);
578 
579 	if (!p)
580 		return NULL;
581 	memset(p, 0, size);
582 
583 	return p;
584 }
585 
vmemmap_pmd_populate(pud_t * pud,unsigned long addr,int node)586 pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
587 {
588 	pmd_t *pmd = pmd_offset(pud, addr);
589 	if (pmd_none(*pmd)) {
590 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
591 		if (!p)
592 			return NULL;
593 		pmd_populate_kernel(&init_mm, pmd, p);
594 	}
595 	return pmd;
596 }
597 
vmemmap_pud_populate(p4d_t * p4d,unsigned long addr,int node)598 pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
599 {
600 	pud_t *pud = pud_offset(p4d, addr);
601 	if (pud_none(*pud)) {
602 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
603 		if (!p)
604 			return NULL;
605 		pud_populate(&init_mm, pud, p);
606 	}
607 	return pud;
608 }
609 
vmemmap_p4d_populate(pgd_t * pgd,unsigned long addr,int node)610 p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
611 {
612 	p4d_t *p4d = p4d_offset(pgd, addr);
613 	if (p4d_none(*p4d)) {
614 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
615 		if (!p)
616 			return NULL;
617 		p4d_populate(&init_mm, p4d, p);
618 	}
619 	return p4d;
620 }
621 
vmemmap_pgd_populate(unsigned long addr,int node)622 pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
623 {
624 	pgd_t *pgd = pgd_offset_k(addr);
625 	if (pgd_none(*pgd)) {
626 		void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
627 		if (!p)
628 			return NULL;
629 		pgd_populate(&init_mm, pgd, p);
630 	}
631 	return pgd;
632 }
633 
vmemmap_populate_address(unsigned long addr,int node,struct vmem_altmap * altmap,struct page * reuse)634 static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
635 					      struct vmem_altmap *altmap,
636 					      struct page *reuse)
637 {
638 	pgd_t *pgd;
639 	p4d_t *p4d;
640 	pud_t *pud;
641 	pmd_t *pmd;
642 	pte_t *pte;
643 
644 	pgd = vmemmap_pgd_populate(addr, node);
645 	if (!pgd)
646 		return NULL;
647 	p4d = vmemmap_p4d_populate(pgd, addr, node);
648 	if (!p4d)
649 		return NULL;
650 	pud = vmemmap_pud_populate(p4d, addr, node);
651 	if (!pud)
652 		return NULL;
653 	pmd = vmemmap_pmd_populate(pud, addr, node);
654 	if (!pmd)
655 		return NULL;
656 	pte = vmemmap_pte_populate(pmd, addr, node, altmap, reuse);
657 	if (!pte)
658 		return NULL;
659 	vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
660 
661 	return pte;
662 }
663 
vmemmap_populate_range(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap,struct page * reuse)664 static int __meminit vmemmap_populate_range(unsigned long start,
665 					    unsigned long end, int node,
666 					    struct vmem_altmap *altmap,
667 					    struct page *reuse)
668 {
669 	unsigned long addr = start;
670 	pte_t *pte;
671 
672 	for (; addr < end; addr += PAGE_SIZE) {
673 		pte = vmemmap_populate_address(addr, node, altmap, reuse);
674 		if (!pte)
675 			return -ENOMEM;
676 	}
677 
678 	return 0;
679 }
680 
vmemmap_populate_basepages(unsigned long start,unsigned long end,int node,struct vmem_altmap * altmap)681 int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
682 					 int node, struct vmem_altmap *altmap)
683 {
684 	return vmemmap_populate_range(start, end, node, altmap, NULL);
685 }
686 
687 /*
688  * For compound pages bigger than section size (e.g. x86 1G compound
689  * pages with 2M subsection size) fill the rest of sections as tail
690  * pages.
691  *
692  * Note that memremap_pages() resets @nr_range value and will increment
693  * it after each range successful onlining. Thus the value or @nr_range
694  * at section memmap populate corresponds to the in-progress range
695  * being onlined here.
696  */
reuse_compound_section(unsigned long start_pfn,struct dev_pagemap * pgmap)697 static bool __meminit reuse_compound_section(unsigned long start_pfn,
698 					     struct dev_pagemap *pgmap)
699 {
700 	unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
701 	unsigned long offset = start_pfn -
702 		PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
703 
704 	return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
705 }
706 
compound_section_tail_page(unsigned long addr)707 static pte_t * __meminit compound_section_tail_page(unsigned long addr)
708 {
709 	pte_t *pte;
710 
711 	addr -= PAGE_SIZE;
712 
713 	/*
714 	 * Assuming sections are populated sequentially, the previous section's
715 	 * page data can be reused.
716 	 */
717 	pte = pte_offset_kernel(pmd_off_k(addr), addr);
718 	if (!pte)
719 		return NULL;
720 
721 	return pte;
722 }
723 
vmemmap_populate_compound_pages(unsigned long start_pfn,unsigned long start,unsigned long end,int node,struct dev_pagemap * pgmap)724 static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
725 						     unsigned long start,
726 						     unsigned long end, int node,
727 						     struct dev_pagemap *pgmap)
728 {
729 	unsigned long size, addr;
730 	pte_t *pte;
731 	int rc;
732 
733 	if (reuse_compound_section(start_pfn, pgmap)) {
734 		pte = compound_section_tail_page(start);
735 		if (!pte)
736 			return -ENOMEM;
737 
738 		/*
739 		 * Reuse the page that was populated in the prior iteration
740 		 * with just tail struct pages.
741 		 */
742 		return vmemmap_populate_range(start, end, node, NULL,
743 					      pte_page(*pte));
744 	}
745 
746 	size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
747 	for (addr = start; addr < end; addr += size) {
748 		unsigned long next = addr, last = addr + size;
749 
750 		/* Populate the head page vmemmap page */
751 		pte = vmemmap_populate_address(addr, node, NULL, NULL);
752 		if (!pte)
753 			return -ENOMEM;
754 
755 		/* Populate the tail pages vmemmap page */
756 		next = addr + PAGE_SIZE;
757 		pte = vmemmap_populate_address(next, node, NULL, NULL);
758 		if (!pte)
759 			return -ENOMEM;
760 
761 		/*
762 		 * Reuse the previous page for the rest of tail pages
763 		 * See layout diagram in Documentation/vm/vmemmap_dedup.rst
764 		 */
765 		next += PAGE_SIZE;
766 		rc = vmemmap_populate_range(next, last, node, NULL,
767 					    pte_page(*pte));
768 		if (rc)
769 			return -ENOMEM;
770 	}
771 
772 	return 0;
773 }
774 
__populate_section_memmap(unsigned long pfn,unsigned long nr_pages,int nid,struct vmem_altmap * altmap,struct dev_pagemap * pgmap)775 struct page * __meminit __populate_section_memmap(unsigned long pfn,
776 		unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
777 		struct dev_pagemap *pgmap)
778 {
779 	unsigned long start = (unsigned long) pfn_to_page(pfn);
780 	unsigned long end = start + nr_pages * sizeof(struct page);
781 	int r;
782 
783 	if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
784 		!IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
785 		return NULL;
786 
787 	if (is_power_of_2(sizeof(struct page)) &&
788 	    pgmap && pgmap_vmemmap_nr(pgmap) > 1 && !altmap)
789 		r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
790 	else
791 		r = vmemmap_populate(start, end, nid, altmap);
792 
793 	if (r < 0)
794 		return NULL;
795 
796 	return pfn_to_page(pfn);
797 }
798