| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * linux/mm/page_alloc.c |
| * |
| * Manages the free list, the system allocates free pages here. |
| * Note that kmalloc() lives in slab.c |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * Swap reorganised 29.12.95, Stephen Tweedie |
| * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
| * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
| * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
| * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
| * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
| * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
| */ |
| |
| #include <linux/stddef.h> |
| #include <linux/mm.h> |
| #include <linux/highmem.h> |
| #include <linux/swap.h> |
| #include <linux/interrupt.h> |
| #include <linux/pagemap.h> |
| #include <linux/jiffies.h> |
| #include <linux/memblock.h> |
| #include <linux/compiler.h> |
| #include <linux/kernel.h> |
| #include <linux/kasan.h> |
| #include <linux/module.h> |
| #include <linux/suspend.h> |
| #include <linux/pagevec.h> |
| #include <linux/blkdev.h> |
| #include <linux/slab.h> |
| #include <linux/ratelimit.h> |
| #include <linux/oom.h> |
| #include <linux/topology.h> |
| #include <linux/sysctl.h> |
| #include <linux/cpu.h> |
| #include <linux/cpuset.h> |
| #include <linux/memory_hotplug.h> |
| #include <linux/nodemask.h> |
| #include <linux/vmalloc.h> |
| #include <linux/vmstat.h> |
| #include <linux/mempolicy.h> |
| #include <linux/memremap.h> |
| #include <linux/stop_machine.h> |
| #include <linux/random.h> |
| #include <linux/sort.h> |
| #include <linux/pfn.h> |
| #include <linux/backing-dev.h> |
| #include <linux/fault-inject.h> |
| #include <linux/page-isolation.h> |
| #include <linux/debugobjects.h> |
| #include <linux/kmemleak.h> |
| #include <linux/compaction.h> |
| #include <trace/events/kmem.h> |
| #include <trace/events/oom.h> |
| #include <linux/prefetch.h> |
| #include <linux/mm_inline.h> |
| #include <linux/migrate.h> |
| #include <linux/hugetlb.h> |
| #include <linux/sched/rt.h> |
| #include <linux/sched/mm.h> |
| #include <linux/page_owner.h> |
| #include <linux/page_pinner.h> |
| #include <linux/kthread.h> |
| #include <linux/memcontrol.h> |
| #include <linux/ftrace.h> |
| #include <linux/lockdep.h> |
| #include <linux/nmi.h> |
| #include <linux/psi.h> |
| #include <linux/padata.h> |
| #include <linux/khugepaged.h> |
| #include <trace/hooks/mm.h> |
| |
| #include <asm/sections.h> |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| #include "internal.h" |
| #include "shuffle.h" |
| #include "page_reporting.h" |
| |
| /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ |
| typedef int __bitwise fpi_t; |
| |
| static inline struct per_cpu_pageset_ext *pcp_to_pageset_ext(struct per_cpu_pages *pcp) |
| { |
| struct per_cpu_pageset *ps = container_of(pcp, struct per_cpu_pageset, pcp); |
| |
| return container_of(ps, struct per_cpu_pageset_ext, pageset); |
| } |
| |
| static inline struct per_cpu_pageset_ext *pageset_to_pageset_ext(struct per_cpu_pageset *ps) |
| { |
| return container_of(ps, struct per_cpu_pageset_ext, pageset); |
| } |
| |
| /* No special request */ |
| #define FPI_NONE ((__force fpi_t)0) |
| |
| /* |
| * Skip free page reporting notification for the (possibly merged) page. |
| * This does not hinder free page reporting from grabbing the page, |
| * reporting it and marking it "reported" - it only skips notifying |
| * the free page reporting infrastructure about a newly freed page. For |
| * example, used when temporarily pulling a page from a freelist and |
| * putting it back unmodified. |
| */ |
| #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) |
| |
| /* |
| * Place the (possibly merged) page to the tail of the freelist. Will ignore |
| * page shuffling (relevant code - e.g., memory onlining - is expected to |
| * shuffle the whole zone). |
| * |
| * Note: No code should rely on this flag for correctness - it's purely |
| * to allow for optimizations when handing back either fresh pages |
| * (memory onlining) or untouched pages (page isolation, free page |
| * reporting). |
| */ |
| #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) |
| |
| /* |
| * Don't poison memory with KASAN (only for the tag-based modes). |
| * During boot, all non-reserved memblock memory is exposed to page_alloc. |
| * Poisoning all that memory lengthens boot time, especially on systems with |
| * large amount of RAM. This flag is used to skip that poisoning. |
| * This is only done for the tag-based KASAN modes, as those are able to |
| * detect memory corruptions with the memory tags assigned by default. |
| * All memory allocated normally after boot gets poisoned as usual. |
| */ |
| #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) |
| |
| /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
| static DEFINE_MUTEX(pcp_batch_high_lock); |
| #define MIN_PERCPU_PAGELIST_FRACTION (8) |
| |
| #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) |
| /* |
| * On SMP, spin_trylock is sufficient protection. |
| * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. |
| */ |
| #define pcp_trylock_prepare(flags) do { } while (0) |
| #define pcp_trylock_finish(flag) do { } while (0) |
| #else |
| |
| /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ |
| #define pcp_trylock_prepare(flags) local_irq_save(flags) |
| #define pcp_trylock_finish(flags) local_irq_restore(flags) |
| #endif |
| |
| #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
| DEFINE_PER_CPU(int, numa_node); |
| EXPORT_PER_CPU_SYMBOL(numa_node); |
| #endif |
| |
| DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); |
| |
| #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
| /* |
| * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
| * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. |
| * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() |
| * defined in <linux/topology.h>. |
| */ |
| DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
| EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
| #endif |
| |
| static DEFINE_MUTEX(pcpu_drain_mutex); |
| |
| #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
| volatile unsigned long latent_entropy __latent_entropy; |
| EXPORT_SYMBOL(latent_entropy); |
| #endif |
| |
| /* |
| * Array of node states. |
| */ |
| nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
| [N_POSSIBLE] = NODE_MASK_ALL, |
| [N_ONLINE] = { { [0] = 1UL } }, |
| #ifndef CONFIG_NUMA |
| [N_NORMAL_MEMORY] = { { [0] = 1UL } }, |
| #ifdef CONFIG_HIGHMEM |
| [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
| #endif |
| [N_MEMORY] = { { [0] = 1UL } }, |
| [N_CPU] = { { [0] = 1UL } }, |
| #endif /* NUMA */ |
| }; |
| EXPORT_SYMBOL(node_states); |
| |
| atomic_long_t _totalram_pages __read_mostly; |
| EXPORT_SYMBOL(_totalram_pages); |
| unsigned long totalreserve_pages __read_mostly; |
| unsigned long totalcma_pages __read_mostly; |
| |
| int percpu_pagelist_fraction; |
| gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| DEFINE_STATIC_KEY_FALSE(init_on_alloc); |
| EXPORT_SYMBOL(init_on_alloc); |
| |
| DEFINE_STATIC_KEY_FALSE(init_on_free); |
| EXPORT_SYMBOL(init_on_free); |
| |
| static bool _init_on_alloc_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); |
| static int __init early_init_on_alloc(char *buf) |
| { |
| |
| return kstrtobool(buf, &_init_on_alloc_enabled_early); |
| } |
| early_param("init_on_alloc", early_init_on_alloc); |
| |
| static bool _init_on_free_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); |
| static int __init early_init_on_free(char *buf) |
| { |
| return kstrtobool(buf, &_init_on_free_enabled_early); |
| } |
| early_param("init_on_free", early_init_on_free); |
| |
| /* |
| * A cached value of the page's pageblock's migratetype, used when the page is |
| * put on a pcplist. Used to avoid the pageblock migratetype lookup when |
| * freeing from pcplists in most cases, at the cost of possibly becoming stale. |
| * Also the migratetype set in the page does not necessarily match the pcplist |
| * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any |
| * other index - this ensures that it will be put on the correct CMA freelist. |
| */ |
| static inline int get_pcppage_migratetype(struct page *page) |
| { |
| return page->index; |
| } |
| |
| static inline void set_pcppage_migratetype(struct page *page, int migratetype) |
| { |
| page->index = migratetype; |
| } |
| |
| #ifdef CONFIG_PM_SLEEP |
| /* |
| * The following functions are used by the suspend/hibernate code to temporarily |
| * change gfp_allowed_mask in order to avoid using I/O during memory allocations |
| * while devices are suspended. To avoid races with the suspend/hibernate code, |
| * they should always be called with system_transition_mutex held |
| * (gfp_allowed_mask also should only be modified with system_transition_mutex |
| * held, unless the suspend/hibernate code is guaranteed not to run in parallel |
| * with that modification). |
| */ |
| |
| static gfp_t saved_gfp_mask; |
| |
| void pm_restore_gfp_mask(void) |
| { |
| WARN_ON(!mutex_is_locked(&system_transition_mutex)); |
| if (saved_gfp_mask) { |
| gfp_allowed_mask = saved_gfp_mask; |
| saved_gfp_mask = 0; |
| } |
| } |
| |
| void pm_restrict_gfp_mask(void) |
| { |
| WARN_ON(!mutex_is_locked(&system_transition_mutex)); |
| WARN_ON(saved_gfp_mask); |
| saved_gfp_mask = gfp_allowed_mask; |
| gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); |
| } |
| |
| bool pm_suspended_storage(void) |
| { |
| if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) |
| return false; |
| return true; |
| } |
| #endif /* CONFIG_PM_SLEEP */ |
| |
| #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| unsigned int pageblock_order __read_mostly; |
| #endif |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags); |
| |
| /* |
| * results with 256, 32 in the lowmem_reserve sysctl: |
| * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
| * 1G machine -> (16M dma, 784M normal, 224M high) |
| * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
| * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
| * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
| * |
| * TBD: should special case ZONE_DMA32 machines here - in those we normally |
| * don't need any ZONE_NORMAL reservation |
| */ |
| int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| [ZONE_DMA] = 256, |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| [ZONE_DMA32] = 256, |
| #endif |
| [ZONE_NORMAL] = 32, |
| #ifdef CONFIG_HIGHMEM |
| [ZONE_HIGHMEM] = 0, |
| #endif |
| [ZONE_MOVABLE] = 0, |
| }; |
| |
| static char * const zone_names[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| "DMA", |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| "DMA32", |
| #endif |
| "Normal", |
| #ifdef CONFIG_HIGHMEM |
| "HighMem", |
| #endif |
| "Movable", |
| #ifdef CONFIG_ZONE_DEVICE |
| "Device", |
| #endif |
| }; |
| |
| const char * const migratetype_names[MIGRATE_TYPES] = { |
| "Unmovable", |
| "Movable", |
| "Reclaimable", |
| #ifdef CONFIG_CMA |
| "CMA", |
| #endif |
| "HighAtomic", |
| #ifdef CONFIG_MEMORY_ISOLATION |
| "Isolate", |
| #endif |
| }; |
| |
| compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { |
| [NULL_COMPOUND_DTOR] = NULL, |
| [COMPOUND_PAGE_DTOR] = free_compound_page, |
| #ifdef CONFIG_HUGETLB_PAGE |
| [HUGETLB_PAGE_DTOR] = free_huge_page, |
| #endif |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, |
| #endif |
| }; |
| |
| int min_free_kbytes = 1024; |
| int user_min_free_kbytes = -1; |
| #ifdef CONFIG_DISCONTIGMEM |
| /* |
| * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges |
| * are not on separate NUMA nodes. Functionally this works but with |
| * watermark_boost_factor, it can reclaim prematurely as the ranges can be |
| * quite small. By default, do not boost watermarks on discontigmem as in |
| * many cases very high-order allocations like THP are likely to be |
| * unsupported and the premature reclaim offsets the advantage of long-term |
| * fragmentation avoidance. |
| */ |
| int watermark_boost_factor __read_mostly; |
| #else |
| int watermark_boost_factor __read_mostly = 15000; |
| #endif |
| int watermark_scale_factor = 10; |
| |
| static unsigned long nr_kernel_pages __initdata; |
| static unsigned long nr_all_pages __initdata; |
| static unsigned long dma_reserve __initdata; |
| |
| static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; |
| static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; |
| static unsigned long required_kernelcore __initdata; |
| static unsigned long required_kernelcore_percent __initdata; |
| static unsigned long required_movablecore __initdata; |
| static unsigned long required_movablecore_percent __initdata; |
| static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; |
| static bool mirrored_kernelcore __meminitdata; |
| |
| /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| int movable_zone; |
| EXPORT_SYMBOL(movable_zone); |
| |
| #if MAX_NUMNODES > 1 |
| unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
| unsigned int nr_online_nodes __read_mostly = 1; |
| EXPORT_SYMBOL(nr_node_ids); |
| EXPORT_SYMBOL(nr_online_nodes); |
| #endif |
| |
| int page_group_by_mobility_disabled __read_mostly; |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| /* |
| * During boot we initialize deferred pages on-demand, as needed, but once |
| * page_alloc_init_late() has finished, the deferred pages are all initialized, |
| * and we can permanently disable that path. |
| */ |
| static DEFINE_STATIC_KEY_TRUE(deferred_pages); |
| |
| static inline bool deferred_pages_enabled(void) |
| { |
| return static_branch_unlikely(&deferred_pages); |
| } |
| |
| /* Returns true if the struct page for the pfn is uninitialised */ |
| static inline bool __meminit early_page_uninitialised(unsigned long pfn) |
| { |
| int nid = early_pfn_to_nid(pfn); |
| |
| if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * Returns true when the remaining initialisation should be deferred until |
| * later in the boot cycle when it can be parallelised. |
| */ |
| static bool __meminit |
| defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
| { |
| static unsigned long prev_end_pfn, nr_initialised; |
| |
| /* |
| * prev_end_pfn static that contains the end of previous zone |
| * No need to protect because called very early in boot before smp_init. |
| */ |
| if (prev_end_pfn != end_pfn) { |
| prev_end_pfn = end_pfn; |
| nr_initialised = 0; |
| } |
| |
| /* Always populate low zones for address-constrained allocations */ |
| if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
| return false; |
| |
| if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) |
| return true; |
| /* |
| * We start only with one section of pages, more pages are added as |
| * needed until the rest of deferred pages are initialized. |
| */ |
| nr_initialised++; |
| if ((nr_initialised > PAGES_PER_SECTION) && |
| (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
| NODE_DATA(nid)->first_deferred_pfn = pfn; |
| return true; |
| } |
| return false; |
| } |
| #else |
| static inline bool deferred_pages_enabled(void) |
| { |
| return false; |
| } |
| |
| static inline bool early_page_uninitialised(unsigned long pfn) |
| { |
| return false; |
| } |
| |
| static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
| { |
| return false; |
| } |
| #endif |
| |
| /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
| static inline unsigned long *get_pageblock_bitmap(struct page *page, |
| unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| return section_to_usemap(__pfn_to_section(pfn)); |
| #else |
| return page_zone(page)->pageblock_flags; |
| #endif /* CONFIG_SPARSEMEM */ |
| } |
| |
| static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| pfn &= (PAGES_PER_SECTION-1); |
| #else |
| pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); |
| #endif /* CONFIG_SPARSEMEM */ |
| return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
| } |
| |
| /** |
| * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| * |
| * Return: pageblock_bits flags |
| */ |
| static __always_inline |
| unsigned long __get_pfnblock_flags_mask(struct page *page, |
| unsigned long pfn, |
| unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long word; |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| /* |
| * This races, without locks, with set_pfnblock_flags_mask(). Ensure |
| * a consistent read of the memory array, so that results, even though |
| * racy, are not corrupted. |
| */ |
| word = READ_ONCE(bitmap[word_bitidx]); |
| return (word >> bitidx) & mask; |
| } |
| |
| unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, |
| unsigned long mask) |
| { |
| return __get_pfnblock_flags_mask(page, pfn, mask); |
| } |
| EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask); |
| |
| int isolate_anon_lru_page(struct page *page) |
| { |
| int ret; |
| |
| if (!PageLRU(page) || !PageAnon(page)) |
| return -EINVAL; |
| |
| if (!get_page_unless_zero(page)) |
| return -EINVAL; |
| |
| ret = isolate_lru_page(page); |
| put_page(page); |
| |
| return ret; |
| } |
| EXPORT_SYMBOL_GPL(isolate_anon_lru_page); |
| |
| static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) |
| { |
| return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); |
| } |
| |
| /** |
| * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @flags: The flags to set |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| */ |
| void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
| unsigned long pfn, |
| unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long old_word, word; |
| |
| BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
| BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| |
| VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); |
| |
| mask <<= bitidx; |
| flags <<= bitidx; |
| |
| word = READ_ONCE(bitmap[word_bitidx]); |
| for (;;) { |
| old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); |
| if (word == old_word) |
| break; |
| word = old_word; |
| } |
| } |
| |
| void set_pageblock_migratetype(struct page *page, int migratetype) |
| { |
| if (unlikely(page_group_by_mobility_disabled && |
| migratetype < MIGRATE_PCPTYPES)) |
| migratetype = MIGRATE_UNMOVABLE; |
| |
| set_pfnblock_flags_mask(page, (unsigned long)migratetype, |
| page_to_pfn(page), MIGRATETYPE_MASK); |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| { |
| int ret = 0; |
| unsigned seq; |
| unsigned long pfn = page_to_pfn(page); |
| unsigned long sp, start_pfn; |
| |
| do { |
| seq = zone_span_seqbegin(zone); |
| start_pfn = zone->zone_start_pfn; |
| sp = zone->spanned_pages; |
| if (!zone_spans_pfn(zone, pfn)) |
| ret = 1; |
| } while (zone_span_seqretry(zone, seq)); |
| |
| if (ret) |
| pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", |
| pfn, zone_to_nid(zone), zone->name, |
| start_pfn, start_pfn + sp); |
| |
| return ret; |
| } |
| |
| static int page_is_consistent(struct zone *zone, struct page *page) |
| { |
| if (!pfn_valid_within(page_to_pfn(page))) |
| return 0; |
| if (zone != page_zone(page)) |
| return 0; |
| |
| return 1; |
| } |
| /* |
| * Temporary debugging check for pages not lying within a given zone. |
| */ |
| static int __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| if (page_outside_zone_boundaries(zone, page)) |
| return 1; |
| if (!page_is_consistent(zone, page)) |
| return 1; |
| |
| return 0; |
| } |
| #else |
| static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| return 0; |
| } |
| #endif |
| |
| static void bad_page(struct page *page, const char *reason) |
| { |
| static unsigned long resume; |
| static unsigned long nr_shown; |
| static unsigned long nr_unshown; |
| |
| /* |
| * Allow a burst of 60 reports, then keep quiet for that minute; |
| * or allow a steady drip of one report per second. |
| */ |
| if (nr_shown == 60) { |
| if (time_before(jiffies, resume)) { |
| nr_unshown++; |
| goto out; |
| } |
| if (nr_unshown) { |
| pr_alert( |
| "BUG: Bad page state: %lu messages suppressed\n", |
| nr_unshown); |
| nr_unshown = 0; |
| } |
| nr_shown = 0; |
| } |
| if (nr_shown++ == 0) |
| resume = jiffies + 60 * HZ; |
| |
| pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
| current->comm, page_to_pfn(page)); |
| __dump_page(page, reason); |
| dump_page_owner(page); |
| |
| print_modules(); |
| dump_stack(); |
| out: |
| /* Leave bad fields for debug, except PageBuddy could make trouble */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
| } |
| |
| /* |
| * Higher-order pages are called "compound pages". They are structured thusly: |
| * |
| * The first PAGE_SIZE page is called the "head page" and have PG_head set. |
| * |
| * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded |
| * in bit 0 of page->compound_head. The rest of bits is pointer to head page. |
| * |
| * The first tail page's ->compound_dtor holds the offset in array of compound |
| * page destructors. See compound_page_dtors. |
| * |
| * The first tail page's ->compound_order holds the order of allocation. |
| * This usage means that zero-order pages may not be compound. |
| */ |
| |
| void free_compound_page(struct page *page) |
| { |
| mem_cgroup_uncharge(page); |
| __free_pages_ok(page, compound_order(page), FPI_NONE); |
| } |
| |
| void prep_compound_page(struct page *page, unsigned int order) |
| { |
| int i; |
| int nr_pages = 1 << order; |
| |
| __SetPageHead(page); |
| for (i = 1; i < nr_pages; i++) { |
| struct page *p = page + i; |
| set_page_count(p, 0); |
| p->mapping = TAIL_MAPPING; |
| set_compound_head(p, page); |
| } |
| |
| set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
| set_compound_order(page, order); |
| atomic_set(compound_mapcount_ptr(page), -1); |
| if (hpage_pincount_available(page)) |
| atomic_set(compound_pincount_ptr(page), 0); |
| } |
| |
| #ifdef CONFIG_DEBUG_PAGEALLOC |
| unsigned int _debug_guardpage_minorder; |
| |
| bool _debug_pagealloc_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); |
| EXPORT_SYMBOL(_debug_pagealloc_enabled_early); |
| DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); |
| EXPORT_SYMBOL(_debug_pagealloc_enabled); |
| |
| DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); |
| |
| static int __init early_debug_pagealloc(char *buf) |
| { |
| return kstrtobool(buf, &_debug_pagealloc_enabled_early); |
| } |
| early_param("debug_pagealloc", early_debug_pagealloc); |
| |
| static int __init debug_guardpage_minorder_setup(char *buf) |
| { |
| unsigned long res; |
| |
| if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { |
| pr_err("Bad debug_guardpage_minorder value\n"); |
| return 0; |
| } |
| _debug_guardpage_minorder = res; |
| pr_info("Setting debug_guardpage_minorder to %lu\n", res); |
| return 0; |
| } |
| early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); |
| |
| static inline bool set_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) |
| { |
| if (!debug_guardpage_enabled()) |
| return false; |
| |
| if (order >= debug_guardpage_minorder()) |
| return false; |
| |
| __SetPageGuard(page); |
| INIT_LIST_HEAD(&page->lru); |
| set_page_private(page, order); |
| /* Guard pages are not available for any usage */ |
| __mod_zone_freepage_state(zone, -(1 << order), migratetype); |
| |
| return true; |
| } |
| |
| static inline void clear_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) |
| { |
| if (!debug_guardpage_enabled()) |
| return; |
| |
| __ClearPageGuard(page); |
| |
| set_page_private(page, 0); |
| if (!is_migrate_isolate(migratetype)) |
| __mod_zone_freepage_state(zone, (1 << order), migratetype); |
| } |
| #else |
| static inline bool set_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) { return false; } |
| static inline void clear_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) {} |
| #endif |
| |
| /* |
| * Enable static keys related to various memory debugging and hardening options. |
| * Some override others, and depend on early params that are evaluated in the |
| * order of appearance. So we need to first gather the full picture of what was |
| * enabled, and then make decisions. |
| */ |
| void init_mem_debugging_and_hardening(void) |
| { |
| bool page_poisoning_requested = false; |
| |
| #ifdef CONFIG_PAGE_POISONING |
| /* |
| * Page poisoning is debug page alloc for some arches. If |
| * either of those options are enabled, enable poisoning. |
| */ |
| if (page_poisoning_enabled() || |
| (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && |
| debug_pagealloc_enabled())) { |
| static_branch_enable(&_page_poisoning_enabled); |
| page_poisoning_requested = true; |
| } |
| #endif |
| |
| if (_init_on_alloc_enabled_early) { |
| if (page_poisoning_requested) |
| pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
| "will take precedence over init_on_alloc\n"); |
| else |
| static_branch_enable(&init_on_alloc); |
| } |
| if (_init_on_free_enabled_early) { |
| if (page_poisoning_requested) |
| pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
| "will take precedence over init_on_free\n"); |
| else |
| static_branch_enable(&init_on_free); |
| } |
| |
| #ifdef CONFIG_DEBUG_PAGEALLOC |
| if (!debug_pagealloc_enabled()) |
| return; |
| |
| static_branch_enable(&_debug_pagealloc_enabled); |
| |
| if (!debug_guardpage_minorder()) |
| return; |
| |
| static_branch_enable(&_debug_guardpage_enabled); |
| #endif |
| } |
| |
| static inline void set_buddy_order(struct page *page, unsigned int order) |
| { |
| set_page_private(page, order); |
| __SetPageBuddy(page); |
| } |
| |
| /* |
| * This function checks whether a page is free && is the buddy |
| * we can coalesce a page and its buddy if |
| * (a) the buddy is not in a hole (check before calling!) && |
| * (b) the buddy is in the buddy system && |
| * (c) a page and its buddy have the same order && |
| * (d) a page and its buddy are in the same zone. |
| * |
| * For recording whether a page is in the buddy system, we set PageBuddy. |
| * Setting, clearing, and testing PageBuddy is serialized by zone->lock. |
| * |
| * For recording page's order, we use page_private(page). |
| */ |
| static inline bool page_is_buddy(struct page *page, struct page *buddy, |
| unsigned int order) |
| { |
| if (!page_is_guard(buddy) && !PageBuddy(buddy)) |
| return false; |
| |
| if (buddy_order(buddy) != order) |
| return false; |
| |
| /* |
| * zone check is done late to avoid uselessly calculating |
| * zone/node ids for pages that could never merge. |
| */ |
| if (page_zone_id(page) != page_zone_id(buddy)) |
| return false; |
| |
| VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); |
| |
| return true; |
| } |
| |
| #ifdef CONFIG_COMPACTION |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| struct capture_control *capc = current->capture_control; |
| |
| return unlikely(capc) && |
| !(current->flags & PF_KTHREAD) && |
| !capc->page && |
| capc->cc->zone == zone ? capc : NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| if (!capc || order != capc->cc->order) |
| return false; |
| |
| /* Do not accidentally pollute CMA or isolated regions*/ |
| if (is_migrate_cma(migratetype) || |
| is_migrate_isolate(migratetype)) |
| return false; |
| |
| /* |
| * Do not let lower order allocations polluate a movable pageblock. |
| * This might let an unmovable request use a reclaimable pageblock |
| * and vice-versa but no more than normal fallback logic which can |
| * have trouble finding a high-order free page. |
| */ |
| if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) |
| return false; |
| |
| capc->page = page; |
| return true; |
| } |
| |
| #else |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| return NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| return false; |
| } |
| #endif /* CONFIG_COMPACTION */ |
| |
| /* Used for pages not on another list */ |
| static inline void add_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_add(&page->lru, &area->free_list[migratetype]); |
| area->nr_free++; |
| } |
| |
| /* Used for pages not on another list */ |
| static inline void add_to_free_list_tail(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_add_tail(&page->lru, &area->free_list[migratetype]); |
| area->nr_free++; |
| } |
| |
| /* |
| * Used for pages which are on another list. Move the pages to the tail |
| * of the list - so the moved pages won't immediately be considered for |
| * allocation again (e.g., optimization for memory onlining). |
| */ |
| static inline void move_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_move_tail(&page->lru, &area->free_list[migratetype]); |
| } |
| |
| static inline void del_page_from_free_list(struct page *page, struct zone *zone, |
| unsigned int order) |
| { |
| /* clear reported state and update reported page count */ |
| if (page_reported(page)) |
| __ClearPageReported(page); |
| |
| list_del(&page->lru); |
| __ClearPageBuddy(page); |
| set_page_private(page, 0); |
| zone->free_area[order].nr_free--; |
| } |
| |
| /* |
| * If this is not the largest possible page, check if the buddy |
| * of the next-highest order is free. If it is, it's possible |
| * that pages are being freed that will coalesce soon. In case, |
| * that is happening, add the free page to the tail of the list |
| * so it's less likely to be used soon and more likely to be merged |
| * as a higher order page |
| */ |
| static inline bool |
| buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, |
| struct page *page, unsigned int order) |
| { |
| struct page *higher_page, *higher_buddy; |
| unsigned long combined_pfn; |
| |
| if (order >= MAX_ORDER - 2) |
| return false; |
| |
| if (!pfn_valid_within(buddy_pfn)) |
| return false; |
| |
| combined_pfn = buddy_pfn & pfn; |
| higher_page = page + (combined_pfn - pfn); |
| buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); |
| higher_buddy = higher_page + (buddy_pfn - combined_pfn); |
| |
| return pfn_valid_within(buddy_pfn) && |
| page_is_buddy(higher_page, higher_buddy, order + 1); |
| } |
| |
| /* |
| * Freeing function for a buddy system allocator. |
| * |
| * The concept of a buddy system is to maintain direct-mapped table |
| * (containing bit values) for memory blocks of various "orders". |
| * The bottom level table contains the map for the smallest allocatable |
| * units of memory (here, pages), and each level above it describes |
| * pairs of units from the levels below, hence, "buddies". |
| * At a high level, all that happens here is marking the table entry |
| * at the bottom level available, and propagating the changes upward |
| * as necessary, plus some accounting needed to play nicely with other |
| * parts of the VM system. |
| * At each level, we keep a list of pages, which are heads of continuous |
| * free pages of length of (1 << order) and marked with PageBuddy. |
| * Page's order is recorded in page_private(page) field. |
| * So when we are allocating or freeing one, we can derive the state of the |
| * other. That is, if we allocate a small block, and both were |
| * free, the remainder of the region must be split into blocks. |
| * If a block is freed, and its buddy is also free, then this |
| * triggers coalescing into a block of larger size. |
| * |
| * -- nyc |
| */ |
| |
| static inline void __free_one_page(struct page *page, |
| unsigned long pfn, |
| struct zone *zone, unsigned int order, |
| int migratetype, fpi_t fpi_flags) |
| { |
| struct capture_control *capc = task_capc(zone); |
| unsigned long buddy_pfn; |
| unsigned long combined_pfn; |
| unsigned int max_order; |
| struct page *buddy; |
| bool to_tail; |
| |
| max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order); |
| |
| VM_BUG_ON(!zone_is_initialized(zone)); |
| VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
| |
| VM_BUG_ON(migratetype == -1); |
| if (likely(!is_migrate_isolate(migratetype))) |
| __mod_zone_freepage_state(zone, 1 << order, migratetype); |
| |
| VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); |
| VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| |
| continue_merging: |
| while (order < max_order) { |
| if (compaction_capture(capc, page, order, migratetype)) { |
| __mod_zone_freepage_state(zone, -(1 << order), |
| migratetype); |
| return; |
| } |
| buddy_pfn = __find_buddy_pfn(pfn, order); |
| buddy = page + (buddy_pfn - pfn); |
| |
| if (!pfn_valid_within(buddy_pfn)) |
| goto done_merging; |
| if (!page_is_buddy(page, buddy, order)) |
| goto done_merging; |
| /* |
| * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
| * merge with it and move up one order. |
| */ |
| if (page_is_guard(buddy)) |
| clear_page_guard(zone, buddy, order, migratetype); |
| else |
| del_page_from_free_list(buddy, zone, order); |
| combined_pfn = buddy_pfn & pfn; |
| page = page + (combined_pfn - pfn); |
| pfn = combined_pfn; |
| order++; |
| } |
| if (order < MAX_ORDER - 1) { |
| /* If we are here, it means order is >= pageblock_order. |
| * We want to prevent merge between freepages on isolate |
| * pageblock and normal pageblock. Without this, pageblock |
| * isolation could cause incorrect freepage or CMA accounting. |
| * |
| * We don't want to hit this code for the more frequent |
| * low-order merging. |
| */ |
| if (unlikely(has_isolate_pageblock(zone))) { |
| int buddy_mt; |
| |
| buddy_pfn = __find_buddy_pfn(pfn, order); |
| buddy = page + (buddy_pfn - pfn); |
| buddy_mt = get_pageblock_migratetype(buddy); |
| |
| if (migratetype != buddy_mt |
| && (is_migrate_isolate(migratetype) || |
| is_migrate_isolate(buddy_mt))) |
| goto done_merging; |
| } |
| max_order = order + 1; |
| goto continue_merging; |
| } |
| |
| done_merging: |
| set_buddy_order(page, order); |
| |
| if (fpi_flags & FPI_TO_TAIL) |
| to_tail = true; |
| else if (is_shuffle_order(order)) |
| to_tail = shuffle_pick_tail(); |
| else |
| to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); |
| |
| if (to_tail) |
| add_to_free_list_tail(page, zone, order, migratetype); |
| else |
| add_to_free_list(page, zone, order, migratetype); |
| |
| /* Notify page reporting subsystem of freed page */ |
| if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) |
| page_reporting_notify_free(order); |
| } |
| |
| /* |
| * A bad page could be due to a number of fields. Instead of multiple branches, |
| * try and check multiple fields with one check. The caller must do a detailed |
| * check if necessary. |
| */ |
| static inline bool page_expected_state(struct page *page, |
| unsigned long check_flags) |
| { |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| return false; |
| |
| if (unlikely((unsigned long)page->mapping | |
| page_ref_count(page) | |
| #ifdef CONFIG_MEMCG |
| (unsigned long)page->mem_cgroup | |
| #endif |
| (page->flags & check_flags))) |
| return false; |
| |
| return true; |
| } |
| |
| static const char *page_bad_reason(struct page *page, unsigned long flags) |
| { |
| const char *bad_reason = NULL; |
| |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| bad_reason = "nonzero mapcount"; |
| if (unlikely(page->mapping != NULL)) |
| bad_reason = "non-NULL mapping"; |
| if (unlikely(page_ref_count(page) != 0)) |
| bad_reason = "nonzero _refcount"; |
| if (unlikely(page->flags & flags)) { |
| if (flags == PAGE_FLAGS_CHECK_AT_PREP) |
| bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; |
| else |
| bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
| } |
| #ifdef CONFIG_MEMCG |
| if (unlikely(page->mem_cgroup)) |
| bad_reason = "page still charged to cgroup"; |
| #endif |
| return bad_reason; |
| } |
| |
| static void check_free_page_bad(struct page *page) |
| { |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); |
| } |
| |
| static inline int check_free_page(struct page *page) |
| { |
| if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) |
| return 0; |
| |
| /* Something has gone sideways, find it */ |
| check_free_page_bad(page); |
| return 1; |
| } |
| |
| static int free_tail_pages_check(struct page *head_page, struct page *page) |
| { |
| int ret = 1; |
| |
| /* |
| * We rely page->lru.next never has bit 0 set, unless the page |
| * is PageTail(). Let's make sure that's true even for poisoned ->lru. |
| */ |
| BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); |
| |
| if (!IS_ENABLED(CONFIG_DEBUG_VM)) { |
| ret = 0; |
| goto out; |
| } |
| switch (page - head_page) { |
| case 1: |
| /* the first tail page: ->mapping may be compound_mapcount() */ |
| if (unlikely(compound_mapcount(page))) { |
| bad_page(page, "nonzero compound_mapcount"); |
| goto out; |
| } |
| break; |
| case 2: |
| /* |
| * the second tail page: ->mapping is |
| * deferred_list.next -- ignore value. |
| */ |
| break; |
| default: |
| if (page->mapping != TAIL_MAPPING) { |
| bad_page(page, "corrupted mapping in tail page"); |
| goto out; |
| } |
| break; |
| } |
| if (unlikely(!PageTail(page))) { |
| bad_page(page, "PageTail not set"); |
| goto out; |
| } |
| if (unlikely(compound_head(page) != head_page)) { |
| bad_page(page, "compound_head not consistent"); |
| goto out; |
| } |
| ret = 0; |
| out: |
| page->mapping = NULL; |
| clear_compound_head(page); |
| return ret; |
| } |
| |
| /* |
| * Skip KASAN memory poisoning when either: |
| * |
| * 1. Deferred memory initialization has not yet completed, |
| * see the explanation below. |
| * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, |
| * see the comment next to it. |
| * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, |
| * see the comment next to it. |
| * |
| * Poisoning pages during deferred memory init will greatly lengthen the |
| * process and cause problem in large memory systems as the deferred pages |
| * initialization is done with interrupt disabled. |
| * |
| * Assuming that there will be no reference to those newly initialized |
| * pages before they are ever allocated, this should have no effect on |
| * KASAN memory tracking as the poison will be properly inserted at page |
| * allocation time. The only corner case is when pages are allocated by |
| * on-demand allocation and then freed again before the deferred pages |
| * initialization is done, but this is not likely to happen. |
| */ |
| static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
| { |
| return deferred_pages_enabled() || |
| (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
| (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
| PageSkipKASanPoison(page); |
| } |
| |
| static void kernel_init_free_pages(struct page *page, int numpages) |
| { |
| int i; |
| |
| /* s390's use of memset() could override KASAN redzones. */ |
| kasan_disable_current(); |
| for (i = 0; i < numpages; i++) { |
| u8 tag = page_kasan_tag(page + i); |
| page_kasan_tag_reset(page + i); |
| clear_highpage(page + i); |
| page_kasan_tag_set(page + i, tag); |
| } |
| kasan_enable_current(); |
| } |
| |
| static __always_inline bool free_pages_prepare(struct page *page, |
| unsigned int order, bool check_free, fpi_t fpi_flags) |
| { |
| int bad = 0; |
| bool init = want_init_on_free(); |
| |
| VM_BUG_ON_PAGE(PageTail(page), page); |
| |
| trace_mm_page_free(page, order); |
| |
| if (unlikely(PageHWPoison(page)) && !order) { |
| /* |
| * Do not let hwpoison pages hit pcplists/buddy |
| * Untie memcg state and reset page's owner |
| */ |
| if (memcg_kmem_enabled() && PageKmemcg(page)) |
| __memcg_kmem_uncharge_page(page, order); |
| reset_page_owner(page, order); |
| free_page_pinner(page, order); |
| return false; |
| } |
| |
| /* |
| * Check tail pages before head page information is cleared to |
| * avoid checking PageCompound for order-0 pages. |
| */ |
| if (unlikely(order)) { |
| bool compound = PageCompound(page); |
| int i; |
| |
| VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
| |
| if (compound) |
| ClearPageDoubleMap(page); |
| for (i = 1; i < (1 << order); i++) { |
| if (compound) |
| bad += free_tail_pages_check(page, page + i); |
| if (unlikely(check_free_page(page + i))) { |
| bad++; |
| continue; |
| } |
| (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| } |
| } |
| if (PageMappingFlags(page)) |
| page->mapping = NULL; |
| if (memcg_kmem_enabled() && PageKmemcg(page)) |
| __memcg_kmem_uncharge_page(page, order); |
| if (check_free) |
| bad += check_free_page(page); |
| if (bad) |
| return false; |
| |
| page_cpupid_reset_last(page); |
| page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| reset_page_owner(page, order); |
| free_page_pinner(page, order); |
| |
| if (!PageHighMem(page)) { |
| debug_check_no_locks_freed(page_address(page), |
| PAGE_SIZE << order); |
| debug_check_no_obj_freed(page_address(page), |
| PAGE_SIZE << order); |
| } |
| |
| kernel_poison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN poisoning and memory initialization code must be |
| * kept together to avoid discrepancies in behavior. |
| * |
| * With hardware tag-based KASAN, memory tags must be set before the |
| * page becomes unavailable via debug_pagealloc or arch_free_page. |
| */ |
| if (!should_skip_kasan_poison(page, fpi_flags)) { |
| kasan_poison_pages(page, order, init); |
| |
| /* Memory is already initialized if KASAN did it internally. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } |
| if (init) |
| kernel_init_free_pages(page, 1 << order); |
| |
| /* |
| * arch_free_page() can make the page's contents inaccessible. s390 |
| * does this. So nothing which can access the page's contents should |
| * happen after this. |
| */ |
| arch_free_page(page, order); |
| |
| debug_pagealloc_unmap_pages(page, 1 << order); |
| |
| return true; |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| /* |
| * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed |
| * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when |
| * moved from pcp lists to free lists. |
| */ |
| static bool free_pcp_prepare(struct page *page) |
| { |
| return free_pages_prepare(page, 0, true, FPI_NONE); |
| } |
| |
| static bool bulkfree_pcp_prepare(struct page *page) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return check_free_page(page); |
| else |
| return false; |
| } |
| #else |
| /* |
| * With DEBUG_VM disabled, order-0 pages being freed are checked only when |
| * moving from pcp lists to free list in order to reduce overhead. With |
| * debug_pagealloc enabled, they are checked also immediately when being freed |
| * to the pcp lists. |
| */ |
| static bool free_pcp_prepare(struct page *page) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return free_pages_prepare(page, 0, true, FPI_NONE); |
| else |
| return free_pages_prepare(page, 0, false, FPI_NONE); |
| } |
| |
| static bool bulkfree_pcp_prepare(struct page *page) |
| { |
| return check_free_page(page); |
| } |
| #endif /* CONFIG_DEBUG_VM */ |
| |
| static inline void prefetch_buddy(struct page *page) |
| { |
| unsigned long pfn = page_to_pfn(page); |
| unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0); |
| struct page *buddy = page + (buddy_pfn - pfn); |
| |
| prefetch(buddy); |
| } |
| |
| /* |
| * Frees a number of pages from the PCP lists |
| * Assumes all pages on list are in same zone, and of same order. |
| * count is the number of pages to free. |
| * |
| * If the zone was previously in an "all pages pinned" state then look to |
| * see if this freeing clears that state. |
| * |
| * And clear the zone's pages_scanned counter, to hold off the "all pages are |
| * pinned" detection logic. |
| */ |
| static void free_pcppages_bulk(struct zone *zone, int count, |
| struct per_cpu_pages *pcp) |
| { |
| int migratetype = 0; |
| int batch_free = 0; |
| int prefetch_nr = 0; |
| bool isolated_pageblocks; |
| struct page *page, *tmp; |
| LIST_HEAD(head); |
| |
| /* |
| * Ensure proper count is passed which otherwise would stuck in the |
| * below while (list_empty(list)) loop. |
| */ |
| count = min(pcp->count, count); |
| while (count) { |
| struct list_head *list; |
| |
| /* |
| * Remove pages from lists in a round-robin fashion. A |
| * batch_free count is maintained that is incremented when an |
| * empty list is encountered. This is so more pages are freed |
| * off fuller lists instead of spinning excessively around empty |
| * lists |
| */ |
| do { |
| batch_free++; |
| if (++migratetype == MIGRATE_PCPTYPES) |
| migratetype = 0; |
| list = &pcp->lists[migratetype]; |
| } while (list_empty(list)); |
| |
| /* This is the only non-empty list. Free them all. */ |
| if (batch_free == MIGRATE_PCPTYPES) |
| batch_free = count; |
| |
| do { |
| page = list_last_entry(list, struct page, lru); |
| /* must delete to avoid corrupting pcp list */ |
| list_del(&page->lru); |
| pcp->count--; |
| |
| if (bulkfree_pcp_prepare(page)) |
| continue; |
| |
| list_add_tail(&page->lru, &head); |
| |
| /* |
| * We are going to put the page back to the global |
| * pool, prefetch its buddy to speed up later access |
| * under zone->lock. It is believed the overhead of |
| * an additional test and calculating buddy_pfn here |
| * can be offset by reduced memory latency later. To |
| * avoid excessive prefetching due to large count, only |
| * prefetch buddy for the first pcp->batch nr of pages. |
| */ |
| if (prefetch_nr++ < pcp->batch) |
| prefetch_buddy(page); |
| } while (--count && --batch_free && !list_empty(list)); |
| } |
| |
| spin_lock(&zone->lock); |
| isolated_pageblocks = has_isolate_pageblock(zone); |
| |
| /* |
| * Use safe version since after __free_one_page(), |
| * page->lru.next will not point to original list. |
| */ |
| list_for_each_entry_safe(page, tmp, &head, lru) { |
| int mt = get_pcppage_migratetype(page); |
| /* MIGRATE_ISOLATE page should not go to pcplists */ |
| VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); |
| /* Pageblock could have been isolated meanwhile */ |
| if (unlikely(isolated_pageblocks)) |
| mt = get_pageblock_migratetype(page); |
| |
| __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); |
| trace_mm_page_pcpu_drain(page, 0, mt); |
| } |
| spin_unlock(&zone->lock); |
| } |
| |
| static void free_one_page(struct zone *zone, |
| struct page *page, unsigned long pfn, |
| unsigned int order, |
| int migratetype, fpi_t fpi_flags) |
| { |
| unsigned long flags; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| if (unlikely(has_isolate_pageblock(zone) || |
| is_migrate_isolate(migratetype))) { |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| } |
| __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
| unsigned long zone, int nid) |
| { |
| mm_zero_struct_page(page); |
| set_page_links(page, zone, nid, pfn); |
| init_page_count(page); |
| page_mapcount_reset(page); |
| page_cpupid_reset_last(page); |
| page_kasan_tag_reset(page); |
| |
| INIT_LIST_HEAD(&page->lru); |
| #ifdef WANT_PAGE_VIRTUAL |
| /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
| if (!is_highmem_idx(zone)) |
| set_page_address(page, __va(pfn << PAGE_SHIFT)); |
| #endif |
| } |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| static void __meminit init_reserved_page(unsigned long pfn) |
| { |
| pg_data_t *pgdat; |
| int nid, zid; |
| |
| if (!early_page_uninitialised(pfn)) |
| return; |
| |
| nid = early_pfn_to_nid(pfn); |
| pgdat = NODE_DATA(nid); |
| |
| for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| struct zone *zone = &pgdat->node_zones[zid]; |
| |
| if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) |
| break; |
| } |
| __init_single_page(pfn_to_page(pfn), pfn, zid, nid); |
| } |
| #else |
| static inline void init_reserved_page(unsigned long pfn) |
| { |
| } |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| /* |
| * Initialised pages do not have PageReserved set. This function is |
| * called for each range allocated by the bootmem allocator and |
| * marks the pages PageReserved. The remaining valid pages are later |
| * sent to the buddy page allocator. |
| */ |
| void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) |
| { |
| unsigned long start_pfn = PFN_DOWN(start); |
| unsigned long end_pfn = PFN_UP(end); |
| |
| for (; start_pfn < end_pfn; start_pfn++) { |
| if (pfn_valid(start_pfn)) { |
| struct page *page = pfn_to_page(start_pfn); |
| |
| init_reserved_page(start_pfn); |
| |
| /* Avoid false-positive PageTail() */ |
| INIT_LIST_HEAD(&page->lru); |
| |
| /* |
| * no need for atomic set_bit because the struct |
| * page is not visible yet so nobody should |
| * access it yet. |
| */ |
| __SetPageReserved(page); |
| } |
| } |
| } |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags) |
| { |
| unsigned long flags; |
| int migratetype; |
| unsigned long pfn = page_to_pfn(page); |
| |
| if (!free_pages_prepare(page, order, true, fpi_flags)) |
| return; |
| |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| local_irq_save(flags); |
| __count_vm_events(PGFREE, 1 << order); |
| free_one_page(page_zone(page), page, pfn, order, migratetype, |
| fpi_flags); |
| local_irq_restore(flags); |
| } |
| |
| void __free_pages_core(struct page *page, unsigned int order) |
| { |
| unsigned int nr_pages = 1 << order; |
| struct page *p = page; |
| unsigned int loop; |
| |
| /* |
| * When initializing the memmap, __init_single_page() sets the refcount |
| * of all pages to 1 ("allocated"/"not free"). We have to set the |
| * refcount of all involved pages to 0. |
| */ |
| prefetchw(p); |
| for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
| prefetchw(p + 1); |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| } |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| |
| atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
| |
| /* |
| * Bypass PCP and place fresh pages right to the tail, primarily |
| * relevant for memory onlining. |
| */ |
| __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); |
| } |
| |
| #ifdef CONFIG_NEED_MULTIPLE_NODES |
| |
| static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; |
| |
| #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID |
| |
| /* |
| * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
| */ |
| int __meminit __early_pfn_to_nid(unsigned long pfn, |
| struct mminit_pfnnid_cache *state) |
| { |
| unsigned long start_pfn, end_pfn; |
| int nid; |
| |
| if (state->last_start <= pfn && pfn < state->last_end) |
| return state->last_nid; |
| |
| nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
| if (nid != NUMA_NO_NODE) { |
| state->last_start = start_pfn; |
| state->last_end = end_pfn; |
| state->last_nid = nid; |
| } |
| |
| return nid; |
| } |
| #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ |
| |
| int __meminit early_pfn_to_nid(unsigned long pfn) |
| { |
| static DEFINE_SPINLOCK(early_pfn_lock); |
| int nid; |
| |
| spin_lock(&early_pfn_lock); |
| nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); |
| if (nid < 0) |
| nid = first_online_node; |
| spin_unlock(&early_pfn_lock); |
| |
| return nid; |
| } |
| #endif /* CONFIG_NEED_MULTIPLE_NODES */ |
| |
| void __init memblock_free_pages(struct page *page, unsigned long pfn, |
| unsigned int order) |
| { |
| if (early_page_uninitialised(pfn)) |
| return; |
| __free_pages_core(page, order); |
| } |
| |
| /* |
| * Check that the whole (or subset of) a pageblock given by the interval of |
| * [start_pfn, end_pfn) is valid and within the same zone, before scanning it |
| * with the migration of free compaction scanner. The scanners then need to |
| * use only pfn_valid_within() check for arches that allow holes within |
| * pageblocks. |
| * |
| * Return struct page pointer of start_pfn, or NULL if checks were not passed. |
| * |
| * It's possible on some configurations to have a setup like node0 node1 node0 |
| * i.e. it's possible that all pages within a zones range of pages do not |
| * belong to a single zone. We assume that a border between node0 and node1 |
| * can occur within a single pageblock, but not a node0 node1 node0 |
| * interleaving within a single pageblock. It is therefore sufficient to check |
| * the first and last page of a pageblock and avoid checking each individual |
| * page in a pageblock. |
| */ |
| struct page *__pageblock_pfn_to_page(unsigned long start_pfn, |
| unsigned long end_pfn, struct zone *zone) |
| { |
| struct page *start_page; |
| struct page *end_page; |
| |
| /* end_pfn is one past the range we are checking */ |
| end_pfn--; |
| |
| if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) |
| return NULL; |
| |
| start_page = pfn_to_online_page(start_pfn); |
| if (!start_page) |
| return NULL; |
| |
| if (page_zone(start_page) != zone) |
| return NULL; |
| |
| end_page = pfn_to_page(end_pfn); |
| |
| /* This gives a shorter code than deriving page_zone(end_page) */ |
| if (page_zone_id(start_page) != page_zone_id(end_page)) |
| return NULL; |
| |
| return start_page; |
| } |
| |
| void set_zone_contiguous(struct zone *zone) |
| { |
| unsigned long block_start_pfn = zone->zone_start_pfn; |
| unsigned long block_end_pfn; |
| |
| block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); |
| for (; block_start_pfn < zone_end_pfn(zone); |
| block_start_pfn = block_end_pfn, |
| block_end_pfn += pageblock_nr_pages) { |
| |
| block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); |
| |
| if (!__pageblock_pfn_to_page(block_start_pfn, |
| block_end_pfn, zone)) |
| return; |
| cond_resched(); |
| } |
| |
| /* We confirm that there is no hole */ |
| zone->contiguous = true; |
| } |
| |
| void clear_zone_contiguous(struct zone *zone) |
| { |
| zone->contiguous = false; |
| } |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| static void __init deferred_free_range(unsigned long pfn, |
| unsigned long nr_pages) |
| { |
| struct page *page; |
| unsigned long i; |
| |
| if (!nr_pages) |
| return; |
| |
| page = pfn_to_page(pfn); |
| |
| /* Free a large naturally-aligned chunk if possible */ |
| if (nr_pages == pageblock_nr_pages && |
| (pfn & (pageblock_nr_pages - 1)) == 0) { |
| set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
| __free_pages_core(page, pageblock_order); |
| return; |
| } |
| |
| for (i = 0; i < nr_pages; i++, page++, pfn++) { |
| if ((pfn & (pageblock_nr_pages - 1)) == 0) |
| set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
| __free_pages_core(page, 0); |
| } |
| } |
| |
| /* Completion tracking for deferred_init_memmap() threads */ |
| static atomic_t pgdat_init_n_undone __initdata; |
| static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); |
| |
| static inline void __init pgdat_init_report_one_done(void) |
| { |
| if (atomic_dec_and_test(&pgdat_init_n_undone)) |
| complete(&pgdat_init_all_done_comp); |
| } |
| |
| /* |
| * Returns true if page needs to be initialized or freed to buddy allocator. |
| * |
| * First we check if pfn is valid on architectures where it is possible to have |
| * holes within pageblock_nr_pages. On systems where it is not possible, this |
| * function is optimized out. |
| * |
| * Then, we check if a current large page is valid by only checking the validity |
| * of the head pfn. |
| */ |
| static inline bool __init deferred_pfn_valid(unsigned long pfn) |
| { |
| if (!pfn_valid_within(pfn)) |
| return false; |
| if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) |
| return false; |
| return true; |
| } |
| |
| /* |
| * Free pages to buddy allocator. Try to free aligned pages in |
| * pageblock_nr_pages sizes. |
| */ |
| static void __init deferred_free_pages(unsigned long pfn, |
| unsigned long end_pfn) |
| { |
| unsigned long nr_pgmask = pageblock_nr_pages - 1; |
| unsigned long nr_free = 0; |
| |
| for (; pfn < end_pfn; pfn++) { |
| if (!deferred_pfn_valid(pfn)) { |
| deferred_free_range(pfn - nr_free, nr_free); |
| nr_free = 0; |
| } else if (!(pfn & nr_pgmask)) { |
| deferred_free_range(pfn - nr_free, nr_free); |
| nr_free = 1; |
| } else { |
| nr_free++; |
| } |
| } |
| /* Free the last block of pages to allocator */ |
| deferred_free_range(pfn - nr_free, nr_free); |
| } |
| |
| /* |
| * Initialize struct pages. We minimize pfn page lookups and scheduler checks |
| * by performing it only once every pageblock_nr_pages. |
| * Return number of pages initialized. |
| */ |
| static unsigned long __init deferred_init_pages(struct zone *zone, |
| unsigned long pfn, |
| unsigned long end_pfn) |
| { |
| unsigned long nr_pgmask = pageblock_nr_pages - 1; |
| int nid = zone_to_nid(zone); |
| unsigned long nr_pages = 0; |
| int zid = zone_idx(zone); |
| struct page *page = NULL; |
| |
| for (; pfn < end_pfn; pfn++) { |
| if (!deferred_pfn_valid(pfn)) { |
| page = NULL; |
| continue; |
| } else if (!page || !(pfn & nr_pgmask)) { |
| page = pfn_to_page(pfn); |
| } else { |
| page++; |
| } |
| __init_single_page(page, pfn, zid, nid); |
| nr_pages++; |
| } |
| return (nr_pages); |
| } |
| |
| /* |
| * This function is meant to pre-load the iterator for the zone init. |
| * Specifically it walks through the ranges until we are caught up to the |
| * first_init_pfn value and exits there. If we never encounter the value we |
| * return false indicating there are no valid ranges left. |
| */ |
| static bool __init |
| deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, |
| unsigned long *spfn, unsigned long *epfn, |
| unsigned long first_init_pfn) |
| { |
| u64 j; |
| |
| /* |
| * Start out by walking through the ranges in this zone that have |
| * already been initialized. We don't need to do anything with them |
| * so we just need to flush them out of the system. |
| */ |
| for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { |
| if (*epfn <= first_init_pfn) |
| continue; |
| if (*spfn < first_init_pfn) |
| *spfn = first_init_pfn; |
| *i = j; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Initialize and free pages. We do it in two loops: first we initialize |
| * struct page, then free to buddy allocator, because while we are |
| * freeing pages we can access pages that are ahead (computing buddy |
| * page in __free_one_page()). |
| * |
| * In order to try and keep some memory in the cache we have the loop |
| * broken along max page order boundaries. This way we will not cause |
| * any issues with the buddy page computation. |
| */ |
| static unsigned long __init |
| deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, |
| unsigned long *end_pfn) |
| { |
| unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); |
| unsigned long spfn = *start_pfn, epfn = *end_pfn; |
| unsigned long nr_pages = 0; |
| u64 j = *i; |
| |
| /* First we loop through and initialize the page values */ |
| for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { |
| unsigned long t; |
| |
| if (mo_pfn <= *start_pfn) |
| break; |
| |
| t = min(mo_pfn, *end_pfn); |
| nr_pages += deferred_init_pages(zone, *start_pfn, t); |
| |
| if (mo_pfn < *end_pfn) { |
| *start_pfn = mo_pfn; |
| break; |
| } |
| } |
| |
| /* Reset values and now loop through freeing pages as needed */ |
| swap(j, *i); |
| |
| for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { |
| unsigned long t; |
| |
| if (mo_pfn <= spfn) |
| break; |
| |
| t = min(mo_pfn, epfn); |
| deferred_free_pages(spfn, t); |
| |
| if (mo_pfn <= epfn) |
| break; |
| } |
| |
| return nr_pages; |
| } |
| |
| static void __init |
| deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, |
| void *arg) |
| { |
| unsigned long spfn, epfn; |
| struct zone *zone = arg; |
| u64 i; |
| |
| deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); |
| |
| /* |
| * Initialize and free pages in MAX_ORDER sized increments so that we |
| * can avoid introducing any issues with the buddy allocator. |
| */ |
| while (spfn < end_pfn) { |
| deferred_init_maxorder(&i, zone, &spfn, &epfn); |
| cond_resched(); |
| } |
| } |
| |
| /* An arch may override for more concurrency. */ |
| __weak int __init |
| deferred_page_init_max_threads(const struct cpumask *node_cpumask) |
| { |
| return 1; |
| } |
| |
| /* Initialise remaining memory on a node */ |
| static int __init deferred_init_memmap(void *data) |
| { |
| pg_data_t *pgdat = data; |
| const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
| unsigned long spfn = 0, epfn = 0; |
| unsigned long first_init_pfn, flags; |
| unsigned long start = jiffies; |
| struct zone *zone; |
| int zid, max_threads; |
| u64 i; |
| |
| /* Bind memory initialisation thread to a local node if possible */ |
| if (!cpumask_empty(cpumask)) |
| set_cpus_allowed_ptr(current, cpumask); |
| |
| pgdat_resize_lock(pgdat, &flags); |
| first_init_pfn = pgdat->first_deferred_pfn; |
| if (first_init_pfn == ULONG_MAX) { |
| pgdat_resize_unlock(pgdat, &flags); |
| pgdat_init_report_one_done(); |
| return 0; |
| } |
| |
| /* Sanity check boundaries */ |
| BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); |
| BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); |
| pgdat->first_deferred_pfn = ULONG_MAX; |
| |
| /* |
| * Once we unlock here, the zone cannot be grown anymore, thus if an |
| * interrupt thread must allocate this early in boot, zone must be |
| * pre-grown prior to start of deferred page initialization. |
| */ |
| pgdat_resize_unlock(pgdat, &flags); |
| |
| /* Only the highest zone is deferred so find it */ |
| for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| zone = pgdat->node_zones + zid; |
| if (first_init_pfn < zone_end_pfn(zone)) |
| break; |
| } |
| |
| /* If the zone is empty somebody else may have cleared out the zone */ |
| if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| first_init_pfn)) |
| goto zone_empty; |
| |
| max_threads = deferred_page_init_max_threads(cpumask); |
| |
| while (spfn < epfn) { |
| unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); |
| struct padata_mt_job job = { |
| .thread_fn = deferred_init_memmap_chunk, |
| .fn_arg = zone, |
| .start = spfn, |
| .size = epfn_align - spfn, |
| .align = PAGES_PER_SECTION, |
| .min_chunk = PAGES_PER_SECTION, |
| .max_threads = max_threads, |
| }; |
| |
| padata_do_multithreaded(&job); |
| deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| epfn_align); |
| } |
| zone_empty: |
| /* Sanity check that the next zone really is unpopulated */ |
| WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); |
| |
| pr_info("node %d deferred pages initialised in %ums\n", |
| pgdat->node_id, jiffies_to_msecs(jiffies - start)); |
| |
| pgdat_init_report_one_done(); |
| return 0; |
| } |
| |
| /* |
| * If this zone has deferred pages, try to grow it by initializing enough |
| * deferred pages to satisfy the allocation specified by order, rounded up to |
| * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments |
| * of SECTION_SIZE bytes by initializing struct pages in increments of |
| * PAGES_PER_SECTION * sizeof(struct page) bytes. |
| * |
| * Return true when zone was grown, otherwise return false. We return true even |
| * when we grow less than requested, to let the caller decide if there are |
| * enough pages to satisfy the allocation. |
| * |
| * Note: We use noinline because this function is needed only during boot, and |
| * it is called from a __ref function _deferred_grow_zone. This way we are |
| * making sure that it is not inlined into permanent text section. |
| */ |
| static noinline bool __init |
| deferred_grow_zone(struct zone *zone, unsigned int order) |
| { |
| unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); |
| pg_data_t *pgdat = zone->zone_pgdat; |
| unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; |
| unsigned long spfn, epfn, flags; |
| unsigned long nr_pages = 0; |
| u64 i; |
| |
| /* Only the last zone may have deferred pages */ |
| if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) |
| return false; |
| |
| pgdat_resize_lock(pgdat, &flags); |
| |
| /* |
| * If someone grew this zone while we were waiting for spinlock, return |
| * true, as there might be enough pages already. |
| */ |
| if (first_deferred_pfn != pgdat->first_deferred_pfn) { |
| pgdat_resize_unlock(pgdat, &flags); |
| return true; |
| } |
| |
| /* If the zone is empty somebody else may have cleared out the zone */ |
| if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| first_deferred_pfn)) { |
| pgdat->first_deferred_pfn = ULONG_MAX; |
| pgdat_resize_unlock(pgdat, &flags); |
| /* Retry only once. */ |
| return first_deferred_pfn != ULONG_MAX; |
| } |
| |
| /* |
| * Initialize and free pages in MAX_ORDER sized increments so |
| * that we can avoid introducing any issues with the buddy |
| * allocator. |
| */ |
| while (spfn < epfn) { |
| /* update our first deferred PFN for this section */ |
| first_deferred_pfn = spfn; |
| |
| nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); |
| touch_nmi_watchdog(); |
| |
| /* We should only stop along section boundaries */ |
| if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) |
| continue; |
| |
| /* If our quota has been met we can stop here */ |
| if (nr_pages >= nr_pages_needed) |
| break; |
| } |
| |
| pgdat->first_deferred_pfn = spfn; |
| pgdat_resize_unlock(pgdat, &flags); |
| |
| return nr_pages > 0; |
| } |
| |
| /* |
| * deferred_grow_zone() is __init, but it is called from |
| * get_page_from_freelist() during early boot until deferred_pages permanently |
| * disables this call. This is why we have refdata wrapper to avoid warning, |
| * and to ensure that the function body gets unloaded. |
| */ |
| static bool __ref |
| _deferred_grow_zone(struct zone *zone, unsigned int order) |
| { |
| return deferred_grow_zone(zone, order); |
| } |
| |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| void __init page_alloc_init_late(void) |
| { |
| struct zone *zone; |
| int nid; |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| |
| /* There will be num_node_state(N_MEMORY) threads */ |
| atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); |
| for_each_node_state(nid, N_MEMORY) { |
| kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); |
| } |
| |
| /* Block until all are initialised */ |
| wait_for_completion(&pgdat_init_all_done_comp); |
| |
| /* |
| * The number of managed pages has changed due to the initialisation |
| * so the pcpu batch and high limits needs to be updated or the limits |
| * will be artificially small. |
| */ |
| for_each_populated_zone(zone) |
| zone_pcp_update(zone); |
| |
| /* |
| * We initialized the rest of the deferred pages. Permanently disable |
| * on-demand struct page initialization. |
| */ |
| static_branch_disable(&deferred_pages); |
| |
| /* Reinit limits that are based on free pages after the kernel is up */ |
| files_maxfiles_init(); |
| #endif |
| |
| /* Discard memblock private memory */ |
| memblock_discard(); |
| |
| for_each_node_state(nid, N_MEMORY) |
| shuffle_free_memory(NODE_DATA(nid)); |
| |
| for_each_populated_zone(zone) |
| set_zone_contiguous(zone); |
| } |
| |
| #ifdef CONFIG_CMA |
| /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ |
| void __init init_cma_reserved_pageblock(struct page *page) |
| { |
| unsigned i = pageblock_nr_pages; |
| struct page *p = page; |
| |
| do { |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| } while (++p, --i); |
| |
| set_pageblock_migratetype(page, MIGRATE_CMA); |
| |
| if (pageblock_order >= MAX_ORDER) { |
| i = pageblock_nr_pages; |
| p = page; |
| do { |
| set_page_refcounted(p); |
| __free_pages(p, MAX_ORDER - 1); |
| p += MAX_ORDER_NR_PAGES; |
| } while (i -= MAX_ORDER_NR_PAGES); |
| } else { |
| set_page_refcounted(page); |
| __free_pages(page, pageblock_order); |
| } |
| |
| adjust_managed_page_count(page, pageblock_nr_pages); |
| page_zone(page)->cma_pages += pageblock_nr_pages; |
| } |
| #endif |
| |
| /* |
| * The order of subdivision here is critical for the IO subsystem. |
| * Please do not alter this order without good reasons and regression |
| * testing. Specifically, as large blocks of memory are subdivided, |
| * the order in which smaller blocks are delivered depends on the order |
| * they're subdivided in this function. This is the primary factor |
| * influencing the order in which pages are delivered to the IO |
| * subsystem according to empirical testing, and this is also justified |
| * by considering the behavior of a buddy system containing a single |
| * large block of memory acted on by a series of small allocations. |
| * This behavior is a critical factor in sglist merging's success. |
| * |
| * -- nyc |
| */ |
| static inline void expand(struct zone *zone, struct page *page, |
| int low, int high, int migratetype) |
| { |
| unsigned long size = 1 << high; |
| |
| while (high > low) { |
| high--; |
| size >>= 1; |
| VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
| |
| /* |
| * Mark as guard pages (or page), that will allow to |
| * merge back to allocator when buddy will be freed. |
| * Corresponding page table entries will not be touched, |
| * pages will stay not present in virtual address space |
| */ |
| if (set_page_guard(zone, &page[size], high, migratetype)) |
| continue; |
| |
| add_to_free_list(&page[size], zone, high, migratetype); |
| set_buddy_order(&page[size], high); |
| } |
| } |
| |
| static void check_new_page_bad(struct page *page) |
| { |
| if (unlikely(page->flags & __PG_HWPOISON)) { |
| /* Don't complain about hwpoisoned pages */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| return; |
| } |
| |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); |
| } |
| |
| /* |
| * This page is about to be returned from the page allocator |
| */ |
| static inline int check_new_page(struct page *page) |
| { |
| if (likely(page_expected_state(page, |
| PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) |
| return 0; |
| |
| check_new_page_bad(page); |
| return 1; |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| /* |
| * With DEBUG_VM enabled, order-0 pages are checked for expected state when |
| * being allocated from pcp lists. With debug_pagealloc also enabled, they are |
| * also checked when pcp lists are refilled from the free lists. |
| */ |
| static inline bool check_pcp_refill(struct page *page) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return check_new_page(page); |
| else |
| return false; |
| } |
| |
| static inline bool check_new_pcp(struct page *page) |
| { |
| return check_new_page(page); |
| } |
| #else |
| /* |
| * With DEBUG_VM disabled, free order-0 pages are checked for expected state |
| * when pcp lists are being refilled from the free lists. With debug_pagealloc |
| * enabled, they are also checked when being allocated from the pcp lists. |
| */ |
| static inline bool check_pcp_refill(struct page *page) |
| { |
| return check_new_page(page); |
| } |
| static inline bool check_new_pcp(struct page *page) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return check_new_page(page); |
| else |
| return false; |
| } |
| #endif /* CONFIG_DEBUG_VM */ |
| |
| static bool check_new_pages(struct page *page, unsigned int order) |
| { |
| int i; |
| for (i = 0; i < (1 << order); i++) { |
| struct page *p = page + i; |
| |
| if (unlikely(check_new_page(p))) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags) |
| { |
| /* Don't skip if a software KASAN mode is enabled. */ |
| if (IS_ENABLED(CONFIG_KASAN_GENERIC) || |
| IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
| return false; |
| |
| /* Skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return true; |
| |
| /* |
| * With hardware tag-based KASAN enabled, skip if either: |
| * |
| * 1. Memory tags have already been cleared via tag_clear_highpage(). |
| * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON. |
| */ |
| return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON); |
| } |
| |
| static inline bool should_skip_init(gfp_t flags) |
| { |
| /* Don't skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return false; |
| |
| /* For hardware tag-based KASAN, skip if requested. */ |
| return (flags & __GFP_SKIP_ZERO); |
| } |
| |
| inline void post_alloc_hook(struct page *page, unsigned int order, |
| gfp_t gfp_flags) |
| { |
| bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && |
| !should_skip_init(gfp_flags); |
| bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); |
| |
| set_page_private(page, 0); |
| set_page_refcounted(page); |
| |
| arch_alloc_page(page, order); |
| debug_pagealloc_map_pages(page, 1 << order); |
| |
| /* |
| * Page unpoisoning must happen before memory initialization. |
| * Otherwise, the poison pattern will be overwritten for __GFP_ZERO |
| * allocations and the page unpoisoning code will complain. |
| */ |
| kernel_unpoison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN unpoisoning and memory initializion code must be |
| * kept together to avoid discrepancies in behavior. |
| */ |
| |
| /* |
| * If memory tags should be zeroed (which happens only when memory |
| * should be initialized as well). |
| */ |
| if (init_tags) { |
| int i; |
| |
| /* Initialize both memory and tags. */ |
| for (i = 0; i != 1 << order; ++i) |
| tag_clear_highpage(page + i); |
| |
| /* Note that memory is already initialized by the loop above. */ |
| init = false; |
| } |
| if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) { |
| /* Unpoison shadow memory or set memory tags. */ |
| kasan_unpoison_pages(page, order, init); |
| |
| /* Note that memory is already initialized by KASAN. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } |
| /* If memory is still not initialized, do it now. */ |
| if (init) |
| kernel_init_free_pages(page, 1 << order); |
| /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ |
| if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) |
| SetPageSkipKASanPoison(page); |
| |
| set_page_owner(page, order, gfp_flags); |
| } |
| |
| static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
| unsigned int alloc_flags) |
| { |
| post_alloc_hook(page, order, gfp_flags); |
| |
| if (order && (gfp_flags & __GFP_COMP)) |
| prep_compound_page(page, order); |
| |
| /* |
| * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to |
| * allocate the page. The expectation is that the caller is taking |
| * steps that will free more memory. The caller should avoid the page |
| * being used for !PFMEMALLOC purposes. |
| */ |
| if (alloc_flags & ALLOC_NO_WATERMARKS) |
| set_page_pfmemalloc(page); |
| else |
| clear_page_pfmemalloc(page); |
| } |
| |
| /* |
| * Go through the free lists for the given migratetype and remove |
| * the smallest available page from the freelists |
| */ |
| static __always_inline |
| struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
| int migratetype) |
| { |
| unsigned int current_order; |
| struct free_area *area; |
| struct page *page; |
| |
| /* Find a page of the appropriate size in the preferred list */ |
| for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
| area = &(zone->free_area[current_order]); |
| page = get_page_from_free_area(area, migratetype); |
| if (!page) |
| continue; |
| del_page_from_free_list(page, zone, current_order); |
| expand(zone, page, order, current_order, migratetype); |
| set_pcppage_migratetype(page, migratetype); |
| return page; |
| } |
| |
| return NULL; |
| } |
| |
| |
| /* |
| * This array describes the order lists are fallen back to when |
| * the free lists for the desirable migrate type are depleted |
| */ |
| static int fallbacks[MIGRATE_TYPES][3] = { |
| [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
| [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, |
| [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
| #ifdef CONFIG_CMA |
| [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ |
| #endif |
| #ifdef CONFIG_MEMORY_ISOLATION |
| [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ |
| #endif |
| }; |
| |
| #ifdef CONFIG_CMA |
| static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) |
| { |
| return __rmqueue_smallest(zone, order, MIGRATE_CMA); |
| } |
| #else |
| static inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) { return NULL; } |
| #endif |
| |
| /* |
| * Move the free pages in a range to the freelist tail of the requested type. |
| * Note that start_page and end_pages are not aligned on a pageblock |
| * boundary. If alignment is required, use move_freepages_block() |
| */ |
| static int move_freepages(struct zone *zone, |
| struct page *start_page, struct page *end_page, |
| int migratetype, int *num_movable) |
| { |
| struct page *page; |
| unsigned int order; |
| int pages_moved = 0; |
| |
| for (page = start_page; page <= end_page;) { |
| if (!pfn_valid_within(page_to_pfn(page))) { |
| page++; |
| continue; |
| } |
| |
| if (!PageBuddy(page)) { |
| /* |
| * We assume that pages that could be isolated for |
| * migration are movable. But we don't actually try |
| * isolating, as that would be expensive. |
| */ |
| if (num_movable && |
| (PageLRU(page) || __PageMovable(page))) |
| (*num_movable)++; |
| |
| page++; |
| continue; |
| } |
| |
| /* Make sure we are not inadvertently changing nodes */ |
| VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
| VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
| |
| order = buddy_order(page); |
| move_to_free_list(page, zone, order, migratetype); |
| page += 1 << order; |
| pages_moved += 1 << order; |
| } |
| |
| return pages_moved; |
| } |
| |
| int move_freepages_block(struct zone *zone, struct page *page, |
| int migratetype, int *num_movable) |
| { |
| unsigned long start_pfn, end_pfn; |
| struct page *start_page, *end_page; |
| |
| if (num_movable) |
| *num_movable = 0; |
| |
| start_pfn = page_to_pfn(page); |
| start_pfn = start_pfn & ~(pageblock_nr_pages-1); |
| start_page = pfn_to_page(start_pfn); |
| end_page = start_page + pageblock_nr_pages - 1; |
| end_pfn = start_pfn + pageblock_nr_pages - 1; |
| |
| /* Do not cross zone boundaries */ |
| if (!zone_spans_pfn(zone, start_pfn)) |
| start_page = page; |
| if (!zone_spans_pfn(zone, end_pfn)) |
| return 0; |
| |
| return move_freepages(zone, start_page, end_page, migratetype, |
| num_movable); |
| } |
| |
| static void change_pageblock_range(struct page *pageblock_page, |
| int start_order, int migratetype) |
| { |
| int nr_pageblocks = 1 << (start_order - pageblock_order); |
| |
| while (nr_pageblocks--) { |
| set_pageblock_migratetype(pageblock_page, migratetype); |
| pageblock_page += pageblock_nr_pages; |
| } |
| } |
| |
| /* |
| * When we are falling back to another migratetype during allocation, try to |
| * steal extra free pages from the same pageblocks to satisfy further |
| * allocations, instead of polluting multiple pageblocks. |
| * |
| * If we are stealing a relatively large buddy page, it is likely there will |
| * be more free pages in the pageblock, so try to steal them all. For |
| * reclaimable and unmovable allocations, we steal regardless of page size, |
| * as fragmentation caused by those allocations polluting movable pageblocks |
| * is worse than movable allocations stealing from unmovable and reclaimable |
| * pageblocks. |
| */ |
| static bool can_steal_fallback(unsigned int order, int start_mt) |
| { |
| /* |
| * Leaving this order check is intended, although there is |
| * relaxed order check in next check. The reason is that |
| * we can actually steal whole pageblock if this condition met, |
| * but, below check doesn't guarantee it and that is just heuristic |
| * so could be changed anytime. |
| */ |
| if (order >= pageblock_order) |
| return true; |
| |
| if (order >= pageblock_order / 2 || |
| start_mt == MIGRATE_RECLAIMABLE || |
| start_mt == MIGRATE_UNMOVABLE || |
| page_group_by_mobility_disabled) |
| return true; |
| |
| return false; |
| } |
| |
| static inline bool boost_watermark(struct zone *zone) |
| { |
| unsigned long max_boost; |
| |
| if (!watermark_boost_factor) |
| return false; |
| /* |
| * Don't bother in zones that are unlikely to produce results. |
| * On small machines, including kdump capture kernels running |
| * in a small area, boosting the watermark can cause an out of |
| * memory situation immediately. |
| */ |
| if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) |
| return false; |
| |
| max_boost = mult_frac(zone->_watermark[WMARK_HIGH], |
| watermark_boost_factor, 10000); |
| |
| /* |
| * high watermark may be uninitialised if fragmentation occurs |
| * very early in boot so do not boost. We do not fall |
| * through and boost by pageblock_nr_pages as failing |
| * allocations that early means that reclaim is not going |
| * to help and it may even be impossible to reclaim the |
| * boosted watermark resulting in a hang. |
| */ |
| if (!max_boost) |
| return false; |
| |
| max_boost = max(pageblock_nr_pages, max_boost); |
| |
| zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, |
| max_boost); |
| |
| return true; |
| } |
| |
| /* |
| * This function implements actual steal behaviour. If order is large enough, |
| * we can steal whole pageblock. If not, we first move freepages in this |
| * pageblock to our migratetype and determine how many already-allocated pages |
| * are there in the pageblock with a compatible migratetype. If at least half |
| * of pages are free or compatible, we can change migratetype of the pageblock |
| * itself, so pages freed in the future will be put on the correct free list. |
| */ |
| static void steal_suitable_fallback(struct zone *zone, struct page *page, |
| unsigned int alloc_flags, int start_type, bool whole_block) |
| { |
| unsigned int current_order = buddy_order(page); |
| int free_pages, movable_pages, alike_pages; |
| int old_block_type; |
| |
| old_block_type = get_pageblock_migratetype(page); |
| |
| /* |
| * This can happen due to races and we want to prevent broken |
| * highatomic accounting. |
| */ |
| if (is_migrate_highatomic(old_block_type)) |
| goto single_page; |
| |
| /* Take ownership for orders >= pageblock_order */ |
| if (current_order >= pageblock_order) { |
| change_pageblock_range(page, current_order, start_type); |
| goto single_page; |
| } |
| |
| /* |
| * Boost watermarks to increase reclaim pressure to reduce the |
| * likelihood of future fallbacks. Wake kswapd now as the node |
| * may be balanced overall and kswapd will not wake naturally. |
| */ |
| if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) |
| set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
| |
| /* We are not allowed to try stealing from the whole block */ |
| if (!whole_block) |
| goto single_page; |
| |
| free_pages = move_freepages_block(zone, page, start_type, |
| &movable_pages); |
| /* |
| * Determine how many pages are compatible with our allocation. |
| * For movable allocation, it's the number of movable pages which |
| * we just obtained. For other types it's a bit more tricky. |
| */ |
| if (start_type == MIGRATE_MOVABLE) { |
| alike_pages = movable_pages; |
| } else { |
| /* |
| * If we are falling back a RECLAIMABLE or UNMOVABLE allocation |
| * to MOVABLE pageblock, consider all non-movable pages as |
| * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or |
| * vice versa, be conservative since we can't distinguish the |
| * exact migratetype of non-movable pages. |
| */ |
| if (old_block_type == MIGRATE_MOVABLE) |
| alike_pages = pageblock_nr_pages |
| - (free_pages + movable_pages); |
| else |
| alike_pages = 0; |
| } |
| |
| /* moving whole block can fail due to zone boundary conditions */ |
| if (!free_pages) |
| goto single_page; |
| |
| /* |
| * If a sufficient number of pages in the block are either free or of |
| * comparable migratability as our allocation, claim the whole block. |
| */ |
| if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || |
| page_group_by_mobility_disabled) |
| set_pageblock_migratetype(page, start_type); |
| |
| return; |
| |
| single_page: |
| move_to_free_list(page, zone, current_order, start_type); |
| } |
| |
| /* |
| * Check whether there is a suitable fallback freepage with requested order. |
| * If only_stealable is true, this function returns fallback_mt only if |
| * we can steal other freepages all together. This would help to reduce |
| * fragmentation due to mixed migratetype pages in one pageblock. |
| */ |
| int find_suitable_fallback(struct free_area *area, unsigned int order, |
| int migratetype, bool only_stealable, bool *can_steal) |
| { |
| int i; |
| int fallback_mt; |
| |
| if (area->nr_free == 0) |
| return -1; |
| |
| *can_steal = false; |
| for (i = 0;; i++) { |
| fallback_mt = fallbacks[migratetype][i]; |
| if (fallback_mt == MIGRATE_TYPES) |
| break; |
| |
| if (free_area_empty(area, fallback_mt)) |
| continue; |
| |
| if (can_steal_fallback(order, migratetype)) |
| *can_steal = true; |
| |
| if (!only_stealable) |
| return fallback_mt; |
| |
| if (*can_steal) |
| return fallback_mt; |
| } |
| |
| return -1; |
| } |
| |
| /* |
| * Reserve a pageblock for exclusive use of high-order atomic allocations if |
| * there are no empty page blocks that contain a page with a suitable order |
| */ |
| static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, |
| unsigned int alloc_order) |
| { |
| int mt; |
| unsigned long max_managed, flags; |
| |
| /* |
| * Limit the number reserved to 1 pageblock or roughly 1% of a zone. |
| * Check is race-prone but harmless. |
| */ |
| max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; |
| if (zone->nr_reserved_highatomic >= max_managed) |
| return; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| /* Recheck the nr_reserved_highatomic limit under the lock */ |
| if (zone->nr_reserved_highatomic >= max_managed) |
| goto out_unlock; |
| |
| /* Yoink! */ |
| mt = get_pageblock_migratetype(page); |
| if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) |
| && !is_migrate_cma(mt)) { |
| zone->nr_reserved_highatomic += pageblock_nr_pages; |
| set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); |
| move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); |
| } |
| |
| out_unlock: |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| /* |
| * Used when an allocation is about to fail under memory pressure. This |
| * potentially hurts the reliability of high-order allocations when under |
| * intense memory pressure but failed atomic allocations should be easier |
| * to recover from than an OOM. |
| * |
| * If @force is true, try to unreserve a pageblock even though highatomic |
| * pageblock is exhausted. |
| */ |
| static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, |
| bool force) |
| { |
| struct zonelist *zonelist = ac->zonelist; |
| unsigned long flags; |
| struct zoneref *z; |
| struct zone *zone; |
| struct page *page; |
| int order; |
| bool ret; |
| |
| for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, |
| ac->nodemask) { |
| /* |
| * Preserve at least one pageblock unless memory pressure |
| * is really high. |
| */ |
| if (!force && zone->nr_reserved_highatomic <= |
| pageblock_nr_pages) |
| continue; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| for (order = 0; order < MAX_ORDER; order++) { |
| struct free_area *area = &(zone->free_area[order]); |
| |
| page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); |
| if (!page) |
| continue; |
| |
| /* |
| * In page freeing path, migratetype change is racy so |
| * we can counter several free pages in a pageblock |
| * in this loop althoug we changed the pageblock type |
| * from highatomic to ac->migratetype. So we should |
| * adjust the count once. |
| */ |
| if (is_migrate_highatomic_page(page)) { |
| /* |
| * It should never happen but changes to |
| * locking could inadvertently allow a per-cpu |
| * drain to add pages to MIGRATE_HIGHATOMIC |
| * while unreserving so be safe and watch for |
| * underflows. |
| */ |
| zone->nr_reserved_highatomic -= min( |
| pageblock_nr_pages, |
| zone->nr_reserved_highatomic); |
| } |
| |
| /* |
| * Convert to ac->migratetype and avoid the normal |
| * pageblock stealing heuristics. Minimally, the caller |
| * is doing the work and needs the pages. More |
| * importantly, if the block was always converted to |
| * MIGRATE_UNMOVABLE or another type then the number |
| * of pageblocks that cannot be completely freed |
| * may increase. |
| */ |
| set_pageblock_migratetype(page, ac->migratetype); |
| ret = move_freepages_block(zone, page, ac->migratetype, |
| NULL); |
| if (ret) { |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return ret; |
| } |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Try finding a free buddy page on the fallback list and put it on the free |
| * list of requested migratetype, possibly along with other pages from the same |
| * block, depending on fragmentation avoidance heuristics. Returns true if |
| * fallback was found so that __rmqueue_smallest() can grab it. |
| * |
| * The use of signed ints for order and current_order is a deliberate |
| * deviation from the rest of this file, to make the for loop |
| * condition simpler. |
| */ |
| static __always_inline bool |
| __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
| unsigned int alloc_flags) |
| { |
| struct free_area *area; |
| int current_order; |
| int min_order = order; |
| struct page *page; |
| int fallback_mt; |
| bool can_steal; |
| |
| /* |
| * Do not steal pages from freelists belonging to other pageblocks |
| * i.e. orders < pageblock_order. If there are no local zones free, |
| * the zonelists will be reiterated without ALLOC_NOFRAGMENT. |
| */ |
| if (alloc_flags & ALLOC_NOFRAGMENT) |
| min_order = pageblock_order; |
| |
| /* |
| * Find the largest available free page in the other list. This roughly |
| * approximates finding the pageblock with the most free pages, which |
| * would be too costly to do exactly. |
| */ |
| for (current_order = MAX_ORDER - 1; current_order >= min_order; |
| --current_order) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt == -1) |
| continue; |
| |
| /* |
| * We cannot steal all free pages from the pageblock and the |
| * requested migratetype is movable. In that case it's better to |
| * steal and split the smallest available page instead of the |
| * largest available page, because even if the next movable |
| * allocation falls back into a different pageblock than this |
| * one, it won't cause permanent fragmentation. |
| */ |
| if (!can_steal && start_migratetype == MIGRATE_MOVABLE |
| && current_order > order) |
| goto find_smallest; |
| |
| goto do_steal; |
| } |
| |
| return false; |
| |
| find_smallest: |
| for (current_order = order; current_order < MAX_ORDER; |
| current_order++) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt != -1) |
| break; |
| } |
| |
| /* |
| * This should not happen - we already found a suitable fallback |
| * when looking for the largest page. |
| */ |
| VM_BUG_ON(current_order == MAX_ORDER); |
| |
| do_steal: |
| page = get_page_from_free_area(area, fallback_mt); |
| |
| steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, |
| can_steal); |
| |
| trace_mm_page_alloc_extfrag(page, order, current_order, |
| start_migratetype, fallback_mt); |
| |
| return true; |
| |
| } |
| |
| /* |
| * Do the hard work of removing an element from the buddy allocator. |
| * Call me with the zone->lock already held. |
| */ |
| static __always_inline struct page * |
| __rmqueue(struct zone *zone, unsigned int order, int migratetype, |
| unsigned int alloc_flags) |
| { |
| struct page *page; |
| |
| retry: |
| page = __rmqueue_smallest(zone, order, migratetype); |
| |
| if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, |
| alloc_flags)) |
| goto retry; |
| |
| trace_mm_page_alloc_zone_locked(page, order, migratetype); |
| return page; |
| } |
| |
| #ifdef CONFIG_CMA |
| static struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
| int migratetype, |
| unsigned int alloc_flags) |
| { |
| struct page *page = __rmqueue_cma_fallback(zone, order); |
| trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); |
| return page; |
| } |
| #else |
| static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order, |
| int migratetype, |
| unsigned int alloc_flags) |
| { |
| return NULL; |
| } |
| #endif |
| |
| /* |
| * Obtain a specified number of elements from the buddy allocator, all under |
| * a single hold of the lock, for efficiency. Add them to the supplied list. |
| * Returns the number of new pages which were placed at *list. |
| */ |
| static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| unsigned long count, struct list_head *list, |
| int migratetype, unsigned int alloc_flags) |
| { |
| int i, alloced = 0; |
| |
| spin_lock(&zone->lock); |
| for (i = 0; i < count; ++i) { |
| struct page *page = NULL; |
| |
| if (is_migrate_cma(migratetype)) { |
| bool is_cma_alloc = true; |
| |
| trace_android_vh_cma_alloc_adjust(zone, &is_cma_alloc); |
| if (is_cma_alloc) |
| page = __rmqueue_cma(zone, order, migratetype, |
| alloc_flags); |
| } else |
| page = __rmqueue(zone, order, migratetype, alloc_flags); |
| |
| if (unlikely(page == NULL)) |
| break; |
| |
| if (unlikely(check_pcp_refill(page))) |
| continue; |
| |
| /* |
| * Split buddy pages returned by expand() are received here in |
| * physical page order. The page is added to the tail of |
| * caller's list. From the callers perspective, the linked list |
| * is ordered by page number under some conditions. This is |
| * useful for IO devices that can forward direction from the |
| * head, thus also in the physical page order. This is useful |
| * for IO devices that can merge IO requests if the physical |
| * pages are ordered properly. |
| */ |
| list_add_tail(&page->lru, list); |
| alloced++; |
| if (is_migrate_cma(get_pcppage_migratetype(page))) |
| __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
| -(1 << order)); |
| } |
| |
| /* |
| * i pages were removed from the buddy list even if some leak due |
| * to check_pcp_refill failing so adjust NR_FREE_PAGES based |
| * on i. Do not confuse with 'alloced' which is the number of |
| * pages added to the pcp list. |
| */ |
| __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
| spin_unlock(&zone->lock); |
| return alloced; |
| } |
| |
| /* |
| * Return the pcp list that corresponds to the migrate type if that list isn't |
| * empty. |
| * If the list is empty return NULL. |
| */ |
| static struct list_head *get_populated_pcp_list(struct zone *zone, |
| unsigned int order, struct per_cpu_pages *pcp, |
| int migratetype, unsigned int alloc_flags) |
| { |
| struct list_head *list = &pcp->lists[migratetype]; |
| |
| if (list_empty(list)) { |
| pcp->count += rmqueue_bulk(zone, order, |
| pcp->batch, list, |
| migratetype, alloc_flags); |
| |
| if (list_empty(list)) |
| list = NULL; |
| } |
| return list; |
| } |
| |
| #ifdef CONFIG_NUMA |
| /* |
| * Called from the vmstat counter updater to drain pagesets of this |
| * currently executing processor on remote nodes after they have |
| * expired. |
| */ |
| void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
| { |
| int to_drain, batch; |
| |
| batch = READ_ONCE(pcp->batch); |
| to_drain = min(pcp->count, batch); |
| if (to_drain > 0) { |
| unsigned long flags; |
| struct per_cpu_pageset_ext *ps_ext = pcp_to_pageset_ext(pcp); |
| |
| /* |
| * free_pcppages_bulk expects IRQs disabled for zone->lock |
| * so even though pcp->lock is not intended to be IRQ-safe, |
| * it's needed in this context. |
| */ |
| spin_lock_irqsave(&ps_ext->lock, flags); |
| free_pcppages_bulk(zone, to_drain, pcp); |
| spin_unlock_irqrestore(&ps_ext->lock, flags); |
| } |
| } |
| #endif |
| |
| /* |
| * Drain pcplists of the indicated processor and zone. |
| */ |
| static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
| { |
| struct per_cpu_pageset *pset; |
| struct per_cpu_pageset_ext *ps_ext; |
| struct per_cpu_pages *pcp; |
| |
| pset = per_cpu_ptr(zone->pageset, cpu); |
| ps_ext = pageset_to_pageset_ext(pset); |
| |
| pcp = &pset->pcp; |
| if (pcp->count) { |
| unsigned long flags; |
| |
| /* See drain_zone_pages on why this is disabling IRQs */ |
| spin_lock_irqsave(&ps_ext->lock, flags); |
| free_pcppages_bulk(zone, pcp->count, pcp); |
| spin_unlock_irqrestore(&ps_ext->lock, flags); |
| } |
| } |
| |
| /* |
| * Drain pcplists of all zones on the indicated processor. |
| */ |
| static void drain_pages(unsigned int cpu) |
| { |
| struct zone *zone; |
| |
| for_each_populated_zone(zone) { |
| drain_pages_zone(cpu, zone); |
| } |
| } |
| |
| /* |
| * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
| */ |
| void drain_local_pages(struct zone *zone) |
| { |
| int cpu = smp_processor_id(); |
| |
| if (zone) |
| drain_pages_zone(cpu, zone); |
| else |
| drain_pages(cpu); |
| } |
| |
| /* |
| * Spill all the per-cpu pages from all CPUs back into the buddy allocator. |
| * |
| * When zone parameter is non-NULL, spill just the single zone's pages. |
| */ |
| void drain_all_pages(struct zone *zone) |
| { |
| int cpu; |
| |
| /* |
| * Allocate in the BSS so we wont require allocation in |
| * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y |
| */ |
| static cpumask_t cpus_with_pcps; |
| |
| /* |
| * Do not drain if one is already in progress unless it's specific to |
| * a zone. Such callers are primarily CMA and memory hotplug and need |
| * the drain to be complete when the call returns. |
| */ |
| if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { |
| if (!zone) |
| return; |
| mutex_lock(&pcpu_drain_mutex); |
| } |
| |
| /* |
| * We don't care about racing with CPU hotplug event |
| * as offline notification will cause the notified |
| * cpu to drain that CPU pcps and on_each_cpu_mask |
| * disables preemption as part of its processing |
| */ |
| for_each_online_cpu(cpu) { |
| struct per_cpu_pageset *pcp; |
| struct zone *z; |
| bool has_pcps = false; |
| |
| if (zone) { |
| pcp = per_cpu_ptr(zone->pageset, cpu); |
| if (pcp->pcp.count) |
| has_pcps = true; |
| } else { |
| for_each_populated_zone(z) { |
| pcp = per_cpu_ptr(z->pageset, cpu); |
| if (pcp->pcp.count) { |
| has_pcps = true; |
| break; |
| } |
| } |
| } |
| |
| if (has_pcps) |
| cpumask_set_cpu(cpu, &cpus_with_pcps); |
| else |
| cpumask_clear_cpu(cpu, &cpus_with_pcps); |
| } |
| |
| for_each_cpu(cpu, &cpus_with_pcps) { |
| if (zone) { |
| drain_pages_zone(cpu, zone); |
| } else { |
| drain_pages(cpu); |
| } |
| } |
| |
| mutex_unlock(&pcpu_drain_mutex); |
| } |
| |
| #ifdef CONFIG_HIBERNATION |
| |
| /* |
| * Touch the watchdog for every WD_PAGE_COUNT pages. |
| */ |
| #define WD_PAGE_COUNT (128*1024) |
| |
| void mark_free_pages(struct zone *zone) |
| { |
| unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT; |
| unsigned long flags |