blob: 7330061a0501166f96a03a10ea2fa1c3b966e6ed [file] [log] [blame]
// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
* Manages the free list, the system allocates free pages here.
* Note that kmalloc() lives in slab.c
*
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
* Swap reorganised 29.12.95, Stephen Tweedie
* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
*/
#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/ratelimit.h>
#include <linux/oom.h>
#include <linux/topology.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/memory_hotplug.h>
#include <linux/nodemask.h>
#include <linux/vmalloc.h>
#include <linux/vmstat.h>
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
#include <trace/events/kmem.h>
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_pinner.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <trace/hooks/mm.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
#include "shuffle.h"
#include "page_reporting.h"
/* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */
typedef int __bitwise fpi_t;
/* No special request */
#define FPI_NONE ((__force fpi_t)0)
/*
* Skip free page reporting notification for the (possibly merged) page.
* This does not hinder free page reporting from grabbing the page,
* reporting it and marking it "reported" - it only skips notifying
* the free page reporting infrastructure about a newly freed page. For
* example, used when temporarily pulling a page from a freelist and
* putting it back unmodified.
*/
#define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0))
/*
* Place the (possibly merged) page to the tail of the freelist. Will ignore
* page shuffling (relevant code - e.g., memory onlining - is expected to
* shuffle the whole zone).
*
* Note: No code should rely on this flag for correctness - it's purely
* to allow for optimizations when handing back either fresh pages
* (memory onlining) or untouched pages (page isolation, free page
* reporting).
*/
#define FPI_TO_TAIL ((__force fpi_t)BIT(1))
/*
* Don't poison memory with KASAN (only for the tag-based modes).
* During boot, all non-reserved memblock memory is exposed to page_alloc.
* Poisoning all that memory lengthens boot time, especially on systems with
* large amount of RAM. This flag is used to skip that poisoning.
* This is only done for the tag-based KASAN modes, as those are able to
* detect memory corruptions with the memory tags assigned by default.
* All memory allocated normally after boot gets poisoned as usual.
*/
#define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2))
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_FRACTION (8)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
#endif
DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key);
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
/*
* N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
* It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
* Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
* defined in <linux/topology.h>.
*/
DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
/* work_structs for global per-cpu drains */
struct pcpu_drain {
struct zone *zone;
struct work_struct work;
};
static DEFINE_MUTEX(pcpu_drain_mutex);
static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
EXPORT_SYMBOL(latent_entropy);
#endif
/*
* Array of node states.
*/
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
[N_POSSIBLE] = NODE_MASK_ALL,
[N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
[N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
[N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
[N_MEMORY] = { { [0] = 1UL } },
[N_CPU] = { { [0] = 1UL } },
#endif /* NUMA */
};
EXPORT_SYMBOL(node_states);
atomic_long_t _totalram_pages __read_mostly;
EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_FALSE(init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_FALSE(init_on_free);
EXPORT_SYMBOL(init_on_free);
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
{
return kstrtobool(buf, &_init_on_alloc_enabled_early);
}
early_param("init_on_alloc", early_init_on_alloc);
static bool _init_on_free_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
static int __init early_init_on_free(char *buf)
{
return kstrtobool(buf, &_init_on_free_enabled_early);
}
early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
* put on a pcplist. Used to avoid the pageblock migratetype lookup when
* freeing from pcplists in most cases, at the cost of possibly becoming stale.
* Also the migratetype set in the page does not necessarily match the pcplist
* index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
* other index - this ensures that it will be put on the correct CMA freelist.
*/
static inline int get_pcppage_migratetype(struct page *page)
{
return page->index;
}
static inline void set_pcppage_migratetype(struct page *page, int migratetype)
{
page->index = migratetype;
}
#ifdef CONFIG_PM_SLEEP
/*
* The following functions are used by the suspend/hibernate code to temporarily
* change gfp_allowed_mask in order to avoid using I/O during memory allocations
* while devices are suspended. To avoid races with the suspend/hibernate code,
* they should always be called with system_transition_mutex held
* (gfp_allowed_mask also should only be modified with system_transition_mutex
* held, unless the suspend/hibernate code is guaranteed not to run in parallel
* with that modification).
*/
static gfp_t saved_gfp_mask;
void pm_restore_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
if (saved_gfp_mask) {
gfp_allowed_mask = saved_gfp_mask;
saved_gfp_mask = 0;
}
}
void pm_restrict_gfp_mask(void)
{
WARN_ON(!mutex_is_locked(&system_transition_mutex));
WARN_ON(saved_gfp_mask);
saved_gfp_mask = gfp_allowed_mask;
gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
}
bool pm_suspended_storage(void)
{
if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
return false;
return true;
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
unsigned int pageblock_order __read_mostly;
#endif
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags);
/*
* results with 256, 32 in the lowmem_reserve sysctl:
* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
* HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
*/
int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
[ZONE_DMA] = 256,
#endif
#ifdef CONFIG_ZONE_DMA32
[ZONE_DMA32] = 256,
#endif
[ZONE_NORMAL] = 32,
#ifdef CONFIG_HIGHMEM
[ZONE_HIGHMEM] = 0,
#endif
[ZONE_MOVABLE] = 0,
};
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
#endif
#ifdef CONFIG_ZONE_DMA32
"DMA32",
#endif
"Normal",
#ifdef CONFIG_HIGHMEM
"HighMem",
#endif
"Movable",
#ifdef CONFIG_ZONE_DEVICE
"Device",
#endif
};
const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
#ifdef CONFIG_CMA
"CMA",
#endif
"HighAtomic",
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
};
compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
[NULL_COMPOUND_DTOR] = NULL,
[COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
[HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
[TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};
/*
* Try to keep at least this much lowmem free. Do not allow normal
* allocations below this point, only high priority ones. Automatically
* tuned according to the amount of memory in the system.
*/
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
#ifdef CONFIG_DISCONTIGMEM
/*
* DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
* are not on separate NUMA nodes. Functionally this works but with
* watermark_boost_factor, it can reclaim prematurely as the ranges can be
* quite small. By default, do not boost watermarks on discontigmem as in
* many cases very high-order allocations like THP are likely to be
* unsupported and the premature reclaim offsets the advantage of long-term
* fragmentation avoidance.
*/
int watermark_boost_factor __read_mostly;
#else
int watermark_boost_factor __read_mostly = 15000;
#endif
int watermark_scale_factor = 10;
/*
* Extra memory for the system to try freeing. Used to temporarily
* free memory, to make space for new workloads. Anyone can allocate
* down to the min watermarks controlled by min_free_kbytes above.
*/
int extra_free_kbytes = 0;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* During boot we initialize deferred pages on-demand, as needed, but once
* page_alloc_init_late() has finished, the deferred pages are all initialized,
* and we can permanently disable that path.
*/
static DEFINE_STATIC_KEY_TRUE(deferred_pages);
/*
* Calling kasan_poison_pages() only after deferred memory initialization
* has completed. Poisoning pages during deferred memory init will greatly
* lengthen the process and cause problem in large memory systems as the
* deferred pages initialization is done with interrupt disabled.
*
* Assuming that there will be no reference to those newly initialized
* pages before they are ever allocated, this should have no effect on
* KASAN memory tracking as the poison will be properly inserted at page
* allocation time. The only corner case is when pages are allocated by
* on-demand allocation and then freed again before the deferred pages
* initialization is done, but this is not likely to happen.
*/
static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
return static_branch_unlikely(&deferred_pages) ||
(!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
PageSkipKASanPoison(page);
}
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
int nid = early_pfn_to_nid(pfn);
if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
return true;
return false;
}
/*
* Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
static bool __meminit
defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
static unsigned long prev_end_pfn, nr_initialised;
/*
* prev_end_pfn static that contains the end of previous zone
* No need to protect because called very early in boot before smp_init.
*/
if (prev_end_pfn != end_pfn) {
prev_end_pfn = end_pfn;
nr_initialised = 0;
}
/* Always populate low zones for address-constrained allocations */
if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
return false;
if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX)
return true;
/*
* We start only with one section of pages, more pages are added as
* needed until the rest of deferred pages are initialized.
*/
nr_initialised++;
if ((nr_initialised > PAGES_PER_SECTION) &&
(pfn & (PAGES_PER_SECTION - 1)) == 0) {
NODE_DATA(nid)->first_deferred_pfn = pfn;
return true;
}
return false;
}
#else
static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
{
return (!IS_ENABLED(CONFIG_KASAN_GENERIC) &&
(fpi_flags & FPI_SKIP_KASAN_POISON)) ||
PageSkipKASanPoison(page);
}
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
}
static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
return false;
}
#endif
/* Return a pointer to the bitmap storing bits affecting a block of pages */
static inline unsigned long *get_pageblock_bitmap(struct page *page,
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
}
static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
#endif /* CONFIG_SPARSEMEM */
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
static __always_inline
unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long word;
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
return (word >> bitidx) & mask;
}
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long mask)
{
return __get_pfnblock_flags_mask(page, pfn, mask);
}
EXPORT_SYMBOL_GPL(get_pfnblock_flags_mask);
int isolate_anon_lru_page(struct page *page)
{
int ret;
if (!PageLRU(page) || !PageAnon(page))
return -EINVAL;
if (!get_page_unless_zero(page))
return -EINVAL;
ret = isolate_lru_page(page);
put_page(page);
return ret;
}
EXPORT_SYMBOL_GPL(isolate_anon_lru_page);
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
* set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
unsigned long mask)
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
mask <<= bitidx;
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
if (word == old_word)
break;
word = old_word;
}
}
void set_pageblock_migratetype(struct page *page, int migratetype)
{
if (unlikely(page_group_by_mobility_disabled &&
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
set_pfnblock_flags_mask(page, (unsigned long)migratetype,
page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
int ret = 0;
unsigned seq;
unsigned long pfn = page_to_pfn(page);
unsigned long sp, start_pfn;
do {
seq = zone_span_seqbegin(zone);
start_pfn = zone->zone_start_pfn;
sp = zone->spanned_pages;
if (!zone_spans_pfn(zone, pfn))
ret = 1;
} while (zone_span_seqretry(zone, seq));
if (ret)
pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
pfn, zone_to_nid(zone), zone->name,
start_pfn, start_pfn + sp);
return ret;
}
static int page_is_consistent(struct zone *zone, struct page *page)
{
if (!pfn_valid_within(page_to_pfn(page)))
return 0;
if (zone != page_zone(page))
return 0;
return 1;
}
/*
* Temporary debugging check for pages not lying within a given zone.
*/
static int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
if (page_outside_zone_boundaries(zone, page))
return 1;
if (!page_is_consistent(zone, page))
return 1;
return 0;
}
#else
static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
{
return 0;
}
#endif
static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
static unsigned long nr_unshown;
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
if (nr_shown == 60) {
if (time_before(jiffies, resume)) {
nr_unshown++;
goto out;
}
if (nr_unshown) {
pr_alert(
"BUG: Bad page state: %lu messages suppressed\n",
nr_unshown);
nr_unshown = 0;
}
nr_shown = 0;
}
if (nr_shown++ == 0)
resume = jiffies + 60 * HZ;
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
__dump_page(page, reason);
dump_page_owner(page);
print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
page_mapcount_reset(page); /* remove PageBuddy */
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
/*
* Higher-order pages are called "compound pages". They are structured thusly:
*
* The first PAGE_SIZE page is called the "head page" and have PG_head set.
*
* The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
* in bit 0 of page->compound_head. The rest of bits is pointer to head page.
*
* The first tail page's ->compound_dtor holds the offset in array of compound
* page destructors. See compound_page_dtors.
*
* The first tail page's ->compound_order holds the order of allocation.
* This usage means that zero-order pages may not be compound.
*/
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page), FPI_NONE);
}
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
set_page_count(p, 0);
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
bool _debug_pagealloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
EXPORT_SYMBOL(_debug_pagealloc_enabled);
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
return kstrtobool(buf, &_debug_pagealloc_enabled_early);
}
early_param("debug_pagealloc", early_debug_pagealloc);
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) {
pr_err("Bad debug_guardpage_minorder value\n");
return 0;
}
_debug_guardpage_minorder = res;
pr_info("Setting debug_guardpage_minorder to %lu\n", res);
return 0;
}
early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
__SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
return true;
}
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
if (!debug_guardpage_enabled())
return;
__ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
/*
* Enable static keys related to various memory debugging and hardening options.
* Some override others, and depend on early params that are evaluated in the
* order of appearance. So we need to first gather the full picture of what was
* enabled, and then make decisions.
*/
void init_mem_debugging_and_hardening(void)
{
bool page_poisoning_requested = false;
#ifdef CONFIG_PAGE_POISONING
/*
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
if (page_poisoning_enabled() ||
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled())) {
static_branch_enable(&_page_poisoning_enabled);
page_poisoning_requested = true;
}
#endif
if (_init_on_alloc_enabled_early) {
if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_alloc\n");
else
static_branch_enable(&init_on_alloc);
}
if (_init_on_free_enabled_early) {
if (page_poisoning_requested)
pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
"will take precedence over init_on_free\n");
else
static_branch_enable(&init_on_free);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
if (!debug_pagealloc_enabled())
return;
static_branch_enable(&_debug_pagealloc_enabled);
if (!debug_guardpage_minorder())
return;
static_branch_enable(&_debug_guardpage_enabled);
#endif
}
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
__SetPageBuddy(page);
}
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
* (a) the buddy is not in a hole (check before calling!) &&
* (b) the buddy is in the buddy system &&
* (c) a page and its buddy have the same order &&
* (d) a page and its buddy are in the same zone.
*
* For recording whether a page is in the buddy system, we set PageBuddy.
* Setting, clearing, and testing PageBuddy is serialized by zone->lock.
*
* For recording page's order, we use page_private(page).
*/
static inline bool page_is_buddy(struct page *page, struct page *buddy,
unsigned int order)
{
if (!page_is_guard(buddy) && !PageBuddy(buddy))
return false;
if (buddy_order(buddy) != order)
return false;
/*
* zone check is done late to avoid uselessly calculating
* zone/node ids for pages that could never merge.
*/
if (page_zone_id(page) != page_zone_id(buddy))
return false;
VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
return true;
}
#ifdef CONFIG_COMPACTION
static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
capc->cc->zone == zone ? capc : NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
if (!capc || order != capc->cc->order)
return false;
/* Do not accidentally pollute CMA or isolated regions*/
if (is_migrate_cma(migratetype) ||
is_migrate_isolate(migratetype))
return false;
/*
* Do not let lower order allocations polluate a movable pageblock.
* This might let an unmovable request use a reclaimable pageblock
* and vice-versa but no more than normal fallback logic which can
* have trouble finding a high-order free page.
*/
if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
return false;
capc->page = page;
return true;
}
#else
static inline struct capture_control *task_capc(struct zone *zone)
{
return NULL;
}
static inline bool
compaction_capture(struct capture_control *capc, struct page *page,
int order, int migratetype)
{
return false;
}
#endif /* CONFIG_COMPACTION */
/* Used for pages not on another list */
static inline void add_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/* Used for pages not on another list */
static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_add_tail(&page->lru, &area->free_list[migratetype]);
area->nr_free++;
}
/*
* Used for pages which are on another list. Move the pages to the tail
* of the list - so the moved pages won't immediately be considered for
* allocation again (e.g., optimization for memory onlining).
*/
static inline void move_to_free_list(struct page *page, struct zone *zone,
unsigned int order, int migratetype)
{
struct free_area *area = &zone->free_area[order];
list_move_tail(&page->lru, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
unsigned int order)
{
/* clear reported state and update reported page count */
if (page_reported(page))
__ClearPageReported(page);
list_del(&page->lru);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
}
/*
* If this is not the largest possible page, check if the buddy
* of the next-highest order is free. If it is, it's possible
* that pages are being freed that will coalesce soon. In case,
* that is happening, add the free page to the tail of the list
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
static inline bool
buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn,
struct page *page, unsigned int order)
{
struct page *higher_page, *higher_buddy;
unsigned long combined_pfn;
if (order >= MAX_ORDER - 2)
return false;
if (!pfn_valid_within(buddy_pfn))
return false;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
return pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1);
}
/*
* Freeing function for a buddy system allocator.
*
* The concept of a buddy system is to maintain direct-mapped table
* (containing bit values) for memory blocks of various "orders".
* The bottom level table contains the map for the smallest allocatable
* units of memory (here, pages), and each level above it describes
* pairs of units from the levels below, hence, "buddies".
* At a high level, all that happens here is marking the table entry
* at the bottom level available, and propagating the changes upward
* as necessary, plus some accounting needed to play nicely with other
* parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PageBuddy.
* Page's order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks.
* If a block is freed, and its buddy is also free, then this
* triggers coalescing into a block of larger size.
*
* -- nyc
*/
static inline void __free_one_page(struct page *page,
unsigned long pfn,
struct zone *zone, unsigned int order,
int migratetype, fpi_t fpi_flags)
{
struct capture_control *capc = task_capc(zone);
unsigned long buddy_pfn;
unsigned long combined_pfn;
unsigned int max_order;
struct page *buddy;
bool to_tail;
max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (likely(!is_migrate_isolate(migratetype)))
__mod_zone_freepage_state(zone, 1 << order, migratetype);
VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
while (order < max_order) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
return;
}
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
if (!pfn_valid_within(buddy_pfn))
goto done_merging;
if (!page_is_buddy(page, buddy, order))
goto done_merging;
/*
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
else
del_page_from_free_list(buddy, zone, order);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
order++;
}
if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
* isolation could cause incorrect freepage or CMA accounting.
*
* We don't want to hit this code for the more frequent
* low-order merging.
*/
if (unlikely(has_isolate_pageblock(zone))) {
int buddy_mt;
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
buddy_mt = get_pageblock_migratetype(buddy);
if (migratetype != buddy_mt
&& (is_migrate_isolate(migratetype) ||
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
max_order = order + 1;
goto continue_merging;
}
done_merging:
set_buddy_order(page, order);
if (fpi_flags & FPI_TO_TAIL)
to_tail = true;
else if (is_shuffle_order(order))
to_tail = shuffle_pick_tail();
else
to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
if (to_tail)
add_to_free_list_tail(page, zone, order, migratetype);
else
add_to_free_list(page, zone, order, migratetype);
/* Notify page reporting subsystem of freed page */
if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
page_reporting_notify_free(order);
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
* try and check multiple fields with one check. The caller must do a detailed
* check if necessary.
*/
static inline bool page_expected_state(struct page *page,
unsigned long check_flags)
{
if (unlikely(atomic_read(&page->_mapcount) != -1))
return false;
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
(unsigned long)page->mem_cgroup |
#endif
(page->flags & check_flags)))
return false;
return true;
}
static const char *page_bad_reason(struct page *page, unsigned long flags)
{
const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
if (unlikely(page->flags & flags)) {
if (flags == PAGE_FLAGS_CHECK_AT_PREP)
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
else
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
}
static void check_free_page_bad(struct page *page)
{
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
}
static inline int check_free_page(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
check_free_page_bad(page);
return 1;
}
static int free_tail_pages_check(struct page *head_page, struct page *page)
{
int ret = 1;
/*
* We rely page->lru.next never has bit 0 set, unless the page
* is PageTail(). Let's make sure that's true even for poisoned ->lru.
*/
BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
ret = 0;
goto out;
}
switch (page - head_page) {
case 1:
/* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
bad_page(page, "nonzero compound_mapcount");
goto out;
}
break;
case 2:
/*
* the second tail page: ->mapping is
* deferred_list.next -- ignore value.
*/
break;
default:
if (page->mapping != TAIL_MAPPING) {
bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
out:
page->mapping = NULL;
clear_compound_head(page);
return ret;
}
static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags)
{
int i;
if (zero_tags) {
for (i = 0; i < numpages; i++)
tag_clear_highpage(page + i);
return;
}
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
for (i = 0; i < numpages; i++) {
u8 tag = page_kasan_tag(page + i);
page_kasan_tag_reset(page + i);
clear_highpage(page + i);
page_kasan_tag_set(page + i, tag);
}
kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free, fpi_t fpi_flags)
{
int bad = 0;
bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags);
VM_BUG_ON_PAGE(PageTail(page), page);
trace_mm_page_free(page, order);
if (unlikely(PageHWPoison(page)) && !order) {
/*
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
free_page_pinner(page, order);
return false;
}
/*
* Check tail pages before head page information is cleared to
* avoid checking PageCompound for order-0 pages.
*/
if (unlikely(order)) {
bool compound = PageCompound(page);
int i;
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
if (compound)
ClearPageDoubleMap(page);
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
if (unlikely(check_free_page(page + i))) {
bad++;
continue;
}
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += check_free_page(page);
if (bad)
return false;
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
free_page_pinner(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
PAGE_SIZE << order);
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
kernel_poison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* kasan_free_pages and kernel_init_free_pages must be
* kept together to avoid discrepancies in behavior.
*
* With hardware tag-based KASAN, memory tags must be set before the
* page becomes unavailable via debug_pagealloc or arch_free_page.
*/
if (kasan_has_integrated_init()) {
if (!skip_kasan_poison)
kasan_free_pages(page, order);
} else {
bool init = want_init_on_free();
if (init)
kernel_init_free_pages(page, 1 << order, false);
if (!skip_kasan_poison)
kasan_poison_pages(page, order, init);
}
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
* happen after this.
*/
arch_free_page(page, order);
debug_pagealloc_unmap_pages(page, 1 << order);
return true;
}
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
* to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
* moved from pcp lists to free lists.
*/
static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, true, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_free_page(page);
else
return false;
}
#else
/*
* With DEBUG_VM disabled, order-0 pages being freed are checked only when
* moving from pcp lists to free list in order to reduce overhead. With
* debug_pagealloc enabled, they are checked also immediately when being freed
* to the pcp lists.
*/
static bool free_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
return free_pages_prepare(page, 0, true, FPI_NONE);
else
return free_pages_prepare(page, 0, false, FPI_NONE);
}
static bool bulkfree_pcp_prepare(struct page *page)
{
return check_free_page(page);
}
#endif /* CONFIG_DEBUG_VM */
static inline void prefetch_buddy(struct page *page)
{
unsigned long pfn = page_to_pfn(page);
unsigned long buddy_pfn = __find_buddy_pfn(pfn, 0);
struct page *buddy = page + (buddy_pfn - pfn);
prefetch(buddy);
}
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
* If the zone was previously in an "all pages pinned" state then look to
* see if this freeing clears that state.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
{
int migratetype = 0;
int batch_free = 0;
int prefetch_nr = 0;
bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
/*
* Ensure proper count is passed which otherwise would stuck in the
* below while (list_empty(list)) loop.
*/
count = min(pcp->count, count);
while (count) {
struct list_head *list;
/*
* Remove pages from lists in a round-robin fashion. A
* batch_free count is maintained that is incremented when an
* empty list is encountered. This is so more pages are freed
* off fuller lists instead of spinning excessively around empty
* lists
*/
do {
batch_free++;
if (++migratetype == MIGRATE_PCPTYPES)
migratetype = 0;
list = &pcp->lists[migratetype];
} while (list_empty(list));
/* This is the only non-empty list. Free them all. */
if (batch_free == MIGRATE_PCPTYPES)
batch_free = count;
do {
page = list_last_entry(list, struct page, lru);
/* must delete to avoid corrupting pcp list */
list_del(&page->lru);
pcp->count--;
if (bulkfree_pcp_prepare(page))
continue;
list_add_tail(&page->lru, &head);
/*
* We are going to put the page back to the global
* pool, prefetch its buddy to speed up later access
* under zone->lock. It is believed the overhead of
* an additional test and calculating buddy_pfn here
* can be offset by reduced memory latency later. To
* avoid excessive prefetching due to large count, only
* prefetch buddy for the first pcp->batch nr of pages.
*/
if (prefetch_nr++ < pcp->batch)
prefetch_buddy(page);
} while (--count && --batch_free && !list_empty(list));
}
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
/*
* Use safe version since after __free_one_page(),
* page->lru.next will not point to original list.
*/
list_for_each_entry_safe(page, tmp, &head, lru) {
int mt = get_pcppage_migratetype(page);
/* MIGRATE_ISOLATE page should not go to pcplists */
VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
/* Pageblock could have been isolated meanwhile */
if (unlikely(isolated_pageblocks))
mt = get_pageblock_migratetype(page);
__free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
trace_mm_page_pcpu_drain(page, 0, mt);
}
spin_unlock(&zone->lock);
}
static void free_one_page(struct zone *zone,
struct page *page, unsigned long pfn,
unsigned int order,
int migratetype, fpi_t fpi_flags)
{
spin_lock(&zone->lock);
if (unlikely(has_isolate_pageblock(zone) ||
is_migrate_isolate(migratetype))) {
migratetype = get_pfnblock_migratetype(page, pfn);
}
__free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
spin_unlock(&zone->lock);
}
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
mm_zero_struct_page(page);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __meminit init_reserved_page(unsigned long pfn)
{
pg_data_t *pgdat;
int nid, zid;
if (!early_page_uninitialised(pfn))
return;
nid = early_pfn_to_nid(pfn);
pgdat = NODE_DATA(nid);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
}
#else
static inline void init_reserved_page(unsigned long pfn)
{
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/*
* Initialised pages do not have PageReserved set. This function is
* called for each range allocated by the bootmem allocator and
* marks the pages PageReserved. The remaining valid pages are later
* sent to the buddy page allocator.
*/
void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);
for (; start_pfn < end_pfn; start_pfn++) {
if (pfn_valid(start_pfn)) {
struct page *page = pfn_to_page(start_pfn);
init_reserved_page(start_pfn);
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
/*
* no need for atomic set_bit because the struct
* page is not visible yet so nobody should
* access it yet.
*/
__SetPageReserved(page);
}
}
}
static void __free_pages_ok(struct page *page, unsigned int order,
fpi_t fpi_flags)
{
unsigned long flags;
int migratetype;
unsigned long pfn = page_to_pfn(page);
if (!free_pages_prepare(page, order, true, fpi_flags))
return;
migratetype = get_pfnblock_migratetype(page, pfn);
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, pfn, order, migratetype,
fpi_flags);
local_irq_restore(flags);
}
void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
unsigned int loop;
/*
* When initializing the memmap, __init_single_page() sets the refcount
* of all pages to 1 ("allocated"/"not free"). We have to set the
* refcount of all involved pages to 0.
*/
prefetchw(p);
for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
prefetchw(p + 1);
__ClearPageReserved(p);
set_page_count(p, 0);
}
__ClearPageReserved(p);
set_page_count(p, 0);
atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
/*
* Bypass PCP and place fresh pages right to the tail, primarily
* relevant for memory onlining.
*/
__free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON);
}
#ifdef CONFIG_NEED_MULTIPLE_NODES
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
int __meminit __early_pfn_to_nid(unsigned long pfn,
struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
int nid;
if (state->last_start <= pfn && pfn < state->last_end)
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
}
return nid;
}
#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
int __meminit early_pfn_to_nid(unsigned long pfn)
{
static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid < 0)
nid = first_online_node;
spin_unlock(&early_pfn_lock);
return nid;
}
#endif /* CONFIG_NEED_MULTIPLE_NODES */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
__free_pages_core(page, order);
}
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
* with the migration of free compaction scanner. The scanners then need to
* use only pfn_valid_within() check for arches that allow holes within
* pageblocks.
*
* Return struct page pointer of start_pfn, or NULL if checks were not passed.
*
* It's possible on some configurations to have a setup like node0 node1 node0
* i.e. it's possible that all pages within a zones range of pages do not
* belong to a single zone. We assume that a border between node0 and node1
* can occur within a single pageblock, but not a node0 node1 node0
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
{
struct page *start_page;
struct page *end_page;
/* end_pfn is one past the range we are checking */
end_pfn--;
if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
return NULL;
start_page = pfn_to_online_page(start_pfn);
if (!start_page)
return NULL;
if (page_zone(start_page) != zone)
return NULL;
end_page = pfn_to_page(end_pfn);
/* This gives a shorter code than deriving page_zone(end_page) */
if (page_zone_id(start_page) != page_zone_id(end_page))
return NULL;
return start_page;
}
void set_zone_contiguous(struct zone *zone)
{
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
cond_resched();
}
/* We confirm that there is no hole */
zone->contiguous = true;
}
void clear_zone_contiguous(struct zone *zone)
{
zone->contiguous = false;
}
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static void __init deferred_free_range(unsigned long pfn,
unsigned long nr_pages)
{
struct page *page;
unsigned long i;
if (!nr_pages)
return;
page = pfn_to_page(pfn);
/* Free a large naturally-aligned chunk if possible */
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
__free_pages_core(page, 0);
}
}
/* Completion tracking for deferred_init_memmap() threads */
static atomic_t pgdat_init_n_undone __initdata;
static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
static inline void __init pgdat_init_report_one_done(void)
{
if (atomic_dec_and_test(&pgdat_init_n_undone))
complete(&pgdat_init_all_done_comp);
}
/*
* Returns true if page needs to be initialized or freed to buddy allocator.
*
* First we check if pfn is valid on architectures where it is possible to have
* holes within pageblock_nr_pages. On systems where it is not possible, this
* function is optimized out.
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
*/
static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
return true;
}
/*
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
} else {
nr_free++;
}
}
/* Free the last block of pages to allocator */
deferred_free_range(pfn - nr_free, nr_free);
}
/*
* Initialize struct pages. We minimize pfn page lookups and scheduler checks
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
unsigned long nr_pgmask = pageblock_nr_pages - 1;
int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
} else {
page++;
}
__init_single_page(page, pfn, zid, nid);
nr_pages++;
}
return (nr_pages);
}
/*
* This function is meant to pre-load the iterator for the zone init.
* Specifically it walks through the ranges until we are caught up to the
* first_init_pfn value and exits there. If we never encounter the value we
* return false indicating there are no valid ranges left.
*/
static bool __init
deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
unsigned long *spfn, unsigned long *epfn,
unsigned long first_init_pfn)
{
u64 j;
/*
* Start out by walking through the ranges in this zone that have
* already been initialized. We don't need to do anything with them
* so we just need to flush them out of the system.
*/
for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
if (*epfn <= first_init_pfn)
continue;
if (*spfn < first_init_pfn)
*spfn = first_init_pfn;
*i = j;
return true;
}
return false;
}
/*
* Initialize and free pages. We do it in two loops: first we initialize
* struct page, then free to buddy allocator, because while we are
* freeing pages we can access pages that are ahead (computing buddy
* page in __free_one_page()).
*
* In order to try and keep some memory in the cache we have the loop
* broken along max page order boundaries. This way we will not cause
* any issues with the buddy page computation.
*/
static unsigned long __init
deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
unsigned long *end_pfn)
{
unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
unsigned long spfn = *start_pfn, epfn = *end_pfn;
unsigned long nr_pages = 0;
u64 j = *i;
/* First we loop through and initialize the page values */
for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
unsigned long t;
if (mo_pfn <= *start_pfn)
break;
t = min(mo_pfn, *end_pfn);
nr_pages += deferred_init_pages(zone, *start_pfn, t);
if (mo_pfn < *end_pfn) {
*start_pfn = mo_pfn;
break;
}
}
/* Reset values and now loop through freeing pages as needed */
swap(j, *i);
for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
unsigned long t;
if (mo_pfn <= spfn)
break;
t = min(mo_pfn, epfn);
deferred_free_pages(spfn, t);
if (mo_pfn <= epfn)
break;
}
return nr_pages;
}
static void __init
deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
void *arg)
{
unsigned long spfn, epfn;
struct zone *zone = arg;
u64 i;
deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
/*
* Initialize and free pages in MAX_ORDER sized increments so that we
* can avoid introducing any issues with the buddy allocator.
*/
while (spfn < end_pfn) {
deferred_init_maxorder(&i, zone, &spfn, &epfn);
cond_resched();
}
}
/* An arch may override for more concurrency. */
__weak int __init
deferred_page_init_max_threads(const struct cpumask *node_cpumask)
{
return 1;
}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
unsigned long spfn = 0, epfn = 0;
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
int zid, max_threads;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(current, cpumask);
pgdat_resize_lock(pgdat, &flags);
first_init_pfn = pgdat->first_deferred_pfn;
if (first_init_pfn == ULONG_MAX) {
pgdat_resize_unlock(pgdat, &flags);
pgdat_init_report_one_done();
return 0;
}
/* Sanity check boundaries */
BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
/*
* Once we unlock here, the zone cannot be grown anymore, thus if an
* interrupt thread must allocate this early in boot, zone must be
* pre-grown prior to start of deferred page initialization.
*/
pgdat_resize_unlock(pgdat, &flags);
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
if (first_init_pfn < zone_end_pfn(zone))
break;
}
/* If the zone is empty somebody else may have cleared out the zone */
if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
first_init_pfn))
goto zone_empty;
max_threads = deferred_page_init_max_threads(cpumask);
while (spfn < epfn) {
unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
struct padata_mt_job job = {
.thread_fn = deferred_init_memmap_chunk,
.fn_arg = zone,
.start = spfn,
.size = epfn_align - spfn,
.align = PAGES_PER_SECTION,
.min_chunk = PAGES_PER_SECTION,
.max_threads = max_threads,
};
padata_do_multithreaded(&job);
deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
epfn_align);
}
zone_empty:
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
pr_info("node %d deferred pages initialised in %ums\n",
pgdat->node_id, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
}
/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
* of SECTION_SIZE bytes by initializing struct pages in increments of
* PAGES_PER_SECTION * sizeof(struct page) bytes.
*
* Return true when zone was grown, otherwise return false. We return true even
* when we grow less than requested, to let the caller decide if there are
* enough pages to satisfy the allocation.
*
* Note: We use noinline because this function is needed only during boot, and
* it is called from a __ref function _deferred_grow_zone. This way we are
* making sure that it is not inlined into permanent text section.
*/
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
unsigned long spfn, epfn, flags;
unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat))
return false;
pgdat_resize_lock(pgdat, &flags);
/*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
if (first_deferred_pfn != pgdat->first_deferred_pfn) {
pgdat_resize_unlock(pgdat, &flags);
return true;
}
/* If the zone is empty somebody else may have cleared out the zone */
if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
first_deferred_pfn)) {
pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
/* Retry only once. */
return first_deferred_pfn != ULONG_MAX;
}
/*
* Initialize and free pages in MAX_ORDER sized increments so
* that we can avoid introducing any issues with the buddy
* allocator.
*/
while (spfn < epfn) {
/* update our first deferred PFN for this section */
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
continue;
/* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
}
/*
* deferred_grow_zone() is __init, but it is called from
* get_page_from_freelist() during early boot until deferred_pages permanently
* disables this call. This is why we have refdata wrapper to avoid warning,
* and to ensure that the function body gets unloaded.
*/
static bool __ref
_deferred_grow_zone(struct zone *zone, unsigned int order)
{
return deferred_grow_zone(zone, order);
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
void __init page_alloc_init_late(void)
{
struct zone *zone;
int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
wait_for_completion(&pgdat_init_all_done_comp);
/*
* The number of managed pages has changed due to the initialisation
* so the pcpu batch and high limits needs to be updated or the limits
* will be artificially small.
*/
for_each_populated_zone(zone)
zone_pcp_update(zone);
/*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
static_branch_disable(&deferred_pages);
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
/* Discard memblock private memory */
memblock_discard();
for_each_node_state(nid, N_MEMORY)
shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
}
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
{
unsigned i = pageblock_nr_pages;
struct page *p = page;
do {
__ClearPageReserved(p);
set_page_count(p, 0);
} while (++p, --i);
set_pageblock_migratetype(page, MIGRATE_CMA);
if (pageblock_order >= MAX_ORDER) {
i = pageblock_nr_pages;
p = page;
do {
set_page_refcounted(p);
__free_pages(p, MAX_ORDER - 1);
p += MAX_ORDER_NR_PAGES;
} while (i -= MAX_ORDER_NR_PAGES);
} else {
set_page_refcounted(page);
__free_pages(page, pageblock_order);
}
adjust_managed_page_count(page, pageblock_nr_pages);
page_zone(page)->cma_pages += pageblock_nr_pages;
}
#endif
/*
* The order of subdivision here is critical for the IO subsystem.
* Please do not alter this order without good reasons and regression
* testing. Specifically, as large blocks of memory are subdivided,
* the order in which smaller blocks are delivered depends on the order
* they're subdivided in this function. This is the primary factor
* influencing the order in which pages are delivered to the IO
* subsystem according to empirical testing, and this is also justified
* by considering the behavior of a buddy system containing a single
* large block of memory acted on by a series of small allocations.
* This behavior is a critical factor in sglist merging's success.
*
* -- nyc
*/
static inline void expand(struct zone *zone, struct page *page,
int low, int high, int migratetype)
{
unsigned long size = 1 << high;
while (high > low) {
high--;
size >>= 1;
VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
/*
* Mark as guard pages (or page), that will allow to
* merge back to allocator when buddy will be freed.
* Corresponding page table entries will not be touched,
* pages will stay not present in virtual address space
*/
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
add_to_free_list(&page[size], zone, high, migratetype);
set_buddy_order(&page[size], high);
}
}
static void check_new_page_bad(struct page *page)
{
if (unlikely(page->flags & __PG_HWPOISON)) {
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
bad_page(page,
page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
* This page is about to be returned from the page allocator
*/
static inline int check_new_page(struct page *page)
{
if (likely(page_expected_state(page,
PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
return 0;
check_new_page_bad(page);
return 1;
}
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
* being allocated from pcp lists. With debug_pagealloc also enabled, they are
* also checked when pcp lists are refilled from the free lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
}
static inline bool check_new_pcp(struct page *page)
{
return check_new_page(page);
}
#else
/*
* With DEBUG_VM disabled, free order-0 pages are checked for expected state
* when pcp lists are being refilled from the free lists. With debug_pagealloc
* enabled, they are also checked when being allocated from the pcp lists.
*/
static inline bool check_pcp_refill(struct page *page)
{
return check_new_page(page);
}
static inline bool check_new_pcp(struct page *page)
{
if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
}
#endif /* CONFIG_DEBUG_VM */
static bool check_new_pages(struct page *page, unsigned int order)
{
int i;
for (i = 0; i < (1 << order); i++) {
struct page *p = page + i;
if (unlikely(check_new_page(p)))
return true;
}
return false;
}
inline void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
set_page_private(page, 0);
set_page_refcounted(page);
arch_alloc_page(page, order);
debug_pagealloc_map_pages(page, 1 << order);
/*
* Page unpoisoning must happen before memory initialization.
* Otherwise, the poison pattern will be overwritten for __GFP_ZERO
* allocations and the page unpoisoning code will complain.
*/
kernel_unpoison_pages(page, 1 << order);
/*
* As memory initialization might be integrated into KASAN,
* kasan_alloc_pages and kernel_init_free_pages must be
* kept together to avoid discrepancies in behavior.
*/
if (kasan_has_integrated_init()) {
kasan_alloc_pages(page, order, gfp_flags);
} else {
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags);
kasan_unpoison_pages(page, order, init);
if (init)
kernel_init_free_pages(page, 1 << order,
gfp_flags & __GFP_ZEROTAGS);
}
set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
post_alloc_hook(page, order, gfp_flags);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
/*
* page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
* allocate the page. The expectation is that the caller is taking
* steps that will free more memory. The caller should avoid the page
* being used for !PFMEMALLOC purposes.
*/
if (alloc_flags & ALLOC_NO_WATERMARKS)
set_page_pfmemalloc(page);
else
clear_page_pfmemalloc(page);
}
/*
* Go through the free lists for the given migratetype and remove
* the smallest available page from the freelists
*/
static __always_inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
int migratetype)
{
unsigned int current_order;
struct free_area *area;
struct page *page;
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
del_page_from_free_list(page, zone, current_order);
expand(zone, page, order, current_order, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
}
return NULL;
}
/*
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
#ifdef CONFIG_MEMORY_ISOLATION
[MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
#endif
};
#ifdef CONFIG_CMA
static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order)
{
return __rmqueue_smallest(zone, order, MIGRATE_CMA);
}
#else
static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
unsigned int order) { return NULL; }
#endif
/*
* Move the free pages in a range to the freelist tail of the requested type.
* Note that start_page and end_pages are not aligned on a pageblock
* boundary. If alignment is required, use move_freepages_block()
*/
static int move_freepages(struct zone *zone,
struct page *start_page, struct page *end_page,
int migratetype, int *num_movable)
{
struct page *page;
unsigned int order;
int pages_moved = 0;
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
* migration are movable. But we don't actually try
* isolating, as that would be expensive.
*/
if (num_movable &&
(PageLRU(page) || __PageMovable(page)))
(*num_movable)++;
page++;
continue;
}
/* Make sure we are not inadvertently changing nodes */
VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
VM_BUG_ON_PAGE(page_zone(page) != zone, page);
order = buddy_order(page);
move_to_free_list(page, zone, order, migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
return pages_moved;
}
int move_freepages_block(struct zone *zone, struct page *page,
int migratetype, int *num_movable)
{
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
if (num_movable)
*num_movable = 0;
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
end_page = start_page + pageblock_nr_pages - 1;
end_pfn = start_pfn + pageblock_nr_pages - 1;
/* Do not cross zone boundaries */
if (!zone_spans_pfn(zone, start_pfn))
start_page = page;
if (!zone_spans_pfn(zone, end_pfn))
return 0;
return move_freepages(zone, start_page, end_page, migratetype,
num_movable);
}
static void change_pageblock_range(struct page *pageblock_page,
int start_order, int migratetype)
{
int nr_pageblocks = 1 << (start_order - pageblock_order);
while (nr_pageblocks--) {
set_pageblock_migratetype(pageblock_page, migratetype);
pageblock_page += pageblock_nr_pages;
}
}
/*
* When we are falling back to another migratetype during allocation, try to
* steal extra free pages from the same pageblocks to satisfy further
* allocations, instead of polluting multiple pageblocks.
*
* If we are stealing a relatively large buddy page, it is likely there will
* be more free pages in the pageblock, so try to steal them all. For
* reclaimable and unmovable allocations, we steal regardless of page size,
* as fragmentation caused by those allocations polluting movable pageblocks
* is worse than movable allocations stealing from unmovable and reclaimable
* pageblocks.
*/
static bool can_steal_fallback(unsigned int order, int start_mt)
{
/*
* Leaving this order check is intended, although there is
* relaxed order check in next check. The reason is that
* we can actually steal whole pageblock if this condition met,
* but, below check doesn't guarantee it and that is just heuristic
* so could be changed anytime.
*/
if (order >= pageblock_order)
return true;
if (order >= pageblock_order / 2 ||
start_mt == MIGRATE_RECLAIMABLE ||
start_mt == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled)
return true;
return false;
}
static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
* in a small area, boosting the watermark can cause an out of
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
/*
* high watermark may be uninitialised if fragmentation occurs
* very early in boot so do not boost. We do not fall
* through and boost by pageblock_nr_pages as failing
* allocations that early means that reclaim is not going
* to help and it may even be impossible to reclaim the
* boosted watermark resulting in a hang.
*/
if (!max_boost)
return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
return true;
}
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
* pageblock to our migratetype and determine how many already-allocated pages
* are there in the pageblock with a compatible migratetype. If at least half
* of pages are free or compatible, we can change migratetype of the pageblock
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = buddy_order(page);
int free_pages, movable_pages, alike_pages;
int old_block_type;
old_block_type = get_pageblock_migratetype(page);
/*
* This can happen due to races and we want to prevent broken
* highatomic accounting.
*/
if (is_migrate_highatomic(old_block_type))
goto single_page;
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
goto single_page;
}
/*
* Boost watermarks to increase reclaim pressure to reduce the
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
free_pages = move_freepages_block(zone, page, start_type,
&movable_pages);
/*
* Determine how many pages are compatible with our allocation.
* For movable allocation, it's the number of movable pages which
* we just obtained. For other types it's a bit more tricky.
*/
if (start_type == MIGRATE_MOVABLE) {
alike_pages = movable_pages;
} else {
/*
* If we are falling back a RECLAIMABLE or UNMOVABLE allocation
* to MOVABLE pageblock, consider all non-movable pages as
* compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
* vice versa, be conservative since we can't distinguish the
* exact migratetype of non-movable pages.
*/
if (old_block_type == MIGRATE_MOVABLE)
alike_pages = pageblock_nr_pages
- (free_pages + movable_pages);
else
alike_pages = 0;
}
/* moving whole block can fail due to zone boundary conditions */
if (!free_pages)
goto single_page;
/*
* If a sufficient number of pages in the block are either free or of
* comparable migratability as our allocation, claim the whole block.
*/
if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
return;
single_page:
move_to_free_list(page, zone, current_order, start_type);
}
/*
* Check whether there is a suitable fallback freepage with requested order.
* If only_stealable is true, this function returns fallback_mt only if
* we can steal other freepages all together. This would help to reduce
* fragmentation due to mixed migratetype pages in one pageblock.
*/
int find_suitable_fallback(struct free_area *area, unsigned int order,
int migratetype, bool only_stealable, bool *can_steal)
{
int i;
int fallback_mt;
if (area->nr_free == 0)
return -1;
*can_steal = false;
for (i = 0;; i++) {
fallback_mt = fallbacks[migratetype][i];
if (fallback_mt == MIGRATE_TYPES)
break;
if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
*can_steal = true;
if (!only_stealable)
return fallback_mt;
if (*can_steal)
return fallback_mt;
}
return -1;
}
/*
* Reserve a pageblock for exclusive use of high-order atomic allocations if
* there are no empty page blocks that contain a page with a suitable order
*/
static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
unsigned int alloc_order)
{
int mt;
unsigned long max_managed, flags;
/*
* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
* Check is race-prone but harmless.
*/
max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
spin_lock_irqsave(&zone->lock, flags);
/* Recheck the nr_reserved_highatomic limit under the lock */
if (zone->nr_reserved_highatomic >= max_managed)
goto out_unlock;
/* Yoink! */
mt = get_pageblock_migratetype(page);
if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
&& !is_migrate_cma(mt)) {
zone->nr_reserved_highatomic += pageblock_nr_pages;
set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
}
out_unlock:
spin_unlock_irqrestore(&zone->lock, flags);
}
/*
* Used when an allocation is about to fail under memory pressure. This
* potentially hurts the reliability of high-order allocations when under
* intense memory pressure but failed atomic allocations should be easier
* to recover from than an OOM.
*
* If @force is true, try to unreserve a pageblock even though highatomic
* pageblock is exhausted.
*/
static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
bool force)
{
struct zonelist *zonelist = ac->zonelist;
unsigned long flags;
struct zoneref *z;
struct zone *zone;
struct page *page;
int order;
bool ret;
for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
* is really high.
*/
if (!force && zone->nr_reserved_highatomic <=
pageblock_nr_pages)
continue;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
/*
* In page freeing path, migratetype change is racy so
* we can counter several free pages in a pageblock
* in this loop althoug we changed the pageblock type
* from highatomic to ac->migratetype. So we should
* adjust the count once.
*/
if (is_migrate_highatomic_page(page)) {
/*
* It should never happen but changes to
* locking could inadvertently allow a per-cpu
* drain to add pages to MIGRATE_HIGHATOMIC
* while unreserving so be safe and watch for
* underflows.
*/
zone->nr_reserved_highatomic -= min(
pageblock_nr_pages,
zone->nr_reserved_highatomic);
}
/*
* Convert to ac->migratetype and avoid the normal
* pageblock stealing heuristics. Minimally, the caller
* is doing the work and needs the pages. More
* importantly, if the block was always converted to
* MIGRATE_UNMOVABLE or another type then the number
* of pageblocks that cannot be completely freed
* may increase.
*/
set_pageblock_migratetype(page, ac->migratetype);
ret = move_freepages_block(zone, page, ac->migratetype,
NULL);
if (ret) {
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
return false;
}
/*
* Try finding a free buddy page on the fallback list and put it on the free
* list of requested migratetype, possibly along with other pages from the same
* block, depending on fragmentation avoidance heuristics. Returns true if
* fallback was found so that __rmqueue_smallest() can grab it.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
*/
static __always_inline bool
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
* Do not steal pages from freelists belonging to other pageblocks
* i.e. orders < pageblock_order. If there are no local zones free,
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
*/
if (alloc_flags & ALLOC_NOFRAGMENT)
min_order = pageblock_order;
/*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt == -1)
continue;
/*
* We cannot steal all free pages from the pageblock and the
* requested migratetype is movable. In that case it's better to
* steal and split the smallest available page instead of the
* largest available page, because even if the next movable
* allocation falls back into a different pageblock than this
* one, it won't cause permanent fragmentation.
*/
if (!can_steal && start_migratetype == MIGRATE_MOVABLE
&& current_order > order)
goto find_smallest;
goto do_steal;
}
return false;
find_smallest:
for (current_order = order; current_order < MAX_ORDER;
current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
start_migratetype, false, &can_steal);
if (fallback_mt != -1)
break;
}
/*
* This should not happen - we already found a suitable fallback
* when looking for the largest page.
*/
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
page = get_page_from_free_area(area, fallback_mt);
steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
return true;
}
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
unsigned int alloc_flags)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
alloc_flags))
goto retry;
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
#ifdef CONFIG_CMA
static struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags)
{
struct page *page = __rmqueue_cma_fallback(zone, order);
trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
return page;
}
#else
static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order,
int migratetype,
unsigned int alloc_flags)
{
return NULL;
}
#endif
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
* Returns the number of new pages which were placed at *list.
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page;
if (is_migrate_cma(migratetype))
page = __rmqueue_cma(zone, order, migratetype,
alloc_flags);
else
page = __rmqueue(zone, order, migratetype, alloc_flags);
if (unlikely(page == NULL))
break;
if (unlikely(check_pcp_refill(page)))
continue;
/*
* Split buddy pages returned by expand() are received here in
* physical page order. The page is added to the tail of
* caller's list. From the callers perspective, the linked list
* is ordered by page number under some conditions. This is
* useful for IO devices that can forward direction from the
* head, thus also in the physical page order. This is useful
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
list_add_tail(&page->lru, list);
alloced++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-(1 << order));
}
/*
* i pages were removed from the buddy list even if some leak due
* to check_pcp_refill failing so adjust NR_FREE_PAGES based
* on i. Do not confuse with 'alloced' which is the number of
* pages added to the pcp list.
*/
__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
spin_unlock(&zone->lock);
return alloced;
}
/*
* Return the pcp list that corresponds to the migrate type if that list isn't
* empty.
* If the list is empty return NULL.
*/
static struct list_head *get_populated_pcp_list(struct zone *zone,
unsigned int order, struct per_cpu_pages *pcp,
int migratetype, unsigned int alloc_flags)
{
struct list_head *list = &pcp->lists[migratetype];
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, order,
pcp->batch, list,
migratetype, alloc_flags);
if (list_empty(list))
list = NULL;
}
return list;
}
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
*
* Note that this function must be called with the thread pinned to
* a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
unsigned long flags;
int to_drain, batch;
local_irq_save(flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
if (to_drain > 0)
free_pcppages_bulk(zone, to_drain, pcp);
local_irq_restore(flags);
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
unsigned long flags;
struct per_cpu_pageset *pset;
struct per_cpu_pages *pcp;
local_irq_save(flags);
pset = per_cpu_ptr(zone->pageset, cpu);
pcp = &pset->pcp;
if (pcp->count)
free_pcppages_bulk(zone, pcp->count, pcp);
local_irq_restore(flags);
}
/*
* Drain pcplists of all zones on the indicated processor.
*
* The processor must either be the current processor and the
* thread pinned to the current processor or a processor that
* is not online.
*/
static void drain_pages(unsigned int cpu)
{
struct zone *zone;
for_each_populated_zone(zone) {
drain_pages_zone(cpu, zone);
}
}
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
*
* The CPU has to be pinned. When zone parameter is non-NULL, spill just
* the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
int cpu = smp_processor_id();
if (zone)
drain_pages_zone(cpu, zone);
else
drain_pages(cpu);
}
static void drain_local_pages_wq(struct work_struct *work)
{
struct pcpu_drain *drain;
drain = container_of(work, struct pcpu_drain, work);
/*
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
* a cpu pinned worker to an unbound one. We can operate on a different
* cpu which is allright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
drain_local_pages(drain->zone);
preempt_enable();
}
/*
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
*
* Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
int cpu;
/*
* Allocate in the BSS so we wont require allocation in
* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
*/
static cpumask_t cpus_with_pcps;
/*
* Make sure nobody triggers this path before mm_percpu_wq is fully
* initialized.
*/
if (WARN_ON_ONCE(!mm_percpu_wq))
return;
/*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
*/
if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
if (!zone)
return;
mutex_lock(&pcpu_drain_mutex);
}
/*
* We don't care about racing with CPU hotplug event
* as offline notification will cause the notified
* cpu to drain that CPU pcps and on_each_cpu_mask
* disables preemption as part of its processing
*/
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pcp;
struct zone *z;
bool has_pcps = false;
if (zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count)
has_pcps = true;
} else {
for_each_populated_zone(z) {
pcp = per_cpu_ptr(z->pageset, cpu);
if (pcp->pcp.count) {
has_pcps = true;
break;
}
}
}
if (has_pcps)
cpumask_set_cpu(cpu, &cpus_with_pcps);
else
cpumask_clear_cpu(cpu, &cpus_with_pcps);
}
for_each_cpu(cpu, &cpus_with_pcps) {
struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
drain->zone = zone;
INIT_WORK(&drain->work, drain_local_pages_wq);
queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
for_each_cpu(cpu, &cpus_with_pcps)
flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
#ifdef CONFIG_HIBERNATION
/*
* Touch the watchdog for every WD_PAGE_COUNT pages.
*/
#define WD_PAGE_COUNT (128*1024)
void mark_free_pages(struct zone *zone)
{
unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
unsigned long flags;
unsigned int order, t;
struct page *page;
if (zone_is_empty(zone))
return;
spin_lock_irqsave(&zone->lock, flags);
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
if (page_zone(page) != zone)
continue;
if (!swsusp_page_is_forbidden(page))
swsusp_unset_page_free(page);
}
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
&zone->free_area[order].free_list[t], lru) {
unsigned long i;
pfn = page_to_pfn(page);
for (i = 0; i < (1UL << order); i++) {
if (!--page_count) {
touch_nmi_watchdog();
page_count = WD_PAGE_COUNT;
}
swsusp_set_page_free(pfn_to_page(pfn + i));
}
}
}
spin_unlock_irqrestore(&zone->lock, flags);
}
#endif /* CONFIG_PM */
static bool free_unref_page_prepare(struct page *