| From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
| From: Rik van Riel <riel@redhat.com> |
| Date: Thu, 1 Sep 2011 15:26:50 -0400 |
| Subject: ANDROID: add extra free kbytes tunable |
| |
| Add a userspace visible knob to tell the VM to keep an extra amount |
| of memory free, by increasing the gap between each zone's min and |
| low watermarks. |
| |
| This is useful for realtime applications that call system |
| calls and have a bound on the number of allocations that happen |
| in any short time period. In this application, extra_free_kbytes |
| would be left at an amount equal to or larger than than the |
| maximum number of allocations that happen in any burst. |
| |
| It may also be useful to reduce the memory use of virtual |
| machines (temporarily?), in a way that does not cause memory |
| fragmentation like ballooning does. |
| |
| [ccross] |
| Revived for use on old kernels where no other solution exists. |
| The tunable will be removed on kernels that do better at avoiding |
| direct reclaim. |
| |
| [surenb] |
| Will be reverted as soon as Android framework is reworked to |
| use upstream-supported watermark_scale_factor instead of |
| extra_free_kbytes. |
| |
| Bug: 86445363 |
| Bug: 109664768 |
| Bug: 120445732 |
| Change-Id: I765a42be8e964bfd3e2886d1ca85a29d60c3bb3e |
| Signed-off-by: Rik van Riel <riel@redhat.com> |
| Signed-off-by: Colin Cross <ccross@android.com> |
| Signed-off-by: Suren Baghdasaryan <surenb@google.com> |
| --- |
| Documentation/admin-guide/sysctl/vm.rst | 16 ++++++++++++++++ |
| kernel/sysctl.c | 9 +++++++++ |
| mm/page_alloc.c | 25 +++++++++++++++++++++---- |
| 3 files changed, 46 insertions(+), 4 deletions(-) |
| |
| diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst |
| index 64aeee1009ca..9e8470008227 100644 |
| --- a/Documentation/admin-guide/sysctl/vm.rst |
| +++ b/Documentation/admin-guide/sysctl/vm.rst |
| @@ -37,6 +37,7 @@ Currently, these files are in /proc/sys/vm: |
| - dirty_writeback_centisecs |
| - drop_caches |
| - extfrag_threshold |
| +- extra_free_kbytes |
| - hugetlb_shm_group |
| - laptop_mode |
| - legacy_va_layout |
| @@ -287,6 +288,21 @@ only use the low memory and they can fill it up with dirty data without |
| any throttling. |
| |
| |
| +extra_free_kbytes |
| + |
| +This parameter tells the VM to keep extra free memory between the threshold |
| +where background reclaim (kswapd) kicks in, and the threshold where direct |
| +reclaim (by allocating processes) kicks in. |
| + |
| +This is useful for workloads that require low latency memory allocations |
| +and have a bounded burstiness in memory allocations, for example a |
| +realtime application that receives and transmits network traffic |
| +(causing in-kernel memory allocations) with a maximum total message burst |
| +size of 200MB may need 200MB of extra free memory to avoid direct reclaim |
| +related latencies. |
| + |
| +============================================================== |
| + |
| hugetlb_shm_group |
| ================= |
| |
| diff --git a/kernel/sysctl.c b/kernel/sysctl.c |
| index 70665934d53e..93be3f131298 100644 |
| --- a/kernel/sysctl.c |
| +++ b/kernel/sysctl.c |
| @@ -111,6 +111,7 @@ extern char core_pattern[]; |
| extern unsigned int core_pipe_limit; |
| #endif |
| extern int pid_max; |
| +extern int extra_free_kbytes; |
| extern int pid_max_min, pid_max_max; |
| extern int percpu_pagelist_fraction; |
| extern int latencytop_enabled; |
| @@ -1524,6 +1525,14 @@ static struct ctl_table vm_table[] = { |
| .extra1 = SYSCTL_ONE, |
| .extra2 = &one_thousand, |
| }, |
| + { |
| + .procname = "extra_free_kbytes", |
| + .data = &extra_free_kbytes, |
| + .maxlen = sizeof(extra_free_kbytes), |
| + .mode = 0644, |
| + .proc_handler = min_free_kbytes_sysctl_handler, |
| + .extra1 = SYSCTL_ZERO, |
| + }, |
| { |
| .procname = "percpu_pagelist_fraction", |
| .data = &percpu_pagelist_fraction, |
| diff --git a/mm/page_alloc.c b/mm/page_alloc.c |
| index f391c0c4ed1d..c41f10295f57 100644 |
| --- a/mm/page_alloc.c |
| +++ b/mm/page_alloc.c |
| @@ -313,6 +313,11 @@ compound_page_dtor * const compound_page_dtors[] = { |
| #endif |
| }; |
| |
| +/* |
| + * Try to keep at least this much lowmem free. Do not allow normal |
| + * allocations below this point, only high priority ones. Automatically |
| + * tuned according to the amount of memory in the system. |
| + */ |
| int min_free_kbytes = 1024; |
| int user_min_free_kbytes = -1; |
| #ifdef CONFIG_DISCONTIGMEM |
| @@ -331,6 +336,13 @@ int watermark_boost_factor __read_mostly = 15000; |
| #endif |
| int watermark_scale_factor = 10; |
| |
| +/* |
| + * Extra memory for the system to try freeing. Used to temporarily |
| + * free memory, to make space for new workloads. Anyone can allocate |
| + * down to the min watermarks controlled by min_free_kbytes above. |
| + */ |
| +int extra_free_kbytes = 0; |
| + |
| static unsigned long nr_kernel_pages __initdata; |
| static unsigned long nr_all_pages __initdata; |
| static unsigned long dma_reserve __initdata; |
| @@ -7736,6 +7748,7 @@ static void setup_per_zone_lowmem_reserve(void) |
| static void __setup_per_zone_wmarks(void) |
| { |
| unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); |
| + unsigned long pages_low = extra_free_kbytes >> (PAGE_SHIFT - 10); |
| unsigned long lowmem_pages = 0; |
| struct zone *zone; |
| unsigned long flags; |
| @@ -7747,11 +7760,13 @@ static void __setup_per_zone_wmarks(void) |
| } |
| |
| for_each_zone(zone) { |
| - u64 tmp; |
| + u64 tmp, low; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| tmp = (u64)pages_min * zone_managed_pages(zone); |
| do_div(tmp, lowmem_pages); |
| + low = (u64)pages_low * zone_managed_pages(zone); |
| + do_div(low, vm_total_pages); |
| if (is_highmem(zone)) { |
| /* |
| * __GFP_HIGH and PF_MEMALLOC allocations usually don't |
| @@ -7784,8 +7799,10 @@ static void __setup_per_zone_wmarks(void) |
| mult_frac(zone_managed_pages(zone), |
| watermark_scale_factor, 10000)); |
| |
| - zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp; |
| - zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2; |
| + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + |
| + low + tmp; |
| + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + |
| + low + tmp * 2; |
| zone->watermark_boost = 0; |
| |
| spin_unlock_irqrestore(&zone->lock, flags); |
| @@ -7869,7 +7886,7 @@ core_initcall(init_per_zone_wmark_min) |
| /* |
| * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so |
| * that we can call two helper functions whenever min_free_kbytes |
| - * changes. |
| + * or extra_free_kbytes changes. |
| */ |
| int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, |
| void __user *buffer, size_t *length, loff_t *ppos) |