Tune the jemalloc to reign in PSS.

The tcache in jemalloc can take up quite a bit of extra PSS. Disabling
the tcache can save a lot of PSS, but it radically reduces performance.

Tune the number of small and large values to store in the tcache.
Immediately force any dirty pages to be purged, rather than keep some
number of dirty pages around.

Restore the chunk size back to 4MB. Using this chunk size and the force
dirty page results in a higher cf-bench native mallocs score but about
the same amount of PSS use.

Limit the number of arenas to 2. The default is 2 * number of cpus, but
that increases the amount of PSS used. My benchmarking indicates that
more than 2 really doesn't help too much even on a device with 4 cpus.
Nearly all speed-ups come from the tcache.

Bug: 17498287

Change-Id: I23b23dd88288c90e002a0a04684fb06dbf4ee742
diff --git a/Android.mk b/Android.mk
index f9497cb..5acd474 100644
--- a/Android.mk
+++ b/Android.mk
@@ -22,6 +22,30 @@
 	-fvisibility=hidden \
 	-Wno-unused-parameter \
 
+# These parameters change the way jemalloc works.
+#   ANDROID_ALWAYS_PURGE
+#     If defined, always purge immediately when a page is purgeable.
+#   ANDROID_MAX_ARENAS=XX
+#     The total number of arenas will be less than or equal to this number.
+#     The number of arenas will be calculated as 2 * the number of cpus
+#     but no larger than XX.
+#   ANDROID_TCACHE_NSLOTS_SMALL_MAX=XX
+#     The number of small slots held in the tcache. The higher this number
+#     is, the higher amount of PSS consumed. If this number is set too low
+#     then small allocations will take longer to complete.
+#   ANDROID_TCACHE_NSLOTS_LARGE=XX
+#     The number of large slots held in the tcache. The higher this number
+#     is, the higher amount of PSS consumed. If this number is set too low
+#     then large allocations will take longer to complete.
+#   ANDROID_LG_TCACHE_MAXCLASS_DEFAULT=XX
+#     1 << XX is the maximum sized allocation that will be in the tcache.
+common_cflags += \
+	-DANDROID_ALWAYS_PURGE \
+	-DANDROID_MAX_ARENAS=2 \
+	-DANDROID_TCACHE_NSLOTS_SMALL_MAX=8 \
+	-DANDROID_TCACHE_NSLOTS_LARGE=16 \
+	-DANDROID_LG_TCACHE_MAXCLASS_DEFAULT=16 \
+
 common_c_includes := \
 	$(LOCAL_PATH)/src \
 	$(LOCAL_PATH)/include \
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 47bbccd..f3bfbe0 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -5,11 +5,7 @@
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
  */
-#if defined(__ANDROID__)
-#define	LG_CHUNK_DEFAULT	20
-#else
 #define	LG_CHUNK_DEFAULT	22
-#endif
 
 /* Return the chunk address for allocation address a. */
 #define	CHUNK_ADDR2BASE(a)						\
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index c0d48b9..6712341 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -22,13 +22,25 @@
  *
  * This constant must be an even number.
  */
+#if defined(ANDROID_TCACHE_NSLOTS_SMALL_MAX)
+#define	TCACHE_NSLOTS_SMALL_MAX		ANDROID_TCACHE_NSLOTS_SMALL_MAX
+#else
 #define	TCACHE_NSLOTS_SMALL_MAX		200
+#endif
 
 /* Number of cache slots for large size classes. */
+#if defined(ANDROID_TCACHE_NSLOTS_LARGE)
+#define	TCACHE_NSLOTS_LARGE		ANDROID_TCACHE_NSLOTS_LARGE
+#else
 #define	TCACHE_NSLOTS_LARGE		20
+#endif
 
 /* (1U << opt_lg_tcache_max) is used to compute tcache_maxclass. */
+#if defined(ANDROID_LG_TCACHE_MAXCLASS_DEFAULT)
+#define	LG_TCACHE_MAXCLASS_DEFAULT	ANDROID_LG_TCACHE_MAXCLASS_DEFAULT
+#else
 #define	LG_TCACHE_MAXCLASS_DEFAULT	15
+#endif
 
 /*
  * TCACHE_GC_SWEEP is the approximate number of allocation events between
diff --git a/src/arena.c b/src/arena.c
index d3fe0fb..026c74a 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -868,14 +868,17 @@
 static inline void
 arena_maybe_purge(arena_t *arena)
 {
+#if !defined(ANDROID_ALWAYS_PURGE)
 	size_t npurgeable, threshold;
 
 	/* Don't purge if the option is disabled. */
 	if (opt_lg_dirty_mult < 0)
 		return;
+#endif
 	/* Don't purge if all dirty pages are already being purged. */
 	if (arena->ndirty <= arena->npurgatory)
 		return;
+#if !defined(ANDROID_ALWAYS_PURGE)
 	npurgeable = arena->ndirty - arena->npurgatory;
 	threshold = (arena->nactive >> opt_lg_dirty_mult);
 	/*
@@ -884,6 +887,7 @@
 	 */
 	if (npurgeable <= threshold)
 		return;
+#endif
 
 	arena_purge(arena, false);
 }
diff --git a/src/jemalloc.c b/src/jemalloc.c
index baff69d..7e7aaf4 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -826,6 +826,14 @@
 		else
 			opt_narenas = 1;
 	}
+#if defined(ANDROID_MAX_ARENAS)
+	/* Never create more than MAX_ARENAS arenas regardless of num_cpus.
+	 * Extra arenas use more PSS and are not very useful unless
+	 * lots of threads are allocing/freeing at the same time.
+	 */
+	if (opt_narenas > ANDROID_MAX_ARENAS)
+		opt_narenas = ANDROID_MAX_ARENAS;
+#endif
 	narenas_auto = opt_narenas;
 	/*
 	 * Make sure that the arenas array can be allocated.  In practice, this