Refactor arenas_cache tsd.

Refactor arenas_cache tsd into arenas_tdata, which is a structure of
type arena_tdata_t.
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 24c4c1d..2750c00 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -31,6 +31,7 @@
 typedef struct arena_bin_info_s arena_bin_info_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
+typedef struct arena_tdata_s arena_tdata_t;
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
@@ -403,6 +404,11 @@
 	/* bins is used to store trees of free regions. */
 	arena_bin_t		bins[NBINS];
 };
+
+/* Used in conjunction with tsd for fast arena-related context lookup. */
+struct arena_tdata_s {
+	arena_t			*arena;
+};
 #endif /* JEMALLOC_ARENA_STRUCTS_B */
 
 #endif /* JEMALLOC_H_STRUCTS */
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 616eb9f..760dbdd 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -459,16 +459,18 @@
 arena_t	*arenas_extend(unsigned ind);
 arena_t	*arena_init(unsigned ind);
 unsigned	narenas_total_get(void);
-arena_t	*arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing);
+arena_tdata_t	*arena_tdata_get_hard(tsd_t *tsd, unsigned ind);
+arena_t	*arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    arena_tdata_t *tdata);
 arena_t	*arena_choose_hard(tsd_t *tsd);
 void	arena_migrate(tsd_t *tsd, unsigned oldind, unsigned newind);
 unsigned	arena_nbound(unsigned ind);
 void	thread_allocated_cleanup(tsd_t *tsd);
 void	thread_deallocated_cleanup(tsd_t *tsd);
 void	arena_cleanup(tsd_t *tsd);
-void	arenas_cache_cleanup(tsd_t *tsd);
-void	narenas_cache_cleanup(tsd_t *tsd);
-void	arenas_cache_bypass_cleanup(tsd_t *tsd);
+void	arenas_tdata_cleanup(tsd_t *tsd);
+void	narenas_tdata_cleanup(tsd_t *tsd);
+void	arenas_tdata_bypass_cleanup(tsd_t *tsd);
 void	jemalloc_prefork(void);
 void	jemalloc_postfork_parent(void);
 void	jemalloc_postfork_child(void);
@@ -535,6 +537,8 @@
 size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
 arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
+arena_tdata_t	*arena_tdata_get(tsd_t *tsd, unsigned ind,
+    bool refresh_if_missing);
 arena_t	*arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
     bool refresh_if_missing);
 #endif
@@ -785,32 +789,45 @@
 	return (ret);
 }
 
+JEMALLOC_INLINE arena_tdata_t *
+arena_tdata_get(tsd_t *tsd, unsigned ind, bool refresh_if_missing)
+{
+	arena_tdata_t *tdata;
+	arena_tdata_t *arenas_tdata = tsd_arenas_tdata_get(tsd);
+
+	if (unlikely(arenas_tdata == NULL)) {
+		/* arenas_tdata hasn't been initialized yet. */
+		return (arena_tdata_get_hard(tsd, ind));
+	}
+	if (unlikely(ind >= tsd_narenas_tdata_get(tsd))) {
+		/*
+		 * ind is invalid, cache is old (too small), or tdata to be
+		 * initialized.
+		 */
+		return (refresh_if_missing ? arena_tdata_get_hard(tsd, ind) :
+		    NULL);
+	}
+
+	tdata = &arenas_tdata[ind];
+	if (likely(tdata != NULL) || !refresh_if_missing)
+		return (tdata);
+	return (arena_tdata_get_hard(tsd, ind));
+}
+
 JEMALLOC_INLINE arena_t *
 arena_get(tsd_t *tsd, unsigned ind, bool init_if_missing,
     bool refresh_if_missing)
 {
-	arena_t *arena;
-	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
+	arena_tdata_t *tdata;
 
 	/* init_if_missing requires refresh_if_missing. */
 	assert(!init_if_missing || refresh_if_missing);
 
-	if (unlikely(arenas_cache == NULL)) {
-		/* arenas_cache hasn't been initialized yet. */
-		return (arena_get_hard(tsd, ind, init_if_missing));
-	}
-	if (unlikely(ind >= tsd_narenas_cache_get(tsd))) {
-		/*
-		 * ind is invalid, cache is old (too small), or arena to be
-		 * initialized.
-		 */
-		return (refresh_if_missing ? arena_get_hard(tsd, ind,
-		    init_if_missing) : NULL);
-	}
-	arena = arenas_cache[ind];
-	if (likely(arena != NULL) || !refresh_if_missing)
-		return (arena);
-	return (arena_get_hard(tsd, ind, init_if_missing));
+	tdata = arena_tdata_get(tsd, ind, refresh_if_missing);
+	if (unlikely(tdata == NULL || tdata->arena == NULL))
+		return (arena_get_hard(tsd, ind, init_if_missing, tdata));
+
+	return (tdata->arena);
 }
 #endif
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index 87b5a91..a0e6d8a 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -93,11 +93,13 @@
 arena_run_regind
 arena_run_to_miscelm
 arena_salloc
-arenas_cache_bypass_cleanup
-arenas_cache_cleanup
+arenas_tdata_bypass_cleanup
+arenas_tdata_cleanup
 arena_sdalloc
 arena_stats_merge
 arena_tcache_fill_small
+arena_tdata_get
+arena_tdata_get_hard
 atomic_add_p
 atomic_add_u
 atomic_add_uint32
@@ -311,7 +313,7 @@
 map_misc_offset
 mb_write
 mutex_boot
-narenas_cache_cleanup
+narenas_tdata_cleanup
 narenas_total_get
 ncpus
 nhbins
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index eed7aa0..16cc2f1 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -537,9 +537,9 @@
     O(thread_deallocated,	uint64_t)				\
     O(prof_tdata,		prof_tdata_t *)				\
     O(arena,			arena_t *)				\
-    O(arenas_cache,		arena_t **)				\
-    O(narenas_cache,		unsigned)				\
-    O(arenas_cache_bypass,	bool)					\
+    O(arenas_tdata,		arena_tdata_t *)			\
+    O(narenas_tdata,		unsigned)				\
+    O(arenas_tdata_bypass,	bool)					\
     O(tcache_enabled,		tcache_enabled_t)			\
     O(quarantine,		quarantine_t *)				\
 
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 8415c0e..d2b2afc 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -516,74 +516,99 @@
 	tsd_arena_set(tsd, NULL);
 }
 
-arena_t *
-arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing)
+arena_tdata_t *
+arena_tdata_get_hard(tsd_t *tsd, unsigned ind)
 {
-	arena_t *arena;
-	arena_t **arenas_cache = tsd_arenas_cache_get(tsd);
-	unsigned narenas_cache = tsd_narenas_cache_get(tsd);
+	arena_tdata_t *tdata, *arenas_tdata_old;
+	arena_tdata_t *arenas_tdata = tsd_arenas_tdata_get(tsd);
+	unsigned narenas_tdata_old, i;
+	unsigned narenas_tdata = tsd_narenas_tdata_get(tsd);
 	unsigned narenas_actual = narenas_total_get();
 
-	/* Deallocate old cache if it's too small. */
-	if (arenas_cache != NULL && narenas_cache < narenas_actual) {
-		a0dalloc(arenas_cache);
-		arenas_cache = NULL;
-		narenas_cache = 0;
-		tsd_arenas_cache_set(tsd, arenas_cache);
-		tsd_narenas_cache_set(tsd, narenas_cache);
+	/*
+	 * Dissociate old tdata array (and set up for deallocation upon return)
+	 * if it's too small.
+	 */
+	if (arenas_tdata != NULL && narenas_tdata < narenas_actual) {
+		arenas_tdata_old = arenas_tdata;
+		narenas_tdata_old = narenas_tdata;
+		arenas_tdata = NULL;
+		narenas_tdata = 0;
+		tsd_arenas_tdata_set(tsd, arenas_tdata);
+		tsd_narenas_tdata_set(tsd, narenas_tdata);
+	} else {
+		arenas_tdata_old = NULL;
+		narenas_tdata_old = 0;
 	}
 
-	/* Allocate cache if it's missing. */
-	if (arenas_cache == NULL) {
-		bool *arenas_cache_bypassp = tsd_arenas_cache_bypassp_get(tsd);
-		assert(ind < narenas_actual || !init_if_missing);
-		narenas_cache = (ind < narenas_actual) ? narenas_actual : ind+1;
+	/* Allocate tdata array if it's missing. */
+	if (arenas_tdata == NULL) {
+		bool *arenas_tdata_bypassp = tsd_arenas_tdata_bypassp_get(tsd);
+		narenas_tdata = (ind < narenas_actual) ? narenas_actual : ind+1;
 
-		if (tsd_nominal(tsd) && !*arenas_cache_bypassp) {
-			*arenas_cache_bypassp = true;
-			arenas_cache = (arena_t **)a0malloc(sizeof(arena_t *) *
-			    narenas_cache);
-			*arenas_cache_bypassp = false;
+		if (tsd_nominal(tsd) && !*arenas_tdata_bypassp) {
+			*arenas_tdata_bypassp = true;
+			arenas_tdata = (arena_tdata_t *)a0malloc(
+			    sizeof(arena_tdata_t) * narenas_tdata);
+			*arenas_tdata_bypassp = false;
 		}
-		if (arenas_cache == NULL) {
-			/*
-			 * This function must always tell the truth, even if
-			 * it's slow, so don't let OOM, thread cleanup (note
-			 * tsd_nominal check), nor recursive allocation
-			 * avoidance (note arenas_cache_bypass check) get in the
-			 * way.
-			 */
-			if (ind >= narenas_actual)
-				return (NULL);
-			malloc_mutex_lock(&arenas_lock);
-			arena = arenas[ind];
-			malloc_mutex_unlock(&arenas_lock);
-			return (arena);
+		if (arenas_tdata == NULL) {
+			tdata = NULL;
+			goto label_return;
 		}
-		assert(tsd_nominal(tsd) && !*arenas_cache_bypassp);
-		tsd_arenas_cache_set(tsd, arenas_cache);
-		tsd_narenas_cache_set(tsd, narenas_cache);
+		assert(tsd_nominal(tsd) && !*arenas_tdata_bypassp);
+		tsd_arenas_tdata_set(tsd, arenas_tdata);
+		tsd_narenas_tdata_set(tsd, narenas_tdata);
 	}
 
 	/*
-	 * Copy to cache.  It's possible that the actual number of arenas has
-	 * increased since narenas_total_get() was called above, but that causes
-	 * no correctness issues unless two threads concurrently execute the
-	 * arenas.extend mallctl, which we trust mallctl synchronization to
+	 * Copy to tdata array.  It's possible that the actual number of arenas
+	 * has increased since narenas_total_get() was called above, but that
+	 * causes no correctness issues unless two threads concurrently execute
+	 * the arenas.extend mallctl, which we trust mallctl synchronization to
 	 * prevent.
 	 */
 	malloc_mutex_lock(&arenas_lock);
-	memcpy(arenas_cache, arenas, sizeof(arena_t *) * narenas_actual);
+	for (i = 0; i < narenas_actual; i++)
+		arenas_tdata[i].arena = arenas[i];
 	malloc_mutex_unlock(&arenas_lock);
-	if (narenas_cache > narenas_actual) {
-		memset(&arenas_cache[narenas_actual], 0, sizeof(arena_t *) *
-		    (narenas_cache - narenas_actual));
+	if (narenas_tdata > narenas_actual) {
+		memset(&arenas_tdata[narenas_actual], 0, sizeof(arena_tdata_t)
+		    * (narenas_tdata - narenas_actual));
 	}
 
-	/* Read the refreshed cache, and init the arena if necessary. */
-	arena = arenas_cache[ind];
-	if (init_if_missing && arena == NULL)
-		arena = arenas_cache[ind] = arena_init(ind);
+	/* Read the refreshed tdata array. */
+	tdata = &arenas_tdata[ind];
+label_return:
+	if (arenas_tdata_old != NULL)
+		a0dalloc(arenas_tdata_old);
+	return (tdata);
+}
+
+arena_t *
+arena_get_hard(tsd_t *tsd, unsigned ind, bool init_if_missing,
+    arena_tdata_t *tdata)
+{
+	arena_t *arena;
+	unsigned narenas_actual;
+
+	if (init_if_missing && tdata != NULL) {
+		tdata->arena = arena_init(ind);
+		if (tdata->arena != NULL)
+			return (tdata->arena);
+	}
+
+	/*
+	 * This function must always tell the truth, even if it's slow, so don't
+	 * let OOM, thread cleanup (note tsd_nominal check), nor recursive
+	 * allocation avoidance (note arenas_tdata_bypass check) get in the way.
+	 */
+	narenas_actual = narenas_total_get();
+	if (ind >= narenas_actual)
+		return (NULL);
+	malloc_mutex_lock(&arenas_lock);
+	arena = arenas[ind];
+	malloc_mutex_unlock(&arenas_lock);
 	return (arena);
 }
 
@@ -674,26 +699,26 @@
 }
 
 void
-arenas_cache_cleanup(tsd_t *tsd)
+arenas_tdata_cleanup(tsd_t *tsd)
 {
-	arena_t **arenas_cache;
+	arena_tdata_t *arenas_tdata;
 
-	arenas_cache = tsd_arenas_cache_get(tsd);
-	if (arenas_cache != NULL) {
-		tsd_arenas_cache_set(tsd, NULL);
-		a0dalloc(arenas_cache);
+	arenas_tdata = tsd_arenas_tdata_get(tsd);
+	if (arenas_tdata != NULL) {
+		tsd_arenas_tdata_set(tsd, NULL);
+		a0dalloc(arenas_tdata);
 	}
 }
 
 void
-narenas_cache_cleanup(tsd_t *tsd)
+narenas_tdata_cleanup(tsd_t *tsd)
 {
 
 	/* Do nothing. */
 }
 
 void
-arenas_cache_bypass_cleanup(tsd_t *tsd)
+arenas_tdata_bypass_cleanup(tsd_t *tsd)
 {
 
 	/* Do nothing. */
diff --git a/src/tsd.c b/src/tsd.c
index 9ffe9af..b85b8b9 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -113,7 +113,7 @@
 	ncleanups = 0;
 	if (tsd_boot0())
 		return (true);
-	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = true;
+	*tsd_arenas_tdata_bypassp_get(tsd_fetch()) = true;
 	return (false);
 }
 
@@ -122,7 +122,7 @@
 {
 
 	tsd_boot1();
-	*tsd_arenas_cache_bypassp_get(tsd_fetch()) = false;
+	*tsd_arenas_tdata_bypassp_get(tsd_fetch()) = false;
 }
 
 #ifdef _WIN32