sys-kernel/spl/files/spl-0.6.0_rc13-fix-soft-lockup.patch - platform/external/gentoo/overlays/gentoo - Git at Google

 From d4899f4747fd03be748fd1a589b9db5786fa1375 Mon Sep 17 00:00:00 2001
 From: Brian Behlendorf <behlendorf1@llnl.gov>
 Date: Fri, 11 Jan 2013 14:29:32 -0800
 Subject: [PATCH] kmem-cache: Fix slab ageing soft lockup

 Commit a10287e00d13c4c4dbbff14f42b00b03da363fcb slightly reworked
 the slab ageing code such that it is no longer dependent on the
 Linux delayed work queue interfaces.

 This was good for portability and performance, but it requires us
 to use the on_each_cpu() function to execute the spl_magazine_age()
 function.  That means that the function is now executing in interrupt
 context whereas before it was scheduled in normal process context.
 And that means we need to be slightly more careful about the locking
 in the interrupt handler.

 With the reworked code it's possible that we'll be holding the
 skc->skc_lock and be interrupted to handle the spl_magazine_age()
 IRQ.  This will result in a deadlock and soft lockup errors unless
 we're careful to detect the contention and avoid taking the lock in
 the interupt handler.  So that's what this patch does.

 Alternately, (and slightly more conventionally) we could have used
 spin_lock_irqsave() to prevent this race entirely but I'd perfer to
 avoid disabling interrupts as much as possible due to performance
 concerns.  There is absolutely no penalty for us not aging objects
 out of the magazine due to contention.

 Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
 Signed-off-by: Prakash Surya <surya1@llnl.gov>
 Closes zfsonlinux/zfs#1193
 ---
  module/spl/spl-kmem.c |   94 +++++++++++++++++++++++++++----------------------
  1 file changed, 51 insertions(+), 43 deletions(-)

 diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
 index bc08a55..cc5961e 100644
 --- a/module/spl/spl-kmem.c
 +++ b/module/spl/spl-kmem.c
 @@ -827,8 +827,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
  struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
  taskq_t *spl_kmem_cache_taskq;          /* Task queue for ageing / reclaim */

 -static int spl_cache_flush(spl_kmem_cache_t *skc,
 -                           spl_kmem_magazine_t *skm, int flush);
 +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);

  SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
  SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
 @@ -1244,6 +1243,38 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,
  	SRETURN(0);
  }

 +/*
 + * Release objects from the per-cpu magazine back to their slab.  The flush
 + * argument contains the max number of entries to remove from the magazine.
 + */
 +static void
 +__spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 +{
 +	int i, count = MIN(flush, skm->skm_avail);
 +	SENTRY;
 +
 +	ASSERT(skc->skc_magic == SKC_MAGIC);
 +	ASSERT(skm->skm_magic == SKM_MAGIC);
 +	ASSERT(spin_is_locked(&skc->skc_lock));
 +
 +	for (i = 0; i < count; i++)
 +		spl_cache_shrink(skc, skm->skm_objs[i]);
 +
 +	skm->skm_avail -= count;
 +	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 +	        sizeof(void *) * skm->skm_avail);
 +
 +	SEXIT;
 +}
 +
 +static void
 +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 +{
 +	spin_lock(&skc->skc_lock);
 +	__spl_cache_flush(skc, skm, flush);
 +	spin_unlock(&skc->skc_lock);
 +}
 +
  static void
  spl_magazine_age(void *data)
  {
 @@ -1252,10 +1283,23 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

  	ASSERT(skm->skm_magic == SKM_MAGIC);
  	ASSERT(skm->skm_cpu == smp_processor_id());
 +	ASSERT(irqs_disabled());
 +
 +	/* There are no available objects or they are too young to age out */
 +	if ((skm->skm_avail == 0) ||
 +	    time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
 +		return;

 -	if (skm->skm_avail > 0)
 -		if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
 -			(void) spl_cache_flush(skc, skm, skm->skm_refill);
 +	/*
 +	 * Because we're executing in interrupt context we may have
 +	 * interrupted the holder of this lock.  To avoid a potential
 +	 * deadlock return if the lock is contended.
 +	 */
 +	if (!spin_trylock(&skc->skc_lock))
 +		return;
 +
 +	__spl_cache_flush(skc, skm, skm->skm_refill);
 +	spin_unlock(&skc->skc_lock);
  }

  /*
 @@ -1451,7 +1495,7 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

          for_each_online_cpu(i) {
  		skm = skc->skc_mag[i];
 -		(void)spl_cache_flush(skc, skm, skm->skm_avail);
 +		spl_cache_flush(skc, skm, skm->skm_avail);
  		spl_magazine_free(skm);
          }

 @@ -1932,42 +1976,6 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,
  }

  /*
 - * Release a batch of objects from a per-cpu magazine back to their
 - * respective slabs.  This occurs when we exceed the magazine size,
 - * are under memory pressure, when the cache is idle, or during
 - * cache cleanup.  The flush argument contains the number of entries
 - * to remove from the magazine.
 - */
 -static int
 -spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 -{
 -	int i, count = MIN(flush, skm->skm_avail);
 -	SENTRY;
 -
 -	ASSERT(skc->skc_magic == SKC_MAGIC);
 -	ASSERT(skm->skm_magic == SKM_MAGIC);
 -
 -	/*
 -	 * XXX: Currently we simply return objects from the magazine to
 -	 * the slabs in fifo order.  The ideal thing to do from a memory
 -	 * fragmentation standpoint is to cheaply determine the set of
 -	 * objects in the magazine which will result in the largest
 -	 * number of free slabs if released from the magazine.
 -	 */
 -	spin_lock(&skc->skc_lock);
 -	for (i = 0; i < count; i++)
 -		spl_cache_shrink(skc, skm->skm_objs[i]);
 -
 -	skm->skm_avail -= count;
 -	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 -	        sizeof(void *) * skm->skm_avail);
 -
 -	spin_unlock(&skc->skc_lock);
 -
 -	SRETURN(count);
 -}
 -
 -/*
   * Allocate an object from the per-cpu magazine, or if the magazine
   * is empty directly allocate from a slab and repopulate the magazine.
   */
 @@ -2053,7 +2061,7 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

  	/* Per-CPU cache full, flush it to make space */
  	if (unlikely(skm->skm_avail >= skm->skm_size))
 -		(void)spl_cache_flush(skc, skm, skm->skm_refill);
 +		spl_cache_flush(skc, skm, skm->skm_refill);

  	/* Available space in cache, use it */
  	skm->skm_objs[skm->skm_avail++] = obj;
 --
 1.7.10
	From d4899f4747fd03be748fd1a589b9db5786fa1375 Mon Sep 17 00:00:00 2001
	From: Brian Behlendorf <behlendorf1@llnl.gov>
	Date: Fri, 11 Jan 2013 14:29:32 -0800
	Subject: [PATCH] kmem-cache: Fix slab ageing soft lockup

	Commit a10287e00d13c4c4dbbff14f42b00b03da363fcb slightly reworked
	the slab ageing code such that it is no longer dependent on the
	Linux delayed work queue interfaces.

	This was good for portability and performance, but it requires us
	to use the on_each_cpu() function to execute the spl_magazine_age()
	function. That means that the function is now executing in interrupt
	context whereas before it was scheduled in normal process context.
	And that means we need to be slightly more careful about the locking
	in the interrupt handler.

	With the reworked code it's possible that we'll be holding the
	skc->skc_lock and be interrupted to handle the spl_magazine_age()
	IRQ. This will result in a deadlock and soft lockup errors unless
	we're careful to detect the contention and avoid taking the lock in
	the interupt handler. So that's what this patch does.

	Alternately, (and slightly more conventionally) we could have used
	spin_lock_irqsave() to prevent this race entirely but I'd perfer to
	avoid disabling interrupts as much as possible due to performance
	concerns. There is absolutely no penalty for us not aging objects
	out of the magazine due to contention.

	Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
	Signed-off-by: Prakash Surya <surya1@llnl.gov>
	Closes zfsonlinux/zfs#1193
	---
	module/spl/spl-kmem.c \| 94 +++++++++++++++++++++++++++----------------------
	1 file changed, 51 insertions(+), 43 deletions(-)

	diff --git a/module/spl/spl-kmem.c b/module/spl/spl-kmem.c
	index bc08a55..cc5961e 100644
	--- a/module/spl/spl-kmem.c
	+++ b/module/spl/spl-kmem.c
	@@ -827,8 +827,7 @@ char kvasprintf(gfp_t gfp, const char fmt, va_list ap)
	struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
	taskq_t spl_kmem_cache_taskq; / Task queue for ageing / reclaim */

	-static int spl_cache_flush(spl_kmem_cache_t *skc,
	- spl_kmem_magazine_t *skm, int flush);
	+static void spl_cache_shrink(spl_kmem_cache_t skc, void obj);

	SPL_SHRINKER_CALLBACK_FWD_DECLARE(spl_kmem_cache_generic_shrinker);
	SPL_SHRINKER_DECLARE(spl_kmem_cache_shrinker,
	@@ -1244,6 +1243,38 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,
	SRETURN(0);
	}

	+/*
	+ * Release objects from the per-cpu magazine back to their slab. The flush
	+ * argument contains the max number of entries to remove from the magazine.
	+ */
	+static void
	+__spl_cache_flush(spl_kmem_cache_t skc, spl_kmem_magazine_t skm, int flush)
	+{
	+ int i, count = MIN(flush, skm->skm_avail);
	+ SENTRY;
	+
	+ ASSERT(skc->skc_magic == SKC_MAGIC);
	+ ASSERT(skm->skm_magic == SKM_MAGIC);
	+ ASSERT(spin_is_locked(&skc->skc_lock));
	+
	+ for (i = 0; i < count; i++)
	+ spl_cache_shrink(skc, skm->skm_objs[i]);
	+
	+ skm->skm_avail -= count;
	+ memmove(skm->skm_objs, &(skm->skm_objs[count]),
	+ sizeof(void ) skm->skm_avail);
	+
	+ SEXIT;
	+}
	+
	+static void
	+spl_cache_flush(spl_kmem_cache_t skc, spl_kmem_magazine_t skm, int flush)
	+{
	+ spin_lock(&skc->skc_lock);
	+ __spl_cache_flush(skc, skm, flush);
	+ spin_unlock(&skc->skc_lock);
	+}
	+
	static void
	spl_magazine_age(void *data)
	{
	@@ -1252,10 +1283,23 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

	ASSERT(skm->skm_magic == SKM_MAGIC);
	ASSERT(skm->skm_cpu == smp_processor_id());
	+ ASSERT(irqs_disabled());
	+
	+ /* There are no available objects or they are too young to age out */
	+ if ((skm->skm_avail == 0) \|\|
	+ time_before(jiffies, skm->skm_age + skc->skc_delay * HZ))
	+ return;

	- if (skm->skm_avail > 0)
	- if (time_after(jiffies, skm->skm_age + skc->skc_delay * HZ))
	- (void) spl_cache_flush(skc, skm, skm->skm_refill);
	+ /*
	+ * Because we're executing in interrupt context we may have
	+ * interrupted the holder of this lock. To avoid a potential
	+ * deadlock return if the lock is contended.
	+ */
	+ if (!spin_trylock(&skc->skc_lock))
	+ return;
	+
	+ __spl_cache_flush(skc, skm, skm->skm_refill);
	+ spin_unlock(&skc->skc_lock);
	}

	/*
	@@ -1451,7 +1495,7 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

	for_each_online_cpu(i) {
	skm = skc->skc_mag[i];
	- (void)spl_cache_flush(skc, skm, skm->skm_avail);
	+ spl_cache_flush(skc, skm, skm->skm_avail);
	spl_magazine_free(skm);
	}

	@@ -1932,42 +1976,6 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,
	}

	/*
	- * Release a batch of objects from a per-cpu magazine back to their
	- * respective slabs. This occurs when we exceed the magazine size,
	- * are under memory pressure, when the cache is idle, or during
	- * cache cleanup. The flush argument contains the number of entries
	- * to remove from the magazine.
	- */
	-static int
	-spl_cache_flush(spl_kmem_cache_t skc, spl_kmem_magazine_t skm, int flush)
	-{
	- int i, count = MIN(flush, skm->skm_avail);
	- SENTRY;
	-
	- ASSERT(skc->skc_magic == SKC_MAGIC);
	- ASSERT(skm->skm_magic == SKM_MAGIC);
	-
	- /*
	- * XXX: Currently we simply return objects from the magazine to
	- * the slabs in fifo order. The ideal thing to do from a memory
	- * fragmentation standpoint is to cheaply determine the set of
	- * objects in the magazine which will result in the largest
	- * number of free slabs if released from the magazine.
	- */
	- spin_lock(&skc->skc_lock);
	- for (i = 0; i < count; i++)
	- spl_cache_shrink(skc, skm->skm_objs[i]);
	-
	- skm->skm_avail -= count;
	- memmove(skm->skm_objs, &(skm->skm_objs[count]),
	- sizeof(void ) skm->skm_avail);
	-
	- spin_unlock(&skc->skc_lock);
	-
	- SRETURN(count);
	-}
	-
	-/*
	* Allocate an object from the per-cpu magazine, or if the magazine
	* is empty directly allocate from a slab and repopulate the magazine.
	*/
	@@ -2053,7 +2061,7 @@ static int spl_cache_flush(spl_kmem_cache_t *skc,

	/* Per-CPU cache full, flush it to make space */
	if (unlikely(skm->skm_avail >= skm->skm_size))
	- (void)spl_cache_flush(skc, skm, skm->skm_refill);
	+ spl_cache_flush(skc, skm, skm->skm_refill);

	/* Available space in cache, use it */
	skm->skm_objs[skm->skm_avail++] = obj;
	--
	1.7.10