Improves stacktrace unwinding on x86

* other platforms (e.g. amd64) are first trying to unwind
  with cfi info, then with the fp chain.
* fp unwind when code is compiled without frame pointer can
  fail and give incomplete stack traces (often terminating
  with a random program counter, causing a huge amount of
  recorded stack traces).

This patch improves unwinding on x86 by:
* first time an IP is unwound, do the unwind both with
  CFI technique and with fp technique.
  If results are identical, IP is inserted in a cache of
  'fp unwindable' IP
* following unwind of the same IP are then done directly
  either with fp unwind or with cfi, depending on the
  cached result of the check done during first unwind.

The cache is needed so as to avoid as much as possible cfi unwind,
as this is significantly slower than fp unwind.



git-svn-id: svn://svn.valgrind.org/valgrind/trunk@13280 a5019735-40e9-0310-863c-91ae7b9d1cf9
diff --git a/coregrind/m_debuginfo/debuginfo.c b/coregrind/m_debuginfo/debuginfo.c
index 0a42325..2d7ec82 100644
--- a/coregrind/m_debuginfo/debuginfo.c
+++ b/coregrind/m_debuginfo/debuginfo.c
@@ -100,6 +100,7 @@
 /*--- fwdses                                               ---*/
 /*------------------------------------------------------------*/
 
+static UInt CF_info_generation = 0;
 static void cfsi_cache__invalidate ( void );
 
 
@@ -2256,8 +2257,13 @@
 
 static void cfsi_cache__invalidate ( void ) {
    VG_(memset)(&cfsi_cache, 0, sizeof(cfsi_cache));
+   CF_info_generation++;
 }
 
+UInt VG_(CF_info_generation) (void)
+{
+   return CF_info_generation;
+}
 
 static inline CFSICacheEnt* cfsi_cache__find ( Addr ip )
 {
diff --git a/coregrind/m_stacktrace.c b/coregrind/m_stacktrace.c
index 49964b0..e7db9c5 100644
--- a/coregrind/m_stacktrace.c
+++ b/coregrind/m_stacktrace.c
@@ -81,13 +81,96 @@
 
 #if defined(VGP_x86_linux) || defined(VGP_x86_darwin)
 
+#define N_FP_CF_VERIF 1021
+// prime number so that size of fp_CF_verif is just below 4K or 8K
+// Note that this prime nr differs from the one chosen in
+// m_debuginfo/debuginfo.c for the cfsi cache : in case we have
+// a collision here between two IPs, we expect to not (often) have the
+// same collision in the cfsi cache (and vice-versa).
+
+// unwinding with fp chain is ok:
+#define FPUNWIND 0
+// there is no CFI info for this IP:
+#define NOINFO   1
+// Unwind with FP is not ok, must use CF unwind:
+#define CFUNWIND 2
+
+static Addr fp_CF_verif_cache [N_FP_CF_VERIF];
+
+/* An unwind done by following the fp chain technique can be incorrect
+   as not all frames are respecting the standard bp/sp ABI.
+   The CF information is now generated by default by gcc
+   (as part of the dwarf info). However, unwinding using CF information
+   is significantly slower : a slowdown of 20% has been observed
+   on an helgrind test case.
+   So, by default, the unwinding will be done using the fp chain.
+   But before accepting to unwind an IP with fp_chain, the result
+   of the unwind will be checked with the CF information.
+   This check can give 3 results:
+     FPUNWIND (0): there is CF info, and it gives the same result as fp unwind.
+       => it is assumed that future unwind for this IP can be done
+          with the fast fp chain, without further CF checking
+     NOINFO   (1): there is no CF info (so, fp unwind is the only do-able thing)
+     CFUNWIND (2): there is CF info, but unwind result differs.
+       => it is assumed that future unwind for this IP must be done
+       with the CF info.
+   Of course, if each fp unwind implies a check done with a CF unwind,
+   it would just be slower => we cache the check result in an
+   array of checked Addr.
+   The check for an IP will be stored at
+    fp_CF_verif_cache[IP % N_FP_CF_VERIF] as one of:
+                     IP ^ FPUNWIND
+                     IP ^ NOINFO
+                     IP ^ CFUNWIND
+
+   Note: we can re-use the last (ROUNDDOWN (log (N_FP_CF_VERIF))) bits
+   to store the check result, as they are guaranteed to be non significant
+   in the comparison between 2 IPs stored in fp_CF_verif_cache).
+   In other words, if two IPs are only differing on the last 2 bits,
+   then they will not land in the same cache bucket.
+*/
+
+static UInt fp_CF_verif_generation = 0;
+// Our cache has to be maintained in sync with the CFI cache.
+// Each time the CFI cache is changed, its generation will be incremented.
+// We will clear our cache when our saved generation differs from
+// the CFI cache generation.
+
 UInt VG_(get_StackTrace_wrk) ( ThreadId tid_if_known,
                                /*OUT*/Addr* ips, UInt max_n_ips,
                                /*OUT*/Addr* sps, /*OUT*/Addr* fps,
                                UnwindStartRegs* startRegs,
                                Addr fp_max_orig )
 {
-   Bool  debug = False;
+   const Bool   debug = False; // VG_(debugLog_getLevel) () > 3; // True;
+   const HChar* unwind_case; // used when debug is True.
+   // Debugging this function is not straightforward.
+   // Here is the easiest way I have found:
+   // 1. Change the above to True.
+   // 2. Start your program under Valgrind with --tool=none --vgdb-error=0
+   // 3. Use GDB/vgdb to put a breakpoint where you want to debug the stacktrace
+   // 4. Continue till breakpoint is encountered
+   // 5. From GDB, use 'monitor v.info scheduler' and examine the unwind traces.
+   //    You might have to do twice 'monitor v.info scheduler' to see
+   //    the effect of caching the results of the verification.
+   //    You can also modify the debug dynamically using by using
+   //    'monitor v.set debuglog 4.
+
+   const Bool do_stats = False; // compute and output some stats regularly.
+   static struct {
+      UInt nr; // nr of stacktraces computed
+      UInt nf; // nr of frames computed
+      UInt Ca; // unwind for which cache indicates CFUnwind must be used.
+      UInt FF; // unwind for which cache indicates FPUnwind can be used.
+      UInt Cf; // unwind at end of stack+store CFUNWIND (xip not end of stack).
+      UInt Fw; // unwind at end of stack+store FPUNWIND
+      UInt FO; // unwind + store FPUNWIND
+      UInt CF; // unwind + store CFUNWIND. Details below.
+      UInt xi; UInt xs; UInt xb; // register(s) which caused a 'store CFUNWIND'.
+      UInt Ck; // unwind fp invalid+store FPUNWIND
+      UInt MS; // microsoft unwind
+   } stats;
+   
    Int   i;
    Addr  fp_max;
    UInt  n_found = 0;
@@ -96,6 +179,9 @@
    vg_assert(sizeof(Addr) == sizeof(UWord));
    vg_assert(sizeof(Addr) == sizeof(void*));
 
+   D3UnwindRegs fpverif_uregs; // result of CF unwind for a check reason.
+   Addr xip_verified; // xip for which we have calculated fpverif_uregs
+
    D3UnwindRegs uregs;
    uregs.xip = (Addr)startRegs->r_pc;
    uregs.xsp = (Addr)startRegs->r_sp;
@@ -135,12 +221,11 @@
    } 
 #  endif
 
-   /* fp is %ebp.  sp is %esp.  ip is %eip. */
+   if (UNLIKELY (fp_CF_verif_generation != VG_(CF_info_generation)())) {
+      fp_CF_verif_generation = VG_(CF_info_generation)();
+      VG_(memset)(&fp_CF_verif_cache, 0, sizeof(fp_CF_verif_cache));
+   }
 
-   if (sps) sps[0] = uregs.xsp;
-   if (fps) fps[0] = uregs.xbp;
-   ips[0] = uregs.xip;
-   i = 1;
 
    /* Loop unwinding the stack. Note that the IP value we get on
     * each pass (whether from CFI info or a stack frame) is a
@@ -157,20 +242,72 @@
     * a tail call occurs and we wind up using the CFI info for the
     * next function which is completely wrong.
     */
+   if (sps) sps[0] = uregs.xsp;
+   if (fps) fps[0] = uregs.xbp;
+   ips[0] = uregs.xip;
+   i = 1;
+   if (do_stats) stats.nr++;
+
    while (True) {
 
       if (i >= max_n_ips)
          break;
 
-      /* Try to derive a new (ip,sp,fp) triple from the current
-         set. */
+      UWord hash = uregs.xip % N_FP_CF_VERIF;
+      Addr xip_verif = uregs.xip ^ fp_CF_verif_cache [hash];
+      if (debug)
+         VG_(printf)("     uregs.xip 0x%08lx xip_verif[0x%08lx]\n",
+                     uregs.xip, xip_verif);
+      // If xip is in cache, then xip_verif will be <= CFUNWIND.
+      // Otherwise, if not in cache, xip_verif will be > CFUNWIND.
 
-      /* On x86, first try the old-fashioned method of following the
-         %ebp-chain.  Code which doesn't use this (that is, compiled
-         with -fomit-frame-pointer) is not ABI compliant and so
-         relatively rare.  Besides, trying the CFI first almost always
-         fails, and is expensive. */
-      /* Deal with frames resulting from functions which begin "pushl%
+      /* Try to derive a new (ip,sp,fp) triple from the current set. */
+
+      /* Do we have to do CFI unwinding ?
+         We do CFI unwinding if one of the following condition holds:
+         a. fp_CF_verif_cache contains xip but indicates CFUNWIND must
+            be done (i.e. fp unwind check failed when we did the first
+            unwind for this IP).
+         b. fp_CF_verif_cache does not contain xip.
+            We will try CFI unwinding in fpverif_uregs and compare with
+            FP unwind result to insert xip in the cache with the correct
+            indicator. */
+      if (UNLIKELY(xip_verif >= CFUNWIND)) {
+         if (xip_verif == CFUNWIND) {
+            /* case a : do "real" cfi unwind */
+            if ( VG_(use_CF_info)( &uregs, fp_min, fp_max ) ) {
+               if (debug) unwind_case = "Ca";
+               if (do_stats) stats.Ca++;
+               goto unwind_done;
+            }
+            /* ??? cache indicates we have to do CFI unwind (so, we
+             previously found CFI info, and failed the fp unwind
+             check). Now, we just failed with CFI.  So, once we
+             succeed, once we fail.  No idea what is going on =>
+             cleanup the cache entry and fallover to fp unwind (this
+             time). */
+            fp_CF_verif_cache [hash] = 0;
+            if (debug) VG_(printf)("     cache reset as CFI ok then nok\n");
+            //??? stats
+            xip_verif = NOINFO;
+         } else {
+            /* case b : do "verif" cfi unwind in fpverif_uregs */
+            fpverif_uregs = uregs;
+            xip_verified = uregs.xip;
+            if ( !VG_(use_CF_info)( &fpverif_uregs, fp_min, fp_max ) ) {
+               fp_CF_verif_cache [hash] = uregs.xip ^ NOINFO;
+               if (debug) VG_(printf)("     cache NOINFO fpverif_uregs\n");
+               xip_verif = NOINFO;
+            }
+         }
+      }
+
+      /* On x86, try the old-fashioned method of following the
+         %ebp-chain.  This can be done if the fp_CF_verif_cache for xip
+         indicate fp unwind is ok. This must be done if the cache indicates
+         there is no info. This is also done to confirm what to put in the cache
+         if xip was not in the cache. */
+      /* This deals with frames resulting from functions which begin "pushl%
          ebp ; movl %esp, %ebp" which is the ABI-mandated preamble. */
       if (fp_min <= uregs.xbp &&
           uregs.xbp <= fp_max - 1 * sizeof(UWord)/*see comment below*/)
@@ -183,54 +320,117 @@
          // entries [1] and above in a stack trace, by subtracting 1 from
          // them.  Hence stacks that used to end with a zero value now end in
          // -1 and so we must detect that too.
-         if (0 == uregs.xip || 1 == uregs.xip) break;
+         if (0 == uregs.xip || 1 == uregs.xip) {
+            if (xip_verif > CFUNWIND) {
+               // Check if we obtain the same result with fp unwind.
+               // If same result, then mark xip as fp unwindable
+               if (uregs.xip == fpverif_uregs.xip) {
+                  fp_CF_verif_cache [hash] = xip_verified ^ FPUNWIND;
+                  if (debug) VG_(printf)("     cache FPUNWIND 0\n");
+                  unwind_case = "Fw";
+                  if (do_stats) stats.Fw++;
+                  break;
+               } else {
+                  fp_CF_verif_cache [hash] = xip_verified ^ CFUNWIND;
+                  uregs = fpverif_uregs;
+                  if (debug) VG_(printf)("     cache CFUNWIND 0\n");
+                  unwind_case = "Cf";
+                  if (do_stats) stats.Cf++;
+                  goto unwind_done;
+               }
+            } else {
+               // end of stack => out of the loop.
+               break;
+            }
+         }
+
          uregs.xsp = uregs.xbp + sizeof(Addr) /*saved %ebp*/ 
                                + sizeof(Addr) /*ra*/;
          uregs.xbp = (((UWord*)uregs.xbp)[0]);
-         if (sps) sps[i] = uregs.xsp;
-         if (fps) fps[i] = uregs.xbp;
-         ips[i++] = uregs.xip - 1; /* -1: refer to calling insn, not the RA */
-         if (debug)
-            VG_(printf)("     ipsF[%d]=0x%08lx\n", i-1, ips[i-1]);
-         uregs.xip = uregs.xip - 1;
-            /* as per comment at the head of this loop */
-         if (UNLIKELY(cmrf > 0)) {RECURSIVE_MERGE(cmrf,ips,i);};
-         continue;
-      }
-
-      /* That didn't work out, so see if there is any CF info to hand
-         which can be used. */
-      if ( VG_(use_CF_info)( &uregs, fp_min, fp_max ) ) {
-         if (0 == uregs.xip || 1 == uregs.xip) break;
-         if (sps) sps[i] = uregs.xsp;
-         if (fps) fps[i] = uregs.xbp;
-         ips[i++] = uregs.xip - 1; /* -1: refer to calling insn, not the RA */
-         if (debug)
-            VG_(printf)("     ipsC[%d]=0x%08lx\n", i-1, ips[i-1]);
-         uregs.xip = uregs.xip - 1;
-            /* as per comment at the head of this loop */
-         if (UNLIKELY(cmrf > 0)) {RECURSIVE_MERGE(cmrf,ips,i);};
-         continue;
+         if (xip_verif > CFUNWIND) {
+            if (uregs.xip == fpverif_uregs.xip
+                && uregs.xsp == fpverif_uregs.xsp
+                && uregs.xbp == fpverif_uregs.xbp) {
+               fp_CF_verif_cache [hash] = xip_verified ^ FPUNWIND;
+               if (debug) VG_(printf)("     cache FPUNWIND >2\n");
+               if (debug) unwind_case = "FO";
+               if (do_stats) stats.FO++;
+            } else {
+               fp_CF_verif_cache [hash] = xip_verified ^ CFUNWIND;
+               if (debug) VG_(printf)("     cache CFUNWIND >2\n");
+               if (do_stats && uregs.xip != fpverif_uregs.xip) stats.xi++;
+               if (do_stats && uregs.xsp != fpverif_uregs.xsp) stats.xs++;
+               if (do_stats && uregs.xbp != fpverif_uregs.xbp) stats.xb++;
+               uregs = fpverif_uregs;
+               if (debug) unwind_case = "CF";
+               if (do_stats) stats.CF++;
+            }
+         } else {
+            if (debug) unwind_case = "FF";
+            if (do_stats) stats.FF++;
+         }
+         goto unwind_done;
+      } else {
+         // fp unwind has failed.
+         // If we were checking the validity of the cfi unwinding,
+         // we mark in the cache that the fp unwind cannot be done, and that
+         // cfi unwind is desired.
+         if (xip_verif > CFUNWIND) {
+            // We know that fpverif_uregs contains valid information,
+            // as a failed cf unwind would have put NOINFO in xip_verif.
+            fp_CF_verif_cache [hash] = xip_verified ^ CFUNWIND;
+            if (debug) VG_(printf)("     cache CFUNWIND as fp failed\n");
+            uregs = fpverif_uregs;
+            if (debug) unwind_case = "Ck";
+            if (do_stats) stats.Ck++;
+            goto unwind_done;
+         }
+         // xip_verif is FPUNWIND or NOINFO.
+         // We failed the cfi unwind and/or the fp unwind.
+         // => fallback to FPO info.
       }
 
       /* And, similarly, try for MSVC FPO unwind info. */
       if ( VG_(use_FPO_info)( &uregs.xip, &uregs.xsp, &uregs.xbp,
                               fp_min, fp_max ) ) {
-         if (0 == uregs.xip || 1 == uregs.xip) break;
-         if (sps) sps[i] = uregs.xsp;
-         if (fps) fps[i] = uregs.xbp;
-         ips[i++] = uregs.xip;
-         if (debug)
-            VG_(printf)("     ipsC[%d]=0x%08lx\n", i-1, ips[i-1]);
-         uregs.xip = uregs.xip - 1;
-         if (UNLIKELY(cmrf > 0)) {RECURSIVE_MERGE(cmrf,ips,i);};
-         continue;
+         if (debug) unwind_case = "MS";
+         if (do_stats) stats.MS++;
+         goto unwind_done;
       }
 
       /* No luck.  We have to give up. */
       break;
+
+   unwind_done:
+      /* Add a frame in ips/sps/fps */
+      /* fp is %ebp.  sp is %esp.  ip is %eip. */
+      if (0 == uregs.xip || 1 == uregs.xip) break;
+      if (sps) sps[i] = uregs.xsp;
+      if (fps) fps[i] = uregs.xbp;
+      ips[i++] = uregs.xip - 1;
+      /* -1: refer to calling insn, not the RA */
+      if (debug)
+         VG_(printf)("     ips%s[%d]=0x%08lx\n", unwind_case, i-1, ips[i-1]);
+      uregs.xip = uregs.xip - 1;
+      /* as per comment at the head of this loop */
+      if (UNLIKELY(cmrf > 0)) {RECURSIVE_MERGE(cmrf,ips,i);};
    }
 
+   if (do_stats) stats.nf += i;
+   if (do_stats && stats.nr % 10000 == 0) {
+     VG_(printf)("nr %u nf %u "
+                 "Ca %u FF %u "
+                 "Cf %u "
+                 "Fw %u FO %u "
+                 "CF %u (xi %u xs %u xb %u) "
+                 "Ck %u MS %u\n",
+                 stats.nr, stats.nf,
+                 stats.Ca, stats.FF, 
+                 stats.Cf,
+                 stats.Fw, stats.FO, 
+                 stats.CF, stats.xi, stats.xs, stats.xb,
+                 stats.Ck, stats.MS);
+   }
    n_found = i;
    return n_found;
 }
diff --git a/coregrind/pub_core_debuginfo.h b/coregrind/pub_core_debuginfo.h
index 95bcd5b..bcf1a74 100644
--- a/coregrind/pub_core_debuginfo.h
+++ b/coregrind/pub_core_debuginfo.h
@@ -132,6 +132,13 @@
                                Addr min_accessible,
                                Addr max_accessible );
 
+/* returns the "generation" of the CF info.
+   Each time some debuginfo is changed (e.g. loaded or unloaded),
+   the VG_(CF_info_generation) value returned will be increased.
+   This can be used to flush cached information derived from the CF info. */
+extern UInt VG_(CF_info_generation) (void);
+
+
 
 /* Use MSVC FPO data to do one step of stack unwinding. */
 extern Bool VG_(use_FPO_info) ( /*MOD*/Addr* ipP,