up to 6% faster encoding with clang compiler

mostly by revamping the main loop of GetResidualCost() and avoiding some branches

Change-Id: Ib05763e18a6bf46c82dc3d5d1d8eb65e99474207
diff --git a/src/enc/frame.c b/src/enc/frame.c
index 4624d91..2963ef6 100644
--- a/src/enc/frame.c
+++ b/src/enc/frame.c
@@ -292,31 +292,20 @@
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
-  cost = 0;
-  while (n < res->last) {
-    int v = res->coeffs[n];
+  cost = VP8BitCost(1, p0);
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
     const int b = VP8EncBands[n + 1];
-    ++n;
-    if (v == 0) {
-      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
-      cost += t[0];
-      t = res->cost[b][0];
-      continue;
-    }
-    v = abs(v);
-    cost += VP8BitCost(1, p0);
+    const int ctx = (v >= 2) ? 2 : v;
     cost += VP8LevelCost(t, v);
-    {
-      const int ctx = (v == 1) ? 1 : 2;
-      p0 = res->prob[b][ctx][0];
-      t = res->cost[b][ctx];
-    }
+    t = res->cost[b][ctx];
+    // the masking trick is faster than "if (v) cost += ..." with clang
+    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
   }
   // Last coefficient is always non-zero
   {
     const int v = abs(res->coeffs[n]);
     assert(v != 0);
-    cost += VP8BitCost(1, p0);
     cost += VP8LevelCost(t, v);
     if (n < 15) {
       const int b = VP8EncBands[n + 1];