fix null-shape crash in pictures
fix gradient interpolation w/ alpha (do it in nonpremul space)
optimize index->16bit sprite blit
diff --git a/include/core/SkTDArray.h b/include/core/SkTDArray.h
index 4d2d7f7..5f6bbd8 100644
--- a/include/core/SkTDArray.h
+++ b/include/core/SkTDArray.h
@@ -251,7 +251,17 @@
         }
         this->reset();
     }
-    
+
+    void safeUnrefAll() {
+        T*  iter = fArray;
+        T*  stop = fArray + fCount;
+        while (iter < stop) {
+            SkSafeUnref(*iter);
+            iter += 1;
+        }
+        this->reset();
+    }
+
 #ifdef SK_DEBUG
     void validate() const {
         SkASSERT((fReserve == 0 && fArray == NULL) ||
diff --git a/src/core/SkPicturePlayback.cpp b/src/core/SkPicturePlayback.cpp
index 77ac912..03cdc16 100644
--- a/src/core/SkPicturePlayback.cpp
+++ b/src/core/SkPicturePlayback.cpp
@@ -125,8 +125,9 @@
     if (fShapeCount > 0) {
         fShapes = SkNEW_ARRAY(SkShape*, fShapeCount);
         for (int i = 0; i < fShapeCount; i++) {
-            fShapes[i] = shapes[i];
-            fShapes[i]->ref();
+            SkShape* s = shapes[i];
+            SkSafeRef(s);
+            fShapes[i] = s;
         }
     }
     
@@ -205,8 +206,9 @@
     fShapeCount = src.fShapeCount;
     fShapes = SkNEW_ARRAY(SkShape*, fShapeCount);
     for (int i = 0; i < fShapeCount; i++) {
-        fShapes[i] = src.fShapes[i];
-        fShapes[i]->ref();
+        SkShape* s = src.fShapes[i];
+        SkSafeRef(s);
+        fShapes[i] = s;
     }
     
     fRegionCount = src.fRegionCount;
@@ -225,7 +227,7 @@
     fShapes = NULL;
     fRegions = NULL;
     fBitmapCount = fMatrixCount = fPaintCount = fPictureCount = 
-    fRegionCount = 0;
+    fRegionCount = fShapeCount = 0;
     
     fFactoryPlayback = NULL;
 }
diff --git a/src/core/SkPictureRecord.cpp b/src/core/SkPictureRecord.cpp
index 4778726..b908670 100644
--- a/src/core/SkPictureRecord.cpp
+++ b/src/core/SkPictureRecord.cpp
@@ -413,7 +413,7 @@
     fPaints.reset();
     fPictureRefs.unrefAll();
     fRegions.reset();
-    fShapes.unrefAll();
+    fShapes.safeUnrefAll();
     fWriter.reset();
     fHeap.reset();
     
diff --git a/src/core/SkSpriteBlitter_RGB16.cpp b/src/core/SkSpriteBlitter_RGB16.cpp
index a158637..3ccc03b 100644
--- a/src/core/SkSpriteBlitter_RGB16.cpp
+++ b/src/core/SkSpriteBlitter_RGB16.cpp
@@ -172,6 +172,68 @@
 
 ///////////////////////////////////////////////////////////////////////////////
 
+static intptr_t asint(const void* ptr) {
+    return reinterpret_cast<const char*>(ptr) - (const char*)0;
+}
+
+static void blitrow_d16_si8(SK_RESTRICT uint16_t* dst,
+                            SK_RESTRICT const uint8_t* src, int count,
+                            SK_RESTRICT const uint16_t* ctable) {
+    if (count <= 8) {
+        do {
+            *dst++ = ctable[*src++];
+        } while (--count);
+        return;
+    }
+
+    // eat src until we're on a 4byte boundary
+    while (asint(src) & 3) {
+        *dst++ = ctable[*src++];
+        count -= 1;
+    }
+
+    int qcount = count >> 2;
+    SkASSERT(qcount > 0);
+    const uint32_t* qsrc = reinterpret_cast<const uint32_t*>(src);
+    if (asint(dst) & 2) {
+        do {
+            uint32_t s4 = *qsrc++;
+#ifdef SK_CPU_LENDIAN
+            *dst++ = ctable[s4 & 0xFF];
+            *dst++ = ctable[(s4 >> 8) & 0xFF];
+            *dst++ = ctable[(s4 >> 16) & 0xFF];
+            *dst++ = ctable[s4 >> 24];
+#else   // BENDIAN
+            *dst++ = ctable[s4 >> 24];
+            *dst++ = ctable[(s4 >> 16) & 0xFF];
+            *dst++ = ctable[(s4 >> 8) & 0xFF];
+            *dst++ = ctable[s4 & 0xFF];
+#endif
+        } while (--qcount);
+    } else {    // dst is on a 4byte boundary
+        uint32_t* ddst = reinterpret_cast<uint32_t*>(dst);
+        do {
+            uint32_t s4 = *qsrc++;
+#ifdef SK_CPU_LENDIAN
+            *ddst++ = (ctable[(s4 >> 8) & 0xFF] << 16) | ctable[s4 & 0xFF];
+            *ddst++ = (ctable[s4 >> 24] << 16) | ctable[(s4 >> 16) & 0xFF];
+#else   // BENDIAN
+            *ddst++ = (ctable[s4 >> 24] << 16) | ctable[(s4 >> 16) & 0xFF];
+            *ddst++ = (ctable[(s4 >> 8) & 0xFF] << 16) | ctable[s4 & 0xFF];
+#endif
+        } while (--qcount);
+        dst = reinterpret_cast<uint16_t*>(ddst);
+    }
+    src = reinterpret_cast<const uint8_t*>(qsrc);
+    count &= 3;
+    // catch any remaining (will be < 4)
+    while (--count >= 0) {
+        *dst++ = ctable[*src++];
+    }
+}
+
+#define SkSPRITE_ROW_PROC(d, s, n, x, y)    blitrow_d16_si8(d, s, n, ctable)
+
 #define SkSPRITE_CLASSNAME                  Sprite_D16_SIndex8_Opaque
 #define SkSPRITE_ARGS
 #define SkSPRITE_FIELDS
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index ec42e43..8d1531a 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -45,7 +45,7 @@
 }
 #endif
 
-static unsigned saturated_add(unsigned a, unsigned b) {
+static inline unsigned saturated_add(unsigned a, unsigned b) {
     SkASSERT(a <= 255);
     SkASSERT(b <= 255);
     unsigned sum = a + b;
@@ -55,7 +55,7 @@
     return sum;
 }
 
-static int clamp_signed_byte(int n) {
+static inline int clamp_signed_byte(int n) {
     if (n < 0) {
         n = 0;
     } else if (n > 255) {
@@ -64,7 +64,7 @@
     return n;
 }
 
-static int clamp_div255round(int prod) {
+static inline int clamp_div255round(int prod) {
     if (prod <= 0) {
         return 0;
     } else if (prod >= 255*255) {
@@ -74,7 +74,7 @@
     }
 }
 
-static int clamp_max(int value, int max) {
+static inline int clamp_max(int value, int max) {
     if (value > max) {
         value = max;
     }
@@ -470,10 +470,10 @@
 
 // kPlus_Mode
 static SkPMColor plus_modeproc(SkPMColor src, SkPMColor dst) {
-    unsigned a = saturated_add(SkGetPackedA32(src), SkGetPackedA32(dst));
-    unsigned r = saturated_add(SkGetPackedR32(src), SkGetPackedR32(dst));
-    unsigned g = saturated_add(SkGetPackedG32(src), SkGetPackedG32(dst));
     unsigned b = saturated_add(SkGetPackedB32(src), SkGetPackedB32(dst));
+    unsigned g = saturated_add(SkGetPackedG32(src), SkGetPackedG32(dst));
+    unsigned r = saturated_add(SkGetPackedR32(src), SkGetPackedR32(dst));
+    unsigned a = saturated_add(SkGetPackedA32(src), SkGetPackedA32(dst));
     return SkPackARGB32(a, r, g, b);
 }
 
diff --git a/src/effects/SkGradientShader.cpp b/src/effects/SkGradientShader.cpp
index e1a92ba..635e0e5 100644
--- a/src/effects/SkGradientShader.cpp
+++ b/src/effects/SkGradientShader.cpp
@@ -106,7 +106,6 @@
     SkMatrix    fPtsToUnit;     // set by subclass
     SkMatrix    fDstToIndex;
     SkMatrix::MapXYProc fDstToIndexProc;
-    SkPMColor*  fARGB32;
     TileMode    fTileMode;
     TileProc    fTileProc;
     int         fColorCount;
@@ -136,7 +135,7 @@
     enum {
         kColorStorageCount = 4, // more than this many colors, and we'll use sk_malloc for the space
 
-        kStorageSize = kColorStorageCount * (sizeof(SkColor) + sizeof(SkPMColor) + sizeof(Rec))
+        kStorageSize = kColorStorageCount * (sizeof(SkColor) + sizeof(Rec))
     };
     SkColor     fStorage[(kStorageSize + 3) >> 2];
     SkColor*    fOrigColors;
@@ -200,7 +199,7 @@
     }
 
     if (fColorCount > kColorStorageCount) {
-        size_t size = sizeof(SkColor) + sizeof(SkPMColor) + sizeof(Rec);
+        size_t size = sizeof(SkColor) + sizeof(Rec);
         fOrigColors = reinterpret_cast<SkColor*>(
                                         sk_malloc_throw(size * fColorCount));
     }
@@ -221,10 +220,7 @@
         }
     }
 
-    // our premul colors point to the 2nd half of the array
-    // these are assigned each time in setContext
-    fARGB32 = fOrigColors + fColorCount;
-    fRecs = (Rec*)(fARGB32 + fColorCount);
+    fRecs = (Rec*)(fOrigColors + fColorCount);
     if (fColorCount > 2) {
         Rec* recs = fRecs;
         recs->fPos = 0;
@@ -297,11 +293,10 @@
         fOrigColors = fStorage;
     }
     buffer.read(fOrigColors, colorCount * sizeof(SkColor));
-    fARGB32 = fOrigColors + colorCount;
 
     fTileMode = (TileMode)buffer.readU8();
     fTileProc = gTileProcs[fTileMode];
-    fRecs = (Rec*)(fARGB32 + colorCount);
+    fRecs = (Rec*)(fOrigColors + colorCount);
     if (colorCount > 2) {
         Rec* recs = fRecs;
         recs[0].fPos = 0;
@@ -363,15 +358,12 @@
     unsigned paintAlpha = this->getPaintAlpha();
     unsigned colorAlpha = 0xFF;
 
+    // FIXME: record colorAlpha in constructor, since this is not affected
+    // by setContext()
     for (int i = 0; i < fColorCount; i++) {
         SkColor src = fOrigColors[i];
         unsigned sa = SkColorGetA(src);
         colorAlpha &= sa;
-        
-        // now modulate it by the paint for our resulting ARGB32 array
-        sa = SkMulDiv255Round(sa, paintAlpha);
-        fARGB32[i] = SkPreMultiplyARGB(sa, SkColorGetR(src), SkColorGetG(src),
-                                       SkColorGetB(src));
     }
 
     fFlags = this->INHERITED::getFlags();
@@ -466,19 +458,24 @@
     } while (--count != 0);
 }
 
-static void build_32bit_cache(SkPMColor cache[], SkPMColor c0, SkPMColor c1,
-                              int count) {
+static void build_32bit_cache(SkPMColor cache[], SkColor c0, SkColor c1,
+                              int count, U8CPU paintAlpha) {
     SkASSERT(count > 1);
 
-    SkFixed a = SkGetPackedA32(c0);
-    SkFixed r = SkGetPackedR32(c0);
-    SkFixed g = SkGetPackedG32(c0);
-    SkFixed b = SkGetPackedB32(c0);
+    // need to apply paintAlpha to our two endpoints
+    SkFixed a = SkMulDiv255Round(SkColorGetA(c0), paintAlpha);
+    SkFixed da;
+    {
+        int tmp = SkMulDiv255Round(SkColorGetA(c1), paintAlpha);
+        da = SkIntToFixed(tmp - a) / (count - 1);
+    }
 
-    SkFixed da = SkIntToFixed(SkGetPackedA32(c1) - a) / (count - 1);
-    SkFixed dr = SkIntToFixed(SkGetPackedR32(c1) - r) / (count - 1);
-    SkFixed dg = SkIntToFixed(SkGetPackedG32(c1) - g) / (count - 1);
-    SkFixed db = SkIntToFixed(SkGetPackedB32(c1) - b) / (count - 1);
+    SkFixed r = SkColorGetR(c0);
+    SkFixed g = SkColorGetG(c0);
+    SkFixed b = SkColorGetB(c0);
+    SkFixed dr = SkIntToFixed(SkColorGetR(c1) - r) / (count - 1);
+    SkFixed dg = SkIntToFixed(SkColorGetG(c1) - g) / (count - 1);
+    SkFixed db = SkIntToFixed(SkColorGetB(c1) - b) / (count - 1);
 
     a = SkIntToFixed(a) + 0x8000;
     r = SkIntToFixed(r) + 0x8000;
@@ -486,7 +483,7 @@
     b = SkIntToFixed(b) + 0x8000;
 
     do {
-        *cache++ = SkPackARGB32(a >> 16, r >> 16, g >> 16, b >> 16);
+        *cache++ = SkPreMultiplyARGB(a >> 16, r >> 16, g >> 16, b >> 16);
         a += da;
         r += dr;
         g += dg;
@@ -559,7 +556,8 @@
 
         fCache32 = fCache32Storage;
         if (fColorCount == 2) {
-            build_32bit_cache(fCache32, fARGB32[0], fARGB32[1], kCache32Count);
+            build_32bit_cache(fCache32, fOrigColors[0], fOrigColors[1],
+                              kCache32Count, fCacheAlpha);
         } else {
             Rec* rec = fRecs;
             int prevIndex = 0;
@@ -568,7 +566,9 @@
                 SkASSERT(nextIndex < kCache32Count);
 
                 if (nextIndex > prevIndex)
-                    build_32bit_cache(fCache32 + prevIndex, fARGB32[i-1], fARGB32[i], nextIndex - prevIndex + 1);
+                    build_32bit_cache(fCache32 + prevIndex, fOrigColors[i-1],
+                                      fOrigColors[i],
+                                      nextIndex - prevIndex + 1, fCacheAlpha);
                 prevIndex = nextIndex;
             }
             SkASSERT(prevIndex == kCache32Count - 1);
@@ -1563,14 +1563,8 @@
     }
     EXPAND_1_COLOR(colorCount);
 
-    SkScalar posStorage[2];
-    if (colorCount == 2 && pos == NULL) {
-        posStorage[0] = SK_Scalar1/4;
-        posStorage[1] = 3*SK_Scalar1/4;
-        pos = posStorage;
-    }
-
-    return SkNEW_ARGS(Linear_Gradient, (pts, colors, pos, colorCount, mode, mapper));
+    return SkNEW_ARGS(Linear_Gradient,
+                      (pts, colors, pos, colorCount, mode, mapper));
 }
 
 SkShader* SkGradientShader::CreateRadial(   const SkPoint& center, SkScalar radius,
@@ -1582,7 +1576,8 @@
     }
     EXPAND_1_COLOR(colorCount);
 
-    return SkNEW_ARGS(Radial_Gradient, (center, radius, colors, pos, colorCount, mode, mapper));
+    return SkNEW_ARGS(Radial_Gradient,
+                      (center, radius, colors, pos, colorCount, mode, mapper));
 }
 
 SkShader* SkGradientShader::CreateSweep(SkScalar cx, SkScalar cy,