handle intrinsic 16bit shaders smarter in blitRect
use expand16 format to speed up blend
fix typo bug in loops for 4444
diff --git a/src/core/SkBlitter_4444.cpp b/src/core/SkBlitter_4444.cpp
index cce94c5..736f8c3 100644
--- a/src/core/SkBlitter_4444.cpp
+++ b/src/core/SkBlitter_4444.cpp
@@ -112,7 +112,7 @@
         *dst = other + SkAlphaMulQ4(*dst, invScale);
         dst++;
     }
-    if (color & 1) {
+    if (count & 1) {
         *dst = color + SkAlphaMulQ4(*dst, invScale);
     }
 }
@@ -134,7 +134,7 @@
         tmp = SkExpand_4444(*dst) * invScale;
         *dst++ = SkCompact_4444((other + tmp) >> 4);
     }
-    if (color & 1) {
+    if (count & 1) {
         tmp = SkExpand_4444(*dst) * invScale;
         *dst = SkCompact_4444((color + tmp) >> 4);
     }
diff --git a/src/core/SkBlitter_RGB16.cpp b/src/core/SkBlitter_RGB16.cpp
index 955f4c1..3111655 100644
--- a/src/core/SkBlitter_RGB16.cpp
+++ b/src/core/SkBlitter_RGB16.cpp
@@ -51,7 +51,7 @@
 ///////////////////////////////////////////////////////////////////////////////
 
 SkRGB16_Black_Blitter::SkRGB16_Black_Blitter(const SkBitmap& device, const SkPaint& paint)
-    : SkRGB16_Blitter(device, paint) {
+    : INHERITED(device, paint) {
     SkASSERT(paint.getShader() == NULL);
     SkASSERT(paint.getColorFilter() == NULL);
     SkASSERT(paint.getXfermode() == NULL);
@@ -342,17 +342,31 @@
     return NULL;
 }
 
+static uint32_t pmcolor_to_expand16(SkPMColor c) {
+    unsigned r = SkGetPackedR32(c);
+    unsigned g = SkGetPackedG32(c);
+    unsigned b = SkGetPackedB32(c);
+    return (g << 24) | (r << 13) | (b << 2);
+}
+
+static inline void blend32_16_row(SkPMColor src, uint16_t dst[], int count) {
+    SkASSERT(count > 0);
+    uint32_t src_expand = pmcolor_to_expand16(src);
+    unsigned scale = SkAlpha255To256(0xFF - SkGetPackedA32(src)) >> 3;
+    do {
+        uint32_t dst_expand = SkExpand_rgb_16(*dst) * scale;
+        *dst = SkCompact_rgb_16((src_expand + dst_expand) >> 5);
+        dst += 1;
+    } while (--count != 0);
+}
+
 void SkRGB16_Blitter::blitH(int x, int y, int width) SK_RESTRICT {
     SkASSERT(width > 0);
     SkASSERT(x + width <= fDevice.width());
     uint16_t* SK_RESTRICT device = fDevice.getAddr16(x, y);
 
     // TODO: respect fDoDither
-    SkPMColor src32 = fSrcColor32;
-    do {
-        *device = SkSrcOver32To16(src32, *device);
-        device += 1;
-    } while (--width != 0);
+    blend32_16_row(fSrcColor32, device, width);
 }
 
 void SkRGB16_Blitter::blitAntiH(int x, int y,
@@ -478,9 +492,7 @@
     SkPMColor src32 = fSrcColor32;
 
     while (--height >= 0) {
-        for (int i = width - 1; i >= 0; --i) {
-            device[i] = SkSrcOver32To16(src32, device[i]);
-        }
+        blend32_16_row(src32, device, width);
         device = (uint16_t*)((char*)device + deviceRB);
     }
 }
@@ -509,6 +521,53 @@
     }
 }
 
+void SkRGB16_Shader16_Blitter::blitRect(int x, int y, int width, int height) {
+    SkShader*   shader = fShader;
+    uint16_t*   dst = fDevice.getAddr16(x, y);
+    size_t      dstRB = fDevice.rowBytes();
+    int         alpha = shader->getSpan16Alpha();
+
+    // TODO: take advantage of (fShaderFlags & SkShader::kConstInY_Flag)
+
+    if (0xFF == alpha) {
+        if (fShaderFlags & SkShader::kConstInY_Flag) {
+            // have the shader blit directly into the device the first time
+            shader->shadeSpan16(x, y, dst, width);
+            // and now just memcpy that line on the subsequent lines
+            if (--height > 0) {
+                const uint16_t* orig = dst;
+                do {
+                    dst = (uint16_t*)((char*)dst + dstRB);
+                    memcpy(dst, orig, width << 1);
+                } while (--height);
+            }
+        } else {    // need to call shadeSpan16 for every line
+            do {
+                shader->shadeSpan16(x, y, dst, width);
+                y += 1;
+                dst = (uint16_t*)((char*)dst + dstRB);
+            } while (--height);
+        }
+    } else {
+        int scale = SkAlpha255To256(alpha);
+        uint16_t* span16 = (uint16_t*)fBuffer;
+        if (fShaderFlags & SkShader::kConstInY_Flag) {
+            shader->shadeSpan16(x, y, span16, width);
+            do {
+                SkBlendRGB16(span16, dst, scale, width);
+                dst = (uint16_t*)((char*)dst + dstRB);
+            } while (--height);
+        } else {
+            do {
+                shader->shadeSpan16(x, y, span16, width);
+                SkBlendRGB16(span16, dst, scale, width);
+                y += 1;
+                dst = (uint16_t*)((char*)dst + dstRB);
+            } while (--height);
+        }
+    }
+}
+
 void SkRGB16_Shader16_Blitter::blitAntiH(int x, int y,
                                          const SkAlpha* SK_RESTRICT antialias,
                                          const int16_t* SK_RESTRICT runs)
diff --git a/src/core/SkCoreBlitters.h b/src/core/SkCoreBlitters.h
index d45467f..5e34685 100644
--- a/src/core/SkCoreBlitters.h
+++ b/src/core/SkCoreBlitters.h
@@ -112,25 +112,25 @@
     typedef SkRasterBlitter INHERITED;
 };
 
-class SkARGB32_Black_Blitter : public SkARGB32_Blitter {
+class SkARGB32_Opaque_Blitter : public SkARGB32_Blitter {
 public:
-    SkARGB32_Black_Blitter(const SkBitmap& device, const SkPaint& paint)
-        : SkARGB32_Blitter(device, paint) {}
+    SkARGB32_Opaque_Blitter(const SkBitmap& device, const SkPaint& paint)
+        : INHERITED(device, paint) { SkASSERT(paint.getAlpha() == 0xFF); }
     virtual void blitMask(const SkMask&, const SkIRect&);
-    virtual void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]);
-    
+
 private:
     typedef SkARGB32_Blitter INHERITED;
 };
 
-class SkARGB32_Opaque_Blitter : public SkARGB32_Blitter {
+class SkARGB32_Black_Blitter : public SkARGB32_Opaque_Blitter {
 public:
-    SkARGB32_Opaque_Blitter(const SkBitmap& device, const SkPaint& paint)
-        : SkARGB32_Blitter(device, paint) { SkASSERT(paint.getAlpha() == 0xFF); }
+    SkARGB32_Black_Blitter(const SkBitmap& device, const SkPaint& paint)
+        : INHERITED(device, paint) {}
     virtual void blitMask(const SkMask&, const SkIRect&);
+    virtual void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]);
 
 private:
-    typedef SkARGB32_Blitter INHERITED;
+    typedef SkARGB32_Opaque_Blitter INHERITED;
 };
 
 class SkARGB32_Shader_Blitter : public SkShaderBlitter {
@@ -188,14 +188,14 @@
     typedef SkRGB16_Blitter INHERITED;
 };
 
-class SkRGB16_Black_Blitter : public SkRGB16_Blitter {
+class SkRGB16_Black_Blitter : public SkRGB16_Opaque_Blitter {
 public:
     SkRGB16_Black_Blitter(const SkBitmap& device, const SkPaint& paint);
     virtual void blitMask(const SkMask&, const SkIRect&);
     virtual void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]);
 
 private:
-    typedef SkRGB16_Blitter INHERITED;
+    typedef SkRGB16_Opaque_Blitter INHERITED;
 };
 
 class SkRGB16_Shader_Blitter : public SkShaderBlitter {
@@ -224,6 +224,7 @@
     SkRGB16_Shader16_Blitter(const SkBitmap& device, const SkPaint& paint);
     virtual void blitH(int x, int y, int width);
     virtual void blitAntiH(int x, int y, const SkAlpha antialias[], const int16_t runs[]);
+    virtual void blitRect(int x, int y, int width, int height);
     
 private:
     typedef SkRGB16_Shader_Blitter INHERITED;
diff --git a/src/core/SkPictureRecord.cpp b/src/core/SkPictureRecord.cpp
index 15ae562..c7f2c6c 100644
--- a/src/core/SkPictureRecord.cpp
+++ b/src/core/SkPictureRecord.cpp
@@ -53,7 +53,7 @@
     if (fRestoreOffsetStack.count() == 0) {
         return;
     }
-    
+
     // patch up the clip offsets
     uint32_t restoreOffset = (uint32_t)fWriter.size();
     uint32_t offset = fRestoreOffsetStack.top();
@@ -63,7 +63,7 @@
         *peek = restoreOffset;
     }
     fRestoreOffsetStack.pop();
-    
+
     addDraw(RESTORE);
     validate();
     return this->INHERITED::restore();
diff --git a/src/utils/mac/SkCreateCGImageRef.cpp b/src/utils/mac/SkCreateCGImageRef.cpp
index be53f00..5c96e21 100644
--- a/src/utils/mac/SkCreateCGImageRef.cpp
+++ b/src/utils/mac/SkCreateCGImageRef.cpp
@@ -16,12 +16,12 @@
 static SkBitmap* prepareForImageRef(const SkBitmap& bm,
                                     size_t* bitsPerComponent,
                                     CGBitmapInfo* info) {
-#if 0
-    SkDebugf("---- %d %d %d %d\n", SK_A32_SHIFT, SK_R32_SHIFT,
-             SK_G32_SHIFT, SK_B32_SHIFT);
-#endif
+    bool upscaleTo32 = false;
 
     switch (bm.config()) {
+        case SkBitmap::kRGB_565_Config:
+            upscaleTo32 = true;
+            // fall through
         case SkBitmap::kARGB_8888_Config:
             *bitsPerComponent = 8;
 #if defined(SK_CPU_LENDIAN) && HAS_ARGB_SHIFTS(24, 0, 8, 16) \
@@ -44,11 +44,13 @@
                     kCGImageAlphaPremultipliedLast;
 #endif
             break;
+#if 0
         case SkBitmap::kRGB_565_Config:
             // doesn't see quite right. Are they thinking 1555?
             *bitsPerComponent = 5;
             *info = kCGBitmapByteOrder16Little;
             break;
+#endif
         case SkBitmap::kARGB_4444_Config:
             *bitsPerComponent = 4;
             *info = kCGBitmapByteOrder16Little | kCGImageAlphaPremultipliedLast;
@@ -57,7 +59,16 @@
             return NULL;
     }
 
-    return new SkBitmap(bm);
+    SkBitmap* copy;
+    if (upscaleTo32) {
+        copy = new SkBitmap;
+        // here we make a ceep copy of the pixels, since CG won't take our
+        // 565 directly
+        bm.copyTo(copy, SkBitmap::kARGB_8888_Config);
+    } else {
+        copy = new SkBitmap(bm);
+    }
+    return copy;
 }
 
 #undef HAS_ARGB_SHIFTS