src/opts/SkBlitRow_opts_SSE2.cpp - platform/external/skia - Git at Google

 /*
  * Copyright 2012 The Android Open Source Project
  *
  * Use of this source code is governed by a BSD-style license that can be
  * found in the LICENSE file.
  */


 #include "SkBlitRow_opts_SSE2.h"
 #include "SkColorPriv.h"
 #include "SkUtils.h"

 #include <emmintrin.h>

 /* SSE2 version of S32_Blend_BlitRow32()
  * portable version is in core/SkBlitRow_D32.cpp
  */
 void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                               const SkPMColor* SK_RESTRICT src,
                               int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count <= 0) {
         return;
     }

     uint32_t src_scale = SkAlpha255To256(alpha);
     uint32_t dst_scale = 256 - src_scale;

     if (count >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
             *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
             src++;
             dst++;
             count--;
         }

         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

         // Move scale factors to upper byte of word
         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
         __m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
         while (count >= 4) {
             // Load 4 pixels each of src and dest.
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);

             // Interleave Atom port 0/1 operations based on the execution port
             // constraints that multiply can only be executed on port 0 (while
             // boolean operations can be executed on either port 0 or port 1)
             // because GCC currently doesn't do a good job scheduling
             // instructions based on these constraints.

             // Get red and blue pixels into lower byte of each word.
             // (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

             // Multiply by scale.
             // (4 x (0, rs.h, 0, bs.h))
             // where rs.h stands for the higher byte of r * scale, and
             // bs.h the higher byte of b * scale.
             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

             // Get alpha and green pixels into higher byte of each word.
             // (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
             __m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

             // Multiply by scale.
             // (4 x (as.h, as.l, gs.h, gs.l))
             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

             // Clear the lower byte of the a*scale and g*scale results
             // (4 x (as.h, 0, gs.h, 0))
             src_ag = _mm_and_si128(src_ag, ag_mask);

             // Operations the destination pixels are the same as on the
             // source pixels. See the comments above.
             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
             dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
             __m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
             dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
             dst_ag = _mm_and_si128(dst_ag, ag_mask);

             // Combine back into RGBA.
             // (4 x (as.h, rs.h, gs.h, bs.h))
             src_pixel = _mm_or_si128(src_rb, src_ag);
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

             // Add result
             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
             _mm_store_si128(d, result);
             s++;
             d++;
             count -= 4;
         }
         src = reinterpret_cast<const SkPMColor*>(s);
         dst = reinterpret_cast<SkPMColor*>(d);
     }

     while (count > 0) {
         *dst = SkAlphaMulQ(*src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
         src++;
         dst++;
         count--;
     }
 }

 void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                 const SkPMColor* SK_RESTRICT src,
                                 int count, U8CPU alpha) {
     SkASSERT(alpha == 255);
     if (count <= 0) {
         return;
     }

     if (count >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
             *dst = SkPMSrcOver(*src, *dst);
             src++;
             dst++;
             count--;
         }

         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
 #ifdef SK_USE_ACCURATE_BLENDING
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i c_128 = _mm_set1_epi16(128);  // 8 copies of 128 (16-bit)
         __m128i c_255 = _mm_set1_epi16(255);  // 8 copies of 255 (16-bit)
         while (count >= 4) {
             // Load 4 pixels
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);

             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
             // Shift alphas down to lower 8 bits of each quad.
             __m128i alpha = _mm_srli_epi32(src_pixel, 24);

             // Copy alpha to upper 3rd byte of each quad
             alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

             // Subtract alphas from 255, to get 0..255
             alpha = _mm_sub_epi16(c_255, alpha);

             // Multiply by red and blue by src alpha.
             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
             // Multiply by alpha and green by src alpha.
             dst_ag = _mm_mullo_epi16(dst_ag, alpha);

             // dst_rb_low = (dst_rb >> 8)
             __m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
             __m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

             // dst_rb = (dst_rb + dst_rb_low + 128) >> 8
             dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
             dst_rb = _mm_add_epi16(dst_rb, c_128);
             dst_rb = _mm_srli_epi16(dst_rb, 8);

             // dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
             dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
             dst_ag = _mm_add_epi16(dst_ag, c_128);
             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

             // Combine back into RGBA.
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

             // Add result
             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
             _mm_store_si128(d, result);
             s++;
             d++;
             count -= 4;
         }
     #else
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i c_256 = _mm_set1_epi16(0x0100);  // 8 copies of 256 (16-bit)
         while (count >= 4) {
             // Load 4 pixels
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);

             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

             // (a0, g0, a1, g1, a2, g2, a3, g3)  (low byte of each word)
             __m128i alpha = _mm_srli_epi16(src_pixel, 8);

             // (a0, a0, a1, a1, a2, g2, a3, g3)
             alpha = _mm_shufflehi_epi16(alpha, 0xF5);

             // (a0, a0, a1, a1, a2, a2, a3, a3)
             alpha = _mm_shufflelo_epi16(alpha, 0xF5);

             // Subtract alphas from 256, to get 1..256
             alpha = _mm_sub_epi16(c_256, alpha);

             // Multiply by red and blue by src alpha.
             dst_rb = _mm_mullo_epi16(dst_rb, alpha);
             // Multiply by alpha and green by src alpha.
             dst_ag = _mm_mullo_epi16(dst_ag, alpha);

             // Divide by 256.
             dst_rb = _mm_srli_epi16(dst_rb, 8);

             // Mask out high bits (already in the right place)
             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

             // Combine back into RGBA.
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);

             // Add result
             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
             _mm_store_si128(d, result);
             s++;
             d++;
             count -= 4;
         }
 #endif
         src = reinterpret_cast<const SkPMColor*>(s);
         dst = reinterpret_cast<SkPMColor*>(d);
     }

     while (count > 0) {
         *dst = SkPMSrcOver(*src, *dst);
         src++;
         dst++;
         count--;
     }
 }

 void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
                                const SkPMColor* SK_RESTRICT src,
                                int count, U8CPU alpha) {
     SkASSERT(alpha <= 255);
     if (count <= 0) {
         return;
     }

     if (count >= 4) {
         while (((size_t)dst & 0x0F) != 0) {
             *dst = SkBlendARGB32(*src, *dst, alpha);
             src++;
             dst++;
             count--;
         }

         uint32_t src_scale = SkAlpha255To256(alpha);

         const __m128i *s = reinterpret_cast<const __m128i*>(src);
         __m128i *d = reinterpret_cast<__m128i*>(dst);
         __m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
         __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
         __m128i c_256 = _mm_set1_epi16(256);  // 8 copies of 256 (16-bit)
         while (count >= 4) {
             // Load 4 pixels each of src and dest.
             __m128i src_pixel = _mm_loadu_si128(s);
             __m128i dst_pixel = _mm_load_si128(d);

             // Get red and blue pixels into lower byte of each word.
             __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
             __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

             // Get alpha and green into lower byte of each word.
             __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
             __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

             // Put per-pixel alpha in low byte of each word.
             // After the following two statements, the dst_alpha looks like
             // (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
             __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
             dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

             // dst_alpha = dst_alpha * src_scale
             // Because src_scales are in the higher byte of each word and
             // we use mulhi here, the resulting alpha values are already
             // in the right place and don't need to be divided by 256.
             // (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
             dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

             // Subtract alphas from 256, to get 1..256
             dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

             // Multiply red and blue by dst pixel alpha.
             dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
             // Multiply alpha and green by dst pixel alpha.
             dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

             // Multiply red and blue by global alpha.
             // (4 x (0, rs.h, 0, bs.h))
             // where rs.h stands for the higher byte of r * src_scale,
             // and bs.h the higher byte of b * src_scale.
             // Again, because we use mulhi, the resuling red and blue
             // values are already in the right place and don't need to
             // be divided by 256.
             src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
             // Multiply alpha and green by global alpha.
             // (4 x (0, as.h, 0, gs.h))
             src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

             // Divide by 256.
             dst_rb = _mm_srli_epi16(dst_rb, 8);

             // Mask out low bits (goodies already in the right place; no need to divide)
             dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
             // Shift alpha and green to higher byte of each word.
             // (4 x (as.h, 0, gs.h, 0))
             src_ag = _mm_slli_epi16(src_ag, 8);

             // Combine back into RGBA.
             dst_pixel = _mm_or_si128(dst_rb, dst_ag);
             src_pixel = _mm_or_si128(src_rb, src_ag);

             // Add two pixels into result.
             __m128i result = _mm_add_epi8(src_pixel, dst_pixel);
             _mm_store_si128(d, result);
             s++;
             d++;
             count -= 4;
         }
         src = reinterpret_cast<const SkPMColor*>(s);
         dst = reinterpret_cast<SkPMColor*>(d);
     }

     while (count > 0) {
         *dst = SkBlendARGB32(*src, *dst, alpha);
         src++;
         dst++;
         count--;
     }
 }

 /* SSE2 version of Color32()
  * portable version is in core/SkBlitRow_D32.cpp
  */
 void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
                   SkPMColor color) {

     if (count <= 0) {
         return;
     }

     if (0 == color) {
         if (src != dst) {
             memcpy(dst, src, count * sizeof(SkPMColor));
         }
         return;
     }

     unsigned colorA = SkGetPackedA32(color);
     if (255 == colorA) {
         sk_memset32(dst, color, count);
     } else {
         unsigned scale = 256 - SkAlpha255To256(colorA);

         if (count >= 4) {
             SkASSERT(((size_t)dst & 0x03) == 0);
             while (((size_t)dst & 0x0F) != 0) {
                 *dst = color + SkAlphaMulQ(*src, scale);
                 src++;
                 dst++;
                 count--;
             }

             const __m128i *s = reinterpret_cast<const __m128i*>(src);
             __m128i *d = reinterpret_cast<__m128i*>(dst);
             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
             __m128i src_scale_wide = _mm_set1_epi16(scale);
             __m128i color_wide = _mm_set1_epi32(color);
             while (count >= 4) {
                 // Load 4 pixels each of src and dest.
                 __m128i src_pixel = _mm_loadu_si128(s);

                 // Get red and blue pixels into lower byte of each word.
                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

                 // Get alpha and green into lower byte of each word.
                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

                 // Multiply by scale.
                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

                 // Divide by 256.
                 src_rb = _mm_srli_epi16(src_rb, 8);
                 src_ag = _mm_andnot_si128(rb_mask, src_ag);

                 // Combine back into RGBA.
                 src_pixel = _mm_or_si128(src_rb, src_ag);

                 // Add color to result.
                 __m128i result = _mm_add_epi8(color_wide, src_pixel);

                 // Store result.
                 _mm_store_si128(d, result);
                 s++;
                 d++;
                 count -= 4;
             }
             src = reinterpret_cast<const SkPMColor*>(s);
             dst = reinterpret_cast<SkPMColor*>(d);
          }

         while (count > 0) {
             *dst = color + SkAlphaMulQ(*src, scale);
             src += 1;
             dst += 1;
             count--;
         }
     }
 }

 void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
                                size_t maskRB, SkColor origColor,
                                int width, int height) {
     SkPMColor color = SkPreMultiplyColor(origColor);
     size_t dstOffset = dstRB - (width << 2);
     size_t maskOffset = maskRB - width;
     SkPMColor* dst = (SkPMColor *)device;
     const uint8_t* mask = (const uint8_t*)maskPtr;
     do {
         int count = width;
         if (count >= 4) {
             while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
                 *dst = SkBlendARGB32(color, *dst, *mask);
                 mask++;
                 dst++;
                 count--;
             }
             __m128i *d = reinterpret_cast<__m128i*>(dst);
             __m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
             __m128i c_256 = _mm_set1_epi16(256);
             __m128i c_1 = _mm_set1_epi16(1);
             __m128i src_pixel = _mm_set1_epi32(color);
             while (count >= 4) {
                 // Load 4 pixels each of src and dest.
                 __m128i dst_pixel = _mm_load_si128(d);

                 //set the aphla value
                 __m128i src_scale_wide =  _mm_set_epi8(0, *(mask+3),\
                                 0, *(mask+3),0, \
                                 *(mask+2),0, *(mask+2),\
                                 0,*(mask+1), 0,*(mask+1),\
                                 0, *mask,0,*mask);

                 //call SkAlpha255To256()
                 src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

                 // Get red and blue pixels into lower byte of each word.
                 __m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
                 __m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

                 // Get alpha and green into lower byte of each word.
                 __m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
                 __m128i src_ag = _mm_srli_epi16(src_pixel, 8);

                 // Put per-pixel alpha in low byte of each word.
                 __m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
                 dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

                 // dst_alpha = dst_alpha * src_scale
                 dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

                 // Divide by 256.
                 dst_alpha = _mm_srli_epi16(dst_alpha, 8);

                 // Subtract alphas from 256, to get 1..256
                 dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
                 // Multiply red and blue by dst pixel alpha.
                 dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
                 // Multiply alpha and green by dst pixel alpha.
                 dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

                 // Multiply red and blue by global alpha.
                 src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
                 // Multiply alpha and green by global alpha.
                 src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
                 // Divide by 256.
                 dst_rb = _mm_srli_epi16(dst_rb, 8);
                 src_rb = _mm_srli_epi16(src_rb, 8);

                 // Mask out low bits (goodies already in the right place; no need to divide)
                 dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
                 src_ag = _mm_andnot_si128(rb_mask, src_ag);

                 // Combine back into RGBA.
                 dst_pixel = _mm_or_si128(dst_rb, dst_ag);
                 __m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

                 // Add two pixels into result.
                 __m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
                 _mm_store_si128(d, result);
                 // load the next 4 pixel
                 mask = mask + 4;
                 d++;
                 count -= 4;
             }
             dst = reinterpret_cast<SkPMColor *>(d);
         }
         while(count > 0) {
             *dst= SkBlendARGB32(color, *dst, *mask);
             dst += 1;
             mask++;
             count --;
         }
         dst = (SkPMColor *)((char*)dst + dstOffset);
         mask += maskOffset;
     } while (--height != 0);
 }

 static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
                                  __m128i &mask, __m128i &scale) {
     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
                               _mm_set1_epi32(0x001F0000));

     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
                               _mm_set1_epi32(0x00001F00));

     __m128i b = _mm_and_si128(_mm_slli_epi32(mask,
                               SK_B16_BITS-5),
                               _mm_set1_epi32(0x0000001F));

     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
     mask = _mm_or_si128(_mm_or_si128(r, g), b);

     // Interleave R,G,B into the lower byte of word.
     __m128i maskLo, maskHi;
     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

     // Upscale to 0..32
     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

     maskLo = _mm_mullo_epi16(maskLo, scale);
     maskHi = _mm_mullo_epi16(maskHi, scale);

     maskLo = _mm_srli_epi16(maskLo, 8);
     maskHi = _mm_srli_epi16(maskHi, 8);

     // Interleave R,G,B into the lower byte of the word.
     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));

     maskLo = _mm_srai_epi16(maskLo, 5);
     maskHi = _mm_srai_epi16(maskHi, 5);

     // Add two pixels into result.
     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

     // Pack into 4 32bit dst pixels
     return _mm_packus_epi16(resultLo, resultHi);
 }

 static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
                                        __m128i &mask) {
     // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
     __m128i r = _mm_and_si128(_mm_slli_epi32(mask,
                               16-SK_R16_SHIFT-(SK_R16_BITS-5)),
                               _mm_set1_epi32(0x001F0000));

     __m128i g = _mm_and_si128(_mm_slli_epi32(mask,
                               8-SK_G16_SHIFT-(SK_G16_BITS-5)),
                               _mm_set1_epi32(0x00001F00));

     __m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
                               _mm_set1_epi32(0x0000001F));

     // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
     mask = _mm_or_si128(_mm_or_si128(r, g), b);

     // Interleave R,G,B into the lower byte of word.
     __m128i maskLo, maskHi;
     maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
     maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

     // Upscale to 0..32
     maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
     maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

     // Interleave R,G,B into the lower byte of the word.
     __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
     __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

     maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
     maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));

     maskLo = _mm_srai_epi16(maskLo, 5);
     maskHi = _mm_srai_epi16(maskHi, 5);

     // Add two pixels into result.
     __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
     __m128i resultHi = _mm_add_epi16(dstHi, maskHi);

     // Pack into 4 32bit dst pixels
     return _mm_packus_epi16(resultLo, resultHi);
 }

 void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
                          SkColor color, int width, SkPMColor) {
     if (width <= 0) {
         return;
     }

     int srcA = SkColorGetA(color);
     int srcR = SkColorGetR(color);
     int srcG = SkColorGetG(color);
     int srcB = SkColorGetB(color);

     srcA = SkAlpha255To256(srcA);

     if (width >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
             *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
             src++;
             dst++;
             width--;
         }

         __m128i *d = reinterpret_cast<__m128i*>(dst);
         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
         __m128i scale = _mm_set1_epi16(srcA);
         while (width >= 4) {
             __m128i dst_pixel = _mm_load_si128(d);
             __m128i mask_pixel = _mm_loadl_epi64(
                                      reinterpret_cast<const __m128i*>(src));

             // Check whether mask_pixels are equal to 0 and get the highest bit
             // of each byte of result, if mask pixes are all zero, we will get
             // pack_cmp to 0xFFFF
             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
                                              _mm_setzero_si128()));

             // if mask pixels are not all zero, we will blend the dst pixels
             if (pack_cmp != 0xFFFF) {
                 // Unpack 4 16bit mask pixels to
                 // (p0, 0, p1, 0, p2, 0, p3, 0)
                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
                                                 _mm_setzero_si128());

                 // Process 4 32bit dst pixels
                 __m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
                                                    mask_pixel, scale);
                 _mm_store_si128(d, result);
             }

             d++;
             src += 4;
             width -= 4;
         }

         dst = reinterpret_cast<SkPMColor*>(d);
     }

     while (width > 0) {
         *dst = SkBlendLCD16(srcA, srcR, srcG, srcB, *dst, *src);
         src++;
         dst++;
         width--;
     }
 }

 void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
                                SkColor color, int width, SkPMColor opaqueDst) {
     if (width <= 0) {
         return;
     }

     int srcR = SkColorGetR(color);
     int srcG = SkColorGetG(color);
     int srcB = SkColorGetB(color);

     if (width >= 4) {
         SkASSERT(((size_t)dst & 0x03) == 0);
         while (((size_t)dst & 0x0F) != 0) {
             *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
             src++;
             dst++;
             width--;
         }

         __m128i *d = reinterpret_cast<__m128i*>(dst);
         __m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
         srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
         while (width >= 4) {
             __m128i dst_pixel = _mm_load_si128(d);
             __m128i mask_pixel = _mm_loadl_epi64(
                                      reinterpret_cast<const __m128i*>(src));

             // Check whether mask_pixels are equal to 0 and get the highest bit
             // of each byte of result, if mask pixes are all zero, we will get
             // pack_cmp to 0xFFFF
             int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
                                              _mm_setzero_si128()));

             // if mask pixels are not all zero, we will blend the dst pixels
             if (pack_cmp != 0xFFFF) {
                 // Unpack 4 16bit mask pixels to
                 // (p0, 0, p1, 0, p2, 0, p3, 0)
                 mask_pixel = _mm_unpacklo_epi16(mask_pixel,
                                                 _mm_setzero_si128());

                 // Process 4 32bit dst pixels
                 __m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
                                                          mask_pixel);
                 _mm_store_si128(d, result);
             }

             d++;
             src += 4;
             width -= 4;
         }

         dst = reinterpret_cast<SkPMColor*>(d);
     }

     while (width > 0) {
         *dst = SkBlendLCD16Opaque(srcR, srcG, srcB, *dst, *src, opaqueDst);
         src++;
         dst++;
         width--;
     }
 }
	/*
	* Copyright 2012 The Android Open Source Project
	*
	* Use of this source code is governed by a BSD-style license that can be
	* found in the LICENSE file.
	*/


	#include "SkBlitRow_opts_SSE2.h"
	#include "SkColorPriv.h"
	#include "SkUtils.h"

	#include <emmintrin.h>

	/* SSE2 version of S32_Blend_BlitRow32()
	* portable version is in core/SkBlitRow_D32.cpp
	*/
	void S32_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
	const SkPMColor* SK_RESTRICT src,
	int count, U8CPU alpha) {
	SkASSERT(alpha <= 255);
	if (count <= 0) {
	return;
	}

	uint32_t src_scale = SkAlpha255To256(alpha);
	uint32_t dst_scale = 256 - src_scale;

	if (count >= 4) {
	SkASSERT(((size_t)dst & 0x03) == 0);
	while (((size_t)dst & 0x0F) != 0) {
	dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
	src++;
	dst++;
	count--;
	}

	const __m128i s = reinterpret_cast<const __m128i>(src);
	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i ag_mask = _mm_set1_epi32(0xFF00FF00);

	// Move scale factors to upper byte of word
	__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
	__m128i dst_scale_wide = _mm_set1_epi16(dst_scale << 8);
	while (count >= 4) {
	// Load 4 pixels each of src and dest.
	__m128i src_pixel = _mm_loadu_si128(s);
	__m128i dst_pixel = _mm_load_si128(d);

	// Interleave Atom port 0/1 operations based on the execution port
	// constraints that multiply can only be executed on port 0 (while
	// boolean operations can be executed on either port 0 or port 1)
	// because GCC currently doesn't do a good job scheduling
	// instructions based on these constraints.

	// Get red and blue pixels into lower byte of each word.
	// (0, r, 0, b, 0, r, 0, b, 0, r, 0, b, 0, r, 0, b)
	__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

	// Multiply by scale.
	// (4 x (0, rs.h, 0, bs.h))
	// where rs.h stands for the higher byte of r * scale, and
	// bs.h the higher byte of b * scale.
	src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);

	// Get alpha and green pixels into higher byte of each word.
	// (a, 0, g, 0, a, 0, g, 0, a, 0, g, 0, a, 0, g, 0)
	__m128i src_ag = _mm_and_si128(ag_mask, src_pixel);

	// Multiply by scale.
	// (4 x (as.h, as.l, gs.h, gs.l))
	src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

	// Clear the lower byte of the ascale and gscale results
	// (4 x (as.h, 0, gs.h, 0))
	src_ag = _mm_and_si128(src_ag, ag_mask);

	// Operations the destination pixels are the same as on the
	// source pixels. See the comments above.
	__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
	dst_rb = _mm_mulhi_epu16(dst_rb, dst_scale_wide);
	__m128i dst_ag = _mm_and_si128(ag_mask, dst_pixel);
	dst_ag = _mm_mulhi_epu16(dst_ag, dst_scale_wide);
	dst_ag = _mm_and_si128(dst_ag, ag_mask);

	// Combine back into RGBA.
	// (4 x (as.h, rs.h, gs.h, bs.h))
	src_pixel = _mm_or_si128(src_rb, src_ag);
	dst_pixel = _mm_or_si128(dst_rb, dst_ag);

	// Add result
	__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
	_mm_store_si128(d, result);
	s++;
	d++;
	count -= 4;
	}
	src = reinterpret_cast<const SkPMColor*>(s);
	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (count > 0) {
	dst = SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(*dst, dst_scale);
	src++;
	dst++;
	count--;
	}
	}

	void S32A_Opaque_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
	const SkPMColor* SK_RESTRICT src,
	int count, U8CPU alpha) {
	SkASSERT(alpha == 255);
	if (count <= 0) {
	return;
	}

	if (count >= 4) {
	SkASSERT(((size_t)dst & 0x03) == 0);
	while (((size_t)dst & 0x0F) != 0) {
	dst = SkPMSrcOver(src, *dst);
	src++;
	dst++;
	count--;
	}

	const __m128i s = reinterpret_cast<const __m128i>(src);
	__m128i d = reinterpret_cast<__m128i>(dst);
	#ifdef SK_USE_ACCURATE_BLENDING
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i c_128 = _mm_set1_epi16(128); // 8 copies of 128 (16-bit)
	__m128i c_255 = _mm_set1_epi16(255); // 8 copies of 255 (16-bit)
	while (count >= 4) {
	// Load 4 pixels
	__m128i src_pixel = _mm_loadu_si128(s);
	__m128i dst_pixel = _mm_load_si128(d);

	__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
	__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
	// Shift alphas down to lower 8 bits of each quad.
	__m128i alpha = _mm_srli_epi32(src_pixel, 24);

	// Copy alpha to upper 3rd byte of each quad
	alpha = _mm_or_si128(alpha, _mm_slli_epi32(alpha, 16));

	// Subtract alphas from 255, to get 0..255
	alpha = _mm_sub_epi16(c_255, alpha);

	// Multiply by red and blue by src alpha.
	dst_rb = _mm_mullo_epi16(dst_rb, alpha);
	// Multiply by alpha and green by src alpha.
	dst_ag = _mm_mullo_epi16(dst_ag, alpha);

	// dst_rb_low = (dst_rb >> 8)
	__m128i dst_rb_low = _mm_srli_epi16(dst_rb, 8);
	__m128i dst_ag_low = _mm_srli_epi16(dst_ag, 8);

	// dst_rb = (dst_rb + dst_rb_low + 128) >> 8
	dst_rb = _mm_add_epi16(dst_rb, dst_rb_low);
	dst_rb = _mm_add_epi16(dst_rb, c_128);
	dst_rb = _mm_srli_epi16(dst_rb, 8);

	// dst_ag = (dst_ag + dst_ag_low + 128) & ag_mask
	dst_ag = _mm_add_epi16(dst_ag, dst_ag_low);
	dst_ag = _mm_add_epi16(dst_ag, c_128);
	dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

	// Combine back into RGBA.
	dst_pixel = _mm_or_si128(dst_rb, dst_ag);

	// Add result
	__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
	_mm_store_si128(d, result);
	s++;
	d++;
	count -= 4;
	}
	#else
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i c_256 = _mm_set1_epi16(0x0100); // 8 copies of 256 (16-bit)
	while (count >= 4) {
	// Load 4 pixels
	__m128i src_pixel = _mm_loadu_si128(s);
	__m128i dst_pixel = _mm_load_si128(d);

	__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
	__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);

	// (a0, g0, a1, g1, a2, g2, a3, g3) (low byte of each word)
	__m128i alpha = _mm_srli_epi16(src_pixel, 8);

	// (a0, a0, a1, a1, a2, g2, a3, g3)
	alpha = _mm_shufflehi_epi16(alpha, 0xF5);

	// (a0, a0, a1, a1, a2, a2, a3, a3)
	alpha = _mm_shufflelo_epi16(alpha, 0xF5);

	// Subtract alphas from 256, to get 1..256
	alpha = _mm_sub_epi16(c_256, alpha);

	// Multiply by red and blue by src alpha.
	dst_rb = _mm_mullo_epi16(dst_rb, alpha);
	// Multiply by alpha and green by src alpha.
	dst_ag = _mm_mullo_epi16(dst_ag, alpha);

	// Divide by 256.
	dst_rb = _mm_srli_epi16(dst_rb, 8);

	// Mask out high bits (already in the right place)
	dst_ag = _mm_andnot_si128(rb_mask, dst_ag);

	// Combine back into RGBA.
	dst_pixel = _mm_or_si128(dst_rb, dst_ag);

	// Add result
	__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
	_mm_store_si128(d, result);
	s++;
	d++;
	count -= 4;
	}
	#endif
	src = reinterpret_cast<const SkPMColor*>(s);
	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (count > 0) {
	dst = SkPMSrcOver(src, *dst);
	src++;
	dst++;
	count--;
	}
	}

	void S32A_Blend_BlitRow32_SSE2(SkPMColor* SK_RESTRICT dst,
	const SkPMColor* SK_RESTRICT src,
	int count, U8CPU alpha) {
	SkASSERT(alpha <= 255);
	if (count <= 0) {
	return;
	}

	if (count >= 4) {
	while (((size_t)dst & 0x0F) != 0) {
	dst = SkBlendARGB32(src, *dst, alpha);
	src++;
	dst++;
	count--;
	}

	uint32_t src_scale = SkAlpha255To256(alpha);

	const __m128i s = reinterpret_cast<const __m128i>(src);
	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i src_scale_wide = _mm_set1_epi16(src_scale << 8);
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i c_256 = _mm_set1_epi16(256); // 8 copies of 256 (16-bit)
	while (count >= 4) {
	// Load 4 pixels each of src and dest.
	__m128i src_pixel = _mm_loadu_si128(s);
	__m128i dst_pixel = _mm_load_si128(d);

	// Get red and blue pixels into lower byte of each word.
	__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
	__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

	// Get alpha and green into lower byte of each word.
	__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
	__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

	// Put per-pixel alpha in low byte of each word.
	// After the following two statements, the dst_alpha looks like
	// (0, a0, 0, a0, 0, a1, 0, a1, 0, a2, 0, a2, 0, a3, 0, a3)
	__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
	dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

	// dst_alpha = dst_alpha * src_scale
	// Because src_scales are in the higher byte of each word and
	// we use mulhi here, the resulting alpha values are already
	// in the right place and don't need to be divided by 256.
	// (0, sa0, 0, sa0, 0, sa1, 0, sa1, 0, sa2, 0, sa2, 0, sa3, 0, sa3)
	dst_alpha = _mm_mulhi_epu16(dst_alpha, src_scale_wide);

	// Subtract alphas from 256, to get 1..256
	dst_alpha = _mm_sub_epi16(c_256, dst_alpha);

	// Multiply red and blue by dst pixel alpha.
	dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
	// Multiply alpha and green by dst pixel alpha.
	dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

	// Multiply red and blue by global alpha.
	// (4 x (0, rs.h, 0, bs.h))
	// where rs.h stands for the higher byte of r * src_scale,
	// and bs.h the higher byte of b * src_scale.
	// Again, because we use mulhi, the resuling red and blue
	// values are already in the right place and don't need to
	// be divided by 256.
	src_rb = _mm_mulhi_epu16(src_rb, src_scale_wide);
	// Multiply alpha and green by global alpha.
	// (4 x (0, as.h, 0, gs.h))
	src_ag = _mm_mulhi_epu16(src_ag, src_scale_wide);

	// Divide by 256.
	dst_rb = _mm_srli_epi16(dst_rb, 8);

	// Mask out low bits (goodies already in the right place; no need to divide)
	dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
	// Shift alpha and green to higher byte of each word.
	// (4 x (as.h, 0, gs.h, 0))
	src_ag = _mm_slli_epi16(src_ag, 8);

	// Combine back into RGBA.
	dst_pixel = _mm_or_si128(dst_rb, dst_ag);
	src_pixel = _mm_or_si128(src_rb, src_ag);

	// Add two pixels into result.
	__m128i result = _mm_add_epi8(src_pixel, dst_pixel);
	_mm_store_si128(d, result);
	s++;
	d++;
	count -= 4;
	}
	src = reinterpret_cast<const SkPMColor*>(s);
	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (count > 0) {
	dst = SkBlendARGB32(src, *dst, alpha);
	src++;
	dst++;
	count--;
	}
	}

	/* SSE2 version of Color32()
	* portable version is in core/SkBlitRow_D32.cpp
	*/
	void Color32_SSE2(SkPMColor dst[], const SkPMColor src[], int count,
	SkPMColor color) {

	if (count <= 0) {
	return;
	}

	if (0 == color) {
	if (src != dst) {
	memcpy(dst, src, count * sizeof(SkPMColor));
	}
	return;
	}

	unsigned colorA = SkGetPackedA32(color);
	if (255 == colorA) {
	sk_memset32(dst, color, count);
	} else {
	unsigned scale = 256 - SkAlpha255To256(colorA);

	if (count >= 4) {
	SkASSERT(((size_t)dst & 0x03) == 0);
	while (((size_t)dst & 0x0F) != 0) {
	dst = color + SkAlphaMulQ(src, scale);
	src++;
	dst++;
	count--;
	}

	const __m128i s = reinterpret_cast<const __m128i>(src);
	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i src_scale_wide = _mm_set1_epi16(scale);
	__m128i color_wide = _mm_set1_epi32(color);
	while (count >= 4) {
	// Load 4 pixels each of src and dest.
	__m128i src_pixel = _mm_loadu_si128(s);

	// Get red and blue pixels into lower byte of each word.
	__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

	// Get alpha and green into lower byte of each word.
	__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

	// Multiply by scale.
	src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
	src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);

	// Divide by 256.
	src_rb = _mm_srli_epi16(src_rb, 8);
	src_ag = _mm_andnot_si128(rb_mask, src_ag);

	// Combine back into RGBA.
	src_pixel = _mm_or_si128(src_rb, src_ag);

	// Add color to result.
	__m128i result = _mm_add_epi8(color_wide, src_pixel);

	// Store result.
	_mm_store_si128(d, result);
	s++;
	d++;
	count -= 4;
	}
	src = reinterpret_cast<const SkPMColor*>(s);
	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (count > 0) {
	dst = color + SkAlphaMulQ(src, scale);
	src += 1;
	dst += 1;
	count--;
	}
	}
	}

	void SkARGB32_A8_BlitMask_SSE2(void* device, size_t dstRB, const void* maskPtr,
	size_t maskRB, SkColor origColor,
	int width, int height) {
	SkPMColor color = SkPreMultiplyColor(origColor);
	size_t dstOffset = dstRB - (width << 2);
	size_t maskOffset = maskRB - width;
	SkPMColor* dst = (SkPMColor *)device;
	const uint8_t* mask = (const uint8_t*)maskPtr;
	do {
	int count = width;
	if (count >= 4) {
	while (((size_t)dst & 0x0F) != 0 && (count > 0)) {
	dst = SkBlendARGB32(color, dst, *mask);
	mask++;
	dst++;
	count--;
	}
	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i rb_mask = _mm_set1_epi32(0x00FF00FF);
	__m128i c_256 = _mm_set1_epi16(256);
	__m128i c_1 = _mm_set1_epi16(1);
	__m128i src_pixel = _mm_set1_epi32(color);
	while (count >= 4) {
	// Load 4 pixels each of src and dest.
	__m128i dst_pixel = _mm_load_si128(d);

	//set the aphla value
	__m128i src_scale_wide = _mm_set_epi8(0, *(mask+3),\
	0, *(mask+3),0, \
	(mask+2),0, (mask+2),\
	0,(mask+1), 0,(mask+1),\
	0, mask,0,mask);

	//call SkAlpha255To256()
	src_scale_wide = _mm_add_epi16(src_scale_wide, c_1);

	// Get red and blue pixels into lower byte of each word.
	__m128i dst_rb = _mm_and_si128(rb_mask, dst_pixel);
	__m128i src_rb = _mm_and_si128(rb_mask, src_pixel);

	// Get alpha and green into lower byte of each word.
	__m128i dst_ag = _mm_srli_epi16(dst_pixel, 8);
	__m128i src_ag = _mm_srli_epi16(src_pixel, 8);

	// Put per-pixel alpha in low byte of each word.
	__m128i dst_alpha = _mm_shufflehi_epi16(src_ag, 0xF5);
	dst_alpha = _mm_shufflelo_epi16(dst_alpha, 0xF5);

	// dst_alpha = dst_alpha * src_scale
	dst_alpha = _mm_mullo_epi16(dst_alpha, src_scale_wide);

	// Divide by 256.
	dst_alpha = _mm_srli_epi16(dst_alpha, 8);

	// Subtract alphas from 256, to get 1..256
	dst_alpha = _mm_sub_epi16(c_256, dst_alpha);
	// Multiply red and blue by dst pixel alpha.
	dst_rb = _mm_mullo_epi16(dst_rb, dst_alpha);
	// Multiply alpha and green by dst pixel alpha.
	dst_ag = _mm_mullo_epi16(dst_ag, dst_alpha);

	// Multiply red and blue by global alpha.
	src_rb = _mm_mullo_epi16(src_rb, src_scale_wide);
	// Multiply alpha and green by global alpha.
	src_ag = _mm_mullo_epi16(src_ag, src_scale_wide);
	// Divide by 256.
	dst_rb = _mm_srli_epi16(dst_rb, 8);
	src_rb = _mm_srli_epi16(src_rb, 8);

	// Mask out low bits (goodies already in the right place; no need to divide)
	dst_ag = _mm_andnot_si128(rb_mask, dst_ag);
	src_ag = _mm_andnot_si128(rb_mask, src_ag);

	// Combine back into RGBA.
	dst_pixel = _mm_or_si128(dst_rb, dst_ag);
	__m128i tmp_src_pixel = _mm_or_si128(src_rb, src_ag);

	// Add two pixels into result.
	__m128i result = _mm_add_epi8(tmp_src_pixel, dst_pixel);
	_mm_store_si128(d, result);
	// load the next 4 pixel
	mask = mask + 4;
	d++;
	count -= 4;
	}
	dst = reinterpret_cast<SkPMColor *>(d);
	}
	while(count > 0) {
	dst= SkBlendARGB32(color, dst, *mask);
	dst += 1;
	mask++;
	count --;
	}
	dst = (SkPMColor )((char)dst + dstOffset);
	mask += maskOffset;
	} while (--height != 0);
	}

	static __m128i SkBlendLCD16_SSE2(__m128i &srci, __m128i &dst,
	__m128i &mask, __m128i &scale) {
	// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
	__m128i r = _mm_and_si128(_mm_slli_epi32(mask,
	16-SK_R16_SHIFT-(SK_R16_BITS-5)),
	_mm_set1_epi32(0x001F0000));

	__m128i g = _mm_and_si128(_mm_slli_epi32(mask,
	8-SK_G16_SHIFT-(SK_G16_BITS-5)),
	_mm_set1_epi32(0x00001F00));

	__m128i b = _mm_and_si128(_mm_slli_epi32(mask,
	SK_B16_BITS-5),
	_mm_set1_epi32(0x0000001F));

	// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
	mask = _mm_or_si128(_mm_or_si128(r, g), b);

	// Interleave R,G,B into the lower byte of word.
	__m128i maskLo, maskHi;
	maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
	maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

	// Upscale to 0..32
	maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
	maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

	maskLo = _mm_mullo_epi16(maskLo, scale);
	maskHi = _mm_mullo_epi16(maskHi, scale);

	maskLo = _mm_srli_epi16(maskLo, 8);
	maskHi = _mm_srli_epi16(maskHi, 8);

	// Interleave R,G,B into the lower byte of the word.
	__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
	__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

	maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
	maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));

	maskLo = _mm_srai_epi16(maskLo, 5);
	maskHi = _mm_srai_epi16(maskHi, 5);

	// Add two pixels into result.
	__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
	__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

	// Pack into 4 32bit dst pixels
	return _mm_packus_epi16(resultLo, resultHi);
	}

	static __m128i SkBlendLCD16Opaque_SSE2(__m128i &srci, __m128i &dst,
	__m128i &mask) {
	// Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
	__m128i r = _mm_and_si128(_mm_slli_epi32(mask,
	16-SK_R16_SHIFT-(SK_R16_BITS-5)),
	_mm_set1_epi32(0x001F0000));

	__m128i g = _mm_and_si128(_mm_slli_epi32(mask,
	8-SK_G16_SHIFT-(SK_G16_BITS-5)),
	_mm_set1_epi32(0x00001F00));

	__m128i b = _mm_and_si128(_mm_slli_epi32(mask, SK_B16_BITS-5),
	_mm_set1_epi32(0x0000001F));

	// Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
	mask = _mm_or_si128(_mm_or_si128(r, g), b);

	// Interleave R,G,B into the lower byte of word.
	__m128i maskLo, maskHi;
	maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
	maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());

	// Upscale to 0..32
	maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
	maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));

	// Interleave R,G,B into the lower byte of the word.
	__m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
	__m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());

	maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(srci, dstLo));
	maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(srci, dstHi));

	maskLo = _mm_srai_epi16(maskLo, 5);
	maskHi = _mm_srai_epi16(maskHi, 5);

	// Add two pixels into result.
	__m128i resultLo = _mm_add_epi16(dstLo, maskLo);
	__m128i resultHi = _mm_add_epi16(dstHi, maskHi);

	// Pack into 4 32bit dst pixels
	return _mm_packus_epi16(resultLo, resultHi);
	}

	void SkBlitLCD16Row_SSE2(SkPMColor dst[], const uint16_t src[],
	SkColor color, int width, SkPMColor) {
	if (width <= 0) {
	return;
	}

	int srcA = SkColorGetA(color);
	int srcR = SkColorGetR(color);
	int srcG = SkColorGetG(color);
	int srcB = SkColorGetB(color);

	srcA = SkAlpha255To256(srcA);

	if (width >= 4) {
	SkASSERT(((size_t)dst & 0x03) == 0);
	while (((size_t)dst & 0x0F) != 0) {
	dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *src);
	src++;
	dst++;
	width--;
	}

	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
	srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
	__m128i scale = _mm_set1_epi16(srcA);
	while (width >= 4) {
	__m128i dst_pixel = _mm_load_si128(d);
	__m128i mask_pixel = _mm_loadl_epi64(
	reinterpret_cast<const __m128i*>(src));

	// Check whether mask_pixels are equal to 0 and get the highest bit
	// of each byte of result, if mask pixes are all zero, we will get
	// pack_cmp to 0xFFFF
	int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
	_mm_setzero_si128()));

	// if mask pixels are not all zero, we will blend the dst pixels
	if (pack_cmp != 0xFFFF) {
	// Unpack 4 16bit mask pixels to
	// (p0, 0, p1, 0, p2, 0, p3, 0)
	mask_pixel = _mm_unpacklo_epi16(mask_pixel,
	_mm_setzero_si128());

	// Process 4 32bit dst pixels
	__m128i result = SkBlendLCD16_SSE2(srci, dst_pixel,
	mask_pixel, scale);
	_mm_store_si128(d, result);
	}

	d++;
	src += 4;
	width -= 4;
	}

	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (width > 0) {
	dst = SkBlendLCD16(srcA, srcR, srcG, srcB, dst, *src);
	src++;
	dst++;
	width--;
	}
	}

	void SkBlitLCD16OpaqueRow_SSE2(SkPMColor dst[], const uint16_t src[],
	SkColor color, int width, SkPMColor opaqueDst) {
	if (width <= 0) {
	return;
	}

	int srcR = SkColorGetR(color);
	int srcG = SkColorGetG(color);
	int srcB = SkColorGetB(color);

	if (width >= 4) {
	SkASSERT(((size_t)dst & 0x03) == 0);
	while (((size_t)dst & 0x0F) != 0) {
	dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *src, opaqueDst);
	src++;
	dst++;
	width--;
	}

	__m128i d = reinterpret_cast<__m128i>(dst);
	__m128i srci = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
	srci = _mm_unpacklo_epi8(srci, _mm_setzero_si128());
	while (width >= 4) {
	__m128i dst_pixel = _mm_load_si128(d);
	__m128i mask_pixel = _mm_loadl_epi64(
	reinterpret_cast<const __m128i*>(src));

	// Check whether mask_pixels are equal to 0 and get the highest bit
	// of each byte of result, if mask pixes are all zero, we will get
	// pack_cmp to 0xFFFF
	int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_pixel,
	_mm_setzero_si128()));

	// if mask pixels are not all zero, we will blend the dst pixels
	if (pack_cmp != 0xFFFF) {
	// Unpack 4 16bit mask pixels to
	// (p0, 0, p1, 0, p2, 0, p3, 0)
	mask_pixel = _mm_unpacklo_epi16(mask_pixel,
	_mm_setzero_si128());

	// Process 4 32bit dst pixels
	__m128i result = SkBlendLCD16Opaque_SSE2(srci, dst_pixel,
	mask_pixel);
	_mm_store_si128(d, result);
	}

	d++;
	src += 4;
	width -= 4;
	}

	dst = reinterpret_cast<SkPMColor*>(d);
	}

	while (width > 0) {
	dst = SkBlendLCD16Opaque(srcR, srcG, srcB, dst, *src, opaqueDst);
	src++;
	dst++;
	width--;
	}
	}