First pass at a blitter for R11 EAC alpha masks. This shaves 10ms off
of the polygon gpu benchmark on the Nexus 7v2 (which is about 6.7% faster).

R=robertphillips@google.com

Author: krajcevski@google.com

Review URL: https://codereview.chromium.org/406693002
diff --git a/src/utils/SkTextureCompressor.cpp b/src/utils/SkTextureCompressor.cpp
index 2b33a13..a593b36 100644
--- a/src/utils/SkTextureCompressor.cpp
+++ b/src/utils/SkTextureCompressor.cpp
@@ -732,6 +732,69 @@
 }
 #endif // COMPRESS_R11_EAC_FASTEST
 
+// The R11 EAC format expects that indices are given in column-major order. Since
+// we receive alpha values in raster order, this usually means that we have to use
+// pack6 above to properly pack our indices. However, if our indices come from the
+// blitter, then each integer will be a column of indices, and hence can be efficiently
+// packed. This function takes the bottom three bits of each byte and places them in
+// the least significant 12 bits of the resulting integer.
+static inline uint32_t pack_indices_vertical(uint32_t x) {
+#if defined (SK_CPU_BENDIAN)
+    return 
+        (x & 7) |
+        ((x >> 5) & (7 << 3)) |
+        ((x >> 10) & (7 << 6)) |
+        ((x >> 15) & (7 << 9));
+#else
+    return 
+        ((x >> 24) & 7) |
+        ((x >> 13) & (7 << 3)) |
+        ((x >> 2) & (7 << 6)) |
+        ((x << 9) & (7 << 9));
+#endif
+}
+
+// This function returns the compressed format of a block given as four columns of
+// alpha values. Each column is assumed to be loaded from top to bottom, and hence
+// must first be converted to indices and then packed into the resulting 64-bit
+// integer.
+static inline uint64_t compress_block_vertical(const uint32_t alphaColumn0,
+                                               const uint32_t alphaColumn1,
+                                               const uint32_t alphaColumn2,
+                                               const uint32_t alphaColumn3) {
+
+    if (alphaColumn0 == alphaColumn1 &&
+        alphaColumn2 == alphaColumn3 &&
+        alphaColumn0 == alphaColumn2) {
+
+        if (0 == alphaColumn0) {
+            // Transparent
+            return 0x0020000000002000ULL;
+        }
+        else if (0xFFFFFFFF == alphaColumn0) {
+            // Opaque
+            return 0xFFFFFFFFFFFFFFFFULL;
+        }
+    }
+
+    const uint32_t indexColumn0 = convert_indices(alphaColumn0);
+    const uint32_t indexColumn1 = convert_indices(alphaColumn1);
+    const uint32_t indexColumn2 = convert_indices(alphaColumn2);
+    const uint32_t indexColumn3 = convert_indices(alphaColumn3);
+
+    const uint32_t packedIndexColumn0 = pack_indices_vertical(indexColumn0);
+    const uint32_t packedIndexColumn1 = pack_indices_vertical(indexColumn1);
+    const uint32_t packedIndexColumn2 = pack_indices_vertical(indexColumn2);
+    const uint32_t packedIndexColumn3 = pack_indices_vertical(indexColumn3);
+
+    return SkEndian_SwapBE64(0x8490000000000000ULL |
+                             (static_cast<uint64_t>(packedIndexColumn0) << 36) |
+                             (static_cast<uint64_t>(packedIndexColumn1) << 24) |
+                             static_cast<uint64_t>(packedIndexColumn2 << 12) |
+                             static_cast<uint64_t>(packedIndexColumn3));
+        
+}
+
 static inline bool compress_a8_to_r11eac(uint8_t* dst, const uint8_t* src,
                                          int width, int height, int rowBytes) {
 #if (COMPRESS_R11_EAC_SLOW) || (COMPRESS_R11_EAC_FAST)
@@ -743,6 +806,35 @@
 #endif
 }
 
+// Updates the block whose columns are stored in blockColN. curAlphai is expected
+// to store, as an integer, the four alpha values that will be placed within each
+// of the columns in the range [col, col+colsLeft).
+static inline void update_block_columns(
+    uint32_t* blockCol1, uint32_t* blockCol2, uint32_t* blockCol3, uint32_t* blockCol4,
+    const int col, const int colsLeft, const uint32_t curAlphai) {
+    SkASSERT(NULL != blockCol1);
+    SkASSERT(NULL != blockCol2);
+    SkASSERT(NULL != blockCol3);
+    SkASSERT(NULL != blockCol4);
+    SkASSERT(col + colsLeft <= 4);
+    for (int i = col; i < (col + colsLeft); ++i) {
+        switch(i) {
+        case 0:
+            *blockCol1 = curAlphai;
+            break;
+        case 1:
+            *blockCol2 = curAlphai;
+            break;
+        case 2:
+            *blockCol3 = curAlphai;
+            break;
+        case 3:
+            *blockCol4 = curAlphai;
+            break;
+        }
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 
 namespace SkTextureCompressor {
@@ -820,4 +912,221 @@
     return NULL;
 }
 
+R11_EACBlitter::R11_EACBlitter(int width, int height, void *latcBuffer)
+    // 0x7FFE is one minus the largest positive 16-bit int. We use it for
+    // debugging to make sure that we're properly setting the nextX distance
+    // in flushRuns(). 
+    : kLongestRun(0x7FFE), kZeroAlpha(0)
+    , fNextRun(0)
+    , fWidth(width)
+    , fHeight(height)
+    , fBuffer(reinterpret_cast<uint64_t*const>(latcBuffer))
+{
+    SkASSERT((width % kR11_EACBlockSz) == 0);
+    SkASSERT((height % kR11_EACBlockSz) == 0);
+}
+
+void R11_EACBlitter::blitAntiH(int x, int y,
+                               const SkAlpha* antialias,
+                               const int16_t* runs) {
+    // Make sure that the new row to blit is either the first
+    // row that we're blitting, or it's exactly the next scan row
+    // since the last row that we blit. This is to ensure that when
+    // we go to flush the runs, that they are all the same four
+    // runs.
+    if (fNextRun > 0 &&
+        ((x != fBufferedRuns[fNextRun-1].fX) ||
+         (y-1 != fBufferedRuns[fNextRun-1].fY))) {
+        this->flushRuns();
+    }
+
+    // Align the rows to a block boundary. If we receive rows that
+    // are not on a block boundary, then fill in the preceding runs
+    // with zeros. We do this by producing a single RLE that says
+    // that we have 0x7FFE pixels of zero (0x7FFE = 32766).
+    const int row = y & ~3;
+    while ((row + fNextRun) < y) {
+        fBufferedRuns[fNextRun].fAlphas = &kZeroAlpha;
+        fBufferedRuns[fNextRun].fRuns = &kLongestRun;
+        fBufferedRuns[fNextRun].fX = 0;
+        fBufferedRuns[fNextRun].fY = row + fNextRun;
+        ++fNextRun;
+    }
+
+    // Make sure that our assumptions aren't violated...
+    SkASSERT(fNextRun == (y & 3));
+    SkASSERT(fNextRun == 0 || fBufferedRuns[fNextRun - 1].fY < y);
+
+    // Set the values of the next run
+    fBufferedRuns[fNextRun].fAlphas = antialias;
+    fBufferedRuns[fNextRun].fRuns = runs;
+    fBufferedRuns[fNextRun].fX = x;
+    fBufferedRuns[fNextRun].fY = y;
+
+    // If we've output four scanlines in a row that don't violate our
+    // assumptions, then it's time to flush them...
+    if (4 == ++fNextRun) {
+        this->flushRuns();
+    }
+}
+
+void R11_EACBlitter::flushRuns() {
+
+    // If we don't have any runs, then just return.
+    if (0 == fNextRun) {
+        return;
+    }
+
+#ifndef NDEBUG
+    // Make sure that if we have any runs, they all match
+    for (int i = 1; i < fNextRun; ++i) {
+        SkASSERT(fBufferedRuns[i].fY == fBufferedRuns[i-1].fY + 1);
+        SkASSERT(fBufferedRuns[i].fX == fBufferedRuns[i-1].fX);
+    }
+#endif
+
+    // If we dont have as many runs as we have rows, fill in the remaining
+    // runs with constant zeros.
+    for (int i = fNextRun; i < kR11_EACBlockSz; ++i) {
+        fBufferedRuns[i].fY = fBufferedRuns[0].fY + i;
+        fBufferedRuns[i].fX = fBufferedRuns[0].fX;
+        fBufferedRuns[i].fAlphas = &kZeroAlpha;
+        fBufferedRuns[i].fRuns = &kLongestRun;
+    }
+
+    // Make sure that our assumptions aren't violated.
+    SkASSERT(fNextRun > 0 && fNextRun <= 4);
+    SkASSERT((fBufferedRuns[0].fY & 3) == 0);
+
+    // The following logic walks four rows at a time and outputs compressed
+    // blocks to the buffer passed into the constructor.
+    // We do the following:
+    //
+    //      c1 c2 c3 c4
+    // -----------------------------------------------------------------------
+    // ... |  |  |  |  |  ----> fBufferedRuns[0]
+    // -----------------------------------------------------------------------
+    // ... |  |  |  |  |  ----> fBufferedRuns[1]
+    // -----------------------------------------------------------------------
+    // ... |  |  |  |  |  ----> fBufferedRuns[2]
+    // -----------------------------------------------------------------------
+    // ... |  |  |  |  |  ----> fBufferedRuns[3]
+    // -----------------------------------------------------------------------
+    // 
+    // curX -- the macro X value that we've gotten to.
+    // c1, c2, c3, c4 -- the integers that represent the columns of the current block
+    //                   that we're operating on
+    // curAlphaColumn -- integer containing the column of alpha values from fBufferedRuns.
+    // nextX -- for each run, the next point at which we need to update curAlphaColumn
+    //          after the value of curX.
+    // finalX -- the minimum of all the nextX values.
+    //
+    // curX advances to finalX outputting any blocks that it passes along
+    // the way. Since finalX will not change when we reach the end of a
+    // run, the termination criteria will be whenever curX == finalX at the
+    // end of a loop.
+
+    // Setup:
+    uint32_t c1 = 0;
+    uint32_t c2 = 0;
+    uint32_t c3 = 0;
+    uint32_t c4 = 0;
+
+    uint32_t curAlphaColumn = 0;
+    SkAlpha *curAlpha = reinterpret_cast<SkAlpha*>(&curAlphaColumn);
+
+    int nextX[kR11_EACBlockSz];
+    for (int i = 0; i < kR11_EACBlockSz; ++i) {
+        nextX[i] = 0x7FFFFF;
+    }
+
+    uint64_t* outPtr = this->getBlock(fBufferedRuns[0].fX, fBufferedRuns[0].fY);
+
+    // Populate the first set of runs and figure out how far we need to
+    // advance on the first step
+    int curX = 0;
+    int finalX = 0xFFFFF;
+    for (int i = 0; i < kR11_EACBlockSz; ++i) {
+        nextX[i] = *(fBufferedRuns[i].fRuns);
+        curAlpha[i] = *(fBufferedRuns[i].fAlphas);
+
+        finalX = SkMin32(nextX[i], finalX);
+    }
+
+    // Make sure that we have a valid right-bound X value
+    SkASSERT(finalX < 0xFFFFF);
+
+    // Run the blitter...
+    while (curX != finalX) {
+        SkASSERT(finalX >= curX);
+
+        // Do we need to populate the rest of the block?
+        if ((finalX - (curX & ~3)) >= kR11_EACBlockSz) {
+            const int col = curX & 3;
+            const int colsLeft = 4 - col;
+            SkASSERT(curX + colsLeft <= finalX);
+
+            update_block_columns(&c1, &c2, &c3, &c4, col, colsLeft, curAlphaColumn);
+
+            // Write this block
+            *outPtr = compress_block_vertical(c1, c2, c3, c4);
+            ++outPtr;
+            curX += colsLeft;
+        }
+
+        // If we can advance even further, then just keep memsetting the block
+        if ((finalX - curX) >= kR11_EACBlockSz) {
+            SkASSERT((curX & 3) == 0);
+
+            const int col = 0;
+            const int colsLeft = kR11_EACBlockSz;
+
+            update_block_columns(&c1, &c2, &c3, &c4, col, colsLeft, curAlphaColumn);
+
+            // While we can keep advancing, just keep writing the block.
+            uint64_t lastBlock = compress_block_vertical(c1, c2, c3, c4);
+            while((finalX - curX) >= kR11_EACBlockSz) {
+                *outPtr = lastBlock;
+                ++outPtr;
+                curX += kR11_EACBlockSz;
+            }
+        }
+
+        // If we haven't advanced within the block then do so.
+        if (curX < finalX) {
+            const int col = curX & 3;
+            const int colsLeft = finalX - curX;
+
+            update_block_columns(&c1, &c2, &c3, &c4, col, colsLeft, curAlphaColumn);
+
+            curX += colsLeft;
+        }
+
+        SkASSERT(curX == finalX);
+
+        // Figure out what the next advancement is...
+        for (int i = 0; i < kR11_EACBlockSz; ++i) {
+            if (nextX[i] == finalX) {
+                const int16_t run = *(fBufferedRuns[i].fRuns);
+                fBufferedRuns[i].fRuns += run;
+                fBufferedRuns[i].fAlphas += run;
+                curAlpha[i] = *(fBufferedRuns[i].fAlphas);
+                nextX[i] += *(fBufferedRuns[i].fRuns);
+            }
+        }
+
+        finalX = 0xFFFFF;
+        for (int i = 0; i < kR11_EACBlockSz; ++i) {
+            finalX = SkMin32(nextX[i], finalX);
+        }
+    }
+
+    // If we didn't land on a block boundary, output the block...
+    if ((curX & 3) > 1) {
+        *outPtr = compress_block_vertical(c1, c2, c3, c4);
+    }
+
+    fNextRun = 0;
+}
+
 }  // namespace SkTextureCompressor
diff --git a/src/utils/SkTextureCompressor.h b/src/utils/SkTextureCompressor.h
index ec6153a..ea57934 100644
--- a/src/utils/SkTextureCompressor.h
+++ b/src/utils/SkTextureCompressor.h
@@ -9,6 +9,7 @@
 #define SkTextureCompressor_DEFINED
 
 #include "SkImageInfo.h"
+#include "SkBlitter.h"
 
 class SkBitmap;
 class SkData;
@@ -42,6 +43,167 @@
     // allows SIMD optimized compression functions to be implemented.
     typedef bool (*CompressionProc)(uint8_t* dst, const uint8_t* src,
                                     int width, int height, int rowBytes);
+
+    // This class implements a blitter that blits directly into a buffer that will
+    // be used as an R11 EAC compressed texture. We compute this buffer by
+    // buffering four scan lines and then outputting them all at once. This blitter
+    // is only expected to be used with alpha masks, i.e. kAlpha8_SkColorType.
+    class R11_EACBlitter : public SkBlitter {
+    public:
+        R11_EACBlitter(int width, int height, void *compressedBuffer);
+        virtual ~R11_EACBlitter() { this->flushRuns(); }
+
+        // Blit a horizontal run of one or more pixels.
+        virtual void blitH(int x, int y, int width) SK_OVERRIDE {
+            // This function is intended to be called from any standard RGB
+            // buffer, so we should never encounter it. However, if some code
+            // path does end up here, then this needs to be investigated.
+            SkFAIL("Not implemented!");
+        }
+
+        /// Blit a horizontal run of antialiased pixels; runs[] is a *sparse*
+        /// zero-terminated run-length encoding of spans of constant alpha values.
+        virtual void blitAntiH(int x, int y,
+                               const SkAlpha antialias[],
+                               const int16_t runs[]) SK_OVERRIDE;
+
+        // Blit a vertical run of pixels with a constant alpha value.
+        virtual void blitV(int x, int y, int height, SkAlpha alpha) SK_OVERRIDE {
+            // This function is currently not implemented. It is not explicitly
+            // required by the contract, but if at some time a code path runs into
+            // this function (which is entirely possible), it needs to be implemented.
+            //
+            // TODO (krajcevski):
+            // This function will be most easily implemented in one of two ways:
+            // 1. Buffer each vertical column value and then construct a list
+            //    of alpha values and output all of the blocks at once. This only
+            //    requires a write to the compressed buffer
+            // 2. Replace the indices of each block with the proper indices based
+            //    on the alpha value. This requires a read and write of the compressed
+            //    buffer, but much less overhead.
+            SkFAIL("Not implemented!");
+        }
+
+        // Blit a solid rectangle one or more pixels wide.
+        virtual void blitRect(int x, int y, int width, int height) SK_OVERRIDE {
+            // Analogous to blitRow, this function is intended for RGB targets
+            // and should never be called by this blitter. Any calls to this function
+            // are probably a bug and should be investigated.
+            SkFAIL("Not implemented!");
+        }
+
+        // Blit a rectangle with one alpha-blended column on the left,
+        // width (zero or more) opaque pixels, and one alpha-blended column
+        // on the right. The result will always be at least two pixels wide.
+        virtual void blitAntiRect(int x, int y, int width, int height,
+                                  SkAlpha leftAlpha, SkAlpha rightAlpha) SK_OVERRIDE {
+            // This function is currently not implemented. It is not explicitly
+            // required by the contract, but if at some time a code path runs into
+            // this function (which is entirely possible), it needs to be implemented.
+            //
+            // TODO (krajcevski):
+            // This function will be most easily implemented as follows:
+            // 1. If width/height are smaller than a block, then update the
+            //    indices of the affected blocks.
+            // 2. If width/height are larger than a block, then construct a 9-patch
+            //    of block encodings that represent the rectangle, and write them
+            //    to the compressed buffer as necessary. Whether or not the blocks
+            //    are overwritten by zeros or just their indices are updated is up
+            //    to debate.
+            SkFAIL("Not implemented!");
+        }
+
+        // Blit a pattern of pixels defined by a rectangle-clipped mask;
+        // typically used for text.
+        virtual void blitMask(const SkMask&, const SkIRect& clip) SK_OVERRIDE {
+            // This function is currently not implemented. It is not explicitly
+            // required by the contract, but if at some time a code path runs into
+            // this function (which is entirely possible), it needs to be implemented.
+            //
+            // TODO (krajcevski):
+            // This function will be most easily implemented in the same way as
+            // blitAntiRect above.
+            SkFAIL("Not implemented!");
+        }
+
+        // If the blitter just sets a single value for each pixel, return the
+        // bitmap it draws into, and assign value. If not, return NULL and ignore
+        // the value parameter.
+        virtual const SkBitmap* justAnOpaqueColor(uint32_t* value) SK_OVERRIDE {
+            return NULL;
+        }
+
+        /**
+         * Compressed texture blitters only really work correctly if they get
+         * four blocks at a time. That being said, this blitter tries it's best
+         * to preserve semantics if blitAntiH doesn't get called in too many
+         * weird ways...
+         */
+        virtual int requestRowsPreserved() const { return kR11_EACBlockSz; }
+
+    protected:
+        virtual void onNotifyFinished() { this->flushRuns(); }
+
+    private:
+        static const int kR11_EACBlockSz = 4;
+        static const int kPixelsPerBlock = kR11_EACBlockSz * kR11_EACBlockSz;
+
+        // The longest possible run of pixels that this blitter will receive.
+        // This is initialized in the constructor to 0x7FFE, which is one less
+        // than the largest positive 16-bit integer. We make sure that it's one
+        // less for debugging purposes. We also don't make this variable static
+        // in order to make sure that we can construct a valid pointer to it.
+        const int16_t kLongestRun;
+
+        // Usually used in conjunction with kLongestRun. This is initialized to
+        // zero.
+        const SkAlpha kZeroAlpha;
+
+        // This is the information that we buffer whenever we're asked to blit
+        // a row with this blitter.
+        struct BufferedRun {
+            const SkAlpha* fAlphas;
+            const int16_t* fRuns;
+            int fX, fY;
+        } fBufferedRuns[kR11_EACBlockSz];
+
+        // The next row (0-3) that we need to blit. This value should never exceed
+        // the number of rows that we have (kR11_EACBlockSz)
+        int fNextRun;
+
+        // The width and height of the image that we're blitting
+        const int fWidth;
+        const int fHeight;
+
+        // The R11 EAC buffer that we're blitting into. It is assumed that the buffer
+        // is large enough to store a compressed image of size fWidth*fHeight.
+        uint64_t* const fBuffer;
+
+        // Various utility functions
+        int blocksWide() const { return fWidth / kR11_EACBlockSz; }
+        int blocksTall() const { return fHeight / kR11_EACBlockSz; }
+        int totalBlocks() const { return (fWidth * fHeight) / kPixelsPerBlock; }
+
+        // Returns the block index for the block containing pixel (x, y). Block
+        // indices start at zero and proceed in raster order.
+        int getBlockOffset(int x, int y) const {
+            SkASSERT(x < fWidth);
+            SkASSERT(y < fHeight);
+            const int blockCol = x / kR11_EACBlockSz;
+            const int blockRow = y / kR11_EACBlockSz;
+            return blockRow * this->blocksWide() + blockCol;
+        }
+
+        // Returns a pointer to the block containing pixel (x, y)
+        uint64_t *getBlock(int x, int y) const {
+            return fBuffer + this->getBlockOffset(x, y);
+        }
+
+        // The following function writes the buffered runs to compressed blocks.
+        // If fNextRun < 4, then we fill the runs that we haven't buffered with
+        // the constant zero buffer.
+        void flushRuns();
+    };
 }
 
 #endif