Optimize getbits and peekbits functions in libmix

BZ: 146000

Optimize getbits and peekbits functions in libmix.
The getbits and peekbits are divided into emulation
prevention and non-emulatioin prevention versions.

Change-Id: I206fa743cd132b3005d096ea9bb8ff57466fb333
Signed-off-by: wfeng6 <wei.feng@intel.com>
Reviewed-on: http://android.intel.com:8080/140126
Reviewed-by: Shi, PingX <pingx.shi@intel.com>
Tested-by: Shi, PingX <pingx.shi@intel.com>
Reviewed-by: cactus <cactus@intel.com>
Tested-by: cactus <cactus@intel.com>
diff --git a/mixvbp/vbp_manager/include/viddec_pm_utils_bstream.h b/mixvbp/vbp_manager/include/viddec_pm_utils_bstream.h
index ab2569f..3bf1857 100755
--- a/mixvbp/vbp_manager/include/viddec_pm_utils_bstream.h
+++ b/mixvbp/vbp_manager/include/viddec_pm_utils_bstream.h
@@ -49,7 +49,8 @@
 
 int32_t viddec_pm_utils_bstream_skipbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t num_bits);
 
-int32_t viddec_pm_utils_bstream_peekbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits, uint8_t skip);
+int32_t viddec_pm_utils_bstream_peekbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
+int32_t viddec_pm_utils_bstream_getbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
 
 int32_t viddec_pm_utils_bstream_get_current_byte(viddec_pm_utils_bstream_cxt_t *cxt, uint8_t *byte);
 
diff --git a/mixvbp/vbp_manager/viddec_pm_parser_ops.c b/mixvbp/vbp_manager/viddec_pm_parser_ops.c
index 8c0a1ec..b10eb2d 100755
--- a/mixvbp/vbp_manager/viddec_pm_parser_ops.c
+++ b/mixvbp/vbp_manager/viddec_pm_parser_ops.c
@@ -12,10 +12,10 @@
     viddec_pm_cxt_t *cxt;
 
     cxt = (viddec_pm_cxt_t *)parent;
-    ret = viddec_pm_utils_bstream_peekbits(&(cxt->getbits), data, num_bits, 1);
+    ret = viddec_pm_utils_bstream_getbits(&(cxt->getbits), data, num_bits);
     if (ret == -1)
     {
-        VTRACE("FAILURE? getbits returned %d", ret);
+        VTRACE("FAILURE: getbits returned %d", ret);
     }
 
     return ret;
@@ -27,7 +27,11 @@
     viddec_pm_cxt_t *cxt;
 
     cxt = (viddec_pm_cxt_t *)parent;
-    ret = viddec_pm_utils_bstream_peekbits(&(cxt->getbits), data, num_bits, 0);
+    ret = viddec_pm_utils_bstream_peekbits(&(cxt->getbits), data, num_bits);
+    if (ret == -1)
+    {
+        VTRACE("FAILURE: peekbits returned %d", ret);
+    }
     return ret;
 }
 
diff --git a/mixvbp/vbp_manager/viddec_pm_utils_bstream.c b/mixvbp/vbp_manager/viddec_pm_utils_bstream.c
index 72d210b..edc8061 100755
--- a/mixvbp/vbp_manager/viddec_pm_utils_bstream.c
+++ b/mixvbp/vbp_manager/viddec_pm_utils_bstream.c
@@ -13,6 +13,12 @@
 uint32_t viddec_pm_utils_bstream_getphys(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t pos, uint32_t lst_index);
 extern uint32_t cp_using_dma(uint32_t ddr_addr, uint32_t local_addr, uint32_t size, char to_ddr, char swap);
 
+static int32_t viddec_pm_utils_bstream_peekbits_noemul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
+static int32_t viddec_pm_utils_bstream_peekbits_emul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
+static int32_t viddec_pm_utils_bstream_getbits_noemul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
+static int32_t viddec_pm_utils_bstream_getbits_emul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits);
+
+
 /* Bytes left in cubby buffer which were not consumed yet */
 static inline uint32_t viddec_pm_utils_bstream_bytesincubby(viddec_pm_utils_bstream_buf_cxt_t *cxt)
 {
@@ -233,44 +239,154 @@
 }
 
 /*
-  Function to get N bits ( N<= 32).
+  Function to get N bits (N<= 32). This function will update the bitstream position.
 */
-int32_t viddec_pm_utils_bstream_peekbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits, uint8_t skip)
+int32_t viddec_pm_utils_bstream_getbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
 {
+    if (cxt->is_emul_reqd) {
+        return viddec_pm_utils_bstream_getbits_emul(cxt, out, num_bits);
+    } else {
+        return viddec_pm_utils_bstream_getbits_noemul(cxt, out, num_bits);
+    }
+}
+
+/*
+  Function to get N bits (N<= 32).This function will NOT update the bitstream position.
+*/
+int32_t viddec_pm_utils_bstream_peekbits(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
+{
+    if (cxt->is_emul_reqd) {
+        return viddec_pm_utils_bstream_peekbits_emul(cxt, out, num_bits);
+    } else {
+        return viddec_pm_utils_bstream_peekbits_noemul(cxt, out, num_bits);
+    }
+}
+
+static inline int32_t getbytes_noemul(viddec_pm_utils_bstream_buf_cxt_t *bstream,
+        viddec_pm_utils_getbits_t *data,/* gets populated with read bytes*/
+        uint32_t *act_bytes, /* actual number of bytes read can be more due to emulation prev bytes*/
+        uint32_t *phase,    /* Phase for emulation */
+        uint32_t num_bytes,/* requested number of bytes*/
+        uint8_t is_offset_zero /* Are we on aligned byte position for first byte*/)
+{
+    int32_t ret = 1;
+    uint8_t cur_byte = 0, valid_bytes_read = 0;
+    *act_bytes = 0;
+    while (valid_bytes_read < num_bytes)
+    {
+        cur_byte = bstream->buf[bstream->buf_index + *act_bytes];
+        data->byte[valid_bytes_read] = cur_byte;
+        valid_bytes_read++;
+        *act_bytes +=1;
+    }
+    /* Check to see if we reached end during above operation. We might be out of range buts it safe since our array
+       has at least MIN_DATA extra bytes and the maximum out of bounds we will go is 5 bytes */
+    if ((bstream->buf_index + *act_bytes -1) >= bstream->buf_end)
+    {
+        ret = -1;
+    }
+    return ret;
+}
+
+
+/* This function populates requested number of bytes into data parameter, skips emulation prevention bytes if needed */
+static inline int32_t getbytes_emul(viddec_pm_utils_bstream_buf_cxt_t *bstream,
+        viddec_pm_utils_getbits_t *data,/* gets populated with read bytes*/
+        uint32_t *act_bytes, /* actual number of bytes read can be more due to emulation prev bytes*/
+        uint32_t *phase,    /* Phase for emulation */
+        uint32_t num_bytes,/* requested number of bytes*/
+        uint8_t is_offset_zero /* Are we on aligned byte position for first byte*/)
+{
+    int32_t ret = 1;
+    uint8_t cur_byte = 0, valid_bytes_read = 0;
+    uint32_t actual_bytes = 0;
+    *act_bytes = 0;
+
+    uint8_t *curr_pos = (uint8_t *)(bstream->buf + bstream->buf_index);
+
+    while (valid_bytes_read < num_bytes)
+    {
+        cur_byte = *curr_pos++;
+   //     ITRACE("getbytes_emul cur_byte = 0x%x", cur_byte);
+        if ((cur_byte == 0x3) && (*phase == 2))
+        {/* skip emulation byte. we update the phase only if emulation prevention is enabled */
+            *phase = 0;
+        }
+        else
+        {
+            data->byte[valid_bytes_read] = cur_byte;
+            /*
+                    We only update phase for first byte if bit offset is 0. If its not 0 then it was already accounted for in the past.
+                    From second byte onwards we always look to update phase.
+                    */
+            if ((actual_bytes != 0) || (is_offset_zero))
+            {
+                if (cur_byte == 0)
+                {
+                    /* Update phase only if emulation prevention is required */
+                    *phase += (*phase < 2 ? 1:0 );
+                }
+                else
+                {
+                    *phase=0;
+                }
+            }
+            valid_bytes_read++;
+        }
+        actual_bytes++;
+    }
+    /*
+        Check to see if we reached end during above operation. We might be out of range buts it safe since our array
+        has at least MIN_DATA extra bytes and the maximum out of bounds we will go is 5 bytes
+       */
+
+    if ((bstream->buf_index + actual_bytes -1) >= bstream->buf_end)
+    {
+        ret = -1;
+    }
+    *act_bytes = actual_bytes;
+    return ret;
+}
+
+static int32_t viddec_pm_utils_bstream_getbits_emul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
+{
+
     uint32_t data_left=0;
     int32_t ret = -1;
+
     /* STEP 1: Make sure that we have at least minimum data before we calculate bits */
-    viddec_pm_utils_check_bstream_reload(cxt, &data_left);
+    viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+    bstream = &(cxt->bstrm_buf);
+    data_left = bstream->buf_end - bstream->buf_index;
+
+    uint32_t bytes_required=0;
+    uint32_t act_bytes = 0;
+    uint32_t phase;
+    viddec_pm_utils_getbits_t data;
 
     if ((num_bits <= 32) && (num_bits > 0) && (data_left != 0))
     {
-        uint32_t bytes_required = 0;
-        viddec_pm_utils_bstream_buf_cxt_t *bstream;
-
-        bstream = &(cxt->bstrm_buf);
-        bytes_required = (bstream->buf_bitoff + num_bits + 7) >> 3;
+        bytes_required = (bstream->buf_bitoff + num_bits + 7)>>3;
 
         /* Step 2: Make sure we have bytes for requested bits */
         if (bytes_required <= data_left)
         {
-            uint32_t act_bytes, phase;
-            viddec_pm_utils_getbits_t data;
             phase = cxt->phase;
-            /* Step 3: Due to emualtion prevention bytes sometimes the bytes_required < actual_required bytes */
-            if (viddec_pm_utils_getbytes(bstream, &data, &act_bytes, &phase, bytes_required, cxt->is_emul_reqd, (bstream->buf_bitoff == 0)) != -1)
+            /* Step 3: Due to emualtion prevention bytes sometimes the bytes_required > actual_required bytes */
+            if (getbytes_emul(bstream, &data, &act_bytes, &phase, bytes_required, (bstream->buf_bitoff == 0)) != -1)
             {
-                uint32_t total_bits = 0;
-                uint32_t shift_by = 0;
+                uint32_t total_bits=0;
+                uint32_t shift_by=0;
                 /* zero out upper bits */
                 /* LIMITATION:For some reason compiler is optimizing it to NOP if i do both shifts
                    in single statement */
                 data.byte[0] <<= bstream->buf_bitoff;
                 data.byte[0] >>= bstream->buf_bitoff;
-
                 data.word[0] = SWAP_WORD(data.word[0]);
                 data.word[1] = SWAP_WORD(data.word[1]);
+                total_bits = num_bits+bstream->buf_bitoff;
 
-                total_bits = num_bits + bstream->buf_bitoff;
                 if (total_bits > 32)
                 {
                     /* We have to use both the words to get required data */
@@ -283,17 +399,218 @@
                     data.word[0] = data.word[0] >> shift_by;
                 }
                 *out = data.word[0];
-                if (skip)
-                {
-                    /* update au byte position if needed */
-                    viddec_pm_utils_update_skipoffsets(bstream, total_bits, act_bytes);
-                    cxt->phase = phase;
 
-                    if (act_bytes > bytes_required)
-                    {
-                        cxt->emulation_byte_counter += act_bytes - bytes_required;
-                    }
+                /* update au byte position if needed */
+                if ((total_bits & 0x7) == 0)
+                {
+                    bstream->buf_bitoff = 0;
+                    bstream->buf_index +=act_bytes;
                 }
+                else
+                {
+                    bstream->buf_bitoff = total_bits & 0x7;
+                    bstream->buf_index +=(act_bytes - 1);
+                }
+                cxt->phase = phase;
+                if (act_bytes > bytes_required)
+                {
+                    cxt->emulation_byte_counter += act_bytes - bytes_required;
+                }
+
+                ret=1;
+            }
+        }
+    }
+    return ret;
+
+}
+
+static int32_t viddec_pm_utils_bstream_getbits_noemul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
+{
+    uint32_t data_left=0;
+    int32_t ret = -1;
+    /* STEP 1: Make sure that we have at least minimum data before we calculate bits */
+
+    viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+    bstream = &(cxt->bstrm_buf);
+    data_left = bstream->buf_end - bstream->buf_index;
+    uint32_t bytes_required=0;
+    if ((num_bits <= 32) && (num_bits > 0) && (data_left != 0))
+    {
+        bytes_required = (bstream->buf_bitoff + num_bits + 7)>>3;
+
+        /* Step 2: Make sure we have bytes for requested bits */
+        if (bytes_required <= data_left)
+        {
+            uint32_t act_bytes, phase;
+            viddec_pm_utils_getbits_t data;
+            phase = cxt->phase;
+            /* Step 3: Due to emualtion prevention bytes sometimes the bytes_required > actual_required bytes */
+            if (getbytes_noemul(bstream, &data, &act_bytes, &phase, bytes_required, (bstream->buf_bitoff == 0)) != -1)
+            {
+                uint32_t total_bits=0;
+                uint32_t shift_by=0;
+                /* zero out upper bits */
+                /* LIMITATION:For some reason compiler is optimizing it to NOP if i do both shifts
+                   in single statement */
+                data.byte[0] <<= bstream->buf_bitoff;
+                data.byte[0] >>= bstream->buf_bitoff;
+                data.word[0] = SWAP_WORD(data.word[0]);
+                data.word[1] = SWAP_WORD(data.word[1]);
+                total_bits = num_bits+bstream->buf_bitoff;
+                if (total_bits > 32)
+                {
+                    /* We have to use both the words to get required data */
+                    shift_by = total_bits - 32;
+                    data.word[0] = (data.word[0] << shift_by) | ( data.word[1] >> (32 - shift_by));
+                }
+                else
+                {
+                    shift_by = 32 - total_bits;
+                    data.word[0] = data.word[0] >> shift_by;
+                }
+                *out = data.word[0];
+
+                /* update au byte position if needed */
+                if ((total_bits & 0x7) == 0)
+                {
+                    bstream->buf_bitoff = 0;
+                    bstream->buf_index +=act_bytes;
+                }
+                else
+                {
+                    bstream->buf_bitoff = total_bits & 0x7;
+                    bstream->buf_index +=(act_bytes - 1);
+                }
+                cxt->phase = phase;
+                if (act_bytes > bytes_required)
+                {
+                    cxt->emulation_byte_counter += act_bytes - bytes_required;
+                }
+
+                ret =1;
+            }
+        }
+    }
+    return ret;
+
+}
+
+static int32_t viddec_pm_utils_bstream_peekbits_emul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
+{
+    uint32_t data_left=0;
+    int32_t ret = -1;
+    /* STEP 1: Make sure that we have at least minimum data before we calculate bits */
+
+    viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+    bstream = &(cxt->bstrm_buf);
+    data_left = bstream->buf_end - bstream->buf_index;
+
+    uint32_t act_bytes = 0, phase;
+    viddec_pm_utils_getbits_t data;
+    uint32_t bytes_required=0;
+
+    if ((num_bits <= 32) && (num_bits > 0) && (data_left != 0))
+    {
+        uint32_t bytes_required=0;
+        viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+        bstream = &(cxt->bstrm_buf);
+        bytes_required = (bstream->buf_bitoff + num_bits + 7)>>3;
+
+        /* Step 2: Make sure we have bytes for requested bits */
+        if (bytes_required <= data_left)
+        {
+            phase = cxt->phase;
+            /* Step 3: Due to emualtion prevention bytes sometimes the bytes_required > actual_required bytes */
+            if (getbytes_emul(bstream, &data, &act_bytes, &phase, bytes_required, (bstream->buf_bitoff == 0)) != -1)
+            {
+                uint32_t total_bits=0;
+                uint32_t shift_by=0;
+                /* zero out upper bits */
+                /* LIMITATION:For some reason compiler is optimizing it to NOP if i do both shifts
+                   in single statement */
+                data.byte[0] <<= bstream->buf_bitoff;
+                data.byte[0] >>= bstream->buf_bitoff;
+
+                data.word[0] = SWAP_WORD(data.word[0]);
+                data.word[1] = SWAP_WORD(data.word[1]);
+                total_bits = num_bits+bstream->buf_bitoff;
+                if (total_bits > 32)
+                {
+                    /* We have to use both the words to get required data */
+                    shift_by = total_bits - 32;
+                    data.word[0] = (data.word[0] << shift_by) | ( data.word[1] >> (32 - shift_by));
+                }
+                else
+                {
+                    shift_by = 32 - total_bits;
+                    data.word[0] = data.word[0] >> shift_by;
+                }
+                *out = data.word[0];
+
+                ret =1;
+            }
+        }
+    }
+    return ret;
+}
+
+static int32_t viddec_pm_utils_bstream_peekbits_noemul(viddec_pm_utils_bstream_cxt_t *cxt, uint32_t *out, uint32_t num_bits)
+{
+    uint32_t data_left=0;
+    int32_t ret = -1;
+    /* STEP 1: Make sure that we have at least minimum data before we calculate bits */
+    //viddec_pm_utils_check_bstream_reload(cxt, &data_left);
+
+    viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+    bstream = &(cxt->bstrm_buf);
+    data_left = bstream->buf_end - bstream->buf_index;
+    uint32_t bytes_required=0;
+
+    if ((num_bits <= 32) && (num_bits > 0) && (data_left != 0))
+    {
+        uint32_t bytes_required=0;
+        viddec_pm_utils_bstream_buf_cxt_t *bstream;
+
+        bstream = &(cxt->bstrm_buf);
+        bytes_required = (bstream->buf_bitoff + num_bits + 7)>>3;
+
+        /* Step 2: Make sure we have bytes for requested bits */
+        if (bytes_required <= data_left)
+        {
+            uint32_t act_bytes, phase;
+            viddec_pm_utils_getbits_t data;
+            phase = cxt->phase;
+            /* Step 3: Due to emualtion prevention bytes sometimes the bytes_required > actual_required bytes */
+            if (getbytes_noemul(bstream, &data, &act_bytes, &phase, bytes_required, (bstream->buf_bitoff == 0)) != -1)
+            {
+                uint32_t total_bits=0;
+                uint32_t shift_by=0;
+                /* zero out upper bits */
+                /* LIMITATION:For some reason compiler is optimizing it to NOP if i do both shifts
+                   in single statement */
+                data.byte[0] <<= bstream->buf_bitoff;
+                data.byte[0] >>= bstream->buf_bitoff;
+
+                data.word[0] = SWAP_WORD(data.word[0]);
+                data.word[1] = SWAP_WORD(data.word[1]);
+                total_bits = num_bits+bstream->buf_bitoff;
+                if (total_bits > 32)
+                {
+                    /* We have to use both the words to get required data */
+                    shift_by = total_bits - 32;
+                    data.word[0] = (data.word[0] << shift_by) | ( data.word[1] >> (32 - shift_by));
+                }
+                else
+                {
+                    shift_by = 32 - total_bits;
+                    data.word[0] = data.word[0] >> shift_by;
+                }
+                *out = data.word[0];
 
                 ret = 1;
             }