Improve the tile based decoding performance

Change-Id: I4043edff510ffe50fdf47a342d6a95e879918737
diff --git a/jdapistd.c b/jdapistd.c
index 8d6e591..e1233df 100644
--- a/jdapistd.c
+++ b/jdapistd.c
@@ -221,6 +221,9 @@
   int col_right_boundary =
                   jdiv_round_up(*start_x + *width, lines_per_iMCU_col);
 
+  cinfo->coef->MCU_columns_to_skip =
+      *start_x / lines_per_iMCU_col - col_left_boundary;
+
   *height = (*start_y - row_offset * lines_per_iMCU_row) + *height;
   *start_x = col_left_boundary * lines_per_iMCU_col;
   *start_y = row_offset * lines_per_iMCU_row;
diff --git a/jdcoefct.c b/jdcoefct.c
index b10f9bc..9e8040b 100644
--- a/jdcoefct.c
+++ b/jdcoefct.c
@@ -170,13 +170,18 @@
     for (MCU_col_num = coef->MCU_ctr; MCU_col_num <= last_MCU_col;
 	 MCU_col_num++) {
       /* Try to fetch an MCU.  Entropy decoder expects buffer to be zeroed. */
-      jzero_far((void FAR *) coef->MCU_buffer[0],
+      if (MCU_col_num < coef->pub.MCU_columns_to_skip) {
+        (*cinfo->entropy->decode_mcu_discard_coef) (cinfo);
+        continue;
+      } else {
+        jzero_far((void FAR *) coef->MCU_buffer[0],
 		(size_t) (cinfo->blocks_in_MCU * SIZEOF(JBLOCK)));
-      if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
-	/* Suspension forced; update state counters and exit */
-	coef->MCU_vert_offset = yoffset;
-	coef->MCU_ctr = MCU_col_num;
-	return JPEG_SUSPENDED;
+        if (! (*cinfo->entropy->decode_mcu) (cinfo, coef->MCU_buffer)) {
+	  /* Suspension forced; update state counters and exit */
+	  coef->MCU_vert_offset = yoffset;
+	  coef->MCU_ctr = MCU_col_num;
+	  return JPEG_SUSPENDED;
+        }
       }
       /* Determine where data should go in output_buf and do the IDCT thing.
        * We skip dummy blocks at the right and bottom edges (but blkn gets
@@ -203,8 +208,8 @@
 	    output_col = start_col;
 	    for (xindex = 0; xindex < useful_width; xindex++) {
 	      (*inverse_DCT) (cinfo, compptr,
-			      (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
-			      output_ptr, output_col);
+		        (JCOEFPTR) coef->MCU_buffer[blkn+xindex],
+		        output_ptr, output_col);
 	      output_col += compptr->DCT_scaled_size;
 	    }
 	  }
@@ -575,11 +580,25 @@
     }
     inverse_DCT = cinfo->idct->inverse_DCT[ci];
     output_ptr = output_buf[ci];
+    int width_in_blocks = compptr->width_in_blocks;
+    int start_block = 0;
+#if ANDROID_TILE_BASED_DECODE
+    if (cinfo->tile_decode) {
+      width_in_blocks = jmin(width_in_blocks,
+        (cinfo->coef->MCU_column_right_boundary -
+         cinfo->coef->MCU_column_left_boundary) *
+         cinfo->max_h_samp_factor /
+         compptr->h_samp_factor);
+      start_block = coef->pub.MCU_columns_to_skip *
+        cinfo->max_h_samp_factor / compptr->h_samp_factor;
+    }
+#endif
     /* Loop over all DCT blocks to be processed. */
     for (block_row = 0; block_row < block_rows; block_row++) {
       buffer_ptr = buffer[block_row];
-      output_col = 0;
-      for (block_num = 0; block_num < compptr->width_in_blocks; block_num++) {
+      output_col = start_block * compptr->DCT_scaled_size;
+      buffer_ptr += start_block;
+      for (block_num = start_block; block_num < width_in_blocks; block_num++) {
 	(*inverse_DCT) (cinfo, compptr, (JCOEFPTR) buffer_ptr,
 			output_ptr, output_col);
 	buffer_ptr++;
@@ -906,6 +925,7 @@
   coef->pub.start_output_pass = start_output_pass;
   coef->pub.column_left_boundary = 0;
   coef->pub.column_right_boundary = 0;
+  coef->pub.MCU_columns_to_skip = 0;
 #ifdef BLOCK_SMOOTHING_SUPPORTED
   coef->coef_bits_latch = NULL;
 #endif
diff --git a/jpegint.h b/jpegint.h
index fd6d8bb..3b5511e 100644
--- a/jpegint.h
+++ b/jpegint.h
@@ -177,10 +177,20 @@
 				 JSAMPIMAGE output_buf));
   /* Pointer to array of coefficient virtual arrays, or NULL if none */
   jvirt_barray_ptr *coef_arrays;
+
+  /* column number of the first and last tile, respectively */
   int column_left_boundary;
   int column_right_boundary;
+
+  /* column number of the first and last MCU, respectively */
   int MCU_column_left_boundary;
   int MCU_column_right_boundary;
+
+  /* the number of MCU columns to skip from the indexed MCU, iM,
+   * to the requested MCU boundary, rM, where iM is the MCU that we sample
+   * into our index and is the nearest one to the left of rM.
+   */
+  int MCU_columns_to_skip;
 };
 
 /* Decompression postprocessing (color quantization buffer control) */