libjpeg: Remove the old assembly code for ARM.

A much better one is coming.

Change-Id: I60d8c227d573fcbff10af363d69405e9fbd0c147
diff --git a/Android.mk b/Android.mk
index 9e1c42e..1b2399d 100644
--- a/Android.mk
+++ b/Android.mk
@@ -10,8 +10,8 @@
 	jdatadst.c jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c \
 	jdinput.c jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c \
 	jdpostct.c jdsample.c jdtrans.c jerror.c jfdctflt.c jfdctfst.c \
-	jfdctint.c jidctflt.c jidctred.c jquant1.c \
-	jquant2.c jutils.c jmemmgr.c \
+	jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c jquant1.c \
+	jquant2.c jutils.c jmemmgr.c
 
 # use ashmem as libjpeg decoder's backing store
 LOCAL_CFLAGS += -DUSE_ANDROID_ASHMEM
@@ -23,21 +23,6 @@
 #LOCAL_SRC_FILES += \
 #	jmem-android.c
 
-
-# the assembler is only for the ARM version, don't break the Linux sim
-ifneq ($(TARGET_ARCH),arm)
-ANDROID_JPEG_NO_ASSEMBLER := true
-endif
-
-# temp fix until we understand why this broke cnn.com
-#ANDROID_JPEG_NO_ASSEMBLER := true
-
-ifeq ($(strip $(ANDROID_JPEG_NO_ASSEMBLER)),true)
-LOCAL_SRC_FILES += jidctint.c jidctfst.c
-else
-LOCAL_SRC_FILES += jidctint.c jidctfst.S
-endif
-
 LOCAL_CFLAGS += -DAVOID_TABLES 
 LOCAL_CFLAGS += -O3 -fstrict-aliasing -fprefetch-loop-arrays
 #LOCAL_CFLAGS += -march=armv6j
diff --git a/jidctfst.S b/jidctfst.S
deleted file mode 100644
index 34e1c24..0000000
--- a/jidctfst.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (C) 2008 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <machine/cpu-features.h>
-
-    .text
-    .align
-
-    .global jpeg_idct_ifast
-    .func   jpeg_idct_ifast
-
-// NOTE: sb=r9, fp=r11 ip=r12, sp=r13, lr=r14, pc=r15
-
-// jpeg_idct_ifast (j_decompress_ptr       cinfo,
-//                 jpeg_component_info *   compptr,
-//                 short*                  coef_block,
-//                 unsigned char*          output_buf,
-//                 int                     output_col)
-
-#define  local_TMP0123       sp
-#define  local_TMP0          [sp, #0]
-#define  local_TMP1          [sp, #4]
-#define  local_TMP2          [sp, #8]
-#define  local_TMP3          [sp, #12]
-#define  local_RANGE_TABLE   [sp, #16]
-#define  local_OUTPUT_COL    [sp, #20]
-#define  local_OUTPUT_BUF    [sp, #24]
-#define  local_UNUSED        [sp, #28]
-#define  off_WORKSPACE       32
-#define  local_WORKSPACE     [sp, #offWORKSPACE]
-#define  local_SIZE          (off_WORKSPACE + 8*8*4)
-
-#define  off_DECOMPRESS_range_limit_base  324
-#define  off_COMPINFO_quanttable          80
-
-#define  DCTSIZE   8
-#define  VY(x)   ((x)*DCTSIZE*2)
-#define  QY(x)   ((x)*DCTSIZE*4)
-
-#define  VX(x)   ((x)*2)
-#define  QX(x)   ((x)*4)
-
-#define  FIX_1_414213562    #362
-#define  FIX_1_082392200    #277
-#define  FIX_1_847759065    #473
-#define  FIX_2_613125930    #669
-
-#define  RANGE_MASK   1023
-
-
-
-jpeg_idct_ifast:
-    PLD     (r2, #0)
-    stmdb   sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    ldr     r4, [sp, #4*10]
-    sub     sp, #local_SIZE
-
-    ldr     r10,[r1, #off_COMPINFO_quanttable]         // r10 = quanttable
-    str     r4, local_OUTPUT_COL
-    str     r3, local_OUTPUT_BUF
-    ldr     r5, [r0, #off_DECOMPRESS_range_limit_base]
-    add     r5, r5, #128
-    str     r5, local_RANGE_TABLE
-    mov     fp, r2                                      // fp = coef_block
-    add     ip, sp, #off_WORKSPACE
-
-VLoopTail:
-    ldrsh    r0, [fp, #VY(0)]
-    ldrsh    r1, [fp, #VY(1)]
-    ldrsh    r2, [fp, #VY(2)]
-    ldrsh    r3, [fp, #VY(3)]
-    ldrsh    r4, [fp, #VY(4)]
-    ldrsh    r5, [fp, #VY(5)]
-    ldrsh    r6, [fp, #VY(6)]
-    ldrsh    r7, [fp, #VY(7)]
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      VLoopHeadZero
-
-VLoopHead:
-    // tmp0 = DEQUANTIZE(in[DCTSIZE*0], quant[DCTSIZE*0]   (r0)
-    // tmp2 = DEQUANTIZE(in[DCTSIZE*4], quant[DCTSIZE*4]   (r4)
-    // tmp1 = DEQUANTIZE(in[DCTSIZE*2], quant[DCTSIZE*2]   (r2)
-    // tmp3 = DEQUANTIZE(in[DCTSIZE*6], quant[DCTSIZE*6]   (r6)
-    // tmp10 = tmp0 + tmp2   (r0)
-    // tmp11 = tmp0 - tmp2   (r4)
-
-    ldr      r9, [r10, #QY(4)]
-    ldr      r8, [r10, #QY(0)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r4, r9, r4
-    smlabb   r0, r8, r0, r4
-#else
-    mul      r4, r9, r4
-    mul      r0, r8, r0
-    add      r0, r4
-#endif
-    ldr      r9, [r10, #QY(6)]
-    ldr      r8, [r10, #QY(2)]
-    sub      r4, r0, r4, lsl #1
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb   r6, r9, r6
-    smlabb   r2, r8, r2, r6
-#else
-    mul      r6, r9, r6
-    mul      r2, r8, r2
-    add      r2, r6
-#endif
-
-    // tmp13 = tmp1 + tmp3                                       (r2)
-    // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13    (r6)
-    // FIX_1_4142... = 362 = 45*8 + 2
-    sub      r6, r2, r6, lsl #1
-    mov      r8, #360
-    add      r8, r8, #2
-    mul      r9, r6, r8
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r9, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // NOTE: be sure to not user r0,r4,r6,r8 soon after stm above
-
-    // odd part
-    // tmp4 = DEQUANTIZE( in[DCTSIZE*1], quant[DCTSIZE*1] )   (r1)
-    // tmp6 = DEQUANTIZE( in[DCTSIZE*5], quant[DCTSIZE*5] )   (r5)
-    // tmp5 = DEQUANTIZE( in[DCTSIZE*3], quant[DCTSIZE*3] )   (r3)
-    // tmp7 = DEQUANTIZE( in[DCTSIZE*7], quant[DCTSIZE*7] )   (r7)
-    // z13 = tmp6 + tmp5;  (r0)
-    // z10 = tmp6 - tmp5;  (r2)
-    // z11 = tmp4 + tmp7;  (r4)
-    // z12 = tmp4 - tmp7;  (r6)
-
-    ldr     r2, [r10, #QY(1)]
-    ldr     r9, [r10, #QY(5)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r1, r2, r1
-#else
-    mul     r1, r2, r1
-#endif
-    ldr     r2, [r10, #QY(3)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smulbb  r5, r9, r5
-#else
-    mul     r5, r9, r5
-#endif
-    ldr     r9, [r10, #QY(7)]
-#if __ARM_HAVE_HALFWORD_MULTIPLY
-    smlabb  r0, r2, r3, r5
-    smlabb  r4, r9, r7, r1
-#else
-    mul     r0, r2, r3
-    add     r0, r5
-    mul     r4, r9, r7
-    add     r4, r1
-#endif
-    rsb  r2, r0, r5, lsl #1
-    rsb  r6, r4, r1, lsl #1
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add     r8, r2, r6
-    mov     r9, #472
-    mla     r8, r9, r8, r8
-    mov     r9, #276
-    mla     r0, r6, r9, r6
-    mov     r9, #668
-    mla     r2, r9, r2, r2
-    sub     r0, r0, r8
-    rsb     r2, r2, r8
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
-    // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
-    // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
-    // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
-    // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5);
-    // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
-    // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
-    // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
-
-    add   r0, r0, r7
-    sub   r7, r0, r7, lsl #1
-    add   r1, r1, r6
-    sub   r6, r1, r6, lsl #1
-    add   r2, r2, r5
-    sub   r5, r2, r5, lsl #1
-    sub   r3, r3, r4
-    add   r4, r3, r4, lsl #1
-
-    str   r0, [ip, #QY(0)]
-    str   r1, [ip, #QY(1)]
-    str   r2, [ip, #QY(2)]
-    str   r3, [ip, #QY(3)]
-    str   r4, [ip, #QY(4)]
-    str   r5, [ip, #QY(5)]
-    str   r6, [ip, #QY(6)]
-    str   r7, [ip, #QY(7)]
-
-    // inptr++;                    /* advance pointers to next column */
-    // quantptr++;
-    // wsptr++;
-    add  fp, fp, #2
-    add  r10, r10, #4
-    add  ip, ip, #4
-    add  r0, sp, #(off_WORKSPACE + 4*8)
-    cmp  ip, r0
-    bne  VLoopTail
-
-
-
-HLoopStart:
-    // reset pointers
-    PLD     (sp, #off_WORKSPACE)
-    add     ip, sp, #off_WORKSPACE
-    ldr     r10, local_RANGE_TABLE
-
-HLoopTail:
-    // output = *output_buf++ + output_col
-    ldr      r0, local_OUTPUT_BUF
-    ldr      r1, local_OUTPUT_COL
-    ldr      r2, [r0], #4
-    str      r0, local_OUTPUT_BUF
-    add      fp, r2, r1
-
-    PLD      (ip, #32)
-    ldmia    ip!, {r0-r7}
-
-    cmp      r1, #0
-    orreqs   r8, r2, r3
-    orreqs   r8, r4, r5
-    orreqs   r8, r6, r7
-    beq      HLoopTailZero
-
-HLoopHead:
-    // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);    (r0)
-    // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);    (r4)
-    add     r0, r0, r4
-    sub     r4, r0, r4, lsl #1
-
-    // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);                                   (r2)
-    // tmp12 = MULTIPLY((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6], FIX_1_414213562) - tmp13;  (r6)
-    // FIX_... = 360 + 2
-    add     r2, r2, r6
-    sub     r6, r2, r6, lsl #1
-    mov     r8, #360
-    add     r8, r8, #2
-    mul     r6, r8, r6
-
-    // tmp0 = tmp10 + tmp13;   (r0)
-    // tmp3 = tmp10 - tmp13;   (r8)
-    // tmp1 = tmp11 + tmp12;   (r4)
-    // tmp2 = tmp11 - tmp12;   (r6)
-    add     r0, r0, r2
-    rsb     r6, r2, r6, asr #8
-    sub     r8, r0, r2, lsl #1
-    add     r4, r4, r6
-    sub     r6, r4, r6, lsl #1
-
-    stmia   local_TMP0123, {r0, r4, r6, r8}
-
-    // Odd part
-
-    // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];  (r0)
-    // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];  (r2)
-    // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];  (r4)
-    // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];  (r6)
-    add  r0, r5, r3
-    sub  r2, r5, r3
-    add  r4, r1, r7
-    sub  r6, r1, r7
-
-    // tmp7 = z11 + z13;                             (r7)
-    // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); (r1)
-    // FIX_... = 360 + 2
-    add   r7, r4, r0
-    sub   r1, r4, r0
-    mov   r8, #360
-    add   r8, r8, #2
-    mul   r1, r8, r1
-
-    // z5 = MULTIPLY(z10 + z12, FIX_1_847759065);        (r8)
-    // tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;      (r0)
-    // tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;    (r2)
-    // FIX_1_8477... = 473 = 472 + 1
-    // FIX_1_082...  = 277 = 276 + 1
-    // FIX_2_...     = 669 = 668 + 1
-    add  r8, r2, r6
-    mov  r9, #472
-    mla  r8, r9, r8, r8
-    mov  r9, #276
-    mla  r0, r6, r9, r6
-    mov  r9, #668
-    mla  r2, r9, r2, r2
-    sub  r0, r0, r8
-    sub  r2, r8, r2
-
-    // tmp6 = tmp12 - tmp7;  (r6)
-    // tmp5 = tmp11 - tmp6;  (r5)
-    // tmp4 = tmp10 + tmp5;  (r4)
-    rsb  r6, r7, r2, asr #8
-    rsb  r5, r6, r1, asr #8
-    add  r4, r5, r0, asr #8
-
-    ldmia local_TMP0123, {r0, r1, r2, r3}
-
-    // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3) & RANGE_MASK];
-    // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3) & RANGE_MASK];
-
-    mov    r8, #128
-    add    r0, r0, r7
-    sub    r7, r0, r7, lsl #1
-    add    r0, r8, r0, asr #5
-    add    r7, r8, r7, asr #5
-    add    r1, r1, r6
-    sub    r6, r1, r6, lsl #1
-    add    r1, r8, r1, asr #5
-    add    r6, r8, r6, asr #5
-    add    r2, r2, r5
-    sub    r5, r2, r5, lsl #1
-    add    r2, r8, r2, asr #5
-    add    r5, r8, r5, asr #5
-    sub    r3, r3, r4
-    add    r4, r3, r4, lsl #1
-    add    r3, r8, r3, asr #5
-    add    r4, r8, r4, asr #5
-
-#if __ARM_ARCH__ >= 6
-    usat   r0, #8, r0
-    usat   r1, #8, r1
-    usat   r2, #8, r2
-    usat   r3, #8, r3
-    usat   r4, #8, r4
-    usat   r5, #8, r5
-    usat   r6, #8, r6
-    usat   r7, #8, r7
-#else
-    cmp    r0, #255
-    mvnhi  r0, r0, asr #31
-    andhi  r0, #255
-    cmp    r7, #255
-    mvnhi  r7, r7, asr #31
-    cmp    r1, #255
-    mvnhi  r1, r1, asr #31
-    andhi  r1, #255
-    cmp    r6, #255
-    mvnhi  r6, r6, asr #31
-    andhi  r6, #255
-    cmp    r2, #255
-    mvnhi  r2, r2, asr #31
-    andhi  r2, #255
-    cmp    r5, #255
-    mvnhi  r5, r5, asr #31
-    andhi  r5, #255
-    cmp    r3, #255
-    mvnhi  r3, r3, asr #31
-    cmp    r4, #255
-    mvnhi  r4, r4, asr #31
-    andhi  r4, #255
-#endif
-
-    // r3 r2 r1 r0
-    orr    r0, r0, r1, lsl #8
-    orr    r0, r0, r2, lsl #16
-    orr    r0, r0, r3, lsl #24
-
-    // r7 r6 r5 r4
-    orr    r1, r4, r5, lsl #8
-    orr    r1, r1, r6, lsl #16
-    orr    r1, r1, r7, lsl #24
-    stmia  fp, {r0, r1}
-
-    add    r0, sp, #(off_WORKSPACE + 8*8*4)
-    cmp    ip, r0
-    bne    HLoopTail
-
-Exit:
-    add    sp, sp, #local_SIZE
-    ldmia  sp!, {r4,r5, r6,r7, r8,r9, r10,r11, r12,lr}
-    bx     lr
-
-
-VLoopHeadZero:
-// ok, all AC coefficients are 0
-    ldr      r1, [r10, #QY(0)]
-    add      fp, fp, #2
-    add      r10, r10, #4
-    mul      r0, r1, r0
-    str      r0, [ip, #QY(0)]
-    str      r0, [ip, #QY(1)]
-    str      r0, [ip, #QY(2)]
-    str      r0, [ip, #QY(3)]
-    str      r0, [ip, #QY(4)]
-    str      r0, [ip, #QY(5)]
-    str      r0, [ip, #QY(6)]
-    str      r0, [ip, #QY(7)]
-    add      ip, ip, #4
-    add      r0, sp, #(off_WORKSPACE + 4*8)
-    cmp      ip, r0
-    beq      HLoopStart
-    b        VLoopTail
-
-HLoopTailZero:
-    mov      r0, r0, asr #5
-    add      r0, #128
-
-#if __ARM_ARCH__ >= 6
-    usat     r0, #8, r0
-#else
-    cmp      r0, #255
-    mvnhi    r0, r0, asr #31
-    andhi    r0, r0, #255
-#endif
-
-    orr      r0, r0, lsl #8
-    orr      r0, r0, lsl #16
-    mov      r1, r0
-    stmia    fp, {r0, r1}
-
-    add      r0, sp, #(off_WORKSPACE + 64*4)
-    cmp      ip, r0
-    beq      Exit
-    b        HLoopTail
-
-    .endfunc