Bug fix on shift that exceeds "lane width".

Rationale:
ARM is a bit less forgiving on shifting more than
the lane width of the SIMD instruction (rejecting
such cases is no loss, since it yields 0 anyway
and should be optimized differently).

Bug: 37776122
Test: test-art-target, test-art-host
Change-Id: I22d04afbfce82b4593f17c2f48c1fd5a0805d305
diff --git a/compiler/optimizing/loop_optimization.cc b/compiler/optimizing/loop_optimization.cc
index c783dde..bbc55dd 100644
--- a/compiler/optimizing/loop_optimization.cc
+++ b/compiler/optimizing/loop_optimization.cc
@@ -833,17 +833,22 @@
     // TODO: accept symbolic, albeit loop invariant shift factors.
     HInstruction* opa = instruction->InputAt(0);
     HInstruction* opb = instruction->InputAt(1);
-    if (VectorizeUse(node, opa, generate_code, type, restrictions) && opb->IsIntConstant()) {
-      if (generate_code) {
-        // Make sure shift factor only looks at lower bits, as defined for sequential shifts.
-        // Note that even the narrower SIMD shifts do the right thing after that.
-        int32_t mask = (instruction->GetType() == Primitive::kPrimLong)
-            ? kMaxLongShiftDistance
-            : kMaxIntShiftDistance;
-        HInstruction* s = graph_->GetIntConstant(opb->AsIntConstant()->GetValue() & mask);
-        GenerateVecOp(instruction, vector_map_->Get(opa), s, type);
+    int64_t value = 0;
+    if (VectorizeUse(node, opa, generate_code, type, restrictions) && IsInt64AndGet(opb, &value)) {
+      // Make sure shift distance only looks at lower bits, as defined for sequential shifts.
+      int64_t mask = (instruction->GetType() == Primitive::kPrimLong)
+          ? kMaxLongShiftDistance
+          : kMaxIntShiftDistance;
+      int64_t distance = value & mask;
+      // Restrict shift distance to packed data type width.
+      int64_t max_distance = Primitive::ComponentSize(type) * 8;
+      if (0 <= distance && distance < max_distance) {
+        if (generate_code) {
+          HInstruction* s = graph_->GetIntConstant(distance);
+          GenerateVecOp(instruction, vector_map_->Get(opa), s, type);
+        }
+        return true;
       }
-      return true;
     }
   } else if (instruction->IsInvokeStaticOrDirect()) {
     // Accept particular intrinsics.
diff --git a/test/640-checker-byte-simd/src/Main.java b/test/640-checker-byte-simd/src/Main.java
index 0f7452b..10b20b8 100644
--- a/test/640-checker-byte-simd/src/Main.java
+++ b/test/640-checker-byte-simd/src/Main.java
@@ -179,6 +179,11 @@
       a[i] >>>= 33;  // 1, since & 31
   }
 
+  static void shl9() {
+    for (int i = 0; i < 128; i++)
+      a[i] <<= 9;  // yields all-zeros
+  }
+
   //
   // Loop bounds.
   //
@@ -259,6 +264,10 @@
     shr33();
     for (int i = 0; i < 128; i++) {
       expectEquals((byte) 0x09, a[i], "shr33");
+    }
+    shl9();
+    for (int i = 0; i < 128; i++) {
+      expectEquals((byte) 0x00, a[i], "shl9");
       a[i] = (byte) 0xf0;  // reset
     }
     not();