Merge branch 'jb-mr1-dev' of https://googleplex-android.googlesource.com/a/platform/external/llvm into merge-2012_09_10

commit: 99e8132d3daedcebd061aacb0b08d1e1a6dab14c [log] [tgz]
author: Stephen Hines <srhines@google.com> Thu Sep 13 19:06:52 2012 -0700
committer: Stephen Hines <srhines@google.com> Thu Sep 13 19:06:52 2012 -0700
tree: 4c816050f64e6b7de0984adb5cd3a0a6723c3703
parent: 68aeecc773d36c4be9adf32d1f09ec739574c880 [diff]
parent: 828ded66831c0caaeecd2291a6bfb084f373d0e4 [diff]
diff --git a/CREDITS.TXT b/CREDITS.TXT
index f090ad7..3362b61 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT

@@ -5,8 +5,8 @@
 
 The list is sorted by surname and formatted to allow easy grepping and
 beautification by scripts.  The fields are: name (N), email (E), web-address
-(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S).
+(W), PGP key ID and fingerprint (P), description (D), snail-mail address
+(S), and (I) IRC handle.
 
 
 N: Vikram Adve
@@ -328,10 +328,6 @@
 D: GCC PCH Integration (llvm-gcc), llvm-gcc improvements
 D: Optimizer improvements, Loop Index Split
 
-N: Sandeep Patel
-E: deeppatel1987@gmail.com
-D: ARM calling conventions rewrite, hard float support
-
 N: Wesley Peck
 E: peckw@wesleypeck.com
 W: http://wesleypeck.com/
@@ -354,6 +350,11 @@
 E: xerxes@zafena.se
 D: Cmake dependency chain and various bug fixes
 
+N: Alex Rosenberg
+E: alexr@leftfield.org
+I: arosenberg
+D: ARM calling conventions rewrite, hard float support
+
 N: Chad Rosier
 E: mcrosier@apple.com
 D: ARM fast-isel improvements

diff --git a/Makefile b/Makefile
index 604696a..a935df4 100644
--- a/Makefile
+++ b/Makefile

@@ -111,6 +111,7 @@
 	  cd BuildTools ; \
 	  unset CFLAGS ; \
 	  unset CXXFLAGS ; \
+	  unset SDKROOT ; \
 	  $(PROJ_SRC_DIR)/configure --build=$(BUILD_TRIPLE) \
 		--host=$(BUILD_TRIPLE) --target=$(BUILD_TRIPLE) \
 	        --disable-polly ; \

diff --git a/bindings/ocaml/executionengine/executionengine_ocaml.c b/bindings/ocaml/executionengine/executionengine_ocaml.c
index 5b1e32e..02e0306 100644
--- a/bindings/ocaml/executionengine/executionengine_ocaml.c
+++ b/bindings/ocaml/executionengine/executionengine_ocaml.c

@@ -75,6 +75,9 @@
   custom_hash_default,
   custom_serialize_default,
   custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
 };
 
 static value alloc_generic_value(LLVMGenericValueRef Ref) {

diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index a5985d9..c984bd1 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c

@@ -1277,6 +1277,9 @@
   custom_hash_default,
   custom_serialize_default,
   custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
 };
 
 static value alloc_builder(LLVMBuilderRef B) {

diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 94cc555..fcd5dd5 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake

@@ -85,13 +85,25 @@
 # library checks
 if( NOT PURE_WINDOWS )
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
-  check_library_exists(pthread pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
-  check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+  if (HAVE_LIBPTHREAD)
+    check_library_exists(pthread pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
+    check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+    check_library_exists(pthread pthread_mutex_lock "" HAVE_PTHREAD_MUTEX_LOCK)
+  else()
+    # this could be Android
+    check_library_exists(c pthread_create "" PTHREAD_IN_LIBC)
+    if (PTHREAD_IN_LIBC)
+      check_library_exists(c pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
+      check_library_exists(c pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+      check_library_exists(c pthread_mutex_lock "" HAVE_PTHREAD_MUTEX_LOCK)
+    endif()
+  endif()
   check_library_exists(dl dlopen "" HAVE_LIBDL)
 endif()
 
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_ARC4RANDOM)
+check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(getrusage sys/resource.h HAVE_GETRUSAGE)
 check_symbol_exists(setrlimit sys/resource.h HAVE_SETRLIMIT)
@@ -134,9 +146,6 @@
 check_symbol_exists(strcmp string.h HAVE_STRCMP)
 check_symbol_exists(strdup string.h HAVE_STRDUP)
 check_symbol_exists(strrchr string.h HAVE_STRRCHR)
-if( NOT PURE_WINDOWS )
-  check_symbol_exists(pthread_mutex_lock pthread.h HAVE_PTHREAD_MUTEX_LOCK)
-endif()
 check_symbol_exists(sbrk unistd.h HAVE_SBRK)
 check_symbol_exists(srand48 stdlib.h HAVE_RAND48_SRAND48)
 if( HAVE_RAND48_SRAND48 )

diff --git a/docs/LangRef.html b/docs/LangRef.html
index 810fce5..4daab59 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html

@@ -1207,13 +1207,6 @@
       may make calls to the function faster, at the cost of extra program
       startup time if the function is not called during program startup.</dd>
 
-  <dt><tt><b>ia_nsdialect</b></tt></dt>
-  <dd>This attribute indicates the associated inline assembly call is using a
-      non-standard assembly dialect.  The standard dialect is ATT, which is
-      assumed when this attribute is not present.  When present, the dialect
-      is assumed to be Intel.  Currently, ATT and Intel are the only supported
-      dialects.</dd>
-
   <dt><tt><b>inlinehint</b></tt></dt>
   <dd>This attribute indicates that the source code contained a hint that inlining
       this function is desirable (such as the "inline" keyword in C/C++).  It
@@ -2901,8 +2894,18 @@
 call void asm alignstack "eieio", ""()
 </pre>
 
-<p>If both keywords appear the '<tt>sideeffect</tt>' keyword must come
-   first.</p>
+<p>Inline asms also support using non-standard assembly dialects.  The assumed
+   dialect is ATT.  When the '<tt>inteldialect</tt>' keyword is present, the
+   inline asm is using the Intel dialect.  Currently, ATT and Intel are the
+   only supported dialects.  An example is:</p>
+
+<pre class="doc_code">
+call void asm inteldialect "eieio", ""()
+</pre>
+
+<p>If multiple keywords appear the '<tt>sideeffect</tt>' keyword must come
+   first, the '<tt>alignstack</tt>' keyword second and the
+   '<tt>inteldialect</tt>' keyword last.</p>
 
 <!--
 <p>TODO: The format of the asm and constraints string still need to be

diff --git a/docs/Makefile.sphinx b/docs/Makefile.sphinx
index 21f6648..81c13de 100644
--- a/docs/Makefile.sphinx
+++ b/docs/Makefile.sphinx

@@ -46,6 +46,10 @@
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
+	@# FIXME: Remove this `cp` once HTML->Sphinx transition is completed.
+	@# Kind of a hack, but HTML-formatted docs are on the way out anyway.
+	@echo "Copying legacy HTML-formatted docs into $(BUILDDIR)/html"
+	@cp -a *.html tutorial $(BUILDDIR)/html
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 
 dirhtml:

diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 1ab2b70..75a6fd1 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html

@@ -499,13 +499,12 @@
 
 <div>
 
-<p>We have changed the way that the Type Legalizer legalizes vectors. The type
-   legalizer now attempts to promote integer elements.  This enabled the
-   implementation of vector-select.  Additionally, we see a performance boost on
-   workloads which use vectors of chars and shorts, since they are now promoted
-   to 32-bit types, which are better supported by the SIMD instruction set.
-   Floating point types are still widened as before.</p>
-
+<p>Stack Coloring - We have implemented a new optimization pass
+  to merge stack objects which are used in disjoin areas of the code.
+  This optimization reduces the required stack space significantly, in cases
+  where it is clear to the optimizer that the stack slot is not shared.
+  We use the lifetime markers to tell the codegen that a certain alloca
+  is used within a region.</p>
 
 <p>We have put a significant amount of work into the code generator
    infrastructure, which allows us to implement more aggressive algorithms and

diff --git a/docs/llvm-theme/layout.html b/docs/_themes/llvm-theme/layout.html
similarity index 100%
rename from docs/llvm-theme/layout.html
rename to docs/_themes/llvm-theme/layout.html


diff --git a/docs/llvm-theme/static/contents.png b/docs/_themes/llvm-theme/static/contents.png
similarity index 100%
rename from docs/llvm-theme/static/contents.png
rename to docs/_themes/llvm-theme/static/contents.png
Binary files differ

diff --git a/docs/llvm-theme/static/llvm-theme.css b/docs/_themes/llvm-theme/static/llvm-theme.css
similarity index 100%
rename from docs/llvm-theme/static/llvm-theme.css
rename to docs/_themes/llvm-theme/static/llvm-theme.css


diff --git a/docs/llvm-theme/static/logo.png b/docs/_themes/llvm-theme/static/logo.png
similarity index 100%
rename from docs/llvm-theme/static/logo.png
rename to docs/_themes/llvm-theme/static/logo.png
Binary files differ

diff --git a/docs/llvm-theme/static/navigation.png b/docs/_themes/llvm-theme/static/navigation.png
similarity index 100%
rename from docs/llvm-theme/static/navigation.png
rename to docs/_themes/llvm-theme/static/navigation.png
Binary files differ

diff --git a/docs/llvm-theme/theme.conf b/docs/_themes/llvm-theme/theme.conf
similarity index 100%
rename from docs/llvm-theme/theme.conf
rename to docs/_themes/llvm-theme/theme.conf


diff --git a/docs/conf.py b/docs/conf.py
index de0585d..a1e9b5f 100644
--- a/docs/conf.py
+++ b/docs/conf.py

@@ -98,7 +98,7 @@
 #html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ["."]
+html_theme_path = ["_themes"]
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
@@ -134,18 +134,7 @@
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#
-# We load all the old-school HTML documentation pages into Sphinx here.
-basedir = os.path.dirname(__file__)
-html_additional_pages = {}
-for directory in ('', 'tutorial'):
-    for file in os.listdir(os.path.join(basedir, directory)):
-        if not file.endswith('.html'):
-            continue
-
-        subpath = os.path.join(directory, file)
-        name,_ = os.path.splitext(subpath)
-        html_additional_pages[name] = subpath
+#html_additional_pages = {}
 
 # If false, no module index is generated.
 #html_domain_indices = True
@@ -226,6 +215,7 @@
 
 # Automatically derive the list of man pages from the contents of the command
 # guide subdirectory.
+basedir = os.path.dirname(__file__)
 man_page_authors = "Maintained by The LLVM Team (http://llvm.org/)."
 command_guide_subpath = 'CommandGuide'
 command_guide_path = os.path.join(basedir, command_guide_subpath)
@@ -237,9 +227,8 @@
     # Otherwise, automatically extract the description.
     file_subpath = os.path.join(command_guide_subpath, name)
     with open(os.path.join(command_guide_path, name)) as f:
-        it = iter(f)
-        title = it.next()[:-1]
-        header = it.next()[:-1]
+        title = f.readline().rstrip('\n')
+        header = f.readline().rstrip('\n')
 
         if len(header) != len(title):
             print >>sys.stderr, (

diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index eceae5c..6587e77 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h

@@ -173,10 +173,11 @@
     LLVMUWTable = 1 << 30,
     LLVMNonLazyBind = 1 << 31
 
-    // FIXME: This attribute is currently not included in the C API as
-    // a temporary measure until the API/ABI impact to the C API is understood
-    // and the path forward agreed upon.
-    //LLVMAddressSafety = 1ULL << 32
+    /* FIXME: This attribute is currently not included in the C API as
+       a temporary measure until the API/ABI impact to the C API is understood
+       and the path forward agreed upon.
+    LLVMAddressSafety = 1ULL << 32
+    */
 } LLVMAttribute;
 
 typedef enum {
@@ -2687,7 +2688,7 @@
   
   template<typename T>
   inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
-    #if DEBUG
+    #ifdef DEBUG
     for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
       cast<T>(*I);
     #endif

diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index f30a6e3..7f20e33 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h

@@ -1238,8 +1238,8 @@
   /// countLeadingZeros - This function is an APInt version of the
   /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
   /// of zeros from the most significant bit to the first one bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the most significant bit to the first
+  /// @returns BitWidth if the value is zero, otherwise
+  /// returns the number of zeros from the most significant bit to the first
   /// one bits.
   unsigned countLeadingZeros() const {
     if (isSingleWord()) {
@@ -1252,8 +1252,8 @@
   /// countLeadingOnes - This function is an APInt version of the
   /// countLeadingOnes_{32,64} functions in MathExtras.h. It counts the number
   /// of ones from the most significant bit to the first zero bit.
-  /// @returns 0 if the high order bit is not set
-  /// @returns the number of 1 bits from the most significant to the least
+  /// @returns 0 if the high order bit is not set, otherwise
+  /// returns the number of 1 bits from the most significant to the least
   /// @brief Count the number of leading one bits.
   unsigned countLeadingOnes() const;
 
@@ -1266,8 +1266,8 @@
   /// countTrailingZeros - This function is an APInt version of the
   /// countTrailingZeros_{32,64} functions in MathExtras.h. It counts
   /// the number of zeros from the least significant bit to the first set bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the least significant bit to the first
+  /// @returns BitWidth if the value is zero, otherwise
+  /// returns the number of zeros from the least significant bit to the first
   /// one bit.
   /// @brief Count the number of trailing zero bits.
   unsigned countTrailingZeros() const;
@@ -1275,8 +1275,8 @@
   /// countTrailingOnes - This function is an APInt version of the
   /// countTrailingOnes_{32,64} functions in MathExtras.h. It counts
   /// the number of ones from the least significant bit to the first zero bit.
-  /// @returns BitWidth if the value is all ones.
-  /// @returns the number of ones from the least significant bit to the first
+  /// @returns BitWidth if the value is all ones, otherwise
+  /// returns the number of ones from the least significant bit to the first
   /// zero bit.
   /// @brief Count the number of trailing one bits.
   unsigned countTrailingOnes() const {
@@ -1288,8 +1288,8 @@
   /// countPopulation - This function is an APInt version of the
   /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
   /// of 1 bits in the APInt value.
-  /// @returns 0 if the value is zero.
-  /// @returns the number of set bits.
+  /// @returns 0 if the value is zero, otherwise returns the number of set
+  /// bits.
   /// @brief Count the number of bits set.
   unsigned countPopulation() const {
     if (isSingleWord())

diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index cf55aad..1e35d62 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h

@@ -59,12 +59,17 @@
     ArrayRef(const T *begin, const T *end)
       : Data(begin), Length(end - begin) {}
 
-    /// Construct an ArrayRef from a SmallVector.
-    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
+    /// Construct an ArrayRef from a SmallVector. This is templated in order to
+    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+    /// copy-construct an ArrayRef.
+    template<typename U>
+    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    }
 
     /// Construct an ArrayRef from a std::vector.
-    /*implicit*/ ArrayRef(const std::vector<T> &Vec)
+    template<typename A>
+    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
       : Data(Vec.empty() ? (T*)0 : &Vec[0]), Length(Vec.size()) {}
 
     /// Construct an ArrayRef from a C array.

diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 3e2e5f2..26ec346 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h

@@ -172,7 +172,7 @@
     unsigned BitPos = Prev % BITWORD_SIZE;
     BitWord Copy = Bits[WordPos];
     // Mask off previous bits.
-    Copy &= ~0L << BitPos;
+    Copy &= ~0UL << BitPos;
 
     if (Copy != 0) {
       if (sizeof(BitWord) == 4)
@@ -311,7 +311,7 @@
     return !(*this == RHS);
   }
 
-  // Intersection, union, disjoint union.
+  /// Intersection, union, disjoint union.
   BitVector &operator&=(const BitVector &RHS) {
     unsigned ThisWords = NumBitWords(size());
     unsigned RHSWords  = NumBitWords(RHS.size());
@@ -328,7 +328,7 @@
     return *this;
   }
 
-  // reset - Reset bits that are set in RHS. Same as *this &= ~RHS.
+  /// reset - Reset bits that are set in RHS. Same as *this &= ~RHS.
   BitVector &reset(const BitVector &RHS) {
     unsigned ThisWords = NumBitWords(size());
     unsigned RHSWords  = NumBitWords(RHS.size());
@@ -338,6 +338,23 @@
     return *this;
   }
 
+  /// test - Check if (This - RHS) is zero.
+  /// This is the same as reset(RHS) and any().
+  bool test(const BitVector &RHS) const {
+    unsigned ThisWords = NumBitWords(size());
+    unsigned RHSWords  = NumBitWords(RHS.size());
+    unsigned i;
+    for (i = 0; i != std::min(ThisWords, RHSWords); ++i)
+      if ((Bits[i] & ~RHS.Bits[i]) != 0)
+        return true;
+
+    for (; i != ThisWords ; ++i)
+      if (Bits[i] != 0)
+        return true;
+
+    return false;
+  }
+
   BitVector &operator|=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
@@ -451,8 +468,11 @@
     //  Then set any stray high bits of the last used word.
     unsigned ExtraBits = Size % BITWORD_SIZE;
     if (ExtraBits) {
-      Bits[UsedWords-1] &= ~(~0L << ExtraBits);
-      Bits[UsedWords-1] |= (0 - (BitWord)t) << ExtraBits;
+      BitWord ExtraBitMask = ~0UL << ExtraBits;
+      if (t)
+        Bits[UsedWords-1] |= ExtraBitMask;
+      else
+        Bits[UsedWords-1] &= ~ExtraBitMask;
     }
   }
 

diff --git a/include/llvm/ADT/DenseMapInfo.h b/include/llvm/ADT/DenseMapInfo.h
index 1559a35..6f17a64 100644
--- a/include/llvm/ADT/DenseMapInfo.h
+++ b/include/llvm/ADT/DenseMapInfo.h

@@ -31,12 +31,12 @@
 template<typename T>
 struct DenseMapInfo<T*> {
   static inline T* getEmptyKey() {
-    intptr_t Val = -1;
+    uintptr_t Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
     return reinterpret_cast<T*>(Val);
   }
   static inline T* getTombstoneKey() {
-    intptr_t Val = -2;
+    uintptr_t Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
     return reinterpret_cast<T*>(Val);
   }
@@ -105,7 +105,7 @@
 // Provide DenseMapInfo for longs.
 template<> struct DenseMapInfo<long> {
   static inline long getEmptyKey() {
-    return (1UL << (sizeof(long) * 8 - 1)) - 1L;
+    return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
   }
   static inline long getTombstoneKey() { return getEmptyKey() - 1L; }
   static unsigned getHashValue(const long& Val) {

diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index ba415ac..375d84a 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h

@@ -278,6 +278,10 @@
 
   bool operator==(FoldingSetNodeIDRef) const;
 
+  /// Used to compare the "ordering" of two nodes as defined by the
+  /// profiled bits and their ordering defined by memcmp().
+  bool operator<(FoldingSetNodeIDRef) const;
+
   const unsigned *getData() const { return Data; }
   size_t getSize() const { return Size; }
 };
@@ -327,6 +331,11 @@
   bool operator==(const FoldingSetNodeID &RHS) const;
   bool operator==(const FoldingSetNodeIDRef RHS) const;
 
+  /// Used to compare the "ordering" of two nodes as defined by the
+  /// profiled bits and their ordering defined by memcmp().
+  bool operator<(const FoldingSetNodeID &RHS) const;
+  bool operator<(const FoldingSetNodeIDRef RHS) const;
+
   /// Intern - Copy this node's data to a memory region allocated from the
   /// given allocator and return a FoldingSetNodeIDRef describing the
   /// interned data.

diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 6ab0725..2363304 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h

@@ -409,7 +409,6 @@
 /// combining them, this (as an optimization) directly combines the integers.
 template <typename InputIteratorT>
 hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) {
-  typedef typename std::iterator_traits<InputIteratorT>::value_type ValueT;
   const size_t seed = get_execution_seed();
   char buffer[64], *buffer_ptr = buffer;
   char *const buffer_end = buffer_ptr + array_lengthof(buffer);

diff --git a/include/llvm/ADT/PointerIntPair.h b/include/llvm/ADT/PointerIntPair.h
index fcc758b..71c379b 100644
--- a/include/llvm/ADT/PointerIntPair.h
+++ b/include/llvm/ADT/PointerIntPair.h

@@ -135,12 +135,12 @@
 struct DenseMapInfo<PointerIntPair<PointerTy, IntBits, IntType> > {
   typedef PointerIntPair<PointerTy, IntBits, IntType> Ty;
   static Ty getEmptyKey() {
-    intptr_t Val = -1;
+    uintptr_t Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<PointerTy>::NumLowBitsAvailable;
     return Ty(reinterpret_cast<PointerTy>(Val), IntType((1 << IntBits)-1));
   }
   static Ty getTombstoneKey() {
-    intptr_t Val = -2;
+    uintptr_t Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<PointerTy>::NumLowBitsAvailable;
     return Ty(reinterpret_cast<PointerTy>(Val), IntType(0));
   }

diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 9fbbbe4..769b7dc 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h

@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_SMALLVECTOR_H
 #define LLVM_ADT_SMALLVECTOR_H
 
+#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
@@ -32,22 +33,52 @@
 protected:
   void *BeginX, *EndX, *CapacityX;
 
+protected:
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
+
+  /// grow_pod - This is an implementation of the grow() method which only works
+  /// on POD-like data types and is out of line to reduce code duplication.
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
+
+public:
+  /// size_in_bytes - This returns size()*sizeof(T).
+  size_t size_in_bytes() const {
+    return size_t((char*)EndX - (char*)BeginX);
+  }
+
+  /// capacity_in_bytes - This returns capacity()*sizeof(T).
+  size_t capacity_in_bytes() const {
+    return size_t((char*)CapacityX - (char*)BeginX);
+  }
+
+  bool empty() const { return BeginX == EndX; }
+};
+
+template <typename T, unsigned N> struct SmallVectorStorage;
+
+/// SmallVectorTemplateCommon - This is the part of SmallVectorTemplateBase
+/// which does not depend on whether the type T is a POD. The extra dummy
+/// template argument is used by ArrayRef to avoid unnecessarily requiring T
+/// to be complete.
+template <typename T, typename = void>
+class SmallVectorTemplateCommon : public SmallVectorBase {
+private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
   // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
   // don't want it to be automatically run, so we need to represent the space as
-  // something else.  An array of char would work great, but might not be
-  // aligned sufficiently.  Instead we use some number of union instances for
-  // the space, which guarantee maximal alignment.
-  union U {
-    double D;
-    long double LD;
-    long long L;
-    void *P;
-  } FirstEl;
+  // something else.  Use an array of char of sufficient alignment.
+  typedef llvm::AlignedCharArrayUnion<T> U;
+  U FirstEl;
   // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
 
 protected:
-  SmallVectorBase(size_t Size)
-    : BeginX(&FirstEl), EndX(&FirstEl), CapacityX((char*)&FirstEl+Size) {}
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
 
   /// isSmall - Return true if this is a smallvector which has not had dynamic
   /// memory allocated for it.
@@ -60,30 +91,6 @@
     BeginX = EndX = CapacityX = &FirstEl;
   }
 
-  /// grow_pod - This is an implementation of the grow() method which only works
-  /// on POD-like data types and is out of line to reduce code duplication.
-  void grow_pod(size_t MinSizeInBytes, size_t TSize);
-
-public:
-  /// size_in_bytes - This returns size()*sizeof(T).
-  size_t size_in_bytes() const {
-    return size_t((char*)EndX - (char*)BeginX);
-  }
-  
-  /// capacity_in_bytes - This returns capacity()*sizeof(T).
-  size_t capacity_in_bytes() const {
-    return size_t((char*)CapacityX - (char*)BeginX);
-  }
-
-  bool empty() const { return BeginX == EndX; }
-};
-
-
-template <typename T>
-class SmallVectorTemplateCommon : public SmallVectorBase {
-protected:
-  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(Size) {}
-
   void setEnd(T *P) { this->EndX = P; }
 public:
   typedef size_t size_type;
@@ -844,6 +851,17 @@
 }
 #endif
 
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T> struct SmallVectorStorage<T, 1> {};
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
 /// SmallVector - This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
 /// in-place, which allows it to avoid heap allocation when the actual number of
@@ -854,41 +872,23 @@
 ///
 template <typename T, unsigned N>
 class SmallVector : public SmallVectorImpl<T> {
-  /// InlineElts - These are 'N-1' elements that are stored inline in the body
-  /// of the vector.  The extra '1' element is stored in SmallVectorImpl.
-  typedef typename SmallVectorImpl<T>::U U;
-  enum {
-    // MinUs - The number of U's require to cover N T's.
-    MinUs = (static_cast<unsigned int>(sizeof(T))*N +
-             static_cast<unsigned int>(sizeof(U)) - 1) /
-            static_cast<unsigned int>(sizeof(U)),
-
-    // NumInlineEltsElts - The number of elements actually in this array.  There
-    // is already one in the parent class, and we have to round up to avoid
-    // having a zero-element array.
-    NumInlineEltsElts = MinUs > 1 ? (MinUs - 1) : 1,
-
-    // NumTsAvailable - The number of T's we actually have space for, which may
-    // be more than N due to rounding.
-    NumTsAvailable = (NumInlineEltsElts+1)*static_cast<unsigned int>(sizeof(U))/
-                     static_cast<unsigned int>(sizeof(T))
-  };
-  U InlineElts[NumInlineEltsElts];
+  /// Storage - Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
 public:
-  SmallVector() : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector() : SmallVectorImpl<T>(N) {
   }
 
   explicit SmallVector(unsigned Size, const T &Value = T())
-    : SmallVectorImpl<T>(NumTsAvailable) {
+    : SmallVectorImpl<T>(N) {
     this->assign(Size, Value);
   }
 
   template<typename ItTy>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
     this->append(S, E);
   }
 
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
     if (!RHS.empty())
       SmallVectorImpl<T>::operator=(RHS);
   }
@@ -899,7 +899,7 @@
   }
 
 #if LLVM_USE_RVALUE_REFERENCES
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
     if (!RHS.empty())
       SmallVectorImpl<T>::operator=(::std::move(RHS));
   }
@@ -912,48 +912,6 @@
 
 };
 
-/// Specialize SmallVector at N=0.  This specialization guarantees
-/// that it can be instantiated at an incomplete T if none of its
-/// members are required.
-template <typename T>
-class SmallVector<T,0> : public SmallVectorImpl<T> {
-public:
-  SmallVector() : SmallVectorImpl<T>(0) {
-  }
-
-  explicit SmallVector(unsigned Size, const T &Value = T())
-    : SmallVectorImpl<T>(0) {
-    this->assign(Size, Value);
-  }
-
-  template<typename ItTy>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(0) {
-    this->append(S, E);
-  }
-
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(0) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(RHS);
-  }
-
-  const SmallVector &operator=(const SmallVector &RHS) {
-    SmallVectorImpl<T>::operator=(RHS);
-    return *this;
-  }
-
-#if LLVM_USE_RVALUE_REFERENCES
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(0) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  const SmallVector &operator=(SmallVector &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-#endif
-};
-
 template<typename T, unsigned N>
 static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
   return X.capacity_in_bytes();

diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 89774c3..791f108 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h

@@ -158,7 +158,7 @@
             && "Word Position outside of element");
 
     // Mask off previous bits.
-    Copy &= ~0L << BitPos;
+    Copy &= ~0UL << BitPos;
 
     if (Copy != 0) {
       if (sizeof(BitWord) == 4)

diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 5569633..dc3db4c 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h

@@ -110,9 +110,9 @@
 /// For sets that may grow to thousands of elements, SparseT should be set to
 /// uint16_t or uint32_t.
 ///
-/// @param ValueT      The type of objects in the set.
-/// @param KeyFunctorT A functor that computes an unsigned index from KeyT.
-/// @param SparseT     An unsigned integer type. See above.
+/// @tparam ValueT      The type of objects in the set.
+/// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
+/// @tparam SparseT     An unsigned integer type. See above.
 ///
 template<typename ValueT,
          typename KeyFunctorT = llvm::identity<unsigned>,

diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 655d884..36df5ac 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h

@@ -125,7 +125,7 @@
 //   X*33+c -> X*33^c
 static inline unsigned HashString(StringRef Str, unsigned Result = 0) {
   for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    Result = Result * 33 + Str[i];
+    Result = Result * 33 + (unsigned char)Str[i];
   return Result;
 }
 

diff --git a/include/llvm/ADT/Trie.h b/include/llvm/ADT/Trie.h
deleted file mode 100644
index 845af01..0000000
--- a/include/llvm/ADT/Trie.h
+++ /dev/null

@@ -1,334 +0,0 @@
-//===- llvm/ADT/Trie.h ---- Generic trie structure --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class defines a generic trie structure. The trie structure
-// is immutable after creation, but the payload contained within it is not.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_TRIE_H
-#define LLVM_ADT_TRIE_H
-
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/Support/DOTGraphTraits.h"
-
-#include <cassert>
-#include <vector>
-
-namespace llvm {
-
-// FIXME:
-// - Labels are usually small, maybe it's better to use SmallString
-// - Should we use char* during construction?
-// - Should we templatize Empty with traits-like interface?
-
-template<class Payload>
-class Trie {
-  friend class GraphTraits<Trie<Payload> >;
-  friend class DOTGraphTraits<Trie<Payload> >;
-public:
-  class Node {
-    friend class Trie;
-
-  public:
-    typedef std::vector<Node*> NodeVectorType;
-    typedef typename NodeVectorType::iterator iterator;
-    typedef typename NodeVectorType::const_iterator const_iterator;
-
-  private:
-    enum QueryResult {
-      Same           = -3,
-      StringIsPrefix = -2,
-      LabelIsPrefix  = -1,
-      DontMatch      = 0,
-      HaveCommonPart
-    };
-
-    struct NodeCmp {
-      bool operator() (Node* N1, Node* N2) {
-        return (N1->Label[0] < N2->Label[0]);
-      }
-      bool operator() (Node* N, char Id) {
-        return (N->Label[0] < Id);
-      }
-    };
-
-    std::string Label;
-    Payload Data;
-    NodeVectorType Children;
-
-    // Do not implement
-    Node(const Node&);
-    Node& operator=(const Node&);
-
-    inline void addEdge(Node* N) {
-      if (Children.empty())
-        Children.push_back(N);
-      else {
-        iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                      N, NodeCmp());
-        // FIXME: no dups are allowed
-        Children.insert(I, N);
-      }
-    }
-
-    inline void setEdge(Node* N) {
-      char Id = N->Label[0];
-      iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                     Id, NodeCmp());
-      assert(I != Children.end() && "Node does not exists!");
-      *I = N;
-    }
-
-    QueryResult query(const std::string& s) const {
-      unsigned i, l;
-      unsigned l1 = s.length();
-      unsigned l2 = Label.length();
-
-      // Find the length of common part
-      l = std::min(l1, l2);
-      i = 0;
-      while ((i < l) && (s[i] == Label[i]))
-        ++i;
-
-      if (i == l) { // One is prefix of another, find who is who
-        if (l1 == l2)
-          return Same;
-        else if (i == l1)
-          return StringIsPrefix;
-        else
-          return LabelIsPrefix;
-      } else // s and Label have common (possible empty) part, return its length
-        return (QueryResult)i;
-    }
-
-  public:
-    inline explicit Node(const Payload& data, const std::string& label = ""):
-        Label(label), Data(data) { }
-
-    inline const Payload& data() const { return Data; }
-    inline void setData(const Payload& data) { Data = data; }
-
-    inline const std::string& label() const { return Label; }
-
-#if 0
-    inline void dump() {
-      llvm::cerr << "Node: " << this << "\n"
-                << "Label: " << Label << "\n"
-                << "Children:\n";
-
-      for (iterator I = Children.begin(), E = Children.end(); I != E; ++I)
-        llvm::cerr << (*I)->Label << "\n";
-    }
-#endif
-
-    inline Node* getEdge(char Id) {
-      Node* fNode = NULL;
-      iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                          Id, NodeCmp());
-      if (I != Children.end() && (*I)->Label[0] == Id)
-        fNode = *I;
-
-      return fNode;
-    }
-
-    inline iterator       begin()       { return Children.begin(); }
-    inline const_iterator begin() const { return Children.begin(); }
-    inline iterator       end  ()       { return Children.end();   }
-    inline const_iterator end  () const { return Children.end();   }
-
-    inline size_t         size () const { return Children.size();  }
-    inline bool           empty() const { return Children.empty(); }
-    inline const Node*   &front() const { return Children.front(); }
-    inline       Node*   &front()       { return Children.front(); }
-    inline const Node*   &back()  const { return Children.back();  }
-    inline       Node*   &back()        { return Children.back();  }
-
-  };
-
-private:
-  std::vector<Node*> Nodes;
-  Payload Empty;
-
-  inline Node* addNode(const Payload& data, const std::string label = "") {
-    Node* N = new Node(data, label);
-    Nodes.push_back(N);
-    return N;
-  }
-
-  inline Node* splitEdge(Node* N, char Id, size_t index) {
-    Node* eNode = N->getEdge(Id);
-    assert(eNode && "Node doesn't exist");
-
-    const std::string &l = eNode->Label;
-    assert(index > 0 && index < l.length() && "Trying to split too far!");
-    std::string l1 = l.substr(0, index);
-    std::string l2 = l.substr(index);
-
-    Node* nNode = addNode(Empty, l1);
-    N->setEdge(nNode);
-
-    eNode->Label = l2;
-    nNode->addEdge(eNode);
-
-    return nNode;
-  }
-
-  // Do not implement
-  Trie(const Trie&);
-  Trie& operator=(const Trie&);
-
-public:
-  inline explicit Trie(const Payload& empty):Empty(empty) {
-    addNode(Empty);
-  }
-  inline ~Trie() {
-    for (unsigned i = 0, e = Nodes.size(); i != e; ++i)
-      delete Nodes[i];
-  }
-
-  inline Node* getRoot() const { return Nodes[0]; }
-
-  bool addString(const std::string& s, const Payload& data);
-  const Payload& lookup(const std::string& s) const;
-
-};
-
-// Define this out-of-line to dissuade the C++ compiler from inlining it.
-template<class Payload>
-bool Trie<Payload>::addString(const std::string& s, const Payload& data) {
-  Node* cNode = getRoot();
-  Node* tNode = NULL;
-  std::string s1(s);
-
-  while (tNode == NULL) {
-    char Id = s1[0];
-    if (Node* nNode = cNode->getEdge(Id)) {
-      typename Node::QueryResult r = nNode->query(s1);
-
-      switch (r) {
-      case Node::Same:
-      case Node::StringIsPrefix:
-        // Currently we don't allow to have two strings in the trie one
-        // being a prefix of another. This should be fixed.
-        assert(0 && "FIXME!");
-        return false;
-      case Node::DontMatch:
-        llvm_unreachable("Impossible!");
-      case Node::LabelIsPrefix:
-        s1 = s1.substr(nNode->label().length());
-        cNode = nNode;
-        break;
-      default:
-        nNode = splitEdge(cNode, Id, r);
-        tNode = addNode(data, s1.substr(r));
-        nNode->addEdge(tNode);
-      }
-    } else {
-      tNode = addNode(data, s1);
-      cNode->addEdge(tNode);
-    }
-  }
-
-  return true;
-}
-
-template<class Payload>
-const Payload& Trie<Payload>::lookup(const std::string& s) const {
-  Node* cNode = getRoot();
-  Node* tNode = NULL;
-  std::string s1(s);
-
-  while (tNode == NULL) {
-    char Id = s1[0];
-    if (Node* nNode = cNode->getEdge(Id)) {
-      typename Node::QueryResult r = nNode->query(s1);
-
-      switch (r) {
-      case Node::Same:
-        tNode = nNode;
-        break;
-      case Node::StringIsPrefix:
-        return Empty;
-      case Node::DontMatch:
-        llvm_unreachable("Impossible!");
-      case Node::LabelIsPrefix:
-        s1 = s1.substr(nNode->label().length());
-        cNode = nNode;
-        break;
-      default:
-        return Empty;
-      }
-    } else
-      return Empty;
-  }
-
-  return tNode->data();
-}
-
-template<class Payload>
-struct GraphTraits<Trie<Payload> > {
-  typedef Trie<Payload> TrieType;
-  typedef typename TrieType::Node NodeType;
-  typedef typename NodeType::iterator ChildIteratorType;
-
-  static inline NodeType *getEntryNode(const TrieType& T) {
-    return T.getRoot();
-  }
-
-  static inline ChildIteratorType child_begin(NodeType *N) {
-    return N->begin();
-  }
-  static inline ChildIteratorType child_end(NodeType *N) { return N->end(); }
-
-  typedef typename std::vector<NodeType*>::const_iterator nodes_iterator;
-
-  static inline nodes_iterator nodes_begin(const TrieType& G) {
-    return G.Nodes.begin();
-  }
-  static inline nodes_iterator nodes_end(const TrieType& G) {
-    return G.Nodes.end();
-  }
-
-};
-
-template<class Payload>
-struct DOTGraphTraits<Trie<Payload> > : public DefaultDOTGraphTraits {
-  typedef typename Trie<Payload>::Node NodeType;
-  typedef typename GraphTraits<Trie<Payload> >::ChildIteratorType EdgeIter;
-
-  static std::string getGraphName(const Trie<Payload>& T) {
-    return "Trie";
-  }
-
-  static std::string getNodeLabel(NodeType* Node, const Trie<Payload>& T) {
-    if (T.getRoot() == Node)
-      return "<Root>";
-    else
-      return Node->label();
-  }
-
-  static std::string getEdgeSourceLabel(NodeType* Node, EdgeIter I) {
-    NodeType* N = *I;
-    return N->label().substr(0, 1);
-  }
-
-  static std::string getNodeAttributes(const NodeType* Node,
-                                       const Trie<Payload>& T) {
-    if (Node->data() != T.Empty)
-      return "color=blue";
-
-    return "";
-  }
-
-};
-
-} // end of llvm namespace
-
-#endif // LLVM_ADT_TRIE_H

diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 7f7061a..ab1f0da 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h

@@ -74,7 +74,8 @@
     PC,
     SCEI,
     BGP,
-    BGQ
+    BGQ,
+    Freescale
   };
   enum OSType {
     UnknownOS,
@@ -109,7 +110,7 @@
     GNUEABIHF,
     EABI,
     MachO,
-    ANDROIDEABI
+    Android
   };
 
 private:

diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 674868a..f587220 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h

@@ -46,6 +46,7 @@
 class StoreInst;
 class VAArgInst;
 class TargetData;
+class TargetLibraryInfo;
 class Pass;
 class AnalysisUsage;
 class MemTransferInst;
@@ -55,6 +56,7 @@
 class AliasAnalysis {
 protected:
   const TargetData *TD;
+  const TargetLibraryInfo *TLI;
 
 private:
   AliasAnalysis *AA;       // Previous Alias Analysis to chain to.
@@ -73,7 +75,7 @@
 
 public:
   static char ID; // Class identification, replacement for typeinfo
-  AliasAnalysis() : TD(0), AA(0) {}
+  AliasAnalysis() : TD(0), TLI(0), AA(0) {}
   virtual ~AliasAnalysis();  // We want to be subclassed
 
   /// UnknownSize - This is a special value which can be used with the
@@ -86,6 +88,11 @@
   ///
   const TargetData *getTargetData() const { return TD; }
 
+  /// getTargetLibraryInfo - Return a pointer to the current TargetLibraryInfo
+  /// object, or null if no TargetLibraryInfo object is available.
+  ///
+  const TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+
   /// getTypeStoreSize - Return the TargetData store size for the given type,
   /// if known, or a conservative value otherwise.
   ///
@@ -187,6 +194,11 @@
     return isNoAlias(Location(V1, V1Size), Location(V2, V2Size));
   }
   
+  /// isNoAlias - A convenience wrapper.
+  bool isNoAlias(const Value *V1, const Value *V2) {
+    return isNoAlias(Location(V1), Location(V2));
+  }
+  
   /// isMustAlias - A convenience wrapper.
   bool isMustAlias(const Location &LocA, const Location &LocB) {
     return alias(LocA, LocB) == MustAlias;

diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 006daa0..c0567da 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h

@@ -28,11 +28,14 @@
 ///
 /// This is a function analysis pass which provides information on the relative
 /// probabilities of each "edge" in the function's CFG where such an edge is
-/// defined by a pair of basic blocks. The probability for a given block and
-/// a successor block are always relative to the probabilities of the other
-/// successor blocks. Another way of looking at it is that the probabilities
-/// for a given block B and each of its successors should sum to exactly
-/// one (100%).
+/// defined by a pair (PredBlock and an index in the successors). The
+/// probability of an edge from one block is always relative to the
+/// probabilities of other edges from the block. The probabilites of all edges
+/// from a block sum to exactly one (100%).
+/// We use a pair (PredBlock and an index in the successors) to uniquely
+/// identify an edge, since we can have multiple edges from Src to Dst.
+/// As an example, we can have a switch which jumps to Dst with value 0 and
+/// value 10.
 class BranchProbabilityInfo : public FunctionPass {
 public:
   static char ID;
@@ -52,6 +55,12 @@
   /// leaving the 'Src' block. The returned probability is never zero, and can
   /// only be one if the source block has only one successor.
   BranchProbability getEdgeProbability(const BasicBlock *Src,
+                                       unsigned IndexInSuccessors) const;
+
+  /// \brief Get the probability of going from Src to Dst.
+  ///
+  /// It returns the sum of all probabilities for edges from Src to Dst.
+  BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
@@ -74,25 +83,34 @@
   raw_ostream &printEdgeProbability(raw_ostream &OS, const BasicBlock *Src,
                                     const BasicBlock *Dst) const;
 
-  /// \brief Get the raw edge weight calculated for the block pair.
+  /// \brief Get the raw edge weight calculated for the edge.
   ///
   /// This returns the raw edge weight. It is guaranteed to fall between 1 and
   /// UINT32_MAX. Note that the raw edge weight is not meaningful in isolation.
   /// This interface should be very carefully, and primarily by routines that
   /// are updating the analysis by later calling setEdgeWeight.
+  uint32_t getEdgeWeight(const BasicBlock *Src,
+                         unsigned IndexInSuccessors) const;
+
+  /// \brief Get the raw edge weight calculated for the block pair.
+  ///
+  /// This returns the sum of all raw edge weights from Src to Dst.
+  /// It is guaranteed to fall between 1 and UINT32_MAX.
   uint32_t getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const;
 
-  /// \brief Set the raw edge weight for the block pair.
+  /// \brief Set the raw edge weight for a given edge.
   ///
-  /// This allows a pass to explicitly set the edge weight for a block. It can
+  /// This allows a pass to explicitly set the edge weight for an edge. It can
   /// be used when updating the CFG to update and preserve the branch
   /// probability information. Read the implementation of how these edge
   /// weights are calculated carefully before using!
-  void setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst,
+  void setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
                      uint32_t Weight);
 
 private:
-  typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
+  // Since we allow duplicate edges from one basic block to another, we use
+  // a pair (PredBlock and an index in the successors) to specify an edge.
+  typedef std::pair<const BasicBlock *, unsigned> Edge;
 
   // Default weight value. Used when we don't have information about the edge.
   // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of

diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index c07fbf7..3bb96f9 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h

@@ -145,7 +145,6 @@
 
   // Loop over the predecessors of the header node...
   BlockT *Header = getHeader();
-  typedef GraphTraits<BlockT*> BlockTraits;
   typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
   for (typename InvBlockTraits::ChildIteratorType PI =
          InvBlockTraits::child_begin(Header),

diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index e16f389..c3ae603 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h

@@ -28,6 +28,7 @@
 class CallInst;
 class PointerType;
 class TargetData;
+class TargetLibraryInfo;
 class Type;
 class Value;
 
@@ -35,27 +36,33 @@
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
-bool isAllocationFn(const Value *V, bool LookThroughBitCast = false);
+bool isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
-bool isNoAliasFn(const Value *V, bool LookThroughBitCast = false);
+bool isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
+                 bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
-bool isMallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
-bool isCallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
-bool isAllocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                   bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// reallocates memory (such as realloc).
-bool isReallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                     bool LookThroughBitCast = false);
 
 
 //===----------------------------------------------------------------------===//
@@ -65,29 +72,31 @@
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *extractMallocCall(const Value *I);
-static inline CallInst *extractMallocCall(Value *I) {
-  return const_cast<CallInst*>(extractMallocCall((const Value*)I));
+const CallInst *extractMallocCall(const Value *I, const TargetLibraryInfo *TLI);
+static inline CallInst *extractMallocCall(Value *I,
+                                          const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(extractMallocCall((const Value*)I, TLI));
 }
 
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction 
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
-const CallInst *isArrayMalloc(const Value *I, const TargetData *TD);
+const CallInst *isArrayMalloc(const Value *I, const TargetData *TD,
+                              const TargetLibraryInfo *TLI);
 
 /// getMallocType - Returns the PointerType resulting from the malloc call.
 /// The PointerType depends on the number of bitcast uses of the malloc call:
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-PointerType *getMallocType(const CallInst *CI);
+PointerType *getMallocType(const CallInst *CI, const TargetLibraryInfo *TLI);
 
 /// getMallocAllocatedType - Returns the Type allocated by malloc call.
 /// The Type depends on the number of bitcast uses of the malloc call:
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-Type *getMallocAllocatedType(const CallInst *CI);
+Type *getMallocAllocatedType(const CallInst *CI, const TargetLibraryInfo *TLI);
 
 /// getMallocArraySize - Returns the array size of a malloc call.  If the 
 /// argument passed to malloc is a multiple of the size of the malloced type,
@@ -95,6 +104,7 @@
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
 Value *getMallocArraySize(CallInst *CI, const TargetData *TD,
+                          const TargetLibraryInfo *TLI,
                           bool LookThroughSExt = false);
 
 
@@ -104,9 +114,10 @@
 
 /// extractCallocCall - Returns the corresponding CallInst if the instruction
 /// is a calloc call.
-const CallInst *extractCallocCall(const Value *I);
-static inline CallInst *extractCallocCall(Value *I) {
-  return const_cast<CallInst*>(extractCallocCall((const Value*)I));
+const CallInst *extractCallocCall(const Value *I, const TargetLibraryInfo *TLI);
+static inline CallInst *extractCallocCall(Value *I,
+                                          const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(extractCallocCall((const Value*)I, TLI));
 }
 
 
@@ -115,10 +126,10 @@
 //
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
-const CallInst *isFreeCall(const Value *I);
+const CallInst *isFreeCall(const Value *I, const TargetLibraryInfo *TLI);
   
-static inline CallInst *isFreeCall(Value *I) {
-  return const_cast<CallInst*>(isFreeCall((const Value*)I));
+static inline CallInst *isFreeCall(Value *I, const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(isFreeCall((const Value*)I, TLI));
 }
 
   
@@ -131,7 +142,7 @@
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
 bool getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
-                   bool RoundToAlign = false);
+                   const TargetLibraryInfo *TLI, bool RoundToAlign = false);
 
 
 
@@ -143,6 +154,7 @@
   : public InstVisitor<ObjectSizeOffsetVisitor, SizeOffsetType> {
 
   const TargetData *TD;
+  const TargetLibraryInfo *TLI;
   bool RoundToAlign;
   unsigned IntTyBits;
   APInt Zero;
@@ -155,8 +167,8 @@
   }
 
 public:
-  ObjectSizeOffsetVisitor(const TargetData *TD, LLVMContext &Context,
-                          bool RoundToAlign = false);
+  ObjectSizeOffsetVisitor(const TargetData *TD, const TargetLibraryInfo *TLI,
+                          LLVMContext &Context, bool RoundToAlign = false);
 
   SizeOffsetType compute(Value *V);
 
@@ -202,6 +214,7 @@
   typedef SmallPtrSet<const Value*, 8> PtrSetTy;
 
   const TargetData *TD;
+  const TargetLibraryInfo *TLI;
   LLVMContext &Context;
   BuilderTy Builder;
   IntegerType *IntTy;
@@ -215,7 +228,8 @@
   SizeOffsetEvalType compute_(Value *V);
 
 public:
-  ObjectSizeOffsetEvaluator(const TargetData *TD, LLVMContext &Context);
+  ObjectSizeOffsetEvaluator(const TargetData *TD, const TargetLibraryInfo *TLI,
+                            LLVMContext &Context);
   SizeOffsetEvalType compute(Value *V);
 
   bool knownSize(SizeOffsetEvalType SizeOffset) {

diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index a22bd12..c52f846 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h

@@ -103,6 +103,14 @@
 
   //===--------------------------------------------------------------------===//
   //
+  // createProfileMetadataLoaderPass - This pass loads information from a
+  // profile dump file and sets branch weight metadata.
+  //
+  ModulePass *createProfileMetadataLoaderPass();
+  extern char &ProfileMetadataLoaderPassID;
+
+  //===--------------------------------------------------------------------===//
+  //
   // createNoProfileInfoPass - This pass implements the default "no profile".
   //
   ImmutablePass *createNoProfileInfoPass();

diff --git a/include/llvm/Analysis/ProfileDataLoader.h b/include/llvm/Analysis/ProfileDataLoader.h
new file mode 100644
index 0000000..bec9fac
--- /dev/null
+++ b/include/llvm/Analysis/ProfileDataLoader.h

@@ -0,0 +1,142 @@
+//===- ProfileDataLoader.h - Load & convert profile info ----*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileDataLoader class is used to load profiling data from a dump file.
+// The ProfileDataT<FType, BType> class is used to store the mapping of this
+// data to control flow edges.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILEDATALOADER_H
+#define LLVM_ANALYSIS_PROFILEDATALOADER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <string>
+
+namespace llvm {
+
+class ModulePass;
+class Function;
+class BasicBlock;
+
+// Helper for dumping edges to dbgs().
+raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *,
+                                                  const BasicBlock *> E);
+
+/// \brief The ProfileDataT<FType, BType> class is used to store the mapping of
+/// profiling data to control flow edges.
+///
+/// An edge is defined by its source and sink basic blocks.
+template<class FType, class BType>
+class ProfileDataT {
+public:
+  // The profiling information defines an Edge by its source and sink basic
+  // blocks.
+  typedef std::pair<const BType*, const BType*> Edge;
+
+private:
+  typedef DenseMap<Edge, unsigned> EdgeWeights;
+
+  /// \brief Count the number of times a transition between two blocks is
+  /// executed.
+  ///
+  /// As a special case, we also hold an edge from the null BasicBlock to the
+  /// entry block to indicate how many times the function was entered.
+  DenseMap<const FType*, EdgeWeights> EdgeInformation;
+
+public:
+  /// getFunction() - Returns the Function for an Edge.
+  static const FType *getFunction(Edge e) {
+    // e.first may be NULL
+    assert(((!e.first) || (e.first->getParent() == e.second->getParent()))
+           && "A ProfileData::Edge can not be between two functions");
+    assert(e.second && "A ProfileData::Edge must have a real sink");
+    return e.second->getParent();
+  }
+
+  /// getEdge() - Creates an Edge between two BasicBlocks.
+  static Edge getEdge(const BType *Src, const BType *Dest) {
+    return Edge(Src, Dest);
+  }
+
+  /// getEdgeWeight - Return the number of times that a given edge was
+  /// executed.
+  unsigned getEdgeWeight(Edge e) const {
+    const FType *f = getFunction(e);
+    assert((EdgeInformation.find(f) != EdgeInformation.end())
+           && "No profiling information for function");
+    EdgeWeights weights = EdgeInformation.find(f)->second;
+
+    assert((weights.find(e) != weights.end())
+           && "No profiling information for edge");
+    return weights.find(e)->second;
+  }
+
+  /// addEdgeWeight - Add 'weight' to the already stored execution count for
+  /// this edge.
+  void addEdgeWeight(Edge e, unsigned weight) {
+    EdgeInformation[getFunction(e)][e] += weight;
+  }
+};
+
+typedef ProfileDataT<Function, BasicBlock> ProfileData;
+//typedef ProfileDataT<MachineFunction, MachineBasicBlock> MachineProfileData;
+
+/// The ProfileDataLoader class is used to load raw profiling data from the
+/// dump file.
+class ProfileDataLoader {
+private:
+  /// The name of the file where the raw profiling data is stored.
+  const std::string &Filename;
+
+  /// A vector of the command line arguments used when the target program was
+  /// run to generate profiling data.  One entry per program run.
+  SmallVector<std::string, 1> CommandLines;
+
+  /// The raw values for how many times each edge was traversed, values from
+  /// multiple program runs are accumulated.
+  SmallVector<unsigned, 32> EdgeCounts;
+
+public:
+  /// ProfileDataLoader ctor - Read the specified profiling data file, exiting
+  /// the program if the file is invalid or broken.
+  ProfileDataLoader(const char *ToolName, const std::string &Filename);
+
+  /// A special value used to represent the weight of an edge which has not
+  /// been counted yet.
+  static const unsigned Uncounted;
+
+  /// The maximum value that can be stored in a profiling counter.
+  static const unsigned MaxCount;
+
+  /// getNumExecutions - Return the number of times the target program was run
+  /// to generate this profiling data.
+  unsigned getNumExecutions() const { return CommandLines.size(); }
+
+  /// getExecution - Return the command line parameters used to generate the
+  /// i'th set of profiling data.
+  const std::string &getExecution(unsigned i) const { return CommandLines[i]; }
+
+  const std::string &getFileName() const { return Filename; }
+
+  /// getRawEdgeCounts - Return the raw profiling data, this is just a list of
+  /// numbers with no mappings to edges.
+  ArrayRef<unsigned> getRawEdgeCounts() const { return EdgeCounts; }
+};
+
+/// createProfileMetadataLoaderPass - This function returns a Pass that loads
+/// the profiling information for the module from the specified filename.
+ModulePass *createProfileMetadataLoaderPass(const std::string &Filename);
+
+} // End llvm namespace
+
+#endif

diff --git a/include/llvm/Analysis/ProfileDataTypes.h b/include/llvm/Analysis/ProfileDataTypes.h
new file mode 100644
index 0000000..1be15e0
--- /dev/null
+++ b/include/llvm/Analysis/ProfileDataTypes.h

@@ -0,0 +1,39 @@
+/*===-- ProfileDataTypes.h - Profiling info shared constants --------------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|*
+|*===----------------------------------------------------------------------===*|
+|*
+|* This file defines constants shared by the various different profiling
+|* runtime libraries and the LLVM C++ profile metadata loader. It must be a
+|* C header because, at present, the profiling runtimes are written in C.
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_ANALYSIS_PROFILEDATATYPES_H
+#define LLVM_ANALYSIS_PROFILEDATATYPES_H
+
+/* Included by libprofile. */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* TODO: Strip out unused entries once ProfileInfo etc has been removed. */
+enum ProfilingType {
+  ArgumentInfo  = 1,   /* The command line argument block */
+  FunctionInfo  = 2,   /* Function profiling information  */
+  BlockInfo     = 3,   /* Block profiling information     */
+  EdgeInfo      = 4,   /* Edge profiling information      */
+  PathInfo      = 5,   /* Path profiling information      */
+  BBTraceInfo   = 6,   /* Basic block trace information   */
+  OptEdgeInfo   = 7    /* Edge profiling information, optimal version */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* LLVM_ANALYSIS_PROFILEDATATYPES_H */

diff --git a/include/llvm/Analysis/ProfileInfoTypes.h b/include/llvm/Analysis/ProfileInfoTypes.h
index 6b4ac85..45aab5b 100644
--- a/include/llvm/Analysis/ProfileInfoTypes.h
+++ b/include/llvm/Analysis/ProfileInfoTypes.h

@@ -27,15 +27,7 @@
   ProfilingHash = 2
 };
 
-enum ProfilingType {
-  ArgumentInfo  = 1,   /* The command line argument block */
-  FunctionInfo  = 2,   /* Function profiling information  */
-  BlockInfo     = 3,   /* Block profiling information     */
-  EdgeInfo      = 4,   /* Edge profiling information      */
-  PathInfo      = 5,   /* Path profiling information      */
-  BBTraceInfo   = 6,   /* Basic block trace information   */
-  OptEdgeInfo   = 7    /* Edge profiling information, optimal version */
-};
+#include "llvm/Analysis/ProfileDataTypes.h"
 
 /*
  * The header for tables that map path numbers to path counters.

diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 188d11c..e62040e 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h

@@ -473,27 +473,6 @@
   const_iterator end() const { return children.end(); }
   //@}
 
-  /// @name BasicBlock Node Iterators
-  ///
-  /// These iterators iterate over all BasicBlock RegionNodes that are
-  /// contained in this Region. The iterator also iterates over BasicBlock
-  /// RegionNodes that are elements of a subregion of this Region. It is
-  /// therefore called a flat iterator.
-  //@{
-  typedef df_iterator<RegionNode*, SmallPtrSet<RegionNode*, 8>, false,
-                      GraphTraits<FlatIt<RegionNode*> > > block_node_iterator;
-
-  typedef df_iterator<const RegionNode*, SmallPtrSet<const RegionNode*, 8>,
-                      false, GraphTraits<FlatIt<const RegionNode*> > >
-            const_block_node_iterator;
-
-  block_node_iterator block_node_begin();
-  block_node_iterator block_node_end();
-
-  const_block_node_iterator block_node_begin() const;
-  const_block_node_iterator block_node_end() const;
-  //@}
-
   /// @name BasicBlock Iterators
   ///
   /// These iterators iterate over all BasicBlocks that are contained in this

diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index 223aa00..0228d86 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h

@@ -134,9 +134,6 @@
                                             /// often, so lazy binding isn't
                                             /// worthwhile.
 DECLARE_LLVM_ATTRIBUTE(AddressSafety,1ULL<<32) ///< Address safety checking is on.
-DECLARE_LLVM_ATTRIBUTE(IANSDialect,1ULL<<33) ///< Inline asm non-standard dialect.
-                                           /// When not set, ATT dialect assumed.
-                                           /// When set implies the Intel dialect.
 
 #undef DECLARE_LLVM_ATTRIBUTE
 
@@ -162,8 +159,7 @@
   ReadOnly_i | NoInline_i | AlwaysInline_i | OptimizeForSize_i |
   StackProtect_i | StackProtectReq_i | NoRedZone_i | NoImplicitFloat_i |
   Naked_i | InlineHint_i | StackAlignment_i |
-  UWTable_i | NonLazyBind_i | ReturnsTwice_i | AddressSafety_i |
-  IANSDialect_i};
+  UWTable_i | NonLazyBind_i | ReturnsTwice_i | AddressSafety_i};
 
 /// @brief Parameter attributes that do not apply to vararg call arguments.
 const AttrConst VarArgsIncompatible = {StructRet_i};

diff --git a/include/llvm/Bitcode/Archive.h b/include/llvm/Bitcode/Archive.h
index 3c75e58..c1c3ec0 100644
--- a/include/llvm/Bitcode/Archive.h
+++ b/include/llvm/Bitcode/Archive.h

@@ -415,8 +415,8 @@
     /// name will be truncated at 15 characters. If \p Compress is specified,
     /// all archive members will be compressed before being written. If
     /// \p PrintSymTab is true, the symbol table will be printed to std::cout.
-    /// @returns true if an error occurred, \p error set to error message
-    /// @returns false if the writing succeeded.
+    /// @returns true if an error occurred, \p error set to error message;
+    /// returns false if the writing succeeded.
     /// @brief Write (possibly modified) archive contents to disk
     bool writeToDisk(
       bool CreateSymbolTable=false,   ///< Create Symbol table
@@ -480,8 +480,8 @@
     /// Writes one ArchiveMember to an ofstream. If an error occurs, returns
     /// false, otherwise true. If an error occurs and error is non-null then
     /// it will be set to an error message.
-    /// @returns false Writing member succeeded
-    /// @returns true Writing member failed, \p error set to error message
+    /// @returns false if writing member succeeded,
+    /// returns true if writing member failed, \p error set to error message.
     bool writeMember(
       const ArchiveMember& member, ///< The member to be written
       std::ofstream& ARFile,       ///< The file to write member onto

diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index a8c34cb..c1dc190 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h

@@ -161,11 +161,14 @@
     CST_CODE_CE_INSERTELT  = 15,  // CE_INSERTELT:  [opval, opval, opval]
     CST_CODE_CE_SHUFFLEVEC = 16,  // CE_SHUFFLEVEC: [opval, opval, opval]
     CST_CODE_CE_CMP        = 17,  // CE_CMP:        [opty, opval, opval, pred]
-    CST_CODE_INLINEASM     = 18,  // INLINEASM:     [sideeffect,asmstr,conststr]
+    CST_CODE_INLINEASM_OLD = 18,  // INLINEASM:     [sideeffect|alignstack,
+                                  //                 asmstr,conststr]
     CST_CODE_CE_SHUFVEC_EX = 19,  // SHUFVEC_EX:    [opty, opval, opval, opval]
     CST_CODE_CE_INBOUNDS_GEP = 20,// INBOUNDS_GEP:  [n x operands]
     CST_CODE_BLOCKADDRESS  = 21,  // CST_CODE_BLOCKADDRESS [fnty, fnval, bb#]
-    CST_CODE_DATA          = 22   // DATA:          [n x elements]
+    CST_CODE_DATA          = 22,  // DATA:          [n x elements]
+    CST_CODE_INLINEASM     = 23   // INLINEASM:     [sideeffect|alignstack|
+                                  //                 asmdialect,asmstr,conststr]
   };
 
   /// CastOpcodes - These are values used in the bitcode files to encode which

diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 170a528..2920675 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h

@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_ASMPRINTER_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -460,7 +461,8 @@
     mutable unsigned SetCounter;
 
     /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-    void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0) const;
+    void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0,
+                    InlineAsm::AsmDialect AsmDialect = InlineAsm::AD_ATT) const;
 
     /// EmitInlineAsm - This method formats and emits the specified machine
     /// instruction that is an inline asm.

diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index f387bd5..5d0a3b4 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h

@@ -637,6 +637,10 @@
     ATOMIC_LOAD_UMIN,
     ATOMIC_LOAD_UMAX,
 
+    /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
+    /// is the chain and the second operand is the alloca pointer.
+    LIFETIME_START, LIFETIME_END,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END

diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index a3ce47c..5aeb1a8 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h

@@ -29,6 +29,7 @@
 #include <climits>
 
 namespace llvm {
+  class CoalescerPair;
   class LiveIntervals;
   class MachineInstr;
   class MachineRegisterInfo;
@@ -366,6 +367,14 @@
       return overlapsFrom(other, other.begin());
     }
 
+    /// overlaps - Return true if the two intervals have overlapping segments
+    /// that are not coalescable according to CP.
+    ///
+    /// Overlapping segments where one interval is defined by a coalescable
+    /// copy are allowed.
+    bool overlaps(const LiveInterval &Other, const CoalescerPair &CP,
+                  const SlotIndexes&) const;
+
     /// overlaps - Return true if the live interval overlaps a range specified
     /// by [Start, End).
     bool overlaps(SlotIndex Start, SlotIndex End) const;

diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index da521db..bf74690 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h

@@ -252,7 +252,7 @@
 
     /// addKillFlags - Add kill flags to any instruction that kills a virtual
     /// register.
-    void addKillFlags();
+    void addKillFlags(const VirtRegMap*);
 
     /// handleMove - call this method to notify LiveIntervals that
     /// instruction 'mi' has been moved within a basic block. This will update

diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 8b958e4..3c07ceb 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h

@@ -28,6 +28,7 @@
 class MachineBasicBlock;
 class TargetFrameLowering;
 class BitVector;
+class Value;
 
 /// The CalleeSavedInfo class tracks the information need to locate where a
 /// callee saved register is in the current frame.
@@ -103,14 +104,18 @@
     // protector.
     bool MayNeedSP;
 
+    /// Alloca - If this stack object is originated from an Alloca instruction
+    /// this value saves the original IR allocation. Can be NULL.
+    const Value *Alloca;
+
     // PreAllocated - If true, the object was mapped into the local frame
     // block and doesn't need additional handling for allocation beyond that.
     bool PreAllocated;
 
     StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM,
-                bool isSS, bool NSP)
+                bool isSS, bool NSP, const Value *Val)
       : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM),
-        isSpillSlot(isSS), MayNeedSP(NSP), PreAllocated(false) {}
+        isSpillSlot(isSS), MayNeedSP(NSP), Alloca(Val), PreAllocated(false) {}
   };
 
   /// Objects - The list of stack objects allocated...
@@ -362,6 +367,14 @@
     ensureMaxAlignment(Align);
   }
 
+  /// getObjectAllocation - Return the underlying Alloca of the specified
+  /// stack object if it exists. Returns 0 if none exists.
+  const Value* getObjectAllocation(int ObjectIdx) const {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return Objects[ObjectIdx+NumFixedObjects].Alloca;
+  }
+
   /// NeedsStackProtector - Returns true if the object may need stack
   /// protectors.
   bool MayNeedStackProtector(int ObjectIdx) const {
@@ -482,9 +495,10 @@
   /// a nonnegative identifier to represent it.
   ///
   int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS,
-                        bool MayNeedSP = false) {
+                        bool MayNeedSP = false, const Value *Alloca = 0) {
     assert(Size != 0 && "Cannot allocate zero size stack objects!");
-    Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP));
+    Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP,
+                                  Alloca));
     int Index = (int)Objects.size() - NumFixedObjects - 1;
     assert(Index >= 0 && "Bad frame index!");
     ensureMaxAlignment(Alignment);
@@ -516,7 +530,7 @@
   ///
   int CreateVariableSizedObject(unsigned Alignment) {
     HasVarSizedObjects = true;
-    Objects.push_back(StackObject(0, Alignment, 0, false, false, true));
+    Objects.push_back(StackObject(0, Alignment, 0, false, false, true, 0));
     ensureMaxAlignment(Alignment);
     return (int)Objects.size()-NumFixedObjects-1;
   }

diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 062c750..0eb9d0e 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h

@@ -138,15 +138,19 @@
   MachineModuleInfo &getMMI() const { return MMI; }
   GCModuleInfo *getGMI() const { return GMI; }
   MCContext &getContext() const { return Ctx; }
-  
+
   /// getFunction - Return the LLVM function that this machine code represents
   ///
   const Function *getFunction() const { return Fn; }
 
+  /// getName - Return the name of the corresponding LLVM function.
+  ///
+  StringRef getName() const;
+
   /// getFunctionNumber - Return a unique ID for the current function.
   ///
   unsigned getFunctionNumber() const { return FunctionNumber; }
-  
+
   /// getTarget - Return the target machine this machine code is compiled with
   ///
   const TargetMachine &getTarget() const { return Target; }

diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 27756ab..4e1533a 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h

@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Support/DebugLoc.h"
 #include <vector>
 
@@ -610,6 +611,7 @@
   bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
   bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }
   bool isStackAligningInlineAsm() const;
+  InlineAsm::AsmDialect getInlineAsmDialect() const;
   bool isInsertSubreg() const {
     return getOpcode() == TargetOpcode::INSERT_SUBREG;
   }
@@ -782,16 +784,43 @@
                         const TargetInstrInfo *TII,
                         const TargetRegisterInfo *TRI) const;
 
+  /// tieOperands - Add a tie between the register operands at DefIdx and
+  /// UseIdx. The tie will cause the register allocator to ensure that the two
+  /// operands are assigned the same physical register.
+  ///
+  /// Tied operands are managed automatically for explicit operands in the
+  /// MCInstrDesc. This method is for exceptional cases like inline asm.
+  void tieOperands(unsigned DefIdx, unsigned UseIdx);
+
+  /// findTiedOperandIdx - Given the index of a tied register operand, find the
+  /// operand it is tied to. Defs are tied to uses and vice versa. Returns the
+  /// index of the tied operand which must exist.
+  unsigned findTiedOperandIdx(unsigned OpIdx) const;
+
   /// isRegTiedToUseOperand - Given the index of a register def operand,
   /// check if the register def is tied to a source operand, due to either
   /// two-address elimination or inline assembly constraints. Returns the
   /// first tied use operand index by reference if UseOpIdx is not null.
-  bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx = 0) const;
+  bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx = 0) const {
+    const MachineOperand &MO = getOperand(DefOpIdx);
+    if (!MO.isReg() || !MO.isDef() || !MO.isTied())
+      return false;
+    if (UseOpIdx)
+      *UseOpIdx = findTiedOperandIdx(DefOpIdx);
+    return true;
+  }
 
   /// isRegTiedToDefOperand - Return true if the use operand of the specified
   /// index is tied to an def operand. It also returns the def operand index by
   /// reference if DefOpIdx is not null.
-  bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = 0) const;
+  bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = 0) const {
+    const MachineOperand &MO = getOperand(UseOpIdx);
+    if (!MO.isReg() || !MO.isUse() || !MO.isTied())
+      return false;
+    if (DefOpIdx)
+      *DefOpIdx = findTiedOperandIdx(UseOpIdx);
+    return true;
+  }
 
   /// clearKillInfo - Clears kill flags on all operands.
   ///
@@ -852,11 +881,11 @@
   bool isSafeToReMat(const TargetInstrInfo *TII, AliasAnalysis *AA,
                      unsigned DstReg) const;
 
-  /// hasVolatileMemoryRef - Return true if this instruction may have a
-  /// volatile memory reference, or if the information describing the
-  /// memory reference is not available. Return false if it is known to
-  /// have no volatile memory references.
-  bool hasVolatileMemoryRef() const;
+  /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+  /// or volatile memory reference, or if the information describing the memory
+  /// reference is not available. Return false if it is known to have no
+  /// ordered or volatile memory references.
+  bool hasOrderedMemoryRef() const;
 
   /// isInvariantLoad - Return true if this instruction is loading from a
   /// location whose value is invariant across the function.  For example,
@@ -935,6 +964,15 @@
   /// return null.
   MachineRegisterInfo *getRegInfo();
 
+  /// untieRegOperand - Break any tie involving OpIdx.
+  void untieRegOperand(unsigned OpIdx) {
+    MachineOperand &MO = getOperand(OpIdx);
+    if (MO.isReg() && MO.isTied()) {
+      getOperand(findTiedOperandIdx(OpIdx)).TiedTo = 0;
+      MO.TiedTo = 0;
+    }
+  }
+
   /// addImplicitDefUseOperands - Add all implicit def and use operands to
   /// this instruction.
   void addImplicitDefUseOperands();

diff --git a/include/llvm/CodeGen/MachineMemOperand.h b/include/llvm/CodeGen/MachineMemOperand.h
index 1ac9080..ddb1271 100644
--- a/include/llvm/CodeGen/MachineMemOperand.h
+++ b/include/llvm/CodeGen/MachineMemOperand.h

@@ -151,6 +151,15 @@
   bool isNonTemporal() const { return Flags & MONonTemporal; }
   bool isInvariant() const { return Flags & MOInvariant; }
 
+  /// isUnordered - Returns true if this memory operation doesn't have any
+  /// ordering constraints other than normal aliasing. Volatile and atomic
+  /// memory operations can't be reordered.
+  ///
+  /// Currently, we don't model the difference between volatile and atomic
+  /// operations. They should retain their ordering relative to all memory
+  /// operations.
+  bool isUnordered() const { return !isVolatile(); }
+
   /// refineAlignment - Update this MachineMemOperand to reflect the alignment
   /// of MMO, if it has a greater alignment. This must only be used when the
   /// new alignment applies to all users of this MachineMemOperand.

diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 37d42b3..0b9d67f 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h

@@ -14,7 +14,6 @@
 #ifndef LLVM_CODEGEN_MACHINEOPERAND_H
 #define LLVM_CODEGEN_MACHINEOPERAND_H
 
-#include "llvm/ADT/Hashing.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
@@ -30,6 +29,7 @@
 class MDNode;
 class TargetMachine;
 class TargetRegisterInfo;
+class hash_code;
 class raw_ostream;
 class MCSymbol;
 
@@ -60,12 +60,20 @@
   /// union.
   unsigned char OpKind; // MachineOperandType
 
-  /// SubReg - Subregister number, only valid for MO_Register.  A value of 0
-  /// indicates the MO_Register has no subReg.
-  unsigned char SubReg;
+  // This union is discriminated by OpKind.
+  union {
+    /// SubReg - Subregister number, only valid for MO_Register.  A value of 0
+    /// indicates the MO_Register has no subReg.
+    unsigned char SubReg;
 
-  /// TargetFlags - This is a set of target-specific operand flags.
-  unsigned char TargetFlags;
+    /// TargetFlags - This is a set of target-specific operand flags.
+    unsigned char TargetFlags;
+  };
+
+  /// TiedTo - Non-zero when this register operand is tied to another register
+  /// operand. The encoding of this field is described in the block comment
+  /// before MachineInstr::tieOperands().
+  unsigned char TiedTo : 4;
 
   /// IsDef/IsImp/IsKill/IsDead flags - These are only valid for MO_Register
   /// operands.
@@ -176,9 +184,17 @@
   ///
   MachineOperandType getType() const { return (MachineOperandType)OpKind; }
 
-  unsigned char getTargetFlags() const { return TargetFlags; }
-  void setTargetFlags(unsigned char F) { TargetFlags = F; }
-  void addTargetFlag(unsigned char F) { TargetFlags |= F; }
+  unsigned char getTargetFlags() const {
+    return isReg() ? 0 : TargetFlags;
+  }
+  void setTargetFlags(unsigned char F) {
+    assert(!isReg() && "Register operands can't have target flags");
+    TargetFlags = F;
+  }
+  void addTargetFlag(unsigned char F) {
+    assert(!isReg() && "Register operands can't have target flags");
+    TargetFlags |= F;
+  }
 
 
   /// getParent - Return the instruction that this operand belongs to.
@@ -288,6 +304,11 @@
     return IsEarlyClobber;
   }
 
+  bool isTied() const {
+    assert(isReg() && "Wrong MachineOperand accessor");
+    return TiedTo;
+  }
+
   bool isDebug() const {
     assert(isReg() && "Wrong MachineOperand accessor");
     return IsDebug;
@@ -421,7 +442,7 @@
   int64_t getOffset() const {
     assert((isGlobal() || isSymbol() || isCPI() || isTargetIndex() ||
             isBlockAddress()) && "Wrong MachineOperand accessor");
-    return (int64_t(Contents.OffsetedInfo.OffsetHi) << 32) |
+    return int64_t(uint64_t(Contents.OffsetedInfo.OffsetHi) << 32) |
            SmallContents.OffsetLo;
   }
 
@@ -548,6 +569,7 @@
     Op.IsUndef = isUndef;
     Op.IsInternalRead = isInternalRead;
     Op.IsEarlyClobber = isEarlyClobber;
+    Op.TiedTo = 0;
     Op.IsDebug = isDebug;
     Op.SmallContents.RegNo = Reg;
     Op.Contents.Reg.Prev = 0;

diff --git a/include/llvm/CodeGen/PBQP/HeuristicBase.h b/include/llvm/CodeGen/PBQP/HeuristicBase.h
index 3fee18c..0c1fcb7 100644
--- a/include/llvm/CodeGen/PBQP/HeuristicBase.h
+++ b/include/llvm/CodeGen/PBQP/HeuristicBase.h

@@ -113,7 +113,7 @@
     }
 
     /// \brief Add the given node to the list of nodes to be optimally reduced.
-    /// @return nItr Node iterator to be added.
+    /// @param nItr Node iterator to be added.
     ///
     /// You probably don't want to over-ride this, except perhaps to record
     /// statistics before calling this implementation. HeuristicBase relies on
@@ -193,8 +193,9 @@
     ///        reduce list.
     /// @return True if a reduction takes place, false if the heuristic reduce
     ///         list is empty.
-    void heuristicReduce() {
+    bool heuristicReduce() {
       llvm_unreachable("Must be implemented in derived class.");
+      return false;
     }
 
     /// \brief Prepare a change in the costs on the given edge.

diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 07b3b45..7bd5764 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h

@@ -404,6 +404,10 @@
   /// inserting cmov instructions.
   extern char &EarlyIfConverterID;
 
+  /// StackSlotColoring - This pass performs stack coloring and merging.
+  /// It merges disjoint allocas to reduce the stack size.
+  extern char &StackColoringID;
+
   /// IfConverter - This pass performs machine code if conversion.
   extern char &IfConverterID;
 

diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 85ab47b..2567a65 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h

@@ -85,6 +85,8 @@
     /// the value of the Latency field of the predecessor, however advanced
     /// models may provide additional information about specific edges.
     unsigned Latency;
+    /// Record MinLatency seperately from "expected" Latency.
+    unsigned MinLatency;
 
   public:
     /// SDep - Construct a null SDep. This is only for use by container
@@ -96,7 +98,7 @@
     SDep(SUnit *S, Kind kind, unsigned latency = 1, unsigned Reg = 0,
          bool isNormalMemory = false, bool isMustAlias = false,
          bool isArtificial = false)
-      : Dep(S, kind), Contents(), Latency(latency) {
+      : Dep(S, kind), Contents(), Latency(latency), MinLatency(latency) {
       switch (kind) {
       case Anti:
       case Output:
@@ -135,7 +137,8 @@
     }
 
     bool operator==(const SDep &Other) const {
-      return overlaps(Other) && Latency == Other.Latency;
+      return overlaps(Other)
+        && Latency == Other.Latency && MinLatency == Other.MinLatency;
     }
 
     bool operator!=(const SDep &Other) const {
@@ -155,6 +158,18 @@
       Latency = Lat;
     }
 
+    /// getMinLatency - Return the minimum latency for this edge. Minimum
+    /// latency is used for scheduling groups, while normal (expected) latency
+    /// is for instruction cost and critical path.
+    unsigned getMinLatency() const {
+      return MinLatency;
+    }
+
+    /// setMinLatency - Set the minimum latency for this edge.
+    void setMinLatency(unsigned Lat) {
+      MinLatency = Lat;
+    }
+
     //// getSUnit - Return the SUnit to which this edge points.
     SUnit *getSUnit() const {
       return Dep.getPointer();

diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1bde942..8b52b5a 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h

@@ -108,6 +108,15 @@
     }
   };
 
+  /// Record a physical register access.
+  /// For non data-dependent uses, OpIdx == -1.
+  struct PhysRegSUOper {
+    SUnit *SU;
+    int OpIdx;
+
+    PhysRegSUOper(SUnit *su, int op): SU(su), OpIdx(op) {}
+  };
+
   /// Combine a SparseSet with a 1x1 vector to track physical registers.
   /// The SparseSet allows iterating over the (few) live registers for quickly
   /// comparing against a regmask or clearing the set.
@@ -116,7 +125,7 @@
   /// cleared between scheduling regions without freeing unused entries.
   class Reg2SUnitsMap {
     SparseSet<unsigned> PhysRegSet;
-    std::vector<std::vector<SUnit*> > SUnits;
+    std::vector<std::vector<PhysRegSUOper> > SUnits;
   public:
     typedef SparseSet<unsigned>::const_iterator const_iterator;
 
@@ -140,7 +149,7 @@
 
     /// If this register is mapped, return its existing SUnits vector.
     /// Otherwise map the register and return an empty SUnits vector.
-    std::vector<SUnit *> &operator[](unsigned Reg) {
+    std::vector<PhysRegSUOper> &operator[](unsigned Reg) {
       bool New = PhysRegSet.insert(Reg).second;
       assert((!New || SUnits[Reg].empty()) && "stale SUnits vector");
       (void)New;
@@ -288,16 +297,6 @@
     ///
     virtual void computeLatency(SUnit *SU);
 
-    /// computeOperandLatency - Return dependence edge latency using
-    /// operand use/def information
-    ///
-    /// FindMin may be set to get the minimum vs. expected latency. Minimum
-    /// latency is used for scheduling groups, while expected latency is for
-    /// instruction cost and critical path.
-    virtual unsigned computeOperandLatency(SUnit *Def, SUnit *Use,
-                                           const SDep& dep,
-                                           bool FindMin = false) const;
-
     /// schedule - Order nodes according to selected style, filling
     /// in the Sequence member.
     ///
@@ -319,7 +318,7 @@
 
   protected:
     void initSUnits();
-    void addPhysRegDataDeps(SUnit *SU, const MachineOperand &MO);
+    void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx);
     void addPhysRegDeps(SUnit *SU, unsigned OperIdx);
     void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
     void addVRegUseDeps(SUnit *SU, unsigned OperIdx);

diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index db361ee..3bea2de 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h

@@ -1011,11 +1011,6 @@
     SubclassData |= SynchScope << 12;
     assert(getOrdering() == Ordering && "Ordering encoding error!");
     assert(getSynchScope() == SynchScope && "Synch-scope encoding error!");
-
-    assert((readMem() || getOrdering() <= Monotonic) &&
-           "Acquire/Release MachineMemOperand must be a load!");
-    assert((writeMem() || getOrdering() <= Monotonic) &&
-           "Acquire/Release MachineMemOperand must be a store!");
   }
 
 public:
@@ -1750,10 +1745,10 @@
 
 class SDNodeIterator : public std::iterator<std::forward_iterator_tag,
                                             SDNode, ptrdiff_t> {
-  SDNode *Node;
+  const SDNode *Node;
   unsigned Operand;
 
-  SDNodeIterator(SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
+  SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
 public:
   bool operator==(const SDNodeIterator& x) const {
     return Operand == x.Operand;
@@ -1784,8 +1779,8 @@
     return Operand - Other.Operand;
   }
 
-  static SDNodeIterator begin(SDNode *N) { return SDNodeIterator(N, 0); }
-  static SDNodeIterator end  (SDNode *N) {
+  static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); }
+  static SDNodeIterator end  (const SDNode *N) {
     return SDNodeIterator(N, N->getNumOperands());
   }
 

diff --git a/include/llvm/Config/AsmParsers.def.in b/include/llvm/Config/AsmParsers.def.in
index 041af83..d636753 100644
--- a/include/llvm/Config/AsmParsers.def.in
+++ b/include/llvm/Config/AsmParsers.def.in

@@ -1,24 +1,24 @@
-//===- llvm/Config/AsmParsers.def - LLVM Assembly Parsers -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language parsers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_ASM_PARSER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly parsers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/AsmParsers.def - LLVM Assembly Parsers -------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language parsers                  *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_ASM_PARSER macro to be a function-like macro with a               *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly parsers.                                             *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_ASM_PARSER
 #  error Please define the macro LLVM_ASM_PARSER(TargetName)

diff --git a/include/llvm/Config/AsmPrinters.def.in b/include/llvm/Config/AsmPrinters.def.in
index 9729bd7..f0152a4 100644
--- a/include/llvm/Config/AsmPrinters.def.in
+++ b/include/llvm/Config/AsmPrinters.def.in

@@ -1,24 +1,24 @@
-//===- llvm/Config/AsmPrinters.def - LLVM Assembly Printers -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language printers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_ASM_PRINTER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly printers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/AsmPrinters.def - LLVM Assembly Printers -----*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language printers                 *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_ASM_PRINTER macro to be a function-like macro with a              *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly printers.                                            *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_ASM_PRINTER
 #  error Please define the macro LLVM_ASM_PRINTER(TargetName)

diff --git a/include/llvm/Config/Disassemblers.def.in b/include/llvm/Config/Disassemblers.def.in
index 1e6281d..d3a9bbd 100644
--- a/include/llvm/Config/Disassemblers.def.in
+++ b/include/llvm/Config/Disassemblers.def.in

@@ -1,24 +1,24 @@
-//===- llvm/Config/Disassemblers.def - LLVM Assembly Parsers ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language parsers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_DISASSEMBLER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly parsers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/Disassemblers.def - LLVM Assembly Parsers ----*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language parsers                  *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_DISASSEMBLER macro to be a function-like macro with a             *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly parsers.                                             *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_DISASSEMBLER
 #  error Please define the macro LLVM_DISASSEMBLER(TargetName)

diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index b912251..eb20b64 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake

@@ -51,7 +51,7 @@
 #cmakedefine HAVE_ASSERT_H ${HAVE_ASSERT_H}
 
 /* Define to 1 if you have the `backtrace' function. */
-#undef HAVE_BACKTRACE
+#cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
 
 /* Define to 1 if you have the `bcopy' function. */
 #undef HAVE_BCOPY

diff --git a/include/llvm/DIBuilder.h b/include/llvm/DIBuilder.h
index 2ed48a9..dd4ea96 100644
--- a/include/llvm/DIBuilder.h
+++ b/include/llvm/DIBuilder.h

@@ -179,8 +179,10 @@
     /// @param Ty           Parent type.
     /// @param PropertyName Name of the Objective C property associated with
     ///                     this ivar.
-    /// @param GetterName   Name of the Objective C property getter selector.
-    /// @param SetterName   Name of the Objective C property setter selector.
+    /// @param PropertyGetterName Name of the Objective C property getter
+    ///                           selector.
+    /// @param PropertySetterName Name of the Objective C property setter
+    ///                           selector.
     /// @param PropertyAttributes Objective C property attributes.
     DIType createObjCIVar(StringRef Name, DIFile File,
                           unsigned LineNo, uint64_t SizeInBits, 
@@ -201,7 +203,7 @@
     /// @param OffsetInBits Member offset.
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Ty           Parent type.
-    /// @param Property     Property associated with this ivar.
+    /// @param PropertyNode Property associated with this ivar.
     DIType createObjCIVar(StringRef Name, DIFile File,
                           unsigned LineNo, uint64_t SizeInBits, 
                           uint64_t AlignInBits, uint64_t OffsetInBits, 
@@ -228,7 +230,7 @@
     /// @param Scope        Scope in which this class is defined.
     /// @param Name         class name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param OffsetInBits Member offset.
@@ -250,7 +252,7 @@
     /// @param Scope        Scope in which this struct is defined.
     /// @param Name         Struct name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Flags        Flags to encode member attribute, e.g. private
@@ -265,7 +267,7 @@
     /// @param Scope        Scope in which this union is defined.
     /// @param Name         Union name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Flags        Flags to encode member attribute, e.g. private
@@ -325,7 +327,7 @@
     /// @param Scope        Scope in which this enumeration is defined.
     /// @param Name         Union name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Elements     Enumeration elements.
@@ -337,9 +339,9 @@
                                  unsigned Flags);
 
     /// createSubroutineType - Create subroutine type.
-    /// @param File          File in which this subroutine is defined.
-    /// @param ParamterTypes An array of subroutine parameter types. This
-    ///                      includes return type at 0th index.
+    /// @param File           File in which this subroutine is defined.
+    /// @param ParameterTypes An array of subroutine parameter types. This
+    ///                       includes return type at 0th index.
     DIType createSubroutineType(DIFile File, DIArray ParameterTypes);
 
     /// createArtificialType - Create a new DIType with "artificial" flag set.
@@ -383,9 +385,9 @@
 
     /// createStaticVariable - Create a new descriptor for the specified 
     /// variable.
-    /// @param Conext      Variable scope. 
+    /// @param Context     Variable scope.
     /// @param Name        Name of the variable.
-    /// @param LinakgeName Mangled  name of the variable.
+    /// @param LinkageName Mangled  name of the variable.
     /// @param File        File where this variable is defined.
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type.
@@ -426,7 +428,7 @@
     ///                    DW_TAG_arg_variable.
     /// @param Scope       Variable scope.
     /// @param Name        Variable name.
-    /// @param File        File where this variable is defined.
+    /// @param F           File where this variable is defined.
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type
     /// @param Addr        An array of complex address operations.

diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index cfdeb46..8d6054a 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h

@@ -15,6 +15,7 @@
 #ifndef LLVM_DEBUGINFO_DICONTEXT_H
 #define LLVM_DEBUGINFO_DICONTEXT_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -54,6 +55,23 @@
   }
 };
 
+/// DIInliningInfo - a format-neutral container for inlined code description.
+class DIInliningInfo {
+  SmallVector<DILineInfo, 4> Frames;
+ public:
+  DIInliningInfo() {}
+  DILineInfo getFrame(unsigned Index) const {
+    assert(Index < Frames.size());
+    return Frames[Index];
+  }
+  uint32_t getNumberOfFrames() const {
+    return Frames.size();
+  }
+  void addFrame(const DILineInfo &Frame) {
+    Frames.push_back(Frame);
+  }
+};
+
 /// DILineInfoSpecifier - controls which fields of DILineInfo container
 /// should be filled with data.
 class DILineInfoSpecifier {
@@ -81,12 +99,15 @@
                                     StringRef abbrevSection,
                                     StringRef aRangeSection = StringRef(),
                                     StringRef lineSection = StringRef(),
-                                    StringRef stringSection = StringRef());
+                                    StringRef stringSection = StringRef(),
+                                    StringRef rangeSection = StringRef());
 
   virtual void dump(raw_ostream &OS) = 0;
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address,
-      DILineInfoSpecifier specifier = DILineInfoSpecifier()) = 0;
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
 };
 
 }

diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index a5c9272..9e5ad2f 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h

@@ -73,6 +73,10 @@
   /// and resolve relocatons based on where they put it).
   void *getSymbolAddress(StringRef Name);
 
+  /// Get the address of the target copy of the symbol. This is the address
+  /// used for relocation.
+  uint64_t getSymbolLoadAddress(StringRef Name);
+
   /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();
 

diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index de97957..bede4eb 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h

@@ -141,6 +141,7 @@
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
+void initializeProfileMetadataLoaderPassPass(PassRegistry&);
 void initializePathProfileLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopDeletionPass(PassRegistry&);
@@ -231,6 +232,7 @@
 void initializeSlotIndexesPass(PassRegistry&);
 void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
+void initializeStackColoringPass(PassRegistry&);
 void initializeStackSlotColoringPass(PassRegistry&);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
 void initializeStripDeadPrototypesPassPass(PassRegistry&);

diff --git a/include/llvm/InlineAsm.h b/include/llvm/InlineAsm.h
index 37aa18b..a0aecfd 100644
--- a/include/llvm/InlineAsm.h
+++ b/include/llvm/InlineAsm.h

@@ -33,6 +33,13 @@
 struct ConstantCreator;
 
 class InlineAsm : public Value {
+public:
+  enum AsmDialect {
+    AD_ATT,
+    AD_Intel
+  };
+
+private:
   friend struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType>;
   friend class ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&,
                                  PointerType, InlineAsm, false>;
@@ -43,10 +50,11 @@
   std::string AsmString, Constraints;
   bool HasSideEffects;
   bool IsAlignStack;
-  
+  AsmDialect Dialect;
+
   InlineAsm(PointerType *Ty, const std::string &AsmString,
             const std::string &Constraints, bool hasSideEffects,
-            bool isAlignStack);
+            bool isAlignStack, AsmDialect asmDialect);
   virtual ~InlineAsm();
 
   /// When the ConstantUniqueMap merges two types and makes two InlineAsms
@@ -58,11 +66,13 @@
   ///
   static InlineAsm *get(FunctionType *Ty, StringRef AsmString,
                         StringRef Constraints, bool hasSideEffects,
-                        bool isAlignStack = false);
+                        bool isAlignStack = false,
+                        AsmDialect asmDialect = AD_ATT);
   
   bool hasSideEffects() const { return HasSideEffects; }
   bool isAlignStack() const { return IsAlignStack; }
-  
+  AsmDialect getDialect() const { return Dialect; }
+
   /// getType - InlineAsm's are always pointers.
   ///
   PointerType *getType() const {
@@ -193,17 +203,18 @@
     Op_InputChain = 0,
     Op_AsmString = 1,
     Op_MDNode = 2,
-    Op_ExtraInfo = 3,    // HasSideEffects, IsAlignStack
+    Op_ExtraInfo = 3,    // HasSideEffects, IsAlignStack, AsmDialect.
     Op_FirstOperand = 4,
 
     // Fixed operands on an INLINEASM MachineInstr.
     MIOp_AsmString = 0,
-    MIOp_ExtraInfo = 1,    // HasSideEffects, IsAlignStack
+    MIOp_ExtraInfo = 1,    // HasSideEffects, IsAlignStack, AsmDialect.
     MIOp_FirstOperand = 2,
 
     // Interpretation of the MIOp_ExtraInfo bit field.
     Extra_HasSideEffects = 1,
     Extra_IsAlignStack = 2,
+    Extra_AsmDialect = 4,
 
     // Inline asm operands map to multiple SDNode / MachineInstr operands.
     // The first operand is an immediate describing the asm operand, the low

diff --git a/include/llvm/InstrTypes.h b/include/llvm/InstrTypes.h
index 2529f24..6291a6d 100644
--- a/include/llvm/InstrTypes.h
+++ b/include/llvm/InstrTypes.h

@@ -581,8 +581,8 @@
 
   /// Determine how a pair of casts can be eliminated, if they can be at all.
   /// This is a helper function for both CastInst and ConstantExpr.
-  /// @returns 0 if the CastInst pair can't be eliminated
-  /// @returns Instruction::CastOps value for a cast that can replace
+  /// @returns 0 if the CastInst pair can't be eliminated, otherwise
+  /// returns Instruction::CastOps value for a cast that can replace
   /// the pair, casting SrcTy to DstTy.
   /// @brief Determine if a cast pair is eliminable
   static unsigned isEliminableCastPair(

diff --git a/include/llvm/IntrinsicsMips.td b/include/llvm/IntrinsicsMips.td
index 4375ac2..e40e162 100644
--- a/include/llvm/IntrinsicsMips.td
+++ b/include/llvm/IntrinsicsMips.td

@@ -14,11 +14,15 @@
 //===----------------------------------------------------------------------===//
 // MIPS DSP data types
 def mips_v2q15_ty: LLVMType<v2i16>;
+def mips_v4q7_ty: LLVMType<v4i8>;
 def mips_q31_ty: LLVMType<i32>;
 
 let TargetPrefix = "mips" in {  // All intrinsics start with "llvm.mips.".
 
 //===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+
+//===----------------------------------------------------------------------===//
 // Addition/subtraction
 
 def int_mips_addu_qb : GCCBuiltin<"__builtin_mips_addu_qb">,
@@ -261,4 +265,125 @@
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
 def int_mips_lwx: GCCBuiltin<"__builtin_mips_lwx">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+
+def int_mips_absq_s_qb: GCCBuiltin<"__builtin_mips_absq_s_qb">,
+  Intrinsic<[mips_v4q7_ty], [mips_v4q7_ty], []>;
+
+def int_mips_addqh_ph: GCCBuiltin<"__builtin_mips_addqh_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_r_ph: GCCBuiltin<"__builtin_mips_addqh_r_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_w: GCCBuiltin<"__builtin_mips_addqh_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_r_w: GCCBuiltin<"__builtin_mips_addqh_r_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
+            [IntrNoMem, Commutative]>;
+
+def int_mips_addu_ph: GCCBuiltin<"__builtin_mips_addu_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+def int_mips_addu_s_ph: GCCBuiltin<"__builtin_mips_addu_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+
+def int_mips_adduh_qb: GCCBuiltin<"__builtin_mips_adduh_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_adduh_r_qb: GCCBuiltin<"__builtin_mips_adduh_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem, Commutative]>;
+
+def int_mips_append: GCCBuiltin<"__builtin_mips_append">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_mips_balign: GCCBuiltin<"__builtin_mips_balign">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_cmpgdu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgdu_eq_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgdu_lt_qb: GCCBuiltin<"__builtin_mips_cmpgdu_lt_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgdu_le_qb: GCCBuiltin<"__builtin_mips_cmpgdu_le_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+
+def int_mips_dpa_w_ph: GCCBuiltin<"__builtin_mips_dpa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dps_w_ph: GCCBuiltin<"__builtin_mips_dps_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+
+def int_mips_dpaqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpaqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_sa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpax_w_ph: GCCBuiltin<"__builtin_mips_dpax_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dpsx_w_ph: GCCBuiltin<"__builtin_mips_dpsx_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dpsqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpsqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_sa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+
+def int_mips_mul_ph: GCCBuiltin<"__builtin_mips_mul_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+def int_mips_mul_s_ph: GCCBuiltin<"__builtin_mips_mul_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+
+def int_mips_mulq_rs_w: GCCBuiltin<"__builtin_mips_mulq_rs_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
+def int_mips_mulq_s_ph: GCCBuiltin<"__builtin_mips_mulq_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_mulq_s_w: GCCBuiltin<"__builtin_mips_mulq_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
+def int_mips_mulsa_w_ph: GCCBuiltin<"__builtin_mips_mulsa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+
+def int_mips_precr_qb_ph: GCCBuiltin<"__builtin_mips_precr_qb_ph">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+def int_mips_precr_sra_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_ph_w">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_precr_sra_r_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_r_ph_w">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_prepend: GCCBuiltin<"__builtin_mips_prepend">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_shra_qb: GCCBuiltin<"__builtin_mips_shra_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shra_r_qb: GCCBuiltin<"__builtin_mips_shra_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shrl_ph: GCCBuiltin<"__builtin_mips_shrl_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_subqh_ph: GCCBuiltin<"__builtin_mips_subqh_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_subqh_r_ph: GCCBuiltin<"__builtin_mips_subqh_r_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_subqh_w: GCCBuiltin<"__builtin_mips_subqh_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
+def int_mips_subqh_r_w: GCCBuiltin<"__builtin_mips_subqh_r_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
+
+def int_mips_subu_ph: GCCBuiltin<"__builtin_mips_subu_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+def int_mips_subu_s_ph: GCCBuiltin<"__builtin_mips_subu_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+
+def int_mips_subuh_qb: GCCBuiltin<"__builtin_mips_subuh_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_subuh_r_qb: GCCBuiltin<"__builtin_mips_subuh_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
 }

diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index e8039f2..5ff0856 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td

@@ -219,7 +219,7 @@
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4f32_ty], []>;
+                         llvm_v4f32_ty], [IntrReadWriteArgMem]>;
 }
 
 // Cacheability support ops
@@ -502,13 +502,13 @@
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2f64_ty], []>;
+                         llvm_v2f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v16i8_ty], []>;
+                         llvm_v16i8_ty], [IntrReadWriteArgMem]>;
   def int_x86_sse2_storel_dq : GCCBuiltin<"__builtin_ia32_storelv4si">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4i32_ty], []>;
+                         llvm_v4i32_ty], [IntrReadWriteArgMem]>;
 }
 
 // Misc.
@@ -1270,19 +1270,19 @@
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_vbroadcast_ss :
         GCCBuiltin<"__builtin_ia32_vbroadcastss">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcast_sd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcast_ss_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_pd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_ps_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_ps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
 }
 
 // SIMD load ops
@@ -1294,41 +1294,45 @@
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_storeu_pd_256 : GCCBuiltin<"__builtin_ia32_storeupd256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_storeu_ps_256 : GCCBuiltin<"__builtin_ia32_storeups256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_storeu_dq_256 : GCCBuiltin<"__builtin_ia32_storedqu256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], [IntrReadWriteArgMem]>;
 }
 
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_ps : GCCBuiltin<"__builtin_ia32_maskloadps">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_pd_256 : GCCBuiltin<"__builtin_ia32_maskloadpd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty],
+                  [IntrReadArgMem]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v2f64_ty, llvm_v2f64_ty], []>;
+                  llvm_v2f64_ty, llvm_v2f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_ps : GCCBuiltin<"__builtin_ia32_maskstoreps">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v4f32_ty, llvm_v4f32_ty], []>;
+                  llvm_v4f32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_pd_256 :
         GCCBuiltin<"__builtin_ia32_maskstorepd256">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v4f64_ty, llvm_v4f64_ty], []>;
+                  llvm_v4f64_ty, llvm_v4f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_ps_256 :
         GCCBuiltin<"__builtin_ia32_maskstoreps256">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v8f32_ty, llvm_v8f32_ty], []>;
+                  llvm_v8f32_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1632,7 +1636,7 @@
               Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_avx2_vbroadcasti128 :
               GCCBuiltin<"__builtin_ia32_vbroadcastsi256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+              Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx2_pbroadcastb_128 :
               GCCBuiltin<"__builtin_ia32_pbroadcastb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -1685,27 +1689,35 @@
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
-        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_q : GCCBuiltin<"__builtin_ia32_maskloadq">,
-        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_d_256 : GCCBuiltin<"__builtin_ia32_maskloadd256">,
-        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
-        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
+                  [IntrReadArgMem]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_maskstore_d : GCCBuiltin<"__builtin_ia32_maskstored">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_q : GCCBuiltin<"__builtin_ia32_maskstoreq">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_d_256 :
         GCCBuiltin<"__builtin_ia32_maskstored256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_q_256 :
         GCCBuiltin<"__builtin_ia32_maskstoreq256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty],
+                  [IntrReadWriteArgMem]>;
 }
 
 // Variable bit shift ops

diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 697c94c..fe4c92a 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h

@@ -107,6 +107,7 @@
       (void) llvm::createProfileVerifierPass();
       (void) llvm::createPathProfileVerifierPass();
       (void) llvm::createProfileLoaderPass();
+      (void) llvm::createProfileMetadataLoaderPass();
       (void) llvm::createPathProfileLoaderPass();
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();

diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 05e6286..1c45090 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h

@@ -30,8 +30,8 @@
 
 /// MCAsmBackend - Generic interface to target specific assembler backends.
 class MCAsmBackend {
-  MCAsmBackend(const MCAsmBackend &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmBackend &);  // DO NOT IMPLEMENT
+  MCAsmBackend(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCAsmBackend();
 
@@ -133,6 +133,13 @@
 
   /// @}
 
+  /// getMinimumNopSize - Returns the minimum size of a nop in bytes on this
+  /// target. The assembler will use this to emit excess padding in situations
+  /// where the padding required for simple alignment would be less than the
+  /// minimum nop size.
+  ///
+  virtual unsigned getMinimumNopSize() const { return 1; }
+
   /// writeNopData - Write an (optimal) nop sequence of Count bytes to the given
   /// output. If the target cannot generate such a sequence, it should return an
   /// error.

diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 9f5230b..97aad71 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h

@@ -33,7 +33,7 @@
   }
 
   namespace LCOMM {
-    enum LCOMMType { None, NoAlignment, ByteAlignment };
+    enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
   }
 
   /// MCAsmInfo - This class is intended to be used as a base class for asm
@@ -247,14 +247,14 @@
     /// .long a - b
     bool HasAggressiveSymbolFolding;           // Defaults to true.
 
-    /// LCOMMDirectiveType - Describes if the target supports the .lcomm
-    /// directive and whether it has an alignment parameter.
-    LCOMM::LCOMMType LCOMMDirectiveType;     // Defaults to LCOMM::None.
-
-    /// COMMDirectiveAlignmentIsInBytes - True is COMMDirective's optional
+    /// COMMDirectiveAlignmentIsInBytes - True is .comm's and .lcomms optional
     /// alignment is to be specified in bytes instead of log2(n).
     bool COMMDirectiveAlignmentIsInBytes;    // Defaults to true;
 
+    /// LCOMMDirectiveAlignment - Describes if the .lcomm directive for the
+    /// target supports an alignment argument and how it is interpreted.
+    LCOMM::LCOMMType LCOMMDirectiveAlignmentType; // Defaults to NoAlignment.
+
     /// HasDotTypeDotSizeDirective - True if the target has .type and .size
     /// directives, this is true for most ELF targets.
     bool HasDotTypeDotSizeDirective;         // Defaults to true.
@@ -496,13 +496,13 @@
     bool hasAggressiveSymbolFolding() const {
       return HasAggressiveSymbolFolding;
     }
-    LCOMM::LCOMMType getLCOMMDirectiveType() const {
-      return LCOMMDirectiveType;
-    }
-    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
     bool getCOMMDirectiveAlignmentIsInBytes() const {
       return COMMDirectiveAlignmentIsInBytes;
     }
+    LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
+      return LCOMMDirectiveAlignmentType;
+    }
+    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
     bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
     bool hasNoDeadStrip() const { return HasNoDeadStrip; }
     bool hasSymbolResolver() const { return HasSymbolResolver; }

diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index ace09a4..1858349 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h

@@ -45,8 +45,8 @@
   friend class MCAsmLayout;
   friend class mcld::Layout;
 
-  MCFragment(const MCFragment&);     // DO NOT IMPLEMENT
-  void operator=(const MCFragment&); // DO NOT IMPLEMENT
+  MCFragment(const MCFragment&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCFragment&) LLVM_DELETED_FUNCTION;
 
 public:
   enum FragmentType {
@@ -184,7 +184,7 @@
   typedef SmallVectorImpl<MCFixup>::iterator fixup_iterator;
 
 public:
-  MCInstFragment(MCInst _Inst, MCSectionData *SD = 0)
+  MCInstFragment(const MCInst &_Inst, MCSectionData *SD = 0)
     : MCFragment(FT_Inst, SD), Inst(_Inst) {
   }
 
@@ -199,7 +199,7 @@
   MCInst &getInst() { return Inst; }
   const MCInst &getInst() const { return Inst; }
 
-  void setInst(MCInst Value) { Inst = Value; }
+  void setInst(const MCInst& Value) { Inst = Value; }
 
   /// @}
   /// @name Fixup Access

diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index 934ef69..4f7d103 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h

@@ -10,6 +10,8 @@
 #ifndef LLVM_MC_MCCODEEMITTER_H
 #define LLVM_MC_MCCODEEMITTER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 class MCFixup;
 class MCInst;
@@ -19,8 +21,8 @@
 /// MCCodeEmitter - Generic instruction encoding interface.
 class MCCodeEmitter {
 private:
-  MCCodeEmitter(const MCCodeEmitter &);   // DO NOT IMPLEMENT
-  void operator=(const MCCodeEmitter &);  // DO NOT IMPLEMENT
+  MCCodeEmitter(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCCodeEmitter();
 

diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 59545d3..23652f0 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h

@@ -40,8 +40,8 @@
   /// of the sections that it creates.
   ///
   class MCContext {
-    MCContext(const MCContext&); // DO NOT IMPLEMENT
-    MCContext &operator=(const MCContext&); // DO NOT IMPLEMENT
+    MCContext(const MCContext&) LLVM_DELETED_FUNCTION;
+    MCContext &operator=(const MCContext&) LLVM_DELETED_FUNCTION;
   public:
     typedef StringMap<MCSymbol*, BumpPtrAllocator&> SymbolTable;
   private:

diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index fdb7ab2..9e09ddf 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h

@@ -19,6 +19,7 @@
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Compiler.h"
 #include <vector>
 
 namespace llvm {
@@ -48,8 +49,8 @@
     MCDwarfFile(StringRef name, unsigned dirIndex)
       : Name(name), DirIndex(dirIndex) {}
 
-    MCDwarfFile(const MCDwarfFile&);       // DO NOT IMPLEMENT
-    void operator=(const MCDwarfFile&); // DO NOT IMPLEMENT
+    MCDwarfFile(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
   public:
     /// getName - Get the base name of this MCDwarfFile.
     StringRef getName() const { return Name; }

diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index aa62eb2..f36db3c 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h

@@ -41,8 +41,8 @@
 private:
   ExprKind Kind;
 
-  MCExpr(const MCExpr&); // DO NOT IMPLEMENT
-  void operator=(const MCExpr&); // DO NOT IMPLEMENT
+  MCExpr(const MCExpr&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCExpr&) LLVM_DELETED_FUNCTION;
 
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
@@ -78,11 +78,11 @@
   /// values. If not given, then only non-symbolic expressions will be
   /// evaluated.
   /// @result - True on success.
+  bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout,
+                          const SectionAddrMap &Addrs) const;
   bool EvaluateAsAbsolute(int64_t &Res) const;
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const;
   bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout) const;
-  bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout,
-                          const SectionAddrMap &Addrs) const;
 
   /// EvaluateAsRelocatable - Try to evaluate the expression to a relocatable
   /// value, i.e. an expression of the fixed form (a - b + constant).
@@ -171,7 +171,8 @@
     VK_ARM_GOTTPOFF,
     VK_ARM_TARGET1,
 
-    VK_PPC_TOC,
+    VK_PPC_TOC,          // TOC base
+    VK_PPC_TOC_ENTRY,    // TOC entry
     VK_PPC_DARWIN_HA16,  // ha16(symbol)
     VK_PPC_DARWIN_LO16,  // lo16(symbol)
     VK_PPC_GAS_HA16,     // symbol@ha

diff --git a/include/llvm/MC/MCLabel.h b/include/llvm/MC/MCLabel.h
index 727520d..c72aabd 100644
--- a/include/llvm/MC/MCLabel.h
+++ b/include/llvm/MC/MCLabel.h

@@ -14,6 +14,8 @@
 #ifndef LLVM_MC_MCLABEL_H
 #define LLVM_MC_MCLABEL_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class MCContext;
   class raw_ostream;
@@ -30,8 +32,8 @@
     MCLabel(unsigned instance)
       : Instance(instance) {}
 
-    MCLabel(const MCLabel&);       // DO NOT IMPLEMENT
-    void operator=(const MCLabel&); // DO NOT IMPLEMENT
+    MCLabel(const MCLabel&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCLabel&) LLVM_DELETED_FUNCTION;
   public:
     /// getInstance - Get the current instance of this Directional Local Label.
     unsigned getInstance() const { return Instance; }

diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index a69075d..b59b76c 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h

@@ -80,6 +80,7 @@
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
   virtual void EmitGPRel32Value(const MCExpr *Value);
+  virtual void EmitGPRel64Value(const MCExpr *Value);
   virtual void FinishImpl();
 
   /// @}

diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 9591a00..14fe75f 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h

@@ -11,6 +11,7 @@
 #define LLVM_MC_MCOBJECTWRITER_H
 
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
@@ -35,8 +36,8 @@
 /// The object writer also contains a number of helper methods for writing
 /// binary data to the output stream.
 class MCObjectWriter {
-  MCObjectWriter(const MCObjectWriter &); // DO NOT IMPLEMENT
-  void operator=(const MCObjectWriter &); // DO NOT IMPLEMENT
+  MCObjectWriter(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
 
 protected:
   raw_ostream &OS;

diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 9a8735f..e102dfb 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h

@@ -31,8 +31,8 @@
   const MemoryBuffer *CurBuf;
   bool isAtStartOfLine;
 
-  void operator=(const AsmLexer&); // DO NOT IMPLEMENT
-  AsmLexer(const AsmLexer&);       // DO NOT IMPLEMENT
+  void operator=(const AsmLexer&) LLVM_DELETED_FUNCTION;
+  AsmLexer(const AsmLexer&) LLVM_DELETED_FUNCTION;
 
 protected:
   /// LexToken - Read the next token and return its code.

diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 5e29ad4..ca163c5 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h

@@ -11,6 +11,7 @@
 #define LLVM_MC_MCASMLEXER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
 
@@ -121,8 +122,8 @@
   SMLoc ErrLoc;
   std::string Err;
 
-  MCAsmLexer(const MCAsmLexer &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmLexer &);  // DO NOT IMPLEMENT
+  MCAsmLexer(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   const char *TokStart;
 

diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 793c709..c673a79 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h

@@ -35,8 +35,8 @@
   typedef bool (*DirectiveHandler)(MCAsmParserExtension*, StringRef, SMLoc);
 
 private:
-  MCAsmParser(const MCAsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmParser &);  // DO NOT IMPLEMENT
+  MCAsmParser(const MCAsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmParser &) LLVM_DELETED_FUNCTION;
 
   MCTargetAsmParser *TargetParser;
 

diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 4e2aee9..59593a8 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h

@@ -21,8 +21,8 @@
 /// which is implemented by target and object file assembly parser
 /// implementations.
 class MCAsmParserExtension {
-  MCAsmParserExtension(const MCAsmParserExtension &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmParserExtension &);  // DO NOT IMPLEMENT
+  MCAsmParserExtension(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
 
   MCAsmParser *Parser;
 

diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index a708edc..f90ad6a 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h

@@ -15,7 +15,7 @@
 #define LLVM_MC_MCSECTION_H
 
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
   class MCAsmInfo;
@@ -34,8 +34,8 @@
     };
 
   private:
-    MCSection(const MCSection&);      // DO NOT IMPLEMENT
-    void operator=(const MCSection&); // DO NOT IMPLEMENT
+    MCSection(const MCSection&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCSection&) LLVM_DELETED_FUNCTION;
   protected:
     MCSection(SectionVariant V, SectionKind K) : Variant(V), Kind(K) {}
     SectionVariant Variant;

diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index e8c3e59..91593b9 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h

@@ -47,8 +47,8 @@
   class MCStreamer {
     MCContext &Context;
 
-    MCStreamer(const MCStreamer&); // DO NOT IMPLEMENT
-    MCStreamer &operator=(const MCStreamer&); // DO NOT IMPLEMENT
+    MCStreamer(const MCStreamer&) LLVM_DELETED_FUNCTION;
+    MCStreamer &operator=(const MCStreamer&) LLVM_DELETED_FUNCTION;
 
     bool EmitEHFrame;
     bool EmitDebugFrame;
@@ -581,9 +581,6 @@
   ///
   /// \param ShowInst - Whether to show the MCInst representation inline with
   /// the assembly.
-  ///
-  /// \param DecodeLSDA - If true, emit comments that translates the LSDA into a
-  /// human readable format. Only usable with CFI.
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 bool isVerboseAsm,
                                 bool useLoc,

diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 31d632d..6c96f49 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h

@@ -72,7 +72,7 @@
 
   /// getSchedModelForCPU - Get the machine model of a CPU.
   ///
-  MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
+  const MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
 
   /// getInstrItineraryForCPU - Get scheduling itinerary of a CPU.
   ///

diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 0583ce5..4c9e7f5 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h

@@ -15,6 +15,7 @@
 #define LLVM_MC_MCSYMBOL_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
   class MCExpr;
@@ -62,8 +63,8 @@
       : Name(name), Section(0), Value(0),
         IsTemporary(isTemporary), IsUsed(false) {}
 
-    MCSymbol(const MCSymbol&);       // DO NOT IMPLEMENT
-    void operator=(const MCSymbol&); // DO NOT IMPLEMENT
+    MCSymbol(const MCSymbol&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCSymbol&) LLVM_DELETED_FUNCTION;
   public:
     /// getName - Get the symbol name.
     StringRef getName() const { return Name; }

diff --git a/include/llvm/MC/MCTargetAsmLexer.h b/include/llvm/MC/MCTargetAsmLexer.h
index f5c8c09..d09fe04 100644
--- a/include/llvm/MC/MCTargetAsmLexer.h
+++ b/include/llvm/MC/MCTargetAsmLexer.h

@@ -24,8 +24,8 @@
   SMLoc ErrLoc;
   std::string Err;
 
-  MCTargetAsmLexer(const MCTargetAsmLexer &);   // DO NOT IMPLEMENT
-  void operator=(const MCTargetAsmLexer &);  // DO NOT IMPLEMENT
+  MCTargetAsmLexer(const MCTargetAsmLexer &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCTargetAsmLexer &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCTargetAsmLexer(const Target &);
 

diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 91b604b..709c2d2 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h

@@ -25,7 +25,6 @@
 class MCTargetAsmParser : public MCAsmParserExtension {
 public:
   enum MatchResultTy {
-    Match_ConversionFail,
     Match_InvalidOperand,
     Match_MissingFeature,
     Match_MnemonicFail,
@@ -34,8 +33,8 @@
   };
 
 private:
-  MCTargetAsmParser(const MCTargetAsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const MCTargetAsmParser &);  // DO NOT IMPLEMENT
+  MCTargetAsmParser(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCTargetAsmParser();
 
@@ -86,7 +85,7 @@
   /// On failure, the target parser is responsible for emitting a diagnostic
   /// explaining the match failure.
   virtual bool
-  MatchInstruction(SMLoc IDLoc,
+  MatchInstruction(SMLoc IDLoc, unsigned &Kind,
                    SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                    SmallVectorImpl<MCInst> &MCInsts,
                    unsigned &OrigErrorInfo,
@@ -112,6 +111,10 @@
     return Match_Success;
   }
 
+  virtual unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                                       unsigned OperandNum,
+                                       unsigned &NumMCOperands) = 0;
 };
 
 } // End llvm namespace

diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index 507d882..87c5fd3 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h

@@ -50,7 +50,7 @@
 //
 struct SubtargetInfoKV {
   const char *Key;                      // K-V key string
-  void *Value;                          // K-V pointer value
+  const void *Value;                    // K-V pointer value
 
   // Compare routine for std binary search
   bool operator<(const SubtargetInfoKV &S) const {
@@ -96,8 +96,8 @@
                           size_t FeatureTableSize);
 
   /// Get scheduling itinerary of a CPU.
-  void *getItinerary(const StringRef CPU,
-                     const SubtargetInfoKV *Table, size_t TableSize);
+  const void *getItinerary(const StringRef CPU,
+                           const SubtargetInfoKV *Table, size_t TableSize);
 
   /// Print feature string.
   void print(raw_ostream &OS) const;

diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index ba0030d..9c4afed 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h

@@ -389,11 +389,36 @@
   }
 };
 
+template<support::endianness target_endianness, bool is64Bits>
+struct Elf_Ehdr_Impl {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
+  unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
+  Elf_Half e_type;     // Type of file (see ET_*)
+  Elf_Half e_machine;  // Required architecture for this file (see EM_*)
+  Elf_Word e_version;  // Must be equal to 1
+  Elf_Addr e_entry;    // Address to jump to in order to start program
+  Elf_Off  e_phoff;    // Program header table's file offset, in bytes
+  Elf_Off  e_shoff;    // Section header table's file offset, in bytes
+  Elf_Word e_flags;    // Processor-specific flags
+  Elf_Half e_ehsize;   // Size of ELF header, in bytes
+  Elf_Half e_phentsize;// Size of an entry in the program header table
+  Elf_Half e_phnum;    // Number of entries in the program header table
+  Elf_Half e_shentsize;// Size of an entry in the section header table
+  Elf_Half e_shnum;    // Number of entries in the section header table
+  Elf_Half e_shstrndx; // Section header table index of section name
+                                 // string table
+  bool checkMagic() const {
+    return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
+  }
+   unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
+   unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
+};
 
 template<support::endianness target_endianness, bool is64Bits>
 class ELFObjectFile : public ObjectFile {
   LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
 
+  typedef Elf_Ehdr_Impl<target_endianness, is64Bits> Elf_Ehdr;
   typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
   typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
   typedef Elf_Dyn_Impl<target_endianness, is64Bits> Elf_Dyn;
@@ -408,28 +433,6 @@
   typedef content_iterator<DynRef> dyn_iterator;
 
 protected:
-  struct Elf_Ehdr {
-    unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
-    Elf_Half e_type;     // Type of file (see ET_*)
-    Elf_Half e_machine;  // Required architecture for this file (see EM_*)
-    Elf_Word e_version;  // Must be equal to 1
-    Elf_Addr e_entry;    // Address to jump to in order to start program
-    Elf_Off  e_phoff;    // Program header table's file offset, in bytes
-    Elf_Off  e_shoff;    // Section header table's file offset, in bytes
-    Elf_Word e_flags;    // Processor-specific flags
-    Elf_Half e_ehsize;   // Size of ELF header, in bytes
-    Elf_Half e_phentsize;// Size of an entry in the program header table
-    Elf_Half e_phnum;    // Number of entries in the program header table
-    Elf_Half e_shentsize;// Size of an entry in the section header table
-    Elf_Half e_shnum;    // Number of entries in the section header table
-    Elf_Half e_shstrndx; // Section header table index of section name
-                                  // string table
-    bool checkMagic() const {
-      return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
-    }
-    unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
-    unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
-  };
   // This flag is used for classof, to distinguish ELFObjectFile from
   // its subclass. If more subclasses will be created, this flag will
   // have to become an enum.
@@ -1446,6 +1449,143 @@
       res = "Unknown";
     }
     break;
+  case ELF::EM_ARM:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+    default:
+      res = "Unknown";
+    }
+    break;
   case ELF::EM_HEXAGON:
     switch (type) {
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
@@ -1576,15 +1716,15 @@
   int64_t addend = 0;
   uint16_t symbol_index = 0;
   switch (sec->sh_type) {
-    default :
+    default:
       return object_error::parse_failed;
-    case ELF::SHT_REL : {
+    case ELF::SHT_REL: {
       type = getRel(Rel)->getType();
       symbol_index = getRel(Rel)->getSymbol();
       // TODO: Read implicit addend from section data.
       break;
     }
-    case ELF::SHT_RELA : {
+    case ELF::SHT_RELA: {
       type = getRela(Rel)->getType();
       symbol_index = getRela(Rel)->getSymbol();
       addend = getRela(Rel)->r_addend;
@@ -1598,9 +1738,8 @@
   switch (Header->e_machine) {
   case ELF::EM_X86_64:
     switch (type) {
-    case ELF::R_X86_64_32S:
-      res = symname;
-      break;
+    case ELF::R_X86_64_PC8:
+    case ELF::R_X86_64_PC16:
     case ELF::R_X86_64_PC32: {
         std::string fmtbuf;
         raw_string_ostream fmt(fmtbuf);
@@ -1609,10 +1748,23 @@
         Result.append(fmtbuf.begin(), fmtbuf.end());
       }
       break;
+    case ELF::R_X86_64_8:
+    case ELF::R_X86_64_16:
+    case ELF::R_X86_64_32:
+    case ELF::R_X86_64_32S:
+    case ELF::R_X86_64_64: {
+        std::string fmtbuf;
+        raw_string_ostream fmt(fmtbuf);
+        fmt << symname << (addend < 0 ? "" : "+") << addend;
+        fmt.flush();
+        Result.append(fmtbuf.begin(), fmtbuf.end());
+      }
+      break;
     default:
       res = "Unknown";
     }
     break;
+  case ELF::EM_ARM:
   case ELF::EM_HEXAGON:
     res = symname;
     break;

diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index cf71251..8c389af 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h

@@ -72,6 +72,10 @@
 template <> struct AlignedCharArrayImpl<0> {
   typedef char type;
 };
+
+// MSVC requires special handling here.
+#ifndef _MSC_VER
+
 #if __has_feature(cxx_alignas)
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
@@ -82,11 +86,6 @@
   template <> struct AlignedCharArrayImpl<x> { \
     typedef char type __attribute__((aligned(x))); \
   }
-#elif defined(_MSC_VER)
-#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
-  template <> struct AlignedCharArrayImpl<x> { \
-    typedef __declspec(align(x)) char type; \
-  }
 #else
 # error No supported align as directive.
 #endif
@@ -104,9 +103,38 @@
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2048);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4096);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
+
+#undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#else // _MSC_VER
+
+// We provide special variations of this template for the most common
+// alignments because __declspec(align(...)) doesn't actually work when it is
+// a member of a by-value function argument in MSVC, even if the alignment
+// request is something reasonably like 8-byte or 16-byte.
+template <> struct AlignedCharArrayImpl<1> { typedef char type; };
+template <> struct AlignedCharArrayImpl<2> { typedef short type; };
+template <> struct AlignedCharArrayImpl<4> { typedef int type; };
+template <> struct AlignedCharArrayImpl<8> { typedef double type; };
+
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <> struct AlignedCharArrayImpl<x> { \
+    typedef __declspec(align(x)) char type; \
+  }
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(512);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1024);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2048);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4096);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
 // Any larger and MSVC complains.
 #undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
 
+#endif // _MSC_VER
+
 /// \brief This union template exposes a suitably aligned and sized character
 /// array member which can hold elements of any of up to four types.
 ///

diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h
index c23bb6a..8905e1e 100644
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h

@@ -81,7 +81,7 @@
   InstrTy *operator->() const { return I.getPointer(); }
   operator bool() const { return I.getPointer(); }
 
-  /// getCalledValue - Return the pointer to function that is being called...
+  /// getCalledValue - Return the pointer to function that is being called.
   ///
   ValTy *getCalledValue() const {
     assert(getInstruction() && "Not a call or invoke instruction!");
@@ -95,7 +95,7 @@
     return dyn_cast<FunTy>(getCalledValue());
   }
 
-  /// setCalledFunction - Set the callee to the specified value...
+  /// setCalledFunction - Set the callee to the specified value.
   ///
   void setCalledFunction(Value *V) {
     assert(getInstruction() && "Not a call or invoke instruction!");
@@ -130,7 +130,7 @@
   }
 
   /// arg_iterator - The type of iterator to use when looping over actual
-  /// arguments at this call site...
+  /// arguments at this call site.
   typedef IterTy arg_iterator;
 
   /// arg_begin/arg_end - Return iterators corresponding to the actual argument

diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index ea0a4da..1136ff7 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h

@@ -24,7 +24,7 @@
 /// does not imply the existence of any other C++ library features.
 #if (__has_feature(cxx_rvalue_references)   \
      || defined(__GXX_EXPERIMENTAL_CXX0X__) \
-     || _MSC_VER >= 1600)
+     || (defined(_MSC_VER) && _MSC_VER >= 1600))
 #define LLVM_USE_RVALUE_REFERENCES 1
 #else
 #define LLVM_USE_RVALUE_REFERENCES 0
@@ -106,9 +106,11 @@
 #endif
 
 #if (__GNUC__ >= 4)
-#define BUILTIN_EXPECT(EXPR, VALUE) __builtin_expect((EXPR), (VALUE))
+#define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #else
-#define BUILTIN_EXPECT(EXPR, VALUE) (EXPR)
+#define LLVM_LIKELY(EXPR) (EXPR)
+#define LLVM_UNLIKELY(EXPR) (EXPR)
 #endif
 
 

diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index 506ec96..8d880fd 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h

@@ -99,8 +99,8 @@
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
-  /// @param[in] byte_size
-  ///     The size in byte of the integer to extract.
+  /// @param[in] size
+  ///     The size in bytes of the integer to extract.
   ///
   /// @return
   ///     The sign extended signed integer value that was extracted,

diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index f4a9aa0..5d60205 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h

@@ -40,7 +40,7 @@
 #include <string>
 #include <vector>
 
-#if HAVE_SYS_STAT_H
+#ifdef HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
 
@@ -432,7 +432,7 @@
 /// @brief Does status represent a symlink?
 ///
 /// @param status A file_status previously returned from stat.
-/// @param result status.type() == symlink_file.
+/// @returns status.type() == symlink_file.
 bool is_symlink(file_status status);
 
 /// @brief Is path a symlink?
@@ -461,7 +461,7 @@
 
 /// @brief Is status available?
 ///
-/// @param path Input path.
+/// @param s Input file status.
 /// @results True if status() != status_error.
 bool status_known(file_status s);
 
@@ -486,7 +486,7 @@
 /// clang-%%-%%-%%-%%-%%.s => /tmp/clang-a0-b1-c2-d3-e4.s
 ///
 /// @param model Name to base unique path off of.
-/// @param result_fs Set to the opened file's file descriptor.
+/// @param result_fd Set to the opened file's file descriptor.
 /// @param result_path Set to the opened file's absolute path.
 /// @param makeAbsolute If true and @model is not an absolute path, a temp
 ///        directory will be prepended.
@@ -586,9 +586,9 @@
 
 public:
   enum mapmode {
-    readonly, //< May only access map via const_data as read only.
-    readwrite, //< May access map via data and modify it. Written to path.
-    priv //< May modify via data, but changes are lost on destruction.
+    readonly, ///< May only access map via const_data as read only.
+    readwrite, ///< May access map via data and modify it. Written to path.
+    priv ///< May modify via data, but changes are lost on destruction.
   };
 
 private:
@@ -596,7 +596,7 @@
   mapmode Mode;
   uint64_t Size;
   void *Mapping;
-#if LLVM_ON_WIN32
+#ifdef LLVM_ON_WIN32
   int FileDescriptor;
   void *FileHandle;
   void *FileMappingHandle;
@@ -658,7 +658,7 @@
 ///
 /// @param path Path to file to map.
 /// @param file_offset Byte offset in file where mapping should begin.
-/// @param size_t Byte length of range of the file to map.
+/// @param size Byte length of range of the file to map.
 /// @param map_writable If true, the file will be mapped in r/w such
 ///        that changes to the mapped buffer will be flushed back
 ///        to the file.  If false, the file will be mapped read-only

diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 19e1ce8..e552315 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h

@@ -27,13 +27,15 @@
 class GCOVLines;
 class FileInfo;
 
-enum GCOVFormat {
-  InvalidGCOV,
-  GCNO_402,
-  GCNO_404,
-  GCDA_402,
-  GCDA_404
-};
+namespace GCOV {
+  enum GCOVFormat {
+    InvalidGCOV,
+    GCNO_402,
+    GCNO_404,
+    GCDA_402,
+    GCDA_404
+  };
+} // end GCOV namespace
 
 /// GCOVBuffer - A wrapper around MemoryBuffer to provide GCOV specific
 /// read operations.
@@ -42,20 +44,20 @@
   GCOVBuffer(MemoryBuffer *B) : Buffer(B), Cursor(0) {}
   
   /// readGCOVFormat - Read GCOV signature at the beginning of buffer.
-  enum GCOVFormat readGCOVFormat() {
+  GCOV::GCOVFormat readGCOVFormat() {
     StringRef Magic = Buffer->getBuffer().slice(0, 12);
     Cursor = 12;
     if (Magic == "oncg*404MVLL")
-      return GCNO_404;
+      return GCOV::GCNO_404;
     else if (Magic == "oncg*204MVLL")
-      return GCNO_402;
+      return GCOV::GCNO_402;
     else if (Magic == "adcg*404MVLL")
-      return GCDA_404;
+      return GCOV::GCDA_404;
     else if (Magic == "adcg*204MVLL")
-      return GCDA_402;
+      return GCOV::GCDA_402;
     
     Cursor = 0;
-    return InvalidGCOV;
+    return GCOV::InvalidGCOV;
   }
 
   /// readFunctionTag - If cursor points to a function tag then increment the
@@ -128,7 +130,7 @@
     StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor+4);
     assert (Str.empty() == false && "Unexpected memory buffer end!");
     Cursor += 4;
-    Result = *(uint32_t *)(Str.data());
+    Result = *(const uint32_t *)(Str.data());
     return Result;
   }
 
@@ -170,7 +172,7 @@
 public:
   GCOVFunction() : Ident(0), LineNumber(0) {}
   ~GCOVFunction();
-  bool read(GCOVBuffer &Buffer, GCOVFormat Format);
+  bool read(GCOVBuffer &Buffer, GCOV::GCOVFormat Format);
   void dump();
   void collectLineCounts(FileInfo &FI);
 private:

diff --git a/include/llvm/Support/IntegersSubsetMapping.h b/include/llvm/Support/IntegersSubsetMapping.h
index cab18dc..7635d5e 100644
--- a/include/llvm/Support/IntegersSubsetMapping.h
+++ b/include/llvm/Support/IntegersSubsetMapping.h

@@ -42,6 +42,7 @@
   struct RangeEx : public RangeTy {
     RangeEx() : Weight(1) {}
     RangeEx(const RangeTy &R) : RangeTy(R), Weight(1) {}
+    RangeEx(const RangeTy &R, unsigned W) : RangeTy(R), Weight(W) {}
     RangeEx(const IntTy &C) : RangeTy(C), Weight(1) {}
     RangeEx(const IntTy &L, const IntTy &H) : RangeTy(L, H), Weight(1) {}
     RangeEx(const IntTy &L, const IntTy &H, unsigned W) :
@@ -316,13 +317,13 @@
     Items.clear();
     const IntTy *Low = &OldItems.begin()->first.getLow();
     const IntTy *High = &OldItems.begin()->first.getHigh();
-    unsigned Weight = 1;
+    unsigned Weight = OldItems.begin()->first.Weight;
     SuccessorClass *Successor = OldItems.begin()->second;
     for (CaseItemIt j = OldItems.begin(), i = j++, e = OldItems.end();
          j != e; i = j++) {
       if (isJoinable(i, j)) {
         const IntTy *CurHigh = &j->first.getHigh();
-        ++Weight;
+        Weight += j->first.Weight;
         if (*CurHigh > *High)
           High = CurHigh;
       } else {
@@ -330,7 +331,7 @@
         add(R, Successor);
         Low = &j->first.getLow();
         High = &j->first.getHigh(); 
-        Weight = 1;
+        Weight = j->first.Weight;
         Successor = j->second;
       }
     }
@@ -362,10 +363,17 @@
   
   /// Adds all ranges and values from given ranges set to the current
   /// mapping.
-  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0) {
+  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0,
+           unsigned Weight = 0) {
+    unsigned ItemWeight = 1;
+    if (Weight)
+      // Weight is associated with CRS, for now we perform a division to
+      // get the weight for each item.
+      ItemWeight = Weight / CRS.getNumItems();
     for (unsigned i = 0, e = CRS.getNumItems(); i < e; ++i) {
       RangeTy R = CRS.getItem(i);
-      add(R, S);
+      RangeEx REx(R, ItemWeight);
+      add(REx, S);
     }
   }
   

diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 4005161..35c2694 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h

@@ -463,12 +463,24 @@
   return int32_t(x << (32 - B)) >> (32 - B);
 }
 
+/// \brief Sign extend number in the bottom B bits of X to a 32-bit int.
+/// Requires 0 < B <= 32.
+inline int32_t SignExtend32(uint32_t X, unsigned B) {
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
 /// SignExtend64 - Sign extend B-bit number x to 64-bit int.
 /// Usage int64_t r = SignExtend64<5>(x);
 template <unsigned B> inline int64_t SignExtend64(uint64_t x) {
   return int64_t(x << (64 - B)) >> (64 - B);
 }
 
+/// \brief Sign extend number in the bottom B bits of X to a 64-bit int.
+/// Requires 0 < B <= 64.
+inline int64_t SignExtend64(uint64_t X, unsigned B) {
+  return int64_t(X << (64 - B)) >> (64 - B);
+}
+
 } // End llvm namespace
 
 #endif

diff --git a/include/llvm/Support/PathV1.h b/include/llvm/Support/PathV1.h
index f4bedf9..643ee8c 100644
--- a/include/llvm/Support/PathV1.h
+++ b/include/llvm/Support/PathV1.h

@@ -683,8 +683,8 @@
       /// This function returns status information about the file. The type of
       /// path (file or directory) is updated to reflect the actual contents
       /// of the file system.
-      /// @returns 0 on failure, with Error explaining why (if non-zero)
-      /// @returns a pointer to a FileStatus structure on success.
+      /// @returns 0 on failure, with Error explaining why (if non-zero),
+      /// otherwise returns a pointer to a FileStatus structure on success.
       /// @brief Get file status.
       const FileStatus *getFileStatus(
         bool forceUpdate = false, ///< Force an update from the file system

diff --git a/include/llvm/Support/PathV2.h b/include/llvm/Support/PathV2.h
index 8d79709..967ea1e 100644
--- a/include/llvm/Support/PathV2.h
+++ b/include/llvm/Support/PathV2.h

@@ -133,7 +133,7 @@
 /// foo   + bar/f => foo/bar/f
 ///
 /// @param path Set to \a path + \a component.
-/// @param component The component to be appended to \a path.
+/// @param a The component to be appended to \a path.
 void append(SmallVectorImpl<char> &path, const Twine &a,
                                          const Twine &b = "",
                                          const Twine &c = "",
@@ -272,7 +272,7 @@
 /// ignored if the user or system has set the typical environment variable
 /// (e.g., TEMP on Windows, TMPDIR on *nix) to specify a temporary directory.
 ///
-/// @param Result Holds the resulting path name.
+/// @param result Holds the resulting path name.
 void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result);
 
 /// @brief Has root name?

diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 8949a3a..3835e84 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h

@@ -145,7 +145,7 @@
   /// GetMessage - Return an SMDiagnostic at the specified location with the
   /// specified string.
   ///
-  /// @param Type - If non-null, the kind of message (e.g., "error") which is
+  /// @param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
   SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg, 
                           ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const;

diff --git a/include/llvm/Support/TimeValue.h b/include/llvm/Support/TimeValue.h
index 94f132a..e780b50 100644
--- a/include/llvm/Support/TimeValue.h
+++ b/include/llvm/Support/TimeValue.h

@@ -153,7 +153,6 @@
 
     /// Determine if \p this is greater than or equal to \p that.
     /// @returns True iff *this >= that.
-    /// @brief True if this >= that.
     int operator >= (const TimeValue &that) const {
       if ( this->seconds_ > that.seconds_ ) {
           return 1;
@@ -164,8 +163,7 @@
     }
 
     /// Determines if two TimeValue objects represent the same moment in time.
-    /// @brief True iff *this == that.
-    /// @brief True if this == that.
+    /// @returns True iff *this == that.
     int operator == (const TimeValue &that) const {
       return (this->seconds_ == that.seconds_) &&
              (this->nanos_ == that.nanos_);
@@ -173,8 +171,7 @@
 
     /// Determines if two TimeValue objects represent times that are not the
     /// same.
-    /// @return True iff *this != that.
-    /// @brief True if this != that.
+    /// @returns True iff *this != that.
     int operator != (const TimeValue &that) const { return !(*this == that); }
 
     /// Adds two TimeValue objects together.

diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 5de749a..9913f98 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h

@@ -210,13 +210,16 @@
 
   /// Changes the foreground color of text that will be output from this point
   /// forward.
-  /// @param colors ANSI color to use, the special SAVEDCOLOR can be used to
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
   /// change only the bold attribute, and keep colors untouched
-  /// @param bold bold/brighter text, default false
-  /// @param bg if true change the background, default: change foreground
+  /// @param Bold bold/brighter text, default false
+  /// @param BG if true change the background, default: change foreground
   /// @returns itself so it can be used within << invocations
-  virtual raw_ostream &changeColor(enum Colors, bool = false, bool = false) {
-    return *this; }
+  virtual raw_ostream &changeColor(enum Colors Color,
+                                   bool Bold = false,
+                                   bool BG = false) {
+    return *this;
+  }
 
   /// Resets the colors to terminal defaults. Call this when you are done
   /// outputting colored text, or before program exit.

diff --git a/include/llvm/SymbolTableListTraits.h b/include/llvm/SymbolTableListTraits.h
index 91a4eb9..ec5c88f 100644
--- a/include/llvm/SymbolTableListTraits.h
+++ b/include/llvm/SymbolTableListTraits.h

@@ -46,7 +46,6 @@
   /// getListOwner - Return the object that owns this list.  If this is a list
   /// of instructions, it returns the BasicBlock that owns them.
   ItemParentClass *getListOwner() {
-    typedef iplist<ValueSubClass> ItemParentClass::*Sublist;
     size_t Offset(size_t(&((ItemParentClass*)0->*ItemParentClass::
                            getSublistAccess(static_cast<ValueSubClass*>(0)))));
     iplist<ValueSubClass>* Anchor(static_cast<iplist<ValueSubClass>*>(this));

diff --git a/include/llvm/TableGen/Error.h b/include/llvm/TableGen/Error.h
index fd5f805..5c1c3ad 100644
--- a/include/llvm/TableGen/Error.h
+++ b/include/llvm/TableGen/Error.h

@@ -20,21 +20,22 @@
 namespace llvm {
 
 class TGError {
-  SMLoc Loc;
+  SmallVector<SMLoc, 4> Locs;
   std::string Message;
 public:
-  TGError(SMLoc loc, const std::string &message) : Loc(loc), Message(message) {}
+  TGError(ArrayRef<SMLoc> locs, const std::string &message)
+    : Locs(locs.begin(), locs.end()), Message(message) {}
 
-  SMLoc getLoc() const { return Loc; }
+  ArrayRef<SMLoc> getLoc() const { return Locs; }
   const std::string &getMessage() const { return Message; }
 };
 
-void PrintWarning(SMLoc WarningLoc, const Twine &Msg);
+void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg);
 void PrintWarning(const char *Loc, const Twine &Msg);
 void PrintWarning(const Twine &Msg);
 void PrintWarning(const TGError &Warning);
 
-void PrintError(SMLoc ErrorLoc, const Twine &Msg);
+void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg);
 void PrintError(const char *Loc, const Twine &Msg);
 void PrintError(const Twine &Msg);
 void PrintError(const TGError &Error);

diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index a8256b7..adb1a77 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h

@@ -509,6 +509,18 @@
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const {
     return const_cast<Init *>(this);
   }
+
+  /// getBit - This method is used to return the initializer for the specified
+  /// bit.
+  virtual Init *getBit(unsigned Bit) const = 0;
+
+  /// getBitVar - This method is used to retrieve the initializer for bit
+  /// reference. For non-VarBitInit, it simply returns itself.
+  virtual Init *getBitVar() const { return const_cast<Init*>(this); }
+
+  /// getBitNum - This method is used to retrieve the bit number of a bit
+  /// reference. For non-VarBitInit, it simply returns 0.
+  virtual unsigned getBitNum() const { return 0; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
@@ -541,13 +553,6 @@
   ///
   virtual RecTy *getFieldType(const std::string &FieldName) const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const = 0;
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -571,6 +576,10 @@
     return Ty->convertValue(const_cast<UnsetInit *>(this));
   }
 
+  virtual Init *getBit(unsigned Bit) const {
+    return const_cast<UnsetInit*>(this);
+  }
+
   virtual bool isComplete() const { return false; }
   virtual std::string getAsString() const { return "?"; }
 };
@@ -595,6 +604,11 @@
     return Ty->convertValue(const_cast<BitInit *>(this));
   }
 
+  virtual Init *getBit(unsigned Bit) const {
+    assert(Bit < 1 && "Bit index out of range!");
+    return const_cast<BitInit*>(this);
+  }
+
   virtual std::string getAsString() const { return Value ? "1" : "0"; }
 };
 
@@ -616,11 +630,6 @@
 
   unsigned getNumBits() const { return Bits.size(); }
 
-  Init *getBit(unsigned Bit) const {
-    assert(Bit < Bits.size() && "Bit index out of range!");
-    return Bits[Bit];
-  }
-
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<BitsInit *>(this));
   }
@@ -640,6 +649,11 @@
   virtual std::string getAsString() const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned Bit) const {
+    assert(Bit < Bits.size() && "Bit index out of range!");
+    return Bits[Bit];
+  }
 };
 
 
@@ -666,15 +680,6 @@
 
   virtual std::string getAsString() const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off int");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -682,6 +687,10 @@
                                             unsigned Elt) const {
     llvm_unreachable("Illegal element reference off int");
   }
+
+  virtual Init *getBit(unsigned Bit) const {
+    return BitInit::get((Value & (1ULL << Bit)) != 0);
+  }
 };
 
 
@@ -709,15 +718,6 @@
   virtual std::string getAsString() const { return "\"" + Value + "\""; }
   virtual std::string getAsUnquotedString() const { return Value; }
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off string");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -725,6 +725,10 @@
                                             unsigned Elt) const {
     llvm_unreachable("Illegal element reference off string");
   }
+
+  virtual Init *getBit(unsigned Bit) const {
+    llvm_unreachable("Illegal bit reference off string");
+  }
 };
 
 /// ListInit - [AL, AH, CL] - Represent a list of defs
@@ -777,20 +781,15 @@
   inline size_t         size () const { return Values.size();  }
   inline bool           empty() const { return Values.empty(); }
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off list");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
+
+  virtual Init *getBit(unsigned Bit) const {
+    llvm_unreachable("Illegal bit reference off list");
+  }
 };
 
 
@@ -818,10 +817,10 @@
     return Ty->convertValue(const_cast<OpInit *>(this));
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
+
+  virtual Init *getBit(unsigned Bit) const;
 };
 
 
@@ -1003,8 +1002,6 @@
     return getNameInit()->getAsUnquotedString();
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
 
@@ -1019,6 +1016,8 @@
   ///
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
+  virtual Init *getBit(unsigned Bit) const;
+
   virtual std::string getAsString() const { return getName(); }
 };
 
@@ -1030,8 +1029,10 @@
   unsigned Bit;
 
   VarBitInit(TypedInit *T, unsigned B) : TI(T), Bit(B) {
-    assert(T->getType() && dynamic_cast<BitsRecTy*>(T->getType()) &&
-           ((BitsRecTy*)T->getType())->getNumBits() > B &&
+    assert(T->getType() &&
+           (dynamic_cast<IntRecTy*>(T->getType()) ||
+            (dynamic_cast<BitsRecTy*>(T->getType()) &&
+             dynamic_cast<BitsRecTy*>(T->getType())->getNumBits() > B)) &&
            "Illegal VarBitInit expression!");
   }
 
@@ -1045,11 +1046,16 @@
     return Ty->convertValue(const_cast<VarBitInit *>(this));
   }
 
-  TypedInit *getVariable() const { return TI; }
-  unsigned getBitNum() const { return Bit; }
+  virtual Init *getBitVar() const { return TI; }
+  virtual unsigned getBitNum() const { return Bit; }
 
   virtual std::string getAsString() const;
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned B) const {
+    assert(B < 1 && "Bit index out of range!");
+    return const_cast<VarBitInit*>(this);
+  }
 };
 
 /// VarListElementInit - List[4] - Represent access to one element of a var or
@@ -1080,9 +1086,6 @@
   TypedInit *getVariable() const { return TI; }
   unsigned getElementNum() const { return Element; }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -1092,6 +1095,8 @@
 
   virtual std::string getAsString() const;
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned Bit) const;
 };
 
 /// DefInit - AL - Represent a reference to a 'def' in the description
@@ -1122,12 +1127,7 @@
 
   virtual std::string getAsString() const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
+  virtual Init *getBit(unsigned Bit) const {
     llvm_unreachable("Illegal bit reference off def");
   }
 
@@ -1163,8 +1163,8 @@
     return Ty->convertValue(const_cast<FieldInit *>(this));
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
+  virtual Init *getBit(unsigned Bit) const;
+
   virtual Init *resolveListElementReference(Record &R,
                                             const RecordVal *RV,
                                             unsigned Elt) const;
@@ -1243,8 +1243,7 @@
   inline size_t              name_size () const { return ArgNames.size();  }
   inline bool                name_empty() const { return ArgNames.empty(); }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
+  virtual Init *getBit(unsigned Bit) const {
     llvm_unreachable("Illegal bit reference off dag");
   }
 
@@ -1301,7 +1300,9 @@
   // Unique record ID.
   unsigned ID;
   Init *Name;
-  SMLoc Loc;
+  // Location where record was instantiated, followed by the location of
+  // multiclass prototypes used.
+  SmallVector<SMLoc, 4> Locs;
   std::vector<Init *> TemplateArgs;
   std::vector<RecordVal> Values;
   std::vector<Record*> SuperClasses;
@@ -1317,13 +1318,15 @@
 public:
 
   // Constructs a record.
-  explicit Record(const std::string &N, SMLoc loc, RecordKeeper &records) :
-    ID(LastID++), Name(StringInit::get(N)), Loc(loc), TrackedRecords(records),
-      TheInit(0) {
+  explicit Record(const std::string &N, ArrayRef<SMLoc> locs,
+                  RecordKeeper &records) :
+    ID(LastID++), Name(StringInit::get(N)), Locs(locs.begin(), locs.end()),
+    TrackedRecords(records), TheInit(0) {
     init();
   }
-  explicit Record(Init *N, SMLoc loc, RecordKeeper &records) :
-    ID(LastID++), Name(N), Loc(loc), TrackedRecords(records), TheInit(0) {
+  explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records) :
+    ID(LastID++), Name(N), Locs(locs.begin(), locs.end()),
+    TrackedRecords(records), TheInit(0) {
     init();
   }
   ~Record() {}
@@ -1345,7 +1348,7 @@
   void setName(Init *Name);               // Also updates RecordKeeper.
   void setName(const std::string &Name);  // Also updates RecordKeeper.
 
-  SMLoc getLoc() const { return Loc; }
+  ArrayRef<SMLoc> getLoc() const { return Locs; }
 
   /// get the corresponding DefInit.
   DefInit *getDefInit();
@@ -1507,6 +1510,12 @@
   ///
   bool getValueAsBit(StringRef FieldName) const;
 
+  /// getValueAsBitOrUnset - This method looks up the specified field and
+  /// returns its value as a bit. If the field is unset, sets Unset to true and
+  /// retunrs false.
+  ///
+  bool getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const;
+
   /// getValueAsInt - This method looks up the specified field and returns its
   /// value as an int64_t, throwing an exception if the field does not exist or
   /// if the value is not the right type.

diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index c406bca..87bd84e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td

@@ -343,8 +343,8 @@
   bit isBarrier    = 0;     // Can control flow fall through this instruction?
   bit isCall       = 0;     // Is this instruction a call instruction?
   bit canFoldAsLoad = 0;    // Can this be folded as a simple memory operand?
-  bit mayLoad      = 0;     // Is it possible for this inst to read memory?
-  bit mayStore     = 0;     // Is it possible for this inst to write memory?
+  bit mayLoad      = ?;     // Is it possible for this inst to read memory?
+  bit mayStore     = ?;     // Is it possible for this inst to write memory?
   bit isConvertibleToThreeAddress = 0;  // Can this 2-addr instruction promote?
   bit isCommutable = 0;     // Is this 3 operand instruction commutable?
   bit isTerminator = 0;     // Is this part of the terminator for a basic block?
@@ -369,7 +369,7 @@
   //
   //  neverHasSideEffects - Set on an instruction with no pattern if it has no
   //    side effects.
-  bit hasSideEffects = 0;
+  bit hasSideEffects = ?;
   bit neverHasSideEffects = 0;
 
   // Is this instruction a "real" instruction (with a distinct machine
@@ -602,23 +602,31 @@
 ///
 def zero_reg;
 
+/// OperandWithDefaultOps - This Operand class can be used as the parent class
+/// for an Operand that needs to be initialized with a default value if
+/// no value is supplied in a pattern.  This class can be used to simplify the
+/// pattern definitions for instructions that have target specific flags
+/// encoded as immediate operands.
+class OperandWithDefaultOps<ValueType ty, dag defaultops>
+  : Operand<ty> {
+  dag DefaultOps = defaultops;
+}
+
 /// PredicateOperand - This can be used to define a predicate operand for an
 /// instruction.  OpTypes specifies the MIOperandInfo for the operand, and
 /// AlwaysVal specifies the value of this predicate when set to "always
 /// execute".
 class PredicateOperand<ValueType ty, dag OpTypes, dag AlwaysVal>
-  : Operand<ty> {
+  : OperandWithDefaultOps<ty, AlwaysVal> {
   let MIOperandInfo = OpTypes;
-  dag DefaultOps = AlwaysVal;
 }
 
 /// OptionalDefOperand - This is used to define a optional definition operand
 /// for an instruction. DefaultOps is the register the operand represents if
 /// none is supplied, e.g. zero_reg.
 class OptionalDefOperand<ValueType ty, dag OpTypes, dag defaultops>
-  : Operand<ty> {
+  : OperandWithDefaultOps<ty, defaultops> {
   let MIOperandInfo = OpTypes;
-  dag DefaultOps = defaultops;
 }
 
 
@@ -631,6 +639,17 @@
   // Sparc manual specifies its instructions in the format [31..0] (big), while
   // PowerPC specifies them using the format [0..31] (little).
   bit isLittleEndianEncoding = 0;
+
+  // The instruction properties mayLoad, mayStore, and hasSideEffects are unset
+  // by default, and TableGen will infer their value from the instruction
+  // pattern when possible.
+  //
+  // Normally, TableGen will issue an error it it can't infer the value of a
+  // property that hasn't been set explicitly. When guessInstructionProperties
+  // is set, it will guess a safe value instead.
+  //
+  // This option is a temporary migration help. It will go away.
+  bit guessInstructionProperties = 1;
 }
 
 // Standard Pseudo Instructions.
@@ -734,6 +753,18 @@
   let InOperandList = (ins variable_ops);
   let AsmString = "BUNDLE";
 }
+def LIFETIME_START : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$id);
+  let AsmString = "LIFETIME_START";
+  let neverHasSideEffects = 1;
+}
+def LIFETIME_END : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$id);
+  let AsmString = "LIFETIME_END";
+  let neverHasSideEffects = 1;
+}
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index da30ab8..d7cc1cf 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h

@@ -459,6 +459,13 @@
   }
 
   /// copyPhysReg - Emit instructions to copy a pair of physical registers.
+  ///
+  /// This function should support copies within any legal register class as
+  /// well as any cross-class copies created during instruction selection.
+  ///
+  /// The source and destination registers may overlap, which may require a
+  /// careful implementation when multiple copy instructions are required for
+  /// large registers. See for example the ARM target.
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -794,20 +801,6 @@
                                  const MachineInstr *UseMI, unsigned UseIdx,
                                  bool FindMin = false) const;
 
-  /// computeOperandLatency - Compute and return the latency of the given data
-  /// dependent def and use. DefMI must be a valid def. UseMI may be NULL for
-  /// an unknown use. If the subtarget allows, this may or may not need to call
-  /// getOperandLatency().
-  ///
-  /// FindMin may be set to get the minimum vs. expected latency. Minimum
-  /// latency is used for scheduling groups, while expected latency is for
-  /// instruction cost and critical path.
-  unsigned computeOperandLatency(const InstrItineraryData *ItinData,
-                                 const TargetRegisterInfo *TRI,
-                                 const MachineInstr *DefMI,
-                                 const MachineInstr *UseMI,
-                                 unsigned Reg, bool FindMin) const;
-
   /// getOutputLatency - Compute and return the output dependency latency of a
   /// a given pair of defs which both target the same register. This is usually
   /// one.

diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index ea2874f..2a0a432 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h

@@ -18,6 +18,26 @@
 
   namespace LibFunc {
     enum Func {
+      /// void operator delete[](void*);
+      ZdaPv,
+      /// void operator delete(void*);
+      ZdlPv,
+      /// void *new[](unsigned int);
+      Znaj,
+      /// void *new[](unsigned int, nothrow);
+      ZnajRKSt9nothrow_t,
+      /// void *new[](unsigned long);
+      Znam,
+      /// void *new[](unsigned long, nothrow);
+      ZnamRKSt9nothrow_t,
+      /// void *new(unsigned int);
+      Znwj,
+      /// void *new(unsigned int, nothrow);
+      ZnwjRKSt9nothrow_t,
+      /// void *new(unsigned long);
+      Znwm,
+      /// void *new(unsigned long, nothrow);
+      ZnwmRKSt9nothrow_t,
       /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
       cxa_atexit,
       /// void __cxa_guard_abort(guard_t *guard);
@@ -33,12 +53,24 @@
       acos,
       /// float acosf(float x);
       acosf,
+      /// double acosh(double x);
+      acosh,
+      /// float acoshf(float x);
+      acoshf,
+      /// long double acoshl(long double x);
+      acoshl,
       /// long double acosl(long double x);
       acosl,
       /// double asin(double x);
       asin,
       /// float asinf(float x);
       asinf,
+      /// double asinh(double x);
+      asinh,
+      /// float asinhf(float x);
+      asinhf,
+      /// long double asinhl(long double x);
+      asinhl,
       /// long double asinl(long double x);
       asinl,
       /// double atan(double x);
@@ -51,8 +83,22 @@
       atan2l,
       /// float atanf(float x);
       atanf,
+      /// double atanh(double x);
+      atanh,
+      /// float atanhf(float x);
+      atanhf,
+      /// long double atanhl(long double x);
+      atanhl,
       /// long double atanl(long double x);
       atanl,
+      /// void *calloc(size_t count, size_t size);
+      calloc,
+      /// double cbrt(double x);
+      cbrt,
+      /// float cbrtf(float x);
+      cbrtf,
+      /// long double cbrtl(long double x);
+      cbrtl,
       /// double ceil(double x);
       ceil,
       /// float ceilf(float x);
@@ -79,6 +125,12 @@
       cosl,
       /// double exp(double x);
       exp,
+      /// double exp10(double x);
+      exp10,
+      /// float exp10f(float x);
+      exp10f,
+      /// long double exp10l(long double x);
+      exp10l,
       /// double exp2(double x);
       exp2,
       /// float exp2f(float x);
@@ -119,6 +171,8 @@
       fputc,
       /// int fputs(const char *s, FILE *stream);
       fputs,
+      /// void free(void *ptr);
+      free,
       /// size_t fwrite(const void *ptr, size_t size, size_t nitems,
       /// FILE *stream);
       fwrite,
@@ -144,10 +198,18 @@
       log2f,
       /// double long double log2l(long double x);
       log2l,
+      /// double logb(double x);
+      logb,
+      /// float logbf(float x);
+      logbf,
+      /// long double logbl(long double x);
+      logbl,
       /// float logf(float x);
       logf,
       /// long double logl(long double x);
       logl,
+      /// void *malloc(size_t size);
+      malloc,
       /// void *memchr(const void *s, int c, size_t n);
       memchr,
       /// int memcmp(const void *s1, const void *s2, size_t n);
@@ -166,6 +228,8 @@
       nearbyintf,
       /// long double nearbyintl(long double x);
       nearbyintl,
+      /// int posix_memalign(void **memptr, size_t alignment, size_t size);
+      posix_memalign,
       /// double pow(double x, double y);
       pow,
       /// float powf(float x, float y);
@@ -176,6 +240,10 @@
       putchar,
       /// int puts(const char *s);
       puts,
+      /// void *realloc(void *ptr, size_t size);
+      realloc,
+      /// void *reallocf(void *ptr, size_t size);
+      reallocf,
       /// double rint(double x);
       rint,
       /// float rintf(float x);
@@ -214,6 +282,8 @@
       strchr,
       /// char *strcpy(char *s1, const char *s2);
       strcpy,
+      /// char *strdup(const char *s1);
+      strdup,
       /// size_t strlen(const char *s);
       strlen,
       /// char *strncat(char *s1, const char *s2, size_t n);
@@ -222,6 +292,8 @@
       strncmp,
       /// char *strncpy(char *s1, const char *s2, size_t n);
       strncpy,
+      /// char *strndup(const char *s1, size_t n);
+      strndup,
       /// size_t strnlen(const char *s, size_t maxlen);
       strnlen,
       /// double tan(double x);
@@ -242,6 +314,8 @@
       truncf,
       /// long double truncl(long double x);
       truncl,
+      /// void *valloc(size_t size);
+      valloc,
 
       NumLibFuncs
     };

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index acf0419..ef63422 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h

@@ -25,6 +25,7 @@
 #include "llvm/CallingConv.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Attributes.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -107,6 +108,14 @@
     ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
   };
 
+  enum SelectSupportKind {
+    ScalarValSelect,      // The target supports scalar selects (ex: cmov).
+    ScalarCondVectorVal,  // The target supports selects with a scalar condition
+                          // and vector values (ex: cmov).
+    VectorMaskSelect      // The target supports vector selects with a vector
+                          // mask (ex: x86 blends).
+  };
+
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
@@ -140,10 +149,22 @@
   /// this target.
   bool isSelectExpensive() const { return SelectIsExpensive; }
 
+  virtual bool isSelectSupported(SelectSupportKind kind) const { return true; }
+
   /// isIntDivCheap() - Return true if integer divide is usually cheaper than
   /// a sequence of several shifts, adds, and multiplies for this target.
   bool isIntDivCheap() const { return IntDivIsCheap; }
 
+  /// isSlowDivBypassed - Returns true if target has indicated at least one
+  /// type should be bypassed.
+  bool isSlowDivBypassed() const { return !BypassSlowDivTypes.empty(); }
+
+  /// getBypassSlowDivTypes - Returns map of slow types for division or
+  /// remainder with corresponding fast types
+  const DenseMap<Type *, Type *> &getBypassSlowDivTypes() const {
+    return BypassSlowDivTypes;
+  }
+
   /// isPow2DivCheap() - Return true if pow2 div is cheaper than a chain of
   /// srl/add/sra.
   bool isPow2DivCheap() const { return Pow2DivIsCheap; }
@@ -1045,6 +1066,11 @@
   /// of instructions not containing an integer divide.
   void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
 
+  /// addBypassSlowDivType - Tells the code generator which types to bypass.
+  void addBypassSlowDivType(Type *slow_type, Type *fast_type) {
+    BypassSlowDivTypes[slow_type] = fast_type;
+  }
+
   /// setPow2DivIsCheap - Tells the code generator that it shouldn't generate
   /// srl/add/sra for a signed divide by power of two, and let the target handle
   /// it.
@@ -1762,6 +1788,12 @@
   /// set to true unconditionally.
   bool IntDivIsCheap;
 
+  /// BypassSlowDivTypes - Tells the code generator to bypass slow divide or
+  /// remainder instructions. For example, SlowDivBypass[i32,u8] tells the code
+  /// generator to bypass 32-bit signed integer div/rem with an 8-bit unsigned
+  /// integer div/rem when the operands are positive and less than 256.
+  DenseMap <Type *, Type *> BypassSlowDivTypes;
+
   /// Pow2DivIsCheap - Tells the code generator that it shouldn't generate
   /// srl/add/sra for a signed divide by power of two, and let the target handle
   /// it.

diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index f0b181e..516e070 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h

@@ -87,7 +87,11 @@
     /// BUNDLE - This instruction represents an instruction bundle. Instructions
     /// which immediately follow a BUNDLE instruction which are marked with
     /// 'InsideBundle' flag are inside the bundle.
-    BUNDLE
+    BUNDLE = 14,
+
+    /// Lifetime markers.
+    LIFETIME_START = 15,
+    LIFETIME_END = 16
   };
 } // end namespace TargetOpcode
 } // end namespace llvm

diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 3f81c06..83bd787 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td

@@ -445,9 +445,9 @@
 def atomic_load_umax : SDNode<"ISD::ATOMIC_LOAD_UMAX", SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load      : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
-                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+                    [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_store     : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
-                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+                    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).

diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 8a939cc..2510aec 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h

@@ -27,6 +27,7 @@
 class Instruction;
 class Pass;
 class ReturnInst;
+class TargetLibraryInfo;
 
 /// DeleteDeadBlock - Delete the specified block, which must have no
 /// predecessors.
@@ -44,7 +45,7 @@
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle. Return true
 /// if any PHIs were deleted.
-bool DeleteDeadPHIs(BasicBlock *BB);
+bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = 0);
 
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.

diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
new file mode 100644
index 0000000..c262434
--- /dev/null
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h

@@ -0,0 +1,33 @@
+//===- llvm/Transforms/Utils/BypassSlowDivision.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#define TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+
+#include "llvm/Function.h"
+
+namespace llvm {
+
+/// This optimization identifies DIV instructions that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool bypassSlowDivision(Function &F,
+                        Function::iterator &I,
+                        const DenseMap<Type *, Type *> &BypassTypeMap);
+
+} // End llvm namespace
+
+#endif

diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 495eab7..83f0e7a 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h

@@ -36,6 +36,7 @@
 class AllocaInst;
 class ConstantExpr;
 class TargetData;
+class TargetLibraryInfo;
 class DIBuilder;
 
 template<typename T> class SmallVectorImpl;
@@ -51,7 +52,8 @@
 /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
-bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false);
+bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
+                            const TargetLibraryInfo *TLI = 0);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -60,20 +62,21 @@
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool isInstructionTriviallyDead(Instruction *I);
+bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=0);
 
 /// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool RecursivelyDeleteTriviallyDeadInstructions(Value *V);
+bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                const TargetLibraryInfo *TLI=0);
 
 /// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
 /// dead PHI node, due to being a def-use chain of single-use nodes that
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool RecursivelyDeleteDeadPHINode(PHINode *PN);
+bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=0);
 
   
 /// SimplifyInstructionsInBlock - Scan the specified basic block and try to
@@ -81,7 +84,8 @@
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD = 0);
+bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD = 0,
+                                 const TargetLibraryInfo *TLI = 0);
     
 //===----------------------------------------------------------------------===//
 //  Control Flow Graph Restructuring.

diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 3b6aab1..f768eec 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp

@@ -36,6 +36,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 // Register the AliasAnalysis interface, providing a nice name to refer to.
@@ -452,6 +453,7 @@
 ///
 void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
   TD = P->getAnalysisIfAvailable<TargetData>();
+  TLI = P->getAnalysisIfAvailable<TargetLibraryInfo>();
   AA = &P->getAnalysis<AliasAnalysis>();
 }
 

diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 92e8906..e9dcb37 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp

@@ -550,7 +550,7 @@
 //===----------------------------------------------------------------------===//
 
 void AliasSet::print(raw_ostream &OS) const {
-  OS << "  AliasSet[" << (void*)this << ", " << RefCount << "] ";
+  OS << "  AliasSet[" << (const void*)this << ", " << RefCount << "] ";
   OS << (AliasTy == MustAlias ? "must" : "may") << " alias, ";
   switch (AccessTy) {
   case NoModRef: OS << "No access "; break;
@@ -590,8 +590,10 @@
   OS << "\n";
 }
 
+#ifndef NDEBUG
 void AliasSet::dump() const { print(dbgs()); }
 void AliasSetTracker::dump() const { print(dbgs()); }
+#endif
 
 //===----------------------------------------------------------------------===//
 //                     ASTCallbackVH Class Implementation

diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 0ba6af9..87a75fd 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp

@@ -61,6 +61,7 @@
   initializePathProfileLoaderPassPass(Registry);
   initializeProfileVerifierPassPass(Registry);
   initializePathProfileVerifierPass(Registry);
+  initializeProfileMetadataLoaderPassPass(Registry);
   initializeRegionInfoPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);

diff --git a/lib/Analysis/Android.mk b/lib/Analysis/Android.mk
index 19c58f3..2ebf091 100644
--- a/lib/Analysis/Android.mk
+++ b/lib/Analysis/Android.mk

@@ -41,6 +41,8 @@
   PathProfileInfo.cpp \
   PathProfileVerifier.cpp \
   PostDominators.cpp \
+  ProfileDataLoader.cpp \
+  ProfileDataLoaderPass.cpp \
   ProfileEstimatorPass.cpp \
   ProfileInfo.cpp \
   ProfileInfoLoader.cpp \

diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 1d028c2..a3bc06a 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp

@@ -85,9 +85,10 @@
 /// getObjectSize - Return the size of the object specified by V, or
 /// UnknownSize if unknown.
 static uint64_t getObjectSize(const Value *V, const TargetData &TD,
+                              const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
   uint64_t Size;
-  if (getObjectSize(V, Size, &TD, RoundToAlign))
+  if (getObjectSize(V, Size, &TD, &TLI, RoundToAlign))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -95,10 +96,11 @@
 /// isObjectSmallerThan - Return true if we can prove that the object specified
 /// by V is smaller than Size.
 static bool isObjectSmallerThan(const Value *V, uint64_t Size,
-                                const TargetData &TD) {
+                                const TargetData &TD,
+                                const TargetLibraryInfo &TLI) {
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
-  uint64_t ObjectSize = getObjectSize(V, TD, /*RoundToAlign*/true);
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
   
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
 }
@@ -106,8 +108,8 @@
 /// isObjectSize - Return true if we can prove that the object specified
 /// by V has size Size.
 static bool isObjectSize(const Value *V, uint64_t Size,
-                         const TargetData &TD) {
-  uint64_t ObjectSize = getObjectSize(V, TD);
+                         const TargetData &TD, const TargetLibraryInfo &TLI) {
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI);
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size;
 }
 
@@ -126,6 +128,15 @@
     const Value *V;
     ExtensionKind Extension;
     int64_t Scale;
+
+    bool operator==(const VariableGEPIndex &Other) const {
+      return V == Other.V && Extension == Other.Extension &&
+        Scale == Other.Scale;
+    }
+
+    bool operator!=(const VariableGEPIndex &Other) const {
+      return !operator==(Other);
+    }
   };
 }
 
@@ -417,13 +428,7 @@
   /// BasicAliasAnalysis - This is the primary alias analysis implementation.
   struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
     static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : ImmutablePass(ID),
-                           // AliasCache rarely has more than 1 or 2 elements,
-                           // so start it off fairly small so that clear()
-                           // doesn't have to tromp through 64 (the default)
-                           // elements on each alias query. This really wants
-                           // something like a SmallDenseMap.
-                           AliasCache(8) {
+    BasicAliasAnalysis() : ImmutablePass(ID) {
       initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
     }
 
@@ -443,7 +448,11 @@
              "BasicAliasAnalysis doesn't support interprocedural queries.");
       AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
                                      LocB.Ptr, LocB.Size, LocB.TBAATag);
-      AliasCache.clear();
+      // AliasCache rarely has more than 1 or 2 elements, always use
+      // shrink_and_clear so it quickly returns to the inline capacity of the
+      // SmallDenseMap if it ever grows larger.
+      // FIXME: This should really be shrink_to_inline_capacity_and_clear().
+      AliasCache.shrink_and_clear();
       return Alias;
     }
 
@@ -481,7 +490,7 @@
   private:
     // AliasCache - Track alias queries to guard against recursion.
     typedef std::pair<Location, Location> LocPair;
-    typedef DenseMap<LocPair, AliasResult> AliasCacheTy;
+    typedef SmallDenseMap<LocPair, AliasResult, 8> AliasCacheTy;
     AliasCacheTy AliasCache;
 
     // Visited - Track instructions visited by pointsToConstantMemory.
@@ -490,6 +499,7 @@
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
     // instruction against another.
     AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
+                         const MDNode *V1TBAAInfo,
                          const Value *V2, uint64_t V2Size,
                          const MDNode *V2TBAAInfo,
                          const Value *UnderlyingV1, const Value *UnderlyingV2);
@@ -807,6 +817,21 @@
   return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
 }
 
+static bool areVarIndicesEqual(SmallVector<VariableGEPIndex, 4> &Indices1,
+                               SmallVector<VariableGEPIndex, 4> &Indices2) {
+  unsigned Size1 = Indices1.size();
+  unsigned Size2 = Indices2.size();
+
+  if (Size1 != Size2)
+    return false;
+
+  for (unsigned I = 0; I != Size1; ++I)
+    if (Indices1[I] != Indices2[I])
+      return false;
+
+  return true;
+}
+
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, TD),
@@ -814,6 +839,7 @@
 ///
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
+                             const MDNode *V1TBAAInfo,
                              const Value *V2, uint64_t V2Size,
                              const MDNode *V2TBAAInfo,
                              const Value *UnderlyingV1,
@@ -821,9 +847,41 @@
   int64_t GEP1BaseOffset;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
-  // If we have two gep instructions with must-alias'ing base pointers, figure
-  // out if the indexes to the GEP tell us anything about the derived pointer.
+  // If we have two gep instructions with must-alias or not-alias'ing base
+  // pointers, figure out if the indexes to the GEP tell us anything about the
+  // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
+    // Check for geps of non-aliasing underlying pointers where the offsets are
+    // identical.
+    if (V1Size == V2Size) {
+      // Do the base pointers alias assuming type and size.
+      AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size,
+                                                V1TBAAInfo, UnderlyingV2,
+                                                V2Size, V2TBAAInfo);
+      if (PreciseBaseAlias == NoAlias) {
+        // See if the computed offset from the common pointer tells us about the
+        // relation of the resulting pointer.
+        int64_t GEP2BaseOffset;
+        SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
+        const Value *GEP2BasePtr =
+          DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
+        const Value *GEP1BasePtr =
+          DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
+        // DecomposeGEPExpression and GetUnderlyingObject should return the
+        // same result except when DecomposeGEPExpression has no TargetData.
+        if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
+          assert(TD == 0 &&
+             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+          return MayAlias;
+        }
+        // Same offsets.
+        if (GEP1BaseOffset == GEP2BaseOffset &&
+            areVarIndicesEqual(GEP1VariableIndices, GEP2VariableIndices))
+          return NoAlias;
+        GEP1VariableIndices.clear();
+      }
+    }
+
     // Do the base pointers alias?
     AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, 0,
                                        UnderlyingV2, UnknownSize, 0);
@@ -843,9 +901,8 @@
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
     
-    // If DecomposeGEPExpression isn't able to look all the way through the
-    // addressing operation, we must not have TD and this is too complex for us
-    // to handle without it.
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no TargetData.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
       assert(TD == 0 &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
@@ -879,9 +936,8 @@
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
     
-    // If DecomposeGEPExpression isn't able to look all the way through the
-    // addressing operation, we must not have TD and this is too complex for us
-    // to handle without it.
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no TargetData.
     if (GEP1BasePtr != UnderlyingV1) {
       assert(TD == 0 &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
@@ -1004,12 +1060,42 @@
   // on corresponding edges.
   if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
     if (PN2->getParent() == PN->getParent()) {
+      LocPair Locs(Location(PN, PNSize, PNTBAAInfo),
+                   Location(V2, V2Size, V2TBAAInfo));
+      if (PN > V2)
+        std::swap(Locs.first, Locs.second);
+
       AliasResult Alias =
         aliasCheck(PN->getIncomingValue(0), PNSize, PNTBAAInfo,
                    PN2->getIncomingValueForBlock(PN->getIncomingBlock(0)),
                    V2Size, V2TBAAInfo);
       if (Alias == MayAlias)
         return MayAlias;
+
+      // If the first source of the PHI nodes NoAlias and the other inputs are
+      // the PHI node itself through some amount of recursion this does not add
+      // any new information so just return NoAlias.
+      // bb:
+      //    ptr = ptr2 + 1
+      // loop:
+      //    ptr_phi = phi [bb, ptr], [loop, ptr_plus_one]
+      //    ptr2_phi = phi [bb, ptr2], [loop, ptr2_plus_one]
+      //    ...
+      //    ptr_plus_one = gep ptr_phi, 1
+      //    ptr2_plus_one = gep ptr2_phi, 1
+      // We assume for the recursion that the the phis (ptr_phi, ptr2_phi) do
+      // not alias each other.
+      bool ArePhisAssumedNoAlias = false;
+      AliasResult OrigAliasResult;
+      if (Alias == NoAlias) {
+        // Pretend the phis do not alias.
+        assert(AliasCache.count(Locs) &&
+               "There must exist an entry for the phi node");
+        OrigAliasResult = AliasCache[Locs];
+        AliasCache[Locs] = NoAlias;
+        ArePhisAssumedNoAlias = true;
+      }
+
       for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
         AliasResult ThisAlias =
           aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
@@ -1019,6 +1105,11 @@
         if (Alias == MayAlias)
           break;
       }
+
+      // Reset if speculation failed.
+      if (ArePhisAssumedNoAlias && Alias != NoAlias)
+        AliasCache[Locs] = OrigAliasResult;
+
       return Alias;
     }
 
@@ -1133,8 +1224,8 @@
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   if (TD)
-    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD)) ||
-        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD)))
+    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
       return NoAlias;
   
   // Check the cache before climbing up use-def chains. This also terminates
@@ -1156,7 +1247,7 @@
     std::swap(O1, O2);
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
-    AliasResult Result = aliasGEP(GV1, V1Size, V2, V2Size, V2TBAAInfo, O1, O2);
+    AliasResult Result = aliasGEP(GV1, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo, O1, O2);
     if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
@@ -1184,8 +1275,8 @@
   // accesses is accessing the entire object, then the accesses must
   // overlap in some way.
   if (TD && O1 == O2)
-    if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD)) ||
-        (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD)))
+    if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD, *TLI)))
       return AliasCache[Locs] = PartialAlias;
 
   AliasResult Result =

diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index b255ce6..04a6560 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp

@@ -115,14 +115,14 @@
     return false;
   }
 
-  SmallPtrSet<BasicBlock *, 4> UnreachableEdges;
-  SmallPtrSet<BasicBlock *, 4> ReachableEdges;
+  SmallVector<unsigned, 4> UnreachableEdges;
+  SmallVector<unsigned, 4> ReachableEdges;
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (PostDominatedByUnreachable.count(*I))
-      UnreachableEdges.insert(*I);
+      UnreachableEdges.push_back(I.getSuccessorIndex());
     else
-      ReachableEdges.insert(*I);
+      ReachableEdges.push_back(I.getSuccessorIndex());
   }
 
   // If all successors are in the set of blocks post-dominated by unreachable,
@@ -136,18 +136,19 @@
     return false;
 
   uint32_t UnreachableWeight =
-    std::max(UR_TAKEN_WEIGHT / UnreachableEdges.size(), MIN_WEIGHT);
-  for (SmallPtrSet<BasicBlock *, 4>::iterator I = UnreachableEdges.begin(),
-                                              E = UnreachableEdges.end();
+    std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = UnreachableEdges.begin(),
+                                          E = UnreachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, UnreachableWeight);
 
   if (ReachableEdges.empty())
     return true;
   uint32_t ReachableWeight =
-    std::max(UR_NONTAKEN_WEIGHT / ReachableEdges.size(), NORMAL_WEIGHT);
-  for (SmallPtrSet<BasicBlock *, 4>::iterator I = ReachableEdges.begin(),
-                                              E = ReachableEdges.end();
+    std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(),
+             NORMAL_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = ReachableEdges.begin(),
+                                          E = ReachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, ReachableWeight);
 
@@ -187,7 +188,7 @@
   }
   assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    setEdgeWeight(BB, TI->getSuccessor(i), Weights[i]);
+    setEdgeWeight(BB, i, Weights[i]);
 
   return true;
 }
@@ -211,19 +212,17 @@
 
   assert(CI->getOperand(1)->getType()->isPointerTy());
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
-
   // p != 0   ->   isProb = true
   // p == 0   ->   isProb = false
   // p != q   ->   isProb = true
   // p == q   ->   isProb = false;
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
   bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE;
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, PH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, PH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, PH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, PH_NONTAKEN_WEIGHT);
   return true;
 }
 
@@ -234,17 +233,17 @@
   if (!L)
     return false;
 
-  SmallPtrSet<BasicBlock *, 8> BackEdges;
-  SmallPtrSet<BasicBlock *, 8> ExitingEdges;
-  SmallPtrSet<BasicBlock *, 8> InEdges; // Edges from header to the loop.
+  SmallVector<unsigned, 8> BackEdges;
+  SmallVector<unsigned, 8> ExitingEdges;
+  SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (!L->contains(*I))
-      ExitingEdges.insert(*I);
+      ExitingEdges.push_back(I.getSuccessorIndex());
     else if (L->getHeader() == *I)
-      BackEdges.insert(*I);
+      BackEdges.push_back(I.getSuccessorIndex());
     else
-      InEdges.insert(*I);
+      InEdges.push_back(I.getSuccessorIndex());
   }
 
   if (uint32_t numBackEdges = BackEdges.size()) {
@@ -252,10 +251,9 @@
     if (backWeight < NORMAL_WEIGHT)
       backWeight = NORMAL_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = BackEdges.begin(),
          EE = BackEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Back = *EI;
-      setEdgeWeight(BB, Back, backWeight);
+      setEdgeWeight(BB, *EI, backWeight);
     }
   }
 
@@ -264,10 +262,9 @@
     if (inWeight < NORMAL_WEIGHT)
       inWeight = NORMAL_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = InEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = InEdges.begin(),
          EE = InEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Back = *EI;
-      setEdgeWeight(BB, Back, inWeight);
+      setEdgeWeight(BB, *EI, inWeight);
     }
   }
 
@@ -276,10 +273,9 @@
     if (exitWeight < MIN_WEIGHT)
       exitWeight = MIN_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = ExitingEdges.begin(),
          EE = ExitingEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Exiting = *EI;
-      setEdgeWeight(BB, Exiting, exitWeight);
+      setEdgeWeight(BB, *EI, exitWeight);
     }
   }
 
@@ -335,14 +331,13 @@
     return false;
   }
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
 
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, ZH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, ZH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, ZH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, ZH_NONTAKEN_WEIGHT);
 
   return true;
 }
@@ -372,14 +367,13 @@
     return false;
   }
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
 
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, FPH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, FPH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, FPH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, FPH_NONTAKEN_WEIGHT);
 
   return true;
 }
@@ -389,11 +383,8 @@
   if (!II)
     return false;
 
-  BasicBlock *Normal = II->getNormalDest();
-  BasicBlock *Unwind = II->getUnwindDest();
-
-  setEdgeWeight(BB, Normal, IH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, Unwind, IH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, 0/*Index for Normal*/, IH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, 1/*Index for Unwind*/, IH_NONTAKEN_WEIGHT);
   return true;
 }
 
@@ -450,8 +441,7 @@
   uint32_t Sum = 0;
 
   for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    const BasicBlock *Succ = *I;
-    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t Weight = getEdgeWeight(BB, I.getSuccessorIndex());
     uint32_t PrevSum = Sum;
 
     Sum += Weight;
@@ -494,11 +484,13 @@
   return 0;
 }
 
-// Return edge's weight. If can't find it, return DEFAULT_WEIGHT value.
+/// Get the raw edge weight for the edge. If can't find it, return
+/// DEFAULT_WEIGHT value. Here an edge is specified using PredBlock and an index
+/// to the successors.
 uint32_t BranchProbabilityInfo::
-getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
-  Edge E(Src, Dst);
-  DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E);
+getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  DenseMap<Edge, uint32_t>::const_iterator I =
+      Weights.find(std::make_pair(Src, IndexInSuccessors));
 
   if (I != Weights.end())
     return I->second;
@@ -506,15 +498,43 @@
   return DEFAULT_WEIGHT;
 }
 
-void BranchProbabilityInfo::
-setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst, uint32_t Weight) {
-  Weights[std::make_pair(Src, Dst)] = Weight;
-  DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
-               << Dst->getName() << " weight to " << Weight
-               << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n"));
+/// Get the raw edge weight calculated for the block pair. This returns the sum
+/// of all raw edge weights from Src to Dst.
+uint32_t BranchProbabilityInfo::
+getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
+  uint32_t Weight = 0;
+  DenseMap<Edge, uint32_t>::const_iterator MapI;
+  for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
+    if (*I == Dst) {
+      MapI = Weights.find(std::make_pair(Src, I.getSuccessorIndex()));
+      if (MapI != Weights.end())
+        Weight += MapI->second;
+    }
+  return (Weight == 0) ? DEFAULT_WEIGHT : Weight;
 }
 
+/// Set the edge weight for a given edge specified by PredBlock and an index
+/// to the successors.
+void BranchProbabilityInfo::
+setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
+              uint32_t Weight) {
+  Weights[std::make_pair(Src, IndexInSuccessors)] = Weight;
+  DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
+               << IndexInSuccessors << " successor weight to "
+               << Weight << "\n");
+}
 
+/// Get an edge's probability, relative to other out-edges from Src.
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  uint32_t N = getEdgeWeight(Src, IndexInSuccessors);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
+
+/// Get the probability of going from Src to Dst. It returns the sum of all
+/// probabilities for edges from Src to Dst.
 BranchProbability BranchProbabilityInfo::
 getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
 

diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 96e68b4..e461848 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt

@@ -44,6 +44,8 @@
   ProfileInfoLoader.cpp
   ProfileInfoLoaderPass.cpp
   ProfileVerifierPass.cpp
+  ProfileDataLoader.cpp
+  ProfileDataLoaderPass.cpp
   RegionInfo.cpp
   RegionPass.cpp
   RegionPrinter.cpp

diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index f5e619c..4ad613c 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp

@@ -659,7 +659,8 @@
   unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
   APInt Offset =
     APInt(BitWidth, TD->getIndexedOffset(Ptr->getType(),
-                                         makeArrayRef((Value **)Ops.data() + 1,
+                                         makeArrayRef((Value *const*)
+                                                        Ops.data() + 1,
                                                       Ops.size() - 1)));
   Ptr = StripPtrCastKeepAS(Ptr);
 

diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 1604576..5536a9b 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp

@@ -133,7 +133,9 @@
   }
 }
 
+#ifndef NDEBUG
 void DominanceFrontierBase::dump() const {
   print(dbgs());
 }
+#endif
 

diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 0df3e8a..947ad51 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp

@@ -198,9 +198,11 @@
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
+#ifndef NDEBUG
 void CallGraph::dump() const {
   print(dbgs(), 0);
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // Implementations of public modification methods
@@ -267,7 +269,9 @@
   OS << '\n';
 }
 
+#ifndef NDEBUG
 void CallGraphNode::dump() const { print(dbgs()); }
+#endif
 
 /// removeCallEdgeFor - This method removes the edge in the node for the
 /// specified call site.  Note that this method takes linear time, so it

diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 22f6e96..990caa8 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp

@@ -263,7 +263,7 @@
     } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (AnalyzeUsesOfPointer(BCI, Readers, Writers, OkayStoreDest))
         return true;
-    } else if (isFreeCall(U)) {
+    } else if (isFreeCall(U, TLI)) {
       Writers.push_back(cast<Instruction>(U)->getParent()->getParent());
     } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
       // Make sure that this is just the function being called, not that it is
@@ -329,7 +329,7 @@
       // Check the value being stored.
       Value *Ptr = GetUnderlyingObject(SI->getOperand(0));
 
-      if (!isAllocLikeFn(Ptr))
+      if (!isAllocLikeFn(Ptr, TLI))
         return false;  // Too hard to analyze.
 
       // Analyze all uses of the allocation.  If any of them are used in a
@@ -458,7 +458,7 @@
           if (SI->isVolatile())
             // Treat volatile stores as reading memory somewhere.
             FunctionEffect |= Ref;
-        } else if (isAllocationFn(&*II) || isFreeCall(&*II)) {
+        } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) {
           FunctionEffect |= ModRef;
         } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
           // The callgraph doesn't include intrinsic calls.

diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 0a6682a..f705181 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp

@@ -273,9 +273,11 @@
   }
 }
 
+#ifndef NDEBUG
 void IVUsers::dump() const {
   print(dbgs());
 }
+#endif
 
 void IVUsers::releaseMemory() {
   Processed.clear();

diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index bc1ecd2..12be7fd 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp

@@ -974,6 +974,7 @@
   return AlwaysInline || Cost < Threshold;
 }
 
+#ifndef NDEBUG
 /// \brief Dump stats about this call's analysis.
 void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
@@ -987,6 +988,7 @@
   DEBUG_PRINT_STAT(SROACostSavingsLost);
 #undef DEBUG_PRINT_STAT
 }
+#endif
 
 InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);

diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 9140786..ec618fa 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp

@@ -470,8 +470,10 @@
     return true;
 
   LVIValueHandle ValHandle(Val, this);
-  if (!ValueCache.count(ValHandle)) return false;
-  return ValueCache[ValHandle].count(BB);
+  std::map<LVIValueHandle, ValueCacheEntryTy>::iterator I =
+    ValueCache.find(ValHandle);
+  if (I == ValueCache.end()) return false;
+  return I->second.count(BB);
 }
 
 LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
@@ -845,9 +847,12 @@
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       ConstantRange EdgeVal(i.getCaseValue()->getValue());
-      if (DefaultCase)
-        EdgesVals = EdgesVals.difference(EdgeVal);
-      else if (i.getCaseSuccessor() == BBTo)
+      if (DefaultCase) {
+        // It is possible that the default destination is the destination of
+        // some cases. There is no need to perform difference for those cases.
+        if (i.getCaseSuccessor() != BBTo)
+          EdgesVals = EdgesVals.difference(EdgeVal);
+      } else if (i.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
     Result = LVILatticeVal::getRange(EdgesVals);

diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 20c33a3..4a18104 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp

@@ -306,9 +306,11 @@
   return 0;
 }
 
+#ifndef NDEBUG
 void Loop::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // UnloopUpdater implementation
@@ -429,8 +431,8 @@
     Unloop->removeChildLoop(llvm::prior(Unloop->end()));
 
     assert(SubloopParents.count(Subloop) && "DFS failed to visit subloop");
-    if (SubloopParents[Subloop])
-      SubloopParents[Subloop]->addChildLoop(Subloop);
+    if (Loop *Parent = SubloopParents[Subloop])
+      Parent->addChildLoop(Subloop);
     else
       LI->addTopLevelLoop(Subloop);
   }
@@ -456,9 +458,8 @@
       assert(Subloop && "subloop is not an ancestor of the original loop");
     }
     // Get the current nearest parent of the Subloop exits, initially Unloop.
-    if (!SubloopParents.count(Subloop))
-      SubloopParents[Subloop] = Unloop;
-    NearLoop = SubloopParents[Subloop];
+    NearLoop =
+      SubloopParents.insert(std::make_pair(Subloop, Unloop)).first->second;
   }
 
   succ_iterator I = succ_begin(BB), E = succ_end(BB);

diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index b986b32..5b2313e 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp

@@ -26,6 +26,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -39,7 +40,7 @@
 };
 
 struct AllocFnsTy {
-  const char *Name;
+  LibFunc::Func Func;
   AllocType AllocTy;
   unsigned char NumParams;
   // First and Second size parameters (or -1 if unused)
@@ -49,22 +50,22 @@
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const AllocFnsTy AllocationFnData[] = {
-  {"malloc",              MallocLike,  1, 0,  -1},
-  {"valloc",              MallocLike,  1, 0,  -1},
-  {"_Znwj",               MallocLike,  1, 0,  -1}, // new(unsigned int)
-  {"_ZnwjRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {"_Znwm",               MallocLike,  1, 0,  -1}, // new(unsigned long)
-  {"_ZnwmRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {"_Znaj",               MallocLike,  1, 0,  -1}, // new[](unsigned int)
-  {"_ZnajRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {"_Znam",               MallocLike,  1, 0,  -1}, // new[](unsigned long)
-  {"_ZnamRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {"posix_memalign",      MallocLike,  3, 2,  -1},
-  {"calloc",              CallocLike,  2, 0,  1},
-  {"realloc",             ReallocLike, 2, 1,  -1},
-  {"reallocf",            ReallocLike, 2, 1,  -1},
-  {"strdup",              StrDupLike,  1, -1, -1},
-  {"strndup",             StrDupLike,  2, 1,  -1}
+  {LibFunc::malloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::valloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
+  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
+  {LibFunc::calloc,              CallocLike,  2, 0,   1},
+  {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
+  {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
+  {LibFunc::strdup,              StrDupLike,  1, -1, -1},
+  {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
 };
 
 
@@ -85,15 +86,22 @@
 /// \brief Returns the allocation data for the given value if it is a call to a
 /// known allocation function, and NULL otherwise.
 static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
+                                           const TargetLibraryInfo *TLI,
                                            bool LookThroughBitCast = false) {
   Function *Callee = getCalledFunction(V, LookThroughBitCast);
   if (!Callee)
     return 0;
 
+  // Make sure that the function is available.
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
   unsigned i = 0;
   bool found = false;
   for ( ; i < array_lengthof(AllocationFnData); ++i) {
-    if (Callee->getName() == AllocationFnData[i].Name) {
+    if (AllocationFnData[i].Func == TLIFn) {
       found = true;
       break;
     }
@@ -106,7 +114,6 @@
     return 0;
 
   // Check function prototype.
-  // FIXME: Check the nobuiltin metadata?? (PR5130)
   int FstParam = FnData->FstParam;
   int SndParam = FnData->SndParam;
   FunctionType *FTy = Callee->getFunctionType();
@@ -132,57 +139,65 @@
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
-bool llvm::isAllocationFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, AnyAlloc, LookThroughBitCast);
+bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
-bool llvm::isNoAliasFn(const Value *V, bool LookThroughBitCast) {
+bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
+                       bool LookThroughBitCast) {
   // it's safe to consider realloc as noalias since accessing the original
   // pointer is undefined behavior
-  return isAllocationFn(V, LookThroughBitCast) ||
+  return isAllocationFn(V, TLI, LookThroughBitCast) ||
          hasNoAliasAttr(V, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
-bool llvm::isMallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, MallocLike, LookThroughBitCast);
+bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
-bool llvm::isCallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, CallocLike, LookThroughBitCast);
+bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
-bool llvm::isAllocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, AllocLike, LookThroughBitCast);
+bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                         bool LookThroughBitCast) {
+  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// reallocates memory (such as realloc).
-bool llvm::isReallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, ReallocLike, LookThroughBitCast);
+bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                           bool LookThroughBitCast) {
+  return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
 }
 
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(const Value *I) {
-  return isMallocLikeFn(I) ? dyn_cast<CallInst>(I) : 0;
+const CallInst *llvm::extractMallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
 }
 
 static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
+                               const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
     return NULL;
 
   // The size of the malloc's result type must be known to determine array size.
-  Type *T = getMallocAllocatedType(CI);
+  Type *T = getMallocAllocatedType(CI, TLI);
   if (!T || !T->isSized() || !TD)
     return NULL;
 
@@ -204,9 +219,11 @@
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction 
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
-const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
-  const CallInst *CI = extractMallocCall(I);
-  Value *ArraySize = computeArraySize(CI, TD);
+const CallInst *llvm::isArrayMalloc(const Value *I,
+                                    const TargetData *TD,
+                                    const TargetLibraryInfo *TLI) {
+  const CallInst *CI = extractMallocCall(I, TLI);
+  Value *ArraySize = computeArraySize(CI, TD, TLI);
 
   if (ArraySize &&
       ArraySize != ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
@@ -221,8 +238,9 @@
 ///   0: PointerType is the calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-PointerType *llvm::getMallocType(const CallInst *CI) {
-  assert(isMallocLikeFn(CI) && "getMallocType and not malloc call");
+PointerType *llvm::getMallocType(const CallInst *CI,
+                                 const TargetLibraryInfo *TLI) {
+  assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call");
   
   PointerType *MallocType = NULL;
   unsigned NumOfBitCastUses = 0;
@@ -252,8 +270,9 @@
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-Type *llvm::getMallocAllocatedType(const CallInst *CI) {
-  PointerType *PT = getMallocType(CI);
+Type *llvm::getMallocAllocatedType(const CallInst *CI,
+                                   const TargetLibraryInfo *TLI) {
+  PointerType *PT = getMallocType(CI, TLI);
   return PT ? PT->getElementType() : NULL;
 }
 
@@ -263,21 +282,23 @@
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
 Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD,
+                                const TargetLibraryInfo *TLI,
                                 bool LookThroughSExt) {
-  assert(isMallocLikeFn(CI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, TD, LookThroughSExt);
+  assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
+  return computeArraySize(CI, TD, TLI, LookThroughSExt);
 }
 
 
 /// extractCallocCall - Returns the corresponding CallInst if the instruction
 /// is a calloc call.
-const CallInst *llvm::extractCallocCall(const Value *I) {
-  return isCallocLikeFn(I) ? cast<CallInst>(I) : 0;
+const CallInst *llvm::extractCallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : 0;
 }
 
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
-const CallInst *llvm::isFreeCall(const Value *I) {
+const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   const CallInst *CI = dyn_cast<CallInst>(I);
   if (!CI)
     return 0;
@@ -285,9 +306,14 @@
   if (Callee == 0 || !Callee->isDeclaration())
     return 0;
 
-  if (Callee->getName() != "free" &&
-      Callee->getName() != "_ZdlPv" && // operator delete(void*)
-      Callee->getName() != "_ZdaPv")   // operator delete[](void*)
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
+  if (TLIFn != LibFunc::free &&
+      TLIFn != LibFunc::ZdlPv && // operator delete(void*)
+      TLIFn != LibFunc::ZdaPv)   // operator delete[](void*)
     return 0;
 
   // Check free prototype.
@@ -316,11 +342,11 @@
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
-                         bool RoundToAlign) {
+                         const TargetLibraryInfo *TLI, bool RoundToAlign) {
   if (!TD)
     return false;
 
-  ObjectSizeOffsetVisitor Visitor(TD, Ptr->getContext(), RoundToAlign);
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -348,9 +374,10 @@
 }
 
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD,
+                                                 const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
                                                  bool RoundToAlign)
-: TD(TD), RoundToAlign(RoundToAlign) {
+: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
   IntegerType *IntTy = TD->getIntPtrType(Context);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
@@ -416,7 +443,8 @@
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
   if (!FnData)
     return unknown();
 
@@ -532,8 +560,9 @@
 
 
 ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const TargetData *TD,
+                                                   const TargetLibraryInfo *TLI,
                                                      LLVMContext &Context)
-: TD(TD), Context(Context), Builder(Context, TargetFolder(TD)) {
+: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
   IntTy = TD->getIntPtrType(Context);
   Zero = ConstantInt::get(IntTy, 0);
 }
@@ -558,7 +587,7 @@
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(TD, Context);
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
@@ -621,7 +650,8 @@
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
   if (!FnData)
     return unknown();
 

diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 059e574..5736c35 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp

@@ -148,7 +148,7 @@
     return AliasAnalysis::ModRef;
   }
 
-  if (const CallInst *CI = isFreeCall(Inst)) {
+  if (const CallInst *CI = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
     // calls to free() deallocate the entire structure
     Loc = AliasAnalysis::Location(CI->getArgOperand(0));
     return AliasAnalysis::Mod;
@@ -479,12 +479,20 @@
     // a subsequent bitcast of the malloc call result.  There can be stores to
     // the malloced memory between the malloc call and its bitcast uses, and we
     // need to continue scanning until the malloc call.
-    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst)) {
+    const TargetLibraryInfo *TLI = AA->getTargetLibraryInfo();
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
       
       if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
-      continue;
+      // Be conservative if the accessed pointer may alias the allocation.
+      if (AA->alias(Inst, AccessPtr) != AliasAnalysis::NoAlias)
+        return MemDepResult::getClobber(Inst);
+      // If the allocation is not aliased and does not read memory (like
+      // strdup), it is safe to ignore.
+      if (isa<AllocaInst>(Inst) ||
+          isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+        continue;
     }
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.

diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index 38cb1c9..d6a17ca 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp

@@ -41,6 +41,7 @@
   return false;
 }
 
+#ifndef NDEBUG
 void PHITransAddr::dump() const {
   if (Addr == 0) {
     dbgs() << "PHITransAddr: null\n";
@@ -50,6 +51,7 @@
   for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
     dbgs() << "  Input #" << i << " is " << *InstInputs[i] << "\n";
 }
+#endif
 
 
 static bool VerifySubExpr(Value *Expr,

diff --git a/lib/Analysis/ProfileDataLoader.cpp b/lib/Analysis/ProfileDataLoader.cpp
new file mode 100644
index 0000000..69286ef
--- /dev/null
+++ b/lib/Analysis/ProfileDataLoader.cpp

@@ -0,0 +1,162 @@
+//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileDataLoader class is used to load raw profiling data from the dump
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Module.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/Analysis/ProfileDataTypes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
+                                                        const BasicBlock *> E) {
+  O << "(";
+
+  if (E.first)
+    O << E.first->getName();
+  else
+    O << "0";
+
+  O << ",";
+
+  if (E.second)
+    O << E.second->getName();
+  else
+    O << "0";
+
+  return O << ")";
+}
+
+/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
+/// (or both) may not be defined.
+static unsigned AddCounts(unsigned A, unsigned B) {
+  // If either value is undefined, use the other.
+  // Undefined + undefined = undefined.
+  if (A == ProfileDataLoader::Uncounted) return B;
+  if (B == ProfileDataLoader::Uncounted) return A;
+
+  // Saturate to the maximum storable value.  This could change taken/nottaken
+  // ratios, but is presumably better than wrapping and thus potentially
+  // inverting ratios.
+  uint64_t tmp = (uint64_t)A + (uint64_t)B;
+  if (tmp > (uint64_t)ProfileDataLoader::MaxCount)
+    tmp = ProfileDataLoader::MaxCount;
+  return (unsigned)tmp;
+}
+
+/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
+template <typename T>
+static void ReadProfilingData(const char *ToolName, FILE *F,
+                              T *Data, size_t NumEntries) {
+  // Read in the block of data...
+  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
+    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
+}
+
+/// ReadProfilingNumEntries - Read how many entries are in this profiling data
+/// packet.
+static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
+                                        bool ShouldByteSwap) {
+  unsigned Entry;
+  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
+  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
+}
+
+/// ReadProfilingBlock - Read the number of entries in the next profiling data
+/// packet and then accumulate the entries into 'Data'.
+static void ReadProfilingBlock(const char *ToolName, FILE *F,
+                               bool ShouldByteSwap,
+                               SmallVector<unsigned, 32> &Data) {
+  // Read the number of entries...
+  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the data.
+  SmallVector<unsigned, 8> TempSpace(NumEntries);
+  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
+
+  // Make sure we have enough space ...
+  if (Data.size() < NumEntries)
+    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
+
+  // Accumulate the data we just read into the existing data.
+  for (unsigned i = 0; i < NumEntries; ++i) {
+    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
+    Data[i] = AddCounts(Entry, Data[i]);
+  }
+}
+
+/// ReadProfilingArgBlock - Read the command line arguments that the progam was
+/// run with when the current profiling data packet(s) were generated.
+static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
+                                  bool ShouldByteSwap,
+                                  SmallVector<std::string, 1> &CommandLines) {
+  // Read the number of bytes ...
+  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the arguments (if there are any to read).  Round up the length to
+  // the nearest 4-byte multiple.
+  SmallVector<char, 8> Args(ArgLength+4);
+  if (ArgLength)
+    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
+
+  // Store the arguments.
+  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
+}
+
+const unsigned ProfileDataLoader::Uncounted = ~0U;
+const unsigned ProfileDataLoader::MaxCount = ~0U - 1U;
+
+/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
+/// a fatal error if the file is invalid or broken.
+ProfileDataLoader::ProfileDataLoader(const char *ToolName,
+                                     const std::string &Filename)
+  : Filename(Filename) {
+  FILE *F = fopen(Filename.c_str(), "rb");
+  if (F == 0)
+    report_fatal_error(Twine(ToolName) + ": Error opening '" +
+                       Filename + "': ");
+
+  // Keep reading packets until we run out of them.
+  unsigned PacketType;
+  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
+    // If the low eight bits of the packet are zero, we must be dealing with an
+    // endianness mismatch.  Byteswap all words read from the profiling
+    // information.  This can happen when the compiler host and target have
+    // different endianness.
+    bool ShouldByteSwap = (char)PacketType == 0;
+    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
+
+    switch (PacketType) {
+      case ArgumentInfo:
+        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
+        break;
+
+      case EdgeInfo:
+        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
+        break;
+
+      default:
+        report_fatal_error(std::string(ToolName)
+                           + ": Unknown profiling packet type");
+        break;
+    }
+  }
+
+  fclose(F);
+}

diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
new file mode 100644
index 0000000..c43cff0
--- /dev/null
+++ b/lib/Analysis/ProfileDataLoaderPass.cpp

@@ -0,0 +1,188 @@
+//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loads profiling data from a dump file and sets branch weight
+// metadata.
+//
+// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
+// once ProfileInfo etc. has been removed.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-metadata-loader"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
+#include "llvm/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumEdgesRead, "The # of edges read.");
+STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
+
+static cl::opt<std::string>
+ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
+                  cl::value_desc("filename"),
+                  cl::desc("Profile file loaded by -profile-metadata-loader"));
+
+namespace {
+  /// This pass loads profiling data from a dump file and sets branch weight
+  /// metadata.
+  class ProfileMetadataLoaderPass : public ModulePass {
+    std::string Filename;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
+        : ModulePass(ID), Filename(filename) {
+      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
+      if (filename.empty()) Filename = ProfileMetadataFilename;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profile loader";
+    }
+
+    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
+                          ArrayRef<unsigned>);
+    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
+    virtual void setBranchWeightMetadata(Module&, ProfileData&);
+
+    virtual bool runOnModule(Module &M);
+  };
+}  // End of anonymous namespace
+
+char ProfileMetadataLoaderPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+
+char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
+
+/// createProfileMetadataLoaderPass - This function returns a Pass that loads
+/// the profiling information for the module from the specified filename,
+/// making it available to the optimizers.
+ModulePass *llvm::createProfileMetadataLoaderPass() { 
+    return new ProfileMetadataLoaderPass();
+}
+ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
+  return new ProfileMetadataLoaderPass(Filename);
+}
+
+/// readEdge - Take the value from a profile counter and assign it to an edge.
+void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
+                                         ProfileData &PB, ProfileData::Edge e,
+                                         ArrayRef<unsigned> Counters) {
+  if (ReadCount >= Counters.size()) return;
+
+  unsigned weight = Counters[ReadCount];
+  assert(weight != ProfileDataLoader::Uncounted);
+  PB.addEdgeWeight(e, weight);
+
+  DEBUG(dbgs() << "-- Read Edge Counter for " << e
+               << " (# "<< (ReadCount) << "): "
+               << PB.getEdgeWeight(e) << "\n");
+}
+
+/// matchEdges - Link every profile counter with an edge.
+unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
+                                               ArrayRef<unsigned> Counters) {
+  if (Counters.size() == 0) return 0;
+
+  unsigned ReadCount = 0;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
+    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
+                 Counters);
+      }
+    }
+  }
+
+  return ReadCount;
+}
+
+/// setBranchWeightMetadata - Translate the counter values associated with each
+/// edge into branch weights for each conditional branch (a branch with 2 or
+/// more desinations).
+void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
+                                                        ProfileData &PB) {
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      unsigned NumSuccessors = TI->getNumSuccessors();
+
+      // If there is only one successor then we can not set a branch
+      // probability as the target is certain.
+      if (NumSuccessors < 2) continue;
+
+      // Load the weights of all edges leading from this terminator.
+      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
+                   << " successors:\n");
+      SmallVector<uint32_t, 4> Weights(NumSuccessors);
+      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
+          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
+          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
+          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
+                       << Weights[s] << "\n");
+      }
+
+      // Set branch weight metadata.  This will set branch probabilities of
+      // 100%/0% if that is true of the dynamic execution.
+      // BranchProbabilityInfo can account for this when it loads this metadata
+      // (it gives the unexectuted branch a weight of 1 for the purposes of
+      // probability calculations).
+      MDBuilder MDB(TI->getContext());
+      MDNode *Node = MDB.createBranchWeights(Weights);
+      TI->setMetadata(LLVMContext::MD_prof, Node);
+      NumTermsAnnotated++;
+    }
+  }
+}
+
+bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
+  ProfileDataLoader PDL("profile-data-loader", Filename);
+  ProfileData PB;
+
+  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
+
+  unsigned ReadCount = matchEdges(M, PB, Counters);
+
+  if (ReadCount != Counters.size()) {
+    errs() << "WARNING: profile information is inconsistent with "
+           << "the current program!\n";
+  }
+  NumEdgesRead = ReadCount;
+
+  setBranchWeightMetadata(M, PB);
+
+  return ReadCount > 0;
+}

diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index 63468f8..12b59e0 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp

@@ -286,7 +286,7 @@
     }
   }
 
-  double fraction = floor(BBWeight/Edges.size());
+  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
   // Finally we know what flow is still not leaving the block, distribute this
   // flow onto the empty edges.
   for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();

diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 173de2c..b5b7ac1 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp

@@ -1016,40 +1016,14 @@
   }
 }
 
-raw_ostream& operator<<(raw_ostream &O, const Function *F) {
-  return O << F->getName();
-}
-
 raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
   return O << MF->getFunction()->getName() << "(MF)";
 }
 
-raw_ostream& operator<<(raw_ostream &O, const BasicBlock *BB) {
-  return O << BB->getName();
-}
-
 raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
   return O << MBB->getBasicBlock()->getName() << "(MB)";
 }
 
-raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *, const BasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first;
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second;
-  else
-    O << "0";
-
-  return O << ")";
-}
-
 raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
   O << "(";
 

diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 868f483..0f9a8b3 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp

@@ -47,7 +47,7 @@
   cl::values(
     clEnumValN(Region::PrintNone, "none",  "print no details"),
     clEnumValN(Region::PrintBB, "bb",
-               "print regions in detail with block_node_iterator"),
+               "print regions in detail with block_iterator"),
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator"),
     clEnumValEnd));
@@ -246,22 +246,6 @@
   verifyRegion();
 }
 
-Region::block_node_iterator Region::block_node_begin() {
-  return GraphTraits<FlatIt<Region*> >::nodes_begin(this);
-}
-
-Region::block_node_iterator Region::block_node_end() {
-  return GraphTraits<FlatIt<Region*> >::nodes_end(this);
-}
-
-Region::const_block_node_iterator Region::block_node_begin() const {
-  return GraphTraits<FlatIt<const Region*> >::nodes_begin(this);
-}
-
-Region::const_block_node_iterator Region::block_node_end() const {
-  return GraphTraits<FlatIt<const Region*> >::nodes_end(this);
-}
-
 Region::element_iterator Region::element_begin() {
   return GraphTraits<Region*>::nodes_begin(this);
 }
@@ -425,10 +409,8 @@
     OS.indent(level*2 + 2);
 
     if (Style == PrintBB) {
-      for (const_block_node_iterator I = block_node_begin(),
-                                     E = block_node_end();
-           I != E; ++I)
-        OS << **I << ", "; // TODO: remove the last ","
+      for (const_block_iterator I = block_begin(), E = block_end(); I != E; ++I)
+        OS << (*I)->getName() << ", "; // TODO: remove the last ","
     } else if (Style == PrintRN) {
       for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
         OS << **I << ", "; // TODO: remove the last ",
@@ -445,9 +427,11 @@
     OS.indent(level*2) << "} \n";
 }
 
+#ifndef NDEBUG
 void Region::dump() const {
   print(dbgs(), true, getDepth(), printStyle.getValue());
 }
+#endif
 
 void Region::clearNodeCache() {
   // Free the cached nodes.

diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index c97b5eb..9208fa2 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp

@@ -195,10 +195,9 @@
 
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
     Out << Banner;
-    for (Region::block_node_iterator I = R->block_node_begin(),
-                                     E = R->block_node_end();
+    for (Region::block_iterator I = R->block_begin(), E = R->block_end();
          I != E; ++I)
-      (*I)->getEntry()->print(Out);
+      (*I)->print(Out);
 
     return false;
   }

diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index a654648..84e147b 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp

@@ -122,10 +122,12 @@
 // Implementation of the SCEV class.
 //
 
+#ifndef NDEBUG
 void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void SCEV::print(raw_ostream &OS) const {
   switch (getSCEVType()) {

diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
index ff5010b..dbb9535 100644
--- a/lib/Analysis/Trace.cpp
+++ b/lib/Analysis/Trace.cpp

@@ -43,9 +43,11 @@
   O << "; Trace parent function: \n" << *F;
 }
 
+#ifndef NDEBUG
 /// dump - Debugger convenience method; writes trace to standard error
 /// output stream.
 ///
 void Trace::dump() const {
   print(dbgs());
 }
+#endif

diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index cea34e1..491224a 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp

@@ -1614,7 +1614,7 @@
   // right.
   unsigned PtrSize = TD.getPointerSizeInBits();
   if (PtrSize < 64)
-    Offset = (Offset << (64-PtrSize)) >> (64-PtrSize);
+    Offset = SignExtend64(Offset, PtrSize);
   
   return GetPointerBaseWithConstantOffset(GEP->getPointerOperand(), Offset, TD);
 }

diff --git a/lib/Archive/ArchiveInternals.h b/lib/Archive/ArchiveInternals.h
index 55684f7..639f5ac 100644
--- a/lib/Archive/ArchiveInternals.h
+++ b/lib/Archive/ArchiveInternals.h

@@ -66,7 +66,7 @@
       fmag[1] = '\n';
     }
 
-    bool checkSignature() {
+    bool checkSignature() const {
       return 0 == memcmp(fmag, ARFILE_MEMBER_MAGIC,2);
     }
   };

diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
index 5cfc810..5052495 100644
--- a/lib/Archive/ArchiveReader.cpp
+++ b/lib/Archive/ArchiveReader.cpp

@@ -79,7 +79,7 @@
   }
 
   // Cast archive member header
-  ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At;
+  const ArchiveMemberHeader* Hdr = (const ArchiveMemberHeader*)At;
   At += sizeof(ArchiveMemberHeader);
 
   int flags = 0;
@@ -196,7 +196,7 @@
       /* FALL THROUGH */
 
     default:
-      char* slash = (char*) memchr(Hdr->name, '/', 16);
+      const char* slash = (const char*) memchr(Hdr->name, '/', 16);
       if (slash == 0)
         slash = Hdr->name + 16;
       pathname.assign(Hdr->name, slash - Hdr->name);

diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index e0458045..6e61665 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp

@@ -510,6 +510,7 @@
   KEYWORD(asm);
   KEYWORD(sideeffect);
   KEYWORD(alignstack);
+  KEYWORD(inteldialect);
   KEYWORD(gc);
 
   KEYWORD(ccc);
@@ -554,7 +555,6 @@
   KEYWORD(naked);
   KEYWORD(nonlazybind);
   KEYWORD(address_safety);
-  KEYWORD(ia_nsdialect);
 
   KEYWORD(type);
   KEYWORD(opaque);

diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index a9c7e98..b0b64d8 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp

@@ -962,7 +962,6 @@
     case lltok::kw_naked:           Attrs |= Attribute::Naked; break;
     case lltok::kw_nonlazybind:     Attrs |= Attribute::NonLazyBind; break;
     case lltok::kw_address_safety:  Attrs |= Attribute::AddressSafety; break;
-    case lltok::kw_ia_nsdialect:    Attrs |= Attribute::IANSDialect; break;
 
     case lltok::kw_alignstack: {
       unsigned Alignment;
@@ -2070,16 +2069,18 @@
 
   case lltok::kw_asm: {
     // ValID ::= 'asm' SideEffect? AlignStack? STRINGCONSTANT ',' STRINGCONSTANT
-    bool HasSideEffect, AlignStack;
+    bool HasSideEffect, AlignStack, AsmDialect;
     Lex.Lex();
     if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
         ParseOptionalToken(lltok::kw_alignstack, AlignStack) ||
+        ParseOptionalToken(lltok::kw_inteldialect, AsmDialect) ||
         ParseStringConstant(ID.StrVal) ||
         ParseToken(lltok::comma, "expected comma in inline asm expression") ||
         ParseToken(lltok::StringConstant, "expected constraint string"))
       return true;
     ID.StrVal2 = Lex.getStrVal();
-    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1);
+    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) |
+      (unsigned(AsmDialect)<<2);
     ID.Kind = ValID::t_InlineAsm;
     return false;
   }
@@ -2496,7 +2497,8 @@
       PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
     if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
       return Error(ID.Loc, "invalid type for inline asm constraint string");
-    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1, ID.UIntVal>>1);
+    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1,
+                       (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2)));
     return false;
   }
   case ValID::t_MDNode:

diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 9fd63f2..37cbf30 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h

@@ -72,6 +72,7 @@
     kw_asm,
     kw_sideeffect,
     kw_alignstack,
+    kw_inteldialect,
     kw_gc,
     kw_c,
 
@@ -107,7 +108,6 @@
     kw_naked,
     kw_nonlazybind,
     kw_address_safety,
-    kw_ia_nsdialect,
 
     kw_type,
     kw_opaque,

diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 65fd52e..f242df4 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp

@@ -1245,7 +1245,9 @@
         V = ConstantExpr::getICmp(Record[3], Op0, Op1);
       break;
     }
-    case bitc::CST_CODE_INLINEASM: {
+    // This maintains backward compatibility, pre-asm dialect keywords.
+    // FIXME: Remove with the 4.0 release.
+    case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2) return Error("Invalid INLINEASM record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
@@ -1266,6 +1268,31 @@
                          AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
     }
+    // This version adds support for the asm dialect keywords (e.g.,
+    // inteldialect).
+    case bitc::CST_CODE_INLINEASM: {
+      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[0] & 1;
+      bool IsAlignStack = (Record[0] >> 1) & 1;
+      unsigned AsmDialect = Record[0] >> 2;
+      unsigned AsmStrSize = Record[1];
+      if (2+AsmStrSize >= Record.size())
+        return Error("Invalid INLINEASM record");
+      unsigned ConstStrSize = Record[2+AsmStrSize];
+      if (3+AsmStrSize+ConstStrSize > Record.size())
+        return Error("Invalid INLINEASM record");
+
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[2+i];
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[3+AsmStrSize+i];
+      PointerType *PTy = cast<PointerType>(CurTy);
+      V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect));
+      break;
+    }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
       Type *FnTy = getTypeByID(Record[0]);
@@ -2837,7 +2864,7 @@
 }
 
 bool BitcodeReader::InitStreamFromBuffer() {
-  const unsigned char *BufPtr = (unsigned char *)Buffer->getBufferStart();
+  const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3) {

diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1d2dfc3..94ebe19 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp

@@ -814,7 +814,8 @@
 
     if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
       Record.push_back(unsigned(IA->hasSideEffects()) |
-                       unsigned(IA->isAlignStack()) << 1);
+                       unsigned(IA->isAlignStack()) << 1 |
+                       unsigned(IA->getDialect()&1) << 2);
 
       // Add the asm string.
       const std::string &AsmStr = IA->getAsmString();

diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 03cf03b..57c4a9b 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk

@@ -93,6 +93,7 @@
   SpillPlacement.cpp \
   Spiller.cpp \
   SplitKit.cpp \
+  StackColoring.cpp \
   StackProtector.cpp \
   StackSlotColoring.cpp \
   StrongPHIElimination.cpp \

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7364f42..23d9222 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp

@@ -319,8 +319,8 @@
       return;
     }
 
-    if (MAI->getLCOMMDirectiveType() != LCOMM::None &&
-        (MAI->getLCOMMDirectiveType() != LCOMM::NoAlignment || Align == 1)) {
+    if (Align == 1 ||
+        MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
       // .lcomm _foo, 42
       OutStreamer.EmitLocalCommonSymbol(GVSym, Size, Align);
       return;
@@ -491,9 +491,8 @@
                      "' label emitted multiple times to assembly file");
 }
 
-
-/// EmitComments - Pretty-print comments for instructions.
-static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+/// emitComments - Pretty-print comments for instructions.
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetMachine &TM = MF->getTarget();
 
@@ -528,16 +527,16 @@
     CommentOS << " Reload Reuse\n";
 }
 
-/// EmitImplicitDef - This method emits the specified machine instruction
+/// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
-static void EmitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
   unsigned RegNo = MI->getOperand(0).getReg();
   AP.OutStreamer.AddComment(Twine("implicit-def: ") +
                             AP.TM.getRegisterInfo()->getName(RegNo));
   AP.OutStreamer.AddBlankLine();
 }
 
-static void EmitKill(const MachineInstr *MI, AsmPrinter &AP) {
+static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
   std::string Str = "kill:";
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &Op = MI->getOperand(i);
@@ -550,10 +549,10 @@
   AP.OutStreamer.AddBlankLine();
 }
 
-/// EmitDebugValueComment - This method handles the target-independent form
+/// emitDebugValueComment - This method handles the target-independent form
 /// of DBG_VALUE, returning true if it was able to do so.  A false return
 /// means the target will need to handle MI in EmitInstruction.
-static bool EmitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
+static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   // This code handles only the 3-operand target-independent form.
   if (MI->getNumOperands() != 3)
     return false;
@@ -685,7 +684,7 @@
 #endif // !ANDROID_TARGET_BUILD || ANDROID_ENGINEERING_BUILD
 
       if (isVerbose())
-        EmitComments(*II, OutStreamer.GetCommentOS());
+        emitComments(*II, OutStreamer.GetCommentOS());
 
       switch (II->getOpcode()) {
       case TargetOpcode::PROLOG_LABEL:
@@ -701,15 +700,15 @@
         break;
       case TargetOpcode::DBG_VALUE:
         if (isVerbose()) {
-          if (!EmitDebugValueComment(II, *this))
+          if (!emitDebugValueComment(II, *this))
             EmitInstruction(II);
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) EmitImplicitDef(II, *this);
+        if (isVerbose()) emitImplicitDef(II, *this);
         break;
       case TargetOpcode::KILL:
-        if (isVerbose()) EmitKill(II, *this);
+        if (isVerbose()) emitKill(II, *this);
         break;
       default:
         if (!TM.hasMCUseLoc())
@@ -1439,9 +1438,9 @@
 // Constant emission.
 //===----------------------------------------------------------------------===//
 
-/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
 ///
-static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
+static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   MCContext &Ctx = AP.OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
@@ -1469,7 +1468,7 @@
     if (Constant *C =
           ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
       if (C != CE)
-        return LowerConstant(C, AP);
+        return lowerConstant(C, AP);
 
     // Otherwise report the problem to the user.
     {
@@ -1487,15 +1486,14 @@
     SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
     int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec);
 
-    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
     if (Offset == 0)
       return Base;
 
     // Truncate/sext the offset to the pointer size.
-    if (TD.getPointerSizeInBits() != 64) {
-      int SExtAmount = 64-TD.getPointerSizeInBits();
-      Offset = (Offset << SExtAmount) >> SExtAmount;
-    }
+    unsigned Width = TD.getPointerSizeInBits();
+    if (Width < 64)
+      Offset = SignExtend64(Offset, Width);
 
     return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
                                    Ctx);
@@ -1508,7 +1506,7 @@
     // is reasonable to treat their delta as a 32-bit value.
     // FALL THROUGH.
   case Instruction::BitCast:
-    return LowerConstant(CE->getOperand(0), AP);
+    return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
     const TargetData &TD = *AP.TM.getTargetData();
@@ -1517,7 +1515,7 @@
     Constant *Op = CE->getOperand(0);
     Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
                                       false/*ZExt*/);
-    return LowerConstant(Op, AP);
+    return lowerConstant(Op, AP);
   }
 
   case Instruction::PtrToInt: {
@@ -1527,7 +1525,7 @@
     Constant *Op = CE->getOperand(0);
     Type *Ty = CE->getType();
 
-    const MCExpr *OpExpr = LowerConstant(Op, AP);
+    const MCExpr *OpExpr = lowerConstant(Op, AP);
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
@@ -1553,8 +1551,8 @@
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
     case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
@@ -1571,7 +1569,7 @@
   }
 }
 
-static void EmitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
+static void emitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
                                    AsmPrinter &AP);
 
 /// isRepeatedByteSequence - Determine whether the given value is
@@ -1633,7 +1631,7 @@
   return -1;
 }
 
-static void EmitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
+static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
                                              unsigned AddrSpace,AsmPrinter &AP){
   
   // See if we can aggregate this into a .fill, if so, emit it as such.
@@ -1698,7 +1696,7 @@
 
 }
 
-static void EmitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
+static void emitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
                                     AsmPrinter &AP) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
@@ -1710,14 +1708,14 @@
   }
   else {
     for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      EmitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+      emitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
   }
 }
 
-static void EmitGlobalConstantVector(const ConstantVector *CV,
+static void emitGlobalConstantVector(const ConstantVector *CV,
                                      unsigned AddrSpace, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
-    EmitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
+    emitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
 
   const TargetData &TD = *AP.TM.getTargetData();
   unsigned Size = TD.getTypeAllocSize(CV->getType());
@@ -1727,7 +1725,7 @@
     AP.OutStreamer.EmitZeros(Padding, AddrSpace);
 }
 
-static void EmitGlobalConstantStruct(const ConstantStruct *CS,
+static void emitGlobalConstantStruct(const ConstantStruct *CS,
                                      unsigned AddrSpace, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
   const TargetData *TD = AP.TM.getTargetData();
@@ -1744,7 +1742,7 @@
     SizeSoFar += FieldSize + PadSize;
 
     // Now print the actual field value.
-    EmitGlobalConstantImpl(Field, AddrSpace, AP);
+    emitGlobalConstantImpl(Field, AddrSpace, AP);
 
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
@@ -1755,7 +1753,7 @@
          "Layout of constant struct may be incorrect!");
 }
 
-static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
+static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
                                  AsmPrinter &AP) {
   if (CFP->getType()->isHalfTy()) {
     if (AP.isVerbose()) {
@@ -1840,7 +1838,7 @@
   }
 }
 
-static void EmitGlobalConstantLargeInt(const ConstantInt *CI,
+static void emitGlobalConstantLargeInt(const ConstantInt *CI,
                                        unsigned AddrSpace, AsmPrinter &AP) {
   const TargetData *TD = AP.TM.getTargetData();
   unsigned BitWidth = CI->getBitWidth();
@@ -1856,7 +1854,7 @@
   }
 }
 
-static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
+static void emitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
                                    AsmPrinter &AP) {
   const TargetData *TD = AP.TM.getTargetData();
   uint64_t Size = TD->getTypeAllocSize(CV->getType());
@@ -1875,13 +1873,13 @@
       AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
       return;
     default:
-      EmitGlobalConstantLargeInt(CI, AddrSpace, AP);
+      emitGlobalConstantLargeInt(CI, AddrSpace, AP);
       return;
     }
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
-    return EmitGlobalConstantFP(CFP, AddrSpace, AP);
+    return emitGlobalConstantFP(CFP, AddrSpace, AP);
 
   if (isa<ConstantPointerNull>(CV)) {
     AP.OutStreamer.EmitIntValue(0, Size, AddrSpace);
@@ -1889,19 +1887,19 @@
   }
 
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
-    return EmitGlobalConstantDataSequential(CDS, AddrSpace, AP);
+    return emitGlobalConstantDataSequential(CDS, AddrSpace, AP);
   
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return EmitGlobalConstantArray(CVA, AddrSpace, AP);
+    return emitGlobalConstantArray(CVA, AddrSpace, AP);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return EmitGlobalConstantStruct(CVS, AddrSpace, AP);
+    return emitGlobalConstantStruct(CVS, AddrSpace, AP);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
     // vectors).
     if (CE->getOpcode() == Instruction::BitCast)
-      return EmitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
+      return emitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
 
     if (Size > 8) {
       // If the constant expression's size is greater than 64-bits, then we have
@@ -1909,23 +1907,23 @@
       // that way.
       Constant *New = ConstantFoldConstantExpression(CE, TD);
       if (New && New != CE)
-        return EmitGlobalConstantImpl(New, AddrSpace, AP);
+        return emitGlobalConstantImpl(New, AddrSpace, AP);
     }
   }
   
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
-    return EmitGlobalConstantVector(V, AddrSpace, AP);
+    return emitGlobalConstantVector(V, AddrSpace, AP);
     
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(LowerConstant(CV, AP), Size, AddrSpace);
+  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size, AddrSpace);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV, unsigned AddrSpace) {
   uint64_t Size = TM.getTargetData()->getTypeAllocSize(CV->getType());
   if (Size)
-    EmitGlobalConstantImpl(CV, AddrSpace, *this);
+    emitGlobalConstantImpl(CV, AddrSpace, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
     // If the global has zero size, emit a single byte so that two labels don't
     // look like they are at the same location.
@@ -2040,8 +2038,8 @@
   }
 }
 
-/// EmitBasicBlockLoopComments - Pretty-print comments for basic blocks.
-static void EmitBasicBlockLoopComments(const MachineBasicBlock &MBB,
+/// emitBasicBlockLoopComments - Pretty-print comments for basic blocks.
+static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
                                        const MachineLoopInfo *LI,
                                        const AsmPrinter &AP) {
   // Add loop depth information
@@ -2107,7 +2105,7 @@
     if (const BasicBlock *BB = MBB->getBasicBlock())
       if (BB->hasName())
         OutStreamer.AddComment("%" + BB->getName());
-    EmitBasicBlockLoopComments(*MBB, LI, *this);
+    emitBasicBlockLoopComments(*MBB, LI, *this);
   }
 
   // Print the main label for the block.

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 711375b..b26ffeb 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp

@@ -43,10 +43,10 @@
   };
 }
 
-/// SrcMgrDiagHandler - This callback is invoked when the SourceMgr for an
+/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
 /// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
 /// struct above.
-static void SrcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
+static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
   SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
 
@@ -68,7 +68,8 @@
 }
 
 /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
+void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
+                               InlineAsm::AsmDialect Dialect) const {
 #ifndef ANDROID_TARGET_BUILD
   assert(!Str.empty() && "Can't emit empty inline asm block");
 
@@ -92,12 +93,12 @@
   LLVMContext &LLVMCtx = MMI->getModule()->getContext();
   bool HasDiagHandler = false;
   if (LLVMCtx.getInlineAsmDiagnosticHandler() != 0) {
-    // If the source manager has an issue, we arrange for SrcMgrDiagHandler
+    // If the source manager has an issue, we arrange for srcMgrDiagHandler
     // to be invoked, getting DiagInfo passed into it.
     DiagInfo.LocInfo = LocMDNode;
     DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
     DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
-    SrcMgr.setDiagHandler(SrcMgrDiagHandler, &DiagInfo);
+    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
     HasDiagHandler = true;
   }
 
@@ -127,6 +128,7 @@
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
+  Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
 
   // Don't implicitly switch to the text section before the asm.
@@ -200,6 +202,15 @@
 
   // The variant of the current asmprinter.
   int AsmPrinterVariant = MAI->getAssemblerDialect();
+  int InlineAsmVariant = MI->getInlineAsmDialect();
+
+  // Switch to the inline assembly variant.
+  if (AsmPrinterVariant != InlineAsmVariant) {
+    if (InlineAsmVariant == 0)
+      OS << ".att_syntax\n\t";
+    else
+      OS << ".intel_syntax\n\t";
+  }
 
   int CurVariant = -1;            // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
@@ -345,11 +356,11 @@
           else {
             AsmPrinter *AP = const_cast<AsmPrinter*>(this);
             if (InlineAsm::isMemKind(OpFlags)) {
-              Error = AP->PrintAsmMemoryOperand(MI, OpNo, AsmPrinterVariant,
+              Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
                                                 Modifier[0] ? Modifier : 0,
                                                 OS);
             } else {
-              Error = AP->PrintAsmOperand(MI, OpNo, AsmPrinterVariant,
+              Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
                                           Modifier[0] ? Modifier : 0, OS);
             }
           }
@@ -365,8 +376,16 @@
     }
     }
   }
+  // Switch to the AsmPrinter variant.
+  if (AsmPrinterVariant != InlineAsmVariant) {
+    if (AsmPrinterVariant == 0)
+      OS << "\n\t.att_syntax";
+    else
+      OS << "\n\t.intel_syntax";
+  }
+
   OS << '\n' << (char)0;  // null terminate string.
-  EmitInlineAsm(OS.str(), LocMD);
+  EmitInlineAsm(OS.str(), LocMD, MI->getInlineAsmDialect());
 
   // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
   // enabled, so we use EmitRawText.
@@ -413,8 +432,8 @@
 /// instruction, using the specified assembler variant.  Targets should
 /// override this to format as appropriate.
 bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-    unsigned AsmVariant, const char *ExtraCode,
-    raw_ostream &O) {
+                                 unsigned AsmVariant, const char *ExtraCode,
+                                 raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.

diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 3776848..0885285 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp

@@ -182,6 +182,12 @@
 void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   unsigned Size = ~0U;
   switch (Form) {
+  case dwarf::DW_FORM_flag_present:
+    // Emit something to keep the lines and comments in sync.
+    // FIXME: Is there a better way to do this?
+    if (Asm->OutStreamer.hasRawTextSupport())
+      Asm->OutStreamer.EmitRawText(StringRef(""));
+    return;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
   case dwarf::DW_FORM_data1: Size = 1; break;
@@ -203,6 +209,7 @@
 ///
 unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
   case dwarf::DW_FORM_data1: return sizeof(int8_t);

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d30e5bb..e585897 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp

@@ -51,6 +51,15 @@
   return Value;
 }
 
+/// addFlag - Add a flag that is true.
+void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
+  if (!DD->useDarwinGDBCompat())
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
+                  DIEIntegerOne);
+  else
+    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+}
+
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
 void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
@@ -794,7 +803,7 @@
         (Language == dwarf::DW_LANG_C89 ||
          Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
-      addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_prototyped);
   }
     break;
   case dwarf::DW_TAG_structure_type:
@@ -825,15 +834,15 @@
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
-          addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1);
+          addFlag(ElemDie, dwarf::DW_AT_explicit);
       }
       else if (Element.isVariable()) {
         DIVariable DV(Element);
         ElemDie = new DIE(dwarf::DW_TAG_variable);
         addString(ElemDie, dwarf::DW_AT_name, DV.getName());
         addType(ElemDie, DV.getType());
-        addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-        addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+        addFlag(ElemDie, dwarf::DW_AT_declaration);
+        addFlag(ElemDie, dwarf::DW_AT_external);
         addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
@@ -883,7 +892,7 @@
     }
 
     if (CTy.isAppleBlockExtension())
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
 
     DICompositeType ContainingType = CTy.getContainingType();
     if (DIDescriptor(ContainingType).isCompositeType())
@@ -895,8 +904,7 @@
     }
 
     if (CTy.isObjcClassComplete())
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type,
-              dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
 
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
@@ -929,7 +937,7 @@
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
-      addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_declaration);
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
@@ -1028,8 +1036,10 @@
   // AT_specification code in order to work around a bug in older
   // gdbs that requires the linkage name to resolve multiple template
   // functions.
+  // TODO: Remove this set of code when we get rid of the old gdb
+  // compatibility.
   StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty())
+  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               getRealLinkageName(LinkageName));
 
@@ -1043,6 +1053,11 @@
     return SPDie;
   }
 
+  // Add the linkage name if we have one.
+  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
+              getRealLinkageName(LinkageName));
+
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
@@ -1055,7 +1070,7 @@
       (Language == dwarf::DW_LANG_C89 ||
        Language == dwarf::DW_LANG_C99 ||
        Language == dwarf::DW_LANG_ObjC))
-    addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_prototyped);
 
   // Add Return Type.
   DICompositeType SPTy = SP.getType();
@@ -1079,7 +1094,7 @@
   }
 
   if (!SP.isDefinition()) {
-    addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_declaration);
     
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
@@ -1093,19 +1108,19 @@
         DIType ATy = DIType(DIType(Args.getElement(i)));
         addType(Arg, ATy);
         if (ATy.isArtificial())
-          addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+          addFlag(Arg, dwarf::DW_AT_artificial);
         SPDie->addChild(Arg);
       }
   }
 
   if (SP.isArtificial())
-    addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_artificial);
 
   if (!SP.isLocalToUnit())
-    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_external);
 
   if (SP.isOptimized())
-    addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
 
   if (unsigned isa = Asm->getISAEncoding()) {
     addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
@@ -1168,7 +1183,7 @@
 
   // Add scoping info.
   if (!GV.isLocalToUnit())
-    addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    addFlag(VariableDIE, dwarf::DW_AT_external);
 
   // Add line number info.
   addSourceLine(VariableDIE, GV);
@@ -1193,8 +1208,7 @@
       addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
                   dwarf::DW_FORM_ref4, VariableDIE);
       addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-      addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag,
-                     1);
+      addFlag(VariableDIE, dwarf::DW_AT_declaration);
       addDie(VariableSpecDIE);
     } else {
       addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
@@ -1260,7 +1274,7 @@
                                         DICompositeType *CTy) {
   Buffer.setTag(dwarf::DW_TAG_array_type);
   if (CTy->getTag() == dwarf::DW_TAG_vector_type)
-    addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
+    addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
 
   // Emit derived type.
   addType(&Buffer, CTy->getTypeDerivedFrom());
@@ -1333,8 +1347,7 @@
   }
 
   if (DV->isArtificial())
-    addUInt(VariableDie, dwarf::DW_AT_artificial,
-                        dwarf::DW_FORM_flag, 1);
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
 
   if (isScopeAbstract) {
     DV->setDIE(VariableDie);

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index b4ff9e8..22401fe 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h

@@ -176,6 +176,9 @@
   }
 public:
 
+  /// addFlag - Add a flag that is true to the DIE.
+  void addFlag(DIE *Die, unsigned Attribute);
+  
   /// addUInt - Add an unsigned integer attribute data and value.
   ///
   void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 649684a..946ac35 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp

@@ -54,9 +54,29 @@
      cl::desc("Make an absence of debug location information explicit."),
      cl::init(false));
 
-static cl::opt<bool> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
+namespace {
+  enum DefaultOnOff {
+    Default, Enable, Disable
+  };
+}
+
+static cl::opt<DefaultOnOff> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
      cl::desc("Output prototype dwarf accelerator tables."),
-     cl::init(false));
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
+
+static cl::opt<DefaultOnOff> DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
+     cl::desc("Compatibility with Darwin gdb."),
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
 
 namespace {
   const char *DWARFGroupName = "DWARF Emission";
@@ -135,10 +155,25 @@
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
-  // Turn on accelerator tables for Darwin.
-  if (Triple(M->getTargetTriple()).isOSDarwin())
-    DwarfAccelTables = true;
-  
+  // Turn on accelerator tables and older gdb compatibility
+  // for Darwin.
+  bool isDarwin = Triple(M->getTargetTriple()).isOSDarwin();
+  if (DarwinGDBCompat == Default) {
+    if (isDarwin)
+      isDarwinGDBCompat = true;
+    else
+      isDarwinGDBCompat = false;
+  } else
+    isDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
+
+  if (DwarfAccelTables == Default) {
+    if (isDarwin)
+      hasDwarfAccelTables = true;
+    else
+      hasDwarfAccelTables = false;
+  } else
+    hasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule(M);
@@ -282,7 +317,7 @@
     if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
         !SP.getContext().isFile() &&
         !isSubprogramContext(SP.getContext())) {
-      SPCU->addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
       
       // Add arguments.
       DICompositeType SPTy = SP.getType();
@@ -294,7 +329,7 @@
           DIType ATy = DIType(DIType(Args.getElement(i)));
           SPCU->addType(Arg, ATy);
           if (ATy.isArtificial())
-            SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+            SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
           SPDie->addChild(Arg);
         }
       DIE *SPDeclDie = SPDie;
@@ -575,7 +610,7 @@
   if (!CompilationDir.empty())
     NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
   if (DIUnit.isOptimized())
-    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
@@ -816,8 +851,8 @@
   // Corresponding abbreviations into a abbrev section.
   emitAbbreviations();
 
-  // Emit info into a dwarf accelerator table sections.
-  if (DwarfAccelTables) {
+  // Emit info into the dwarf accelerator table sections.
+  if (useDwarfAccelTables()) {
     emitAccelNames();
     emitAccelObjC();
     emitAccelNamespaces();
@@ -825,7 +860,10 @@
   }
   
   // Emit info into a debug pubtypes section.
-  emitDebugPubTypes();
+  // TODO: When we don't need the option anymore we can
+  // remove all of the code that adds to the table.
+  if (useDarwinGDBCompat())
+    emitDebugPubTypes();
 
   // Emit info into a debug loc section.
   emitDebugLoc();
@@ -840,7 +878,11 @@
   emitDebugMacInfo();
 
   // Emit inline info.
-  emitDebugInlineInfo();
+  // TODO: When we don't need the option anymore we
+  // can remove all of the code that this section
+  // depends upon.
+  if (useDarwinGDBCompat())
+    emitDebugInlineInfo();
 
   // Emit info into a debug str section.
   emitDebugStr();
@@ -1439,8 +1481,7 @@
   DIE *CurFnDIE = constructScopeDIE(TheCU, FnScope);
   
   if (!MF->getTarget().Options.DisableFramePointerElim(*MF))
-    TheCU->addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr,
-                   dwarf::DW_FORM_flag, 1);
+    TheCU->addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
   DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
                                                MMI->getFrameMoves()));

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index d1d6512..f94c9d0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h

@@ -307,6 +307,9 @@
   // table for the same directory as DW_at_comp_dir.
   StringRef CompilationDir;
 
+  // A holder for the DarwinGDBCompat flag so that the compile unit can use it.
+  bool isDarwinGDBCompat;
+  bool hasDwarfAccelTables;
 private:
 
   /// assignAbbrevNumber - Define a unique number for the abbreviation.
@@ -520,6 +523,11 @@
   /// getStringPoolEntry - returns an entry into the string pool with the given
   /// string text.
   MCSymbol *getStringPoolEntry(StringRef Str);
+
+  /// useDarwinGDBCompat - returns whether or not to limit some of our debug
+  /// output to the limitations of darwin gdb.
+  bool useDarwinGDBCompat() { return isDarwinGDBCompat; }
+  bool useDwarfAccelTables() { return hasDwarfAccelTables; }
 };
 } // End of namespace llvm
 

diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 75f6056..fe9e493 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h

@@ -43,26 +43,6 @@
   /// MMI - Collected machine module information.
   MachineModuleInfo *MMI;
 
-  /// EmitExceptionTable - Emit landing pads and actions.
-  ///
-  /// The general organization of the table is complex, but the basic concepts
-  /// are easy.  First there is a header which describes the location and
-  /// organization of the three components that follow.
-  ///  1. The landing pad site information describes the range of code covered
-  ///     by the try.  In our case it's an accumulation of the ranges covered
-  ///     by the invokes in the try.  There is also a reference to the landing
-  ///     pad that handles the exception once processed.  Finally an index into
-  ///     the actions table.
-  ///  2. The action table, in our case, is composed of pairs of type ids
-  ///     and next action offset.  Starting with the action index from the
-  ///     landing pad site, each type Id is checked for a match to the current
-  ///     exception.  If it matches then the exception and type id are passed
-  ///     on to the landing pad.  Otherwise the next action is looked up.  This
-  ///     chain is terminated with a next action of zero.  If no type id is
-  ///     found the frame is unwound and handling continues.
-  ///  3. Type id table contains references to all the C++ typeinfo for all
-  ///     catches in the function.  This tables is reversed indexed base 1.
-
   /// SharedTypeIds - How many leading type ids two landing pads have in common.
   static unsigned SharedTypeIds(const LandingPadInfo *L,
                                 const LandingPadInfo *R);
@@ -119,6 +99,26 @@
                             const RangeMapType &PadMap,
                             const SmallVectorImpl<const LandingPadInfo *> &LPs,
                             const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// EmitExceptionTable - Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
   void EmitExceptionTable();
 
 public:

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fb65bb7..7df0e15 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp

@@ -1554,8 +1554,7 @@
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         Uses.insert(*AI);
     } else {
-      if (Uses.count(Reg)) {
-        Uses.erase(Reg);
+      if (Uses.erase(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
           Uses.erase(*SubRegs); // Use sub-registers to be conservative
       }

diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 2e189ad..386509b 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt

@@ -95,6 +95,7 @@
   SplitKit.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
+  StackColoring.cpp
   StrongPHIElimination.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp

diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 939af3f..bc5258e 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp

@@ -9,7 +9,6 @@
 
 #define DEBUG_TYPE "calcspillweights"
 
-#include "llvm/Function.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -42,8 +41,7 @@
 bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
-               << "********** Function: "
-               << MF.getFunction()->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
 
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineRegisterInfo &MRI = MF.getRegInfo();

diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index fb2c2e8..65f0941 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp

@@ -56,6 +56,7 @@
   initializeRegisterCoalescerPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackProtectorPass(Registry);
+  initializeStackColoringPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeStrongPHIEliminationPass(Registry);
   initializeTailDuplicatePassPass(Registry);

diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index f9347ef..c40c5ac 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp

@@ -18,7 +18,6 @@
 
 #define DEBUG_TYPE "early-ifcvt"
 #include "MachineTraceMetrics.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -775,8 +774,7 @@
 
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
-               << "********** Function: "
-               << ((Value*)MF.getFunction())->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
   SchedModel = MF.getTarget().getInstrItineraryData()->SchedModel;

diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 7a17331..ffe4b63 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp

@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "postrapseudos"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -190,8 +189,7 @@
 bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Machine Function\n"
                << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
-               << "********** Function: "
-               << MF.getFunction()->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
   TRI = MF.getTarget().getRegisterInfo();
   TII = MF.getTarget().getInstrInfo();
 

diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 4214ba1..31e36f0 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp

@@ -13,7 +13,6 @@
 
 #define DEBUG_TYPE "ifcvt"
 #include "BranchFolding.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -282,7 +281,7 @@
   }
 
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
-               << MF.getFunction()->getName() << "\'");
+               << MF.getName() << "\'");
 
   if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
     DEBUG(dbgs() << " skipped\n");
@@ -997,14 +996,13 @@
   }
   for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
     unsigned Reg = Defs[i];
-    if (Redefs.count(Reg)) {
+    if (!Redefs.insert(Reg)) {
       if (AddImpUse)
         // Treat predicated update as read + write.
         MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
                                               true/*IsImp*/,false/*IsKill*/,
                                               false/*IsDead*/,true/*IsUndef*/));
     } else {
-      Redefs.insert(Reg);
       for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
         Redefs.insert(*SubRegs);
     }

diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 07e37af..622127c 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp

@@ -613,7 +613,7 @@
     propagateSiblingValue(SVI);
   } while (!WorkList.empty());
 
-  // Look up the value we were looking for.  We already did this lokup at the
+  // Look up the value we were looking for.  We already did this lookup at the
   // top of the function, but SibValues may have been invalidated.
   SVI = SibValues.find(UseVNI);
   assert(SVI != SibValues.end() && "Didn't compute requested info");

diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index d631726..defc127 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp

@@ -687,8 +687,7 @@
   clear();
   LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
-               << ((Value*)mf.getFunction())->getName()
-               << " **********\n");
+               << mf.getName() << " **********\n");
 
   bool Changed = collectDebugValues(mf);
   computeIntervals();

diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 0a795e6..3e9b485 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp

@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "RegisterCoalescer.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -142,6 +143,48 @@
   return false;
 }
 
+bool LiveInterval::overlaps(const LiveInterval &Other,
+                            const CoalescerPair &CP,
+                            const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty interval");
+  if (Other.empty())
+    return false;
+
+  // Use binary searches to find initial positions.
+  const_iterator I = find(Other.beginIndex());
+  const_iterator IE = end();
+  if (I == IE)
+    return false;
+  const_iterator J = Other.find(I->start);
+  const_iterator JE = Other.end();
+  if (J == JE)
+    return false;
+
+  for (;;) {
+    // J has just been advanced to satisfy:
+    assert(J->end >= I->start);
+    // Check for an overlap.
+    if (J->start < I->end) {
+      // I and J are overlapping. Find the later start.
+      SlotIndex Def = std::max(I->start, J->start);
+      // Allow the overlap if Def is a coalescable copy.
+      if (Def.isBlock() ||
+          !CP.isCoalescable(Indexes.getInstructionFromIndex(Def)))
+        return true;
+    }
+    // Advance the iterator that ends first to check for more overlaps.
+    if (J->end > I->end) {
+      std::swap(I, J);
+      std::swap(IE, JE);
+    }
+    // Advance J until J->end >= I->start.
+    do
+      if (++J == JE)
+        return false;
+    while (J->end < I->start);
+  }
+}
+
 /// overlaps - Return true if the live interval overlaps a range specified
 /// by [Start, End).
 bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
@@ -705,9 +748,11 @@
   return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
 }
 
+#ifndef NDEBUG
 void LiveRange::dump() const {
   dbgs() << *this << "\n";
 }
+#endif
 
 void LiveInterval::print(raw_ostream &OS) const {
   if (empty())
@@ -740,9 +785,11 @@
   }
 }
 
+#ifndef NDEBUG
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
+#endif
 
 #ifndef NDEBUG
 void LiveInterval::verify() const {

diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index d0f8ae1..17f9d9e 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp

@@ -34,6 +34,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "LiveRangeCalc.h"
+#include "VirtRegMap.h"
 #include <algorithm>
 #include <limits>
 #include <cmath>
@@ -155,9 +156,11 @@
   MF->print(OS, Indexes);
 }
 
+#ifndef NDEBUG
 void LiveIntervals::dumpInstrs() const {
   printInstrs(dbgs());
 }
+#endif
 
 static
 bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
@@ -382,8 +385,7 @@
 /// which a variable is live
 void LiveIntervals::computeIntervals() {
   DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
-               << "********** Function: "
-               << ((Value*)MF->getFunction())->getName() << '\n');
+               << "********** Function: " << MF->getName() << '\n');
 
   RegMaskBlocks.resize(MF->getNumBlockIDs());
 
@@ -440,7 +442,7 @@
 
     // Compute the number of register mask instructions in this block.
     std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
-    RMB.second = RegMaskSlots.size() - RMB.first;;
+    RMB.second = RegMaskSlots.size() - RMB.first;
   }
 
   // Create empty intervals for registers defined by implicit_def's (except
@@ -497,7 +499,7 @@
           RegMaskBits.push_back(MO->getRegMask());
       }
     // Compute the number of register mask instructions in this block.
-    RMB.second = RegMaskSlots.size() - RMB.first;;
+    RMB.second = RegMaskSlots.size() - RMB.first;
   }
 }
 
@@ -734,12 +736,28 @@
 // Register allocator hooks.
 //
 
-void LiveIntervals::addKillFlags() {
+void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
+  // Keep track of regunit ranges.
+  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     LiveInterval *LI = &getInterval(Reg);
+    if (LI->empty())
+      continue;
+
+    // Find the regunit intervals for the assigned register. They may overlap
+    // the virtual register live range, cancelling any kills.
+    RU.clear();
+    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
+         ++Units) {
+      LiveInterval *RUInt = &getRegUnit(*Units);
+      if (RUInt->empty())
+        continue;
+      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+    }
 
     // Every instruction that kills Reg corresponds to a live range end point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
@@ -750,7 +768,32 @@
       MachineInstr *MI = getInstructionFromIndex(RI->end);
       if (!MI)
         continue;
-      MI->addRegisterKilled(Reg, NULL);
+
+      // Check if any of the reguints are live beyond the end of RI. That could
+      // happen when a physreg is defined as a copy of a virtreg:
+      //
+      //   %EAX = COPY %vreg5
+      //   FOO %vreg5         <--- MI, cancel kill because %EAX is live.
+      //   BAR %EAX<kill>
+      //
+      // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
+      bool CancelKill = false;
+      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
+        LiveInterval *RInt = RU[u].first;
+        LiveInterval::iterator &I = RU[u].second;
+        if (I == RInt->end())
+          continue;
+        I = RInt->advanceTo(I, RI->end);
+        if (I == RInt->end() || I->start >= RI->end)
+          continue;
+        // I is overlapping RI.
+        CancelKill = true;
+        break;
+      }
+      if (CancelKill)
+        MI->clearRegisterKills(Reg, NULL);
+      else
+        MI->addRegisterKilled(Reg, NULL);
     }
   }
 }
@@ -1174,7 +1217,7 @@
     SlotIndex LastUse = findLastUseBefore(LI->reg, OldIdx);
     if (LastUse != NewIdx)
       moveKillFlags(LI->reg, NewIdx, LastUse);
-    LR->end = LastUse.getRegSlot();
+    LR->end = LastUse.getRegSlot(LR->end.isEarlyClobber());
   }
 
   void moveEnteringDownFrom(SlotIndex OldIdx, IntRangePair& P) {
@@ -1188,7 +1231,7 @@
         assert(LR->end > OldIdx && "LiveRange does not cover original slot");
         moveKillFlags(LI->reg, LR->end, NewIdx);
       }
-      LR->end = NewIdx.getRegSlot();
+      LR->end = NewIdx.getRegSlot(LR->end.isEarlyClobber());
     }
   }
 

diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d828f25..c3ff4f1 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp

@@ -65,7 +65,11 @@
   // Visit all operands that read Reg. This may include partial defs.
   for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
        E = MRI->reg_nodbg_end(); I != E; ++I) {
-    const MachineOperand &MO = I.getOperand();
+    MachineOperand &MO = I.getOperand();
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervalAnalysis::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
     if (!MO.readsReg())
       continue;
     // MI is reading Reg. We may have visited MI before if it happens to be

diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index cdb1776..7f22478 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp

@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "LiveRegMatrix.h"
+#include "RegisterCoalescer.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -117,8 +118,9 @@
                                              unsigned PhysReg) {
   if (VirtReg.empty())
     return false;
+  CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (VirtReg.overlaps(LIS->getRegUnit(*Units)))
+    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
       return true;
   return false;
 }

diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 348ed3a..f294124 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp

@@ -65,6 +65,7 @@
 }
 
 void LiveVariables::VarInfo::dump() const {
+#ifndef NDEBUG
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
            E = AliveBlocks.end(); I != E; ++I)
@@ -77,6 +78,7 @@
       dbgs() << "\n    #" << i << ": " << *Kills[i];
     dbgs() << "\n";
   }
+#endif
 }
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
@@ -806,18 +808,44 @@
                                 MachineBasicBlock *SuccBB) {
   const unsigned NumNew = BB->getNumber();
 
-  // All registers used by PHI nodes in SuccBB must be live through BB.
-  for (MachineBasicBlock::iterator BBI = SuccBB->begin(),
-         BBE = SuccBB->end(); BBI != BBE && BBI->isPHI(); ++BBI)
+  SmallSet<unsigned, 16> Defs, Kills;
+
+  MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
+  for (; BBI != BBE && BBI->isPHI(); ++BBI) {
+    // Record the def of the PHI node.
+    Defs.insert(BBI->getOperand(0).getReg());
+
+    // All registers used by PHI nodes in SuccBB must be live through BB.
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
       if (BBI->getOperand(i+1).getMBB() == BB)
         getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+  }
+
+  // Record all vreg defs and kills of all instructions in SuccBB.
+  for (; BBI != BBE; ++BBI) {
+    for (MachineInstr::mop_iterator I = BBI->operands_begin(),
+         E = BBI->operands_end(); I != E; ++I) {
+      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+        if (I->isDef())
+          Defs.insert(I->getReg());
+        else if (I->isKill())
+          Kills.insert(I->getReg());
+      }
+    }
+  }
 
   // Update info for all live variables
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // If the Defs is defined in the successor it can't be live in BB.
+    if (Defs.count(Reg))
+      continue;
+
+    // If the register is either killed in or live through SuccBB it's also live
+    // through BB.
     VarInfo &VI = getVarInfo(Reg);
-    if (!VI.AliveBlocks.test(NumNew) && VI.isLiveIn(*SuccBB, Reg, *MRI))
+    if (Kills.count(Reg) || VI.AliveBlocks.test(SuccBB->getNumber()))
       VI.AliveBlocks.set(NumNew);
   }
 }

diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index cf13dbd..9250577 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp

@@ -228,9 +228,11 @@
   return 0;
 }
 
+#ifndef NDEBUG
 void MachineBasicBlock::dump() const {
   print(dbgs());
 }
+#endif
 
 StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
@@ -243,7 +245,7 @@
 std::string MachineBasicBlock::getFullName() const {
   std::string Name;
   if (getParent())
-    Name = (getParent()->getFunction()->getName() + ":").str();
+    Name = (getParent()->getName() + ":").str();
   if (getBasicBlock())
     Name += getBasicBlock()->getName();
   else

diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d4aede8..c282332 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp

@@ -284,12 +284,19 @@
   return std::make_pair(Result, Result + Num);
 }
 
+#ifndef NDEBUG
 void MachineFunction::dump() const {
   print(dbgs());
 }
+#endif
+
+StringRef MachineFunction::getName() const {
+  assert(getFunction() && "No function!");
+  return getFunction()->getName();
+}
 
 void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
-  OS << "# Machine code for function " << Fn->getName() << ": ";
+  OS << "# Machine code for function " << getName() << ": ";
   if (RegInfo) {
     OS << (RegInfo->isSSA() ? "SSA" : "Post SSA");
     if (!RegInfo->tracksLiveness())
@@ -334,7 +341,7 @@
     BB->print(OS, Indexes);
   }
 
-  OS << "\n# End machine code for function " << Fn->getName() << ".\n\n";
+  OS << "\n# End machine code for function " << getName() << ".\n\n";
 }
 
 namespace llvm {
@@ -344,7 +351,7 @@
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const MachineFunction *F) {
-      return "CFG for '" + F->getFunction()->getName().str() + "' function";
+      return "CFG for '" + F->getName().str() + "' function";
     }
 
     std::string getNodeLabel(const MachineBasicBlock *Node,
@@ -377,7 +384,7 @@
 void MachineFunction::viewCFG() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getName());
+  ViewGraph(this, "mf" + getName());
 #else
   errs() << "MachineFunction::viewCFG is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
@@ -387,7 +394,7 @@
 void MachineFunction::viewCFGOnly() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getName(), true);
+  ViewGraph(this, "mf" + getName(), true);
 #else
   errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
@@ -453,7 +460,9 @@
   unsigned StackAlign = TFI.getStackAlignment();
   unsigned Align = MinAlign(SPOffset, StackAlign);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
-                                              /*isSS*/false, false));
+                                              /*isSS*/   false,
+                                              /*NeedSP*/ false,
+                                              /*Alloca*/ 0));
   return -++NumFixedObjects;
 }
 
@@ -525,9 +534,11 @@
   }
 }
 
+#ifndef NDEBUG
 void MachineFrameInfo::dump(const MachineFunction &MF) const {
   print(MF, dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 //  MachineJumpTableInfo implementation
@@ -622,7 +633,9 @@
   OS << '\n';
 }
 
+#ifndef NDEBUG
 void MachineJumpTableInfo::dump() const { print(dbgs()); }
+#endif
 
 
 //===----------------------------------------------------------------------===//
@@ -749,10 +762,12 @@
     if (Constants[i].isMachineConstantPoolEntry())
       Constants[i].Val.MachineCPVal->print(OS);
     else
-      OS << *(Value*)Constants[i].Val.ConstVal;
+      OS << *(const Value*)Constants[i].Val.ConstVal;
     OS << ", align=" << Constants[i].getAlignment();
     OS << "\n";
   }
 }
 
+#ifndef NDEBUG
 void MachineConstantPool::dump() const { print(dbgs()); }
+#endif

diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index b166849..0508b9f 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp

@@ -111,6 +111,7 @@
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
   // If this operand is currently a register operand, and if this is in a
   // function, deregister the operand from the register's use/def list.
   if (isReg() && isOnRegUseList())
@@ -136,7 +137,8 @@
         RegInfo = &MF->getRegInfo();
   // If this operand is already a register operand, remove it from the
   // register's use/def lists.
-  if (RegInfo && isReg())
+  bool WasReg = isReg();
+  if (RegInfo && WasReg)
     RegInfo->removeRegOperandFromUseList(this);
 
   // Change this to a register and set the reg#.
@@ -153,6 +155,9 @@
   IsDebug = isDebug;
   // Ensure isOnRegUseList() returns false.
   Contents.Reg.Prev = 0;
+  // Preserve the tie when the operand was already a register.
+  if (!WasReg)
+    TiedTo = 0;
 
   // If this operand is embedded in a function, add the operand to the
   // register's use/def list.
@@ -208,8 +213,8 @@
 hash_code llvm::hash_value(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(),
-                        MO.getSubReg(), MO.isDef());
+    // Register operands don't have target flags.
+    return hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(), MO.isDef());
   case MachineOperand::MO_Immediate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
   case MachineOperand::MO_CImmediate:
@@ -262,7 +267,7 @@
     OS << PrintReg(getReg(), TRI, getSubReg());
 
     if (isDef() || isKill() || isDead() || isImplicit() || isUndef() ||
-        isInternalRead() || isEarlyClobber()) {
+        isInternalRead() || isEarlyClobber() || isTied()) {
       OS << '<';
       bool NeedComma = false;
       if (isDef()) {
@@ -282,27 +287,32 @@
           NeedComma = true;
       }
 
-      if (isKill() || isDead() || (isUndef() && isUse()) || isInternalRead()) {
+      if (isKill()) {
         if (NeedComma) OS << ',';
-        NeedComma = false;
-        if (isKill()) {
-          OS << "kill";
-          NeedComma = true;
-        }
-        if (isDead()) {
-          OS << "dead";
-          NeedComma = true;
-        }
-        if (isUndef() && isUse()) {
-          if (NeedComma) OS << ',';
-          OS << "undef";
-          NeedComma = true;
-        }
-        if (isInternalRead()) {
-          if (NeedComma) OS << ',';
-          OS << "internal";
-          NeedComma = true;
-        }
+        OS << "kill";
+        NeedComma = true;
+      }
+      if (isDead()) {
+        if (NeedComma) OS << ',';
+        OS << "dead";
+        NeedComma = true;
+      }
+      if (isUndef() && isUse()) {
+        if (NeedComma) OS << ',';
+        OS << "undef";
+        NeedComma = true;
+      }
+      if (isInternalRead()) {
+        if (NeedComma) OS << ',';
+        OS << "internal";
+        NeedComma = true;
+      }
+      if (isTied()) {
+        if (NeedComma) OS << ',';
+        OS << "tied";
+        if (TiedTo != 15)
+          OS << unsigned(TiedTo - 1);
+        NeedComma = true;
       }
       OS << '>';
     }
@@ -673,6 +683,7 @@
   if (!isImpReg && !isInlineAsm()) {
     while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
       --OpNo;
+      assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
       if (RegInfo)
         RegInfo->removeRegOperandFromUseList(&Operands[OpNo]);
     }
@@ -708,12 +719,25 @@
   if (Operands[OpNo].isReg()) {
     // Ensure isOnRegUseList() returns false, regardless of Op's status.
     Operands[OpNo].Contents.Reg.Prev = 0;
+    // Ignore existing ties. This is not a property that can be copied.
+    Operands[OpNo].TiedTo = 0;
     // Add the new operand to RegInfo.
     if (RegInfo)
       RegInfo->addRegOperandToUseList(&Operands[OpNo]);
-    // If the register operand is flagged as early, mark the operand as such.
-    if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-      Operands[OpNo].setIsEarlyClobber(true);
+    // The MCID operand information isn't accurate until we start adding
+    // explicit operands. The implicit operands are added first, then the
+    // explicits are inserted before them.
+    if (!isImpReg) {
+      // Tie uses to defs as indicated in MCInstrDesc.
+      if (Operands[OpNo].isUse()) {
+        int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
+        if (DefIdx != -1)
+          tieOperands(DefIdx, OpNo);
+      }
+      // If the register operand is flagged as early, mark the operand as such.
+      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+        Operands[OpNo].setIsEarlyClobber(true);
+    }
   }
 
   // Re-add all the implicit ops.
@@ -730,6 +754,7 @@
 ///
 void MachineInstr::RemoveOperand(unsigned OpNo) {
   assert(OpNo < Operands.size() && "Invalid operand number");
+  untieRegOperand(OpNo);
   MachineRegisterInfo *RegInfo = getRegInfo();
 
   // Special case removing the last one.
@@ -752,6 +777,13 @@
     }
   }
 
+#ifndef NDEBUG
+  // Moving tied operands would break the ties.
+  for (unsigned i = OpNo + 1, e = Operands.size(); i != e; ++i)
+    if (Operands[i].isReg())
+      assert(!Operands[i].isTied() && "Cannot move tied operands");
+#endif
+
   Operands.erase(Operands.begin()+OpNo);
 
   if (RegInfo) {
@@ -935,6 +967,12 @@
   return false;
 }
 
+InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
+  assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
+  unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+}
+
 int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
                                        unsigned *GroupNo) const {
   assert(isInlineAsm() && "Expected an inline asm instruction");
@@ -1114,107 +1152,99 @@
   return -1;
 }
 
-/// isRegTiedToUseOperand - Given the index of a register def operand,
-/// check if the register def is tied to a source operand, due to either
-/// two-address elimination or inline assembly constraints. Returns the
-/// first tied use operand index by reference is UseOpIdx is not null.
-bool MachineInstr::
-isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
-  if (isInlineAsm()) {
-    assert(DefOpIdx > InlineAsm::MIOp_FirstOperand);
-    const MachineOperand &MO = getOperand(DefOpIdx);
-    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
-      return false;
-    // Determine the actual operand index that corresponds to this index.
-    unsigned DefNo = 0;
-    int FlagIdx = findInlineAsmFlagIdx(DefOpIdx, &DefNo);
-    if (FlagIdx < 0)
-      return false;
+// MachineOperand::TiedTo is 4 bits wide.
+const unsigned TiedMax = 15;
 
-    // Which part of the group is DefOpIdx?
-    unsigned DefPart = DefOpIdx - (FlagIdx + 1);
+/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
+///
+/// Use and def operands can be tied together, indicated by a non-zero TiedTo
+/// field. TiedTo can have these values:
+///
+/// 0:              Operand is not tied to anything.
+/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
+/// TiedMax:        Tied to an operand >= TiedMax-1.
+///
+/// The tied def must be one of the first TiedMax operands on a normal
+/// instruction. INLINEASM instructions allow more tied defs.
+///
+void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
+  MachineOperand &DefMO = getOperand(DefIdx);
+  MachineOperand &UseMO = getOperand(UseIdx);
+  assert(DefMO.isDef() && "DefIdx must be a def operand");
+  assert(UseMO.isUse() && "UseIdx must be a use operand");
+  assert(!DefMO.isTied() && "Def is already tied to another use");
+  assert(!UseMO.isTied() && "Use is already tied to another def");
 
-    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands();
-         i != e; ++i) {
-      const MachineOperand &FMO = getOperand(i);
-      if (!FMO.isImm())
-        continue;
-      if (i+1 >= e || !getOperand(i+1).isReg() || !getOperand(i+1).isUse())
-        continue;
-      unsigned Idx;
-      if (InlineAsm::isUseOperandTiedToDef(FMO.getImm(), Idx) &&
-          Idx == DefNo) {
-        if (UseOpIdx)
-          *UseOpIdx = (unsigned)i + 1 + DefPart;
-        return true;
-      }
-    }
-    return false;
+  if (DefIdx < TiedMax)
+    UseMO.TiedTo = DefIdx + 1;
+  else {
+    // Inline asm can use the group descriptors to find tied operands, but on
+    // normal instruction, the tied def must be within the first TiedMax
+    // operands.
+    assert(isInlineAsm() && "DefIdx out of range");
+    UseMO.TiedTo = TiedMax;
   }
 
-  assert(getOperand(DefOpIdx).isDef() && "DefOpIdx is not a def!");
-  const MCInstrDesc &MCID = getDesc();
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = getOperand(i);
-    if (MO.isReg() && MO.isUse() &&
-        MCID.getOperandConstraint(i, MCOI::TIED_TO) == (int)DefOpIdx) {
-      if (UseOpIdx)
-        *UseOpIdx = (unsigned)i;
-      return true;
-    }
-  }
-  return false;
+  // UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
+  DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
 }
 
-/// isRegTiedToDefOperand - Return true if the operand of the specified index
-/// is a register use and it is tied to an def operand. It also returns the def
-/// operand index by reference.
-bool MachineInstr::
-isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
-  if (isInlineAsm()) {
-    const MachineOperand &MO = getOperand(UseOpIdx);
-    if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0)
-      return false;
+/// Given the index of a tied register operand, find the operand it is tied to.
+/// Defs are tied to uses and vice versa. Returns the index of the tied operand
+/// which must exist.
+unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isTied() && "Operand isn't tied");
 
-    // Find the flag operand corresponding to UseOpIdx
-    int FlagIdx = findInlineAsmFlagIdx(UseOpIdx);
-    if (FlagIdx < 0)
-      return false;
+  // Normally TiedTo is in range.
+  if (MO.TiedTo < TiedMax)
+    return MO.TiedTo - 1;
 
-    const MachineOperand &UFMO = getOperand(FlagIdx);
-    unsigned DefNo;
-    if (InlineAsm::isUseOperandTiedToDef(UFMO.getImm(), DefNo)) {
-      if (!DefOpIdx)
-        return true;
-
-      unsigned DefIdx = InlineAsm::MIOp_FirstOperand;
-      // Remember to adjust the index. First operand is asm string, second is
-      // the HasSideEffects and AlignStack bits, then there is a flag for each.
-      while (DefNo) {
-        const MachineOperand &FMO = getOperand(DefIdx);
-        assert(FMO.isImm());
-        // Skip over this def.
-        DefIdx += InlineAsm::getNumOperandRegisters(FMO.getImm()) + 1;
-        --DefNo;
-      }
-      *DefOpIdx = DefIdx + UseOpIdx - FlagIdx;
-      return true;
+  // Uses on normal instructions can be out of range.
+  if (!isInlineAsm()) {
+    // Normal tied defs must be in the 0..TiedMax-1 range.
+    if (MO.isUse())
+      return TiedMax - 1;
+    // MO is a def. Search for the tied use.
+    for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &UseMO = getOperand(i);
+      if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
+        return i;
     }
-    return false;
+    llvm_unreachable("Can't find tied use");
   }
 
-  const MCInstrDesc &MCID = getDesc();
-  if (UseOpIdx >= MCID.getNumOperands())
-    return false;
-  const MachineOperand &MO = getOperand(UseOpIdx);
-  if (!MO.isReg() || !MO.isUse())
-    return false;
-  int DefIdx = MCID.getOperandConstraint(UseOpIdx, MCOI::TIED_TO);
-  if (DefIdx == -1)
-    return false;
-  if (DefOpIdx)
-    *DefOpIdx = (unsigned)DefIdx;
-  return true;
+  // Now deal with inline asm by parsing the operand group descriptor flags.
+  // Find the beginning of each operand group.
+  SmallVector<unsigned, 8> GroupIdx;
+  unsigned OpIdxGroup = ~0u;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
+    unsigned CurGroup = GroupIdx.size();
+    GroupIdx.push_back(i);
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    // OpIdx belongs to this operand group.
+    if (OpIdx > i && OpIdx < i + NumOps)
+      OpIdxGroup = CurGroup;
+    unsigned TiedGroup;
+    if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
+      continue;
+    // Operands in this group are tied to operands in TiedGroup which must be
+    // earlier. Find the number of operands between the two groups.
+    unsigned Delta = i - GroupIdx[TiedGroup];
+
+    // OpIdx is a use tied to TiedGroup.
+    if (OpIdxGroup == CurGroup)
+      return OpIdx - Delta;
+
+    // OpIdx is a def tied to this use group.
+    if (OpIdxGroup == TiedGroup)
+      return OpIdx + Delta;
+  }
+  llvm_unreachable("Invalid tied operand on inline asm");
 }
 
 /// clearKillInfo - Clears kill flags on all operands.
@@ -1292,7 +1322,12 @@
                                 AliasAnalysis *AA,
                                 bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
-  if (mayStore() || isCall()) {
+  //
+  // Treat volatile loads as stores. This is not strictly necessary for
+  // volatiles, but it is required for atomic loads. It is not allowed to move
+  // a load across an atomic load with Ordering > Monotonic.
+  if (mayStore() || isCall() ||
+      (mayLoad() && hasOrderedMemoryRef())) {
     SawStore = true;
     return false;
   }
@@ -1308,8 +1343,8 @@
   // load.
   if (mayLoad() && !isInvariantLoad(AA))
     // Otherwise, this is a real load.  If there is a store between the load and
-    // end of block, or if the load is volatile, we can't move it.
-    return !SawStore && !hasVolatileMemoryRef();
+    // end of block, we can't move it.
+    return !SawStore;
 
   return true;
 }
@@ -1340,11 +1375,11 @@
   return true;
 }
 
-/// hasVolatileMemoryRef - Return true if this instruction may have a
-/// volatile memory reference, or if the information describing the
-/// memory reference is not available. Return false if it is known to
-/// have no volatile memory references.
-bool MachineInstr::hasVolatileMemoryRef() const {
+/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+/// or volatile memory reference, or if the information describing the memory
+/// reference is not available. Return false if it is known to have no ordered
+/// memory references.
+bool MachineInstr::hasOrderedMemoryRef() const {
   // An instruction known never to access memory won't have a volatile access.
   if (!mayStore() &&
       !mayLoad() &&
@@ -1357,9 +1392,9 @@
   if (memoperands_empty())
     return true;
 
-  // Check the memory reference information for volatile references.
+  // Check the memory reference information for ordered references.
   for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
-    if ((*I)->isVolatile())
+    if (!(*I)->isUnordered())
       return true;
 
   return false;
@@ -1461,7 +1496,9 @@
 }
 
 void MachineInstr::dump() const {
+#ifndef NDEBUG
   dbgs() << "  " << *this;
+#endif
 }
 
 static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
@@ -1540,6 +1577,10 @@
       OS << " [sideeffect]";
     if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
       OS << " [alignstack]";
+    if (getInlineAsmDialect() == InlineAsm::AD_ATT)
+      OS << " [attdialect]";
+    if (getInlineAsmDialect() == InlineAsm::AD_Intel)
+      OS << " [inteldialect]";
 
     StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
     FirstOp = false;

diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index efec481..169443e 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp

@@ -334,7 +334,7 @@
     DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
   else
     DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
-  DEBUG(dbgs() << MF.getFunction()->getName() << " ********\n");
+  DEBUG(dbgs() << MF.getName() << " ********\n");
 
   if (PreRegAlloc) {
     // Estimate register pressure during pre-regalloc pass.

diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 9f3829e..05d2f2a 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp

@@ -74,6 +74,8 @@
   return BotMBB;
 }
 
+#ifndef NDEBUG
 void MachineLoop::dump() const {
   print(dbgs());
 }
+#endif

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a1dc948..4704dae 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp

@@ -252,7 +252,7 @@
         continue;
       }
       DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      DEBUG(dbgs() << MF->getFunction()->getName()
+      DEBUG(dbgs() << MF->getName()
             << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
@@ -764,12 +764,14 @@
     Queue.pop_back();
   }
 
+#ifndef NDEBUG
   void dump() {
     dbgs() << Name << ": ";
     for (unsigned i = 0, e = Queue.size(); i < e; ++i)
       dbgs() << Queue[i]->NodeNum << " ";
     dbgs() << "\n";
   }
+#endif
 };
 
 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
@@ -905,13 +907,12 @@
   for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned Latency =
-      DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true);
+    unsigned MinLatency = I->getMinLatency();
 #ifndef NDEBUG
-    Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency);
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
 #endif
-    if (SU->TopReadyCycle < PredReadyCycle + Latency)
-      SU->TopReadyCycle = PredReadyCycle + Latency;
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
   }
   Top.releaseNode(SU, SU->TopReadyCycle);
 }
@@ -925,13 +926,12 @@
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned Latency =
-      DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true);
+    unsigned MinLatency = I->getMinLatency();
 #ifndef NDEBUG
-    Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency);
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
 #endif
-    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
-      SU->BotReadyCycle = SuccReadyCycle + Latency;
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
   }
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }

diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 852c169..181e09e 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp

@@ -23,8 +23,9 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/BasicBlock.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -213,6 +214,8 @@
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
 
+    void verifyInlineAsm(const MachineInstr *MI);
+
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void markReachable(const MachineBasicBlock *MBB);
     void calcRegsPassed();
@@ -357,7 +360,7 @@
     MF->print(*OS, Indexes);
   }
   *OS << "*** Bad machine code: " << msg << " ***\n"
-      << "- function:    " << MF->getFunction()->getName() << "\n";
+      << "- function:    " << MF->getName() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
@@ -365,7 +368,7 @@
   report(msg, MBB->getParent());
   *OS << "- basic block: BB#" << MBB->getNumber()
       << ' ' << MBB->getName()
-      << " (" << (void*)MBB << ')';
+      << " (" << (const void*)MBB << ')';
   if (Indexes)
     *OS << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
@@ -695,6 +698,49 @@
   }
 }
 
+// The operands on an INLINEASM instruction must follow a template.
+// Verify that the flag operands make sense.
+void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
+  // The first two operands on INLINEASM are the asm string and global flags.
+  if (MI->getNumOperands() < 2) {
+    report("Too few operands on inline asm", MI);
+    return;
+  }
+  if (!MI->getOperand(0).isSymbol())
+    report("Asm string must be an external symbol", MI);
+  if (!MI->getOperand(1).isImm())
+    report("Asm flags must be an immediate", MI);
+  // Allowed flags are Extra_HasSideEffects = 1, and Extra_IsAlignStack = 2.
+  if (!isUInt<2>(MI->getOperand(1).getImm()))
+    report("Unknown asm flags", &MI->getOperand(1), 1);
+
+  assert(InlineAsm::MIOp_FirstOperand == 2 && "Asm format changed");
+
+  unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+  unsigned NumOps;
+  for (unsigned e = MI->getNumOperands(); OpNo < e; OpNo += NumOps) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // There may be implicit ops after the fixed operands.
+    if (!MO.isImm())
+      break;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(MO.getImm());
+  }
+
+  if (OpNo > MI->getNumOperands())
+    report("Missing operands in last group", MI);
+
+  // An optional MDNode follows the groups.
+  if (OpNo < MI->getNumOperands() && MI->getOperand(OpNo).isMetadata())
+    ++OpNo;
+
+  // All trailing operands must be implicit registers.
+  for (unsigned e = MI->getNumOperands(); OpNo < e; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (!MO.isReg() || !MO.isImplicit())
+      report("Expected implicit register after groups", &MO, OpNo);
+  }
+}
+
 void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
@@ -703,6 +749,10 @@
         << MI->getNumExplicitOperands() << " given.\n";
   }
 
+  // Check the tied operands.
+  if (MI->isInlineAsm())
+    verifyInlineAsm(MI);
+
   // Check the MachineMemOperands for basic consistency.
   for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
        E = MI->memoperands_end(); I != E; ++I) {
@@ -758,6 +808,17 @@
       if (MO->isImplicit())
         report("Explicit operand marked as implicit", MO, MONum);
     }
+
+    int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      if (!MO->isReg())
+        report("Tied use must be a register", MO, MONum);
+      else if (!MO->isTied())
+        report("Operand should be tied", MO, MONum);
+      else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
+        report("Tied def doesn't match MCInstrDesc", MO, MONum);
+    } else if (MO->isReg() && MO->isTied())
+      report("Explicit operand should not be tied", MO, MONum);
   } else {
     // ARM adds %reg0 operands to indicate predicates. We'll allow that.
     if (MO->isReg() && !MO->isImplicit() && !MI->isVariadic() && MO->getReg())
@@ -772,6 +833,28 @@
     if (MRI->tracksLiveness() && !MI->isDebugValue())
       checkLiveness(MO, MONum);
 
+    // Verify the consistency of tied operands.
+    if (MO->isTied()) {
+      unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
+      const MachineOperand &OtherMO = MI->getOperand(OtherIdx);
+      if (!OtherMO.isReg())
+        report("Must be tied to a register", MO, MONum);
+      if (!OtherMO.isTied())
+        report("Missing tie flags on tied operand", MO, MONum);
+      if (MI->findTiedOperandIdx(OtherIdx) != MONum)
+        report("Inconsistent tie links", MO, MONum);
+      if (MONum < MCID.getNumDefs()) {
+        if (OtherIdx < MCID.getNumOperands()) {
+          if (-1 == MCID.getOperandConstraint(OtherIdx, MCOI::TIED_TO))
+            report("Explicit def tied to explicit use without tie constraint",
+                   MO, MONum);
+        } else {
+          if (!OtherMO.isImplicit())
+            report("Explicit def should be tied to implicit use", MO, MONum);
+        }
+      }
+    }
+
     // Verify two-address constraints after leaving SSA form.
     unsigned DefIdx;
     if (!MRI->isSSA() && MO->isUse() &&

diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 56526f2..a6dd5dec 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp

@@ -447,8 +447,8 @@
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
     const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
     assert (TPI && IPI && "Pass ID not registered!");
-    const char *TID = (char *)(TPI->getTypeInfo());
-    const char *IID = (char *)(IPI->getTypeInfo());
+    const char *TID = (const char *)(TPI->getTypeInfo());
+    const char *IID = (const char *)(IPI->getTypeInfo());
     insertPass(TID, IID);
   }
 
@@ -529,6 +529,10 @@
   // instructions dead.
   addPass(&OptimizePHIsID);
 
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
   addPass(&LocalStackSlotAllocationID);

diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 7449ff5..6090752 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp

@@ -240,6 +240,7 @@
   ScheduleDAGInstrs::exitRegion();
 }
 
+#ifndef NDEBUG
 /// dumpSchedule - dump the scheduled Sequence.
 void SchedulePostRATDList::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
@@ -249,6 +250,7 @@
       dbgs() << "**** NOOP ****\n";
   }
 }
+#endif
 
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   TII = Fn.getTarget().getInstrInfo();
@@ -298,7 +300,7 @@
       static int bbcnt = 0;
       if (bbcnt++ % DebugDiv != DebugMod)
         continue;
-      dbgs() << "*** DEBUG scheduling " << Fn.getFunction()->getName()
+      dbgs() << "*** DEBUG scheduling " << Fn.getName()
              << ":BB#" << MBB->getNumber() << " ***\n";
     }
 #endif

diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 34d075c..e4e18c3 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp

@@ -137,8 +137,7 @@
 bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
-               << "********** Function: "
-               << ((Value*)MF.getFunction())->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
 
   bool Changed = false;
 

diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 3a03807..8a49609 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp

@@ -20,7 +20,6 @@
 #include "VirtRegMap.h"
 #include "LiveRegMatrix.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -273,7 +272,7 @@
 bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
                << "********** Function: "
-               << ((Value*)mf.getFunction())->getName() << '\n');
+               << mf.getName() << '\n');
 
   MF = &mf;
   RegAllocBase::init(getAnalysis<VirtRegMap>(),

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 6b3a48e..f573d41 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp

@@ -1110,8 +1110,7 @@
 ///
 bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
-               << "********** Function: "
-               << ((Value*)Fn.getFunction())->getName() << '\n');
+               << "********** Function: " << Fn.getName() << '\n');
   MF = &Fn;
   MRI = &MF->getRegInfo();
   TM = &Fn.getTarget();

diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index d0cff48..c021a93 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp

@@ -24,7 +24,6 @@
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/EdgeBundles.h"
@@ -1746,8 +1745,7 @@
 
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
-               << "********** Function: "
-               << mf.getFunction()->getName() << '\n');
+               << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
   if (VerifyEnabled)

diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index d0db26b..fcdbce7 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp

@@ -192,7 +192,6 @@
                                                 const MachineLoopInfo *loopInfo,
                                                 const RegSet &vregs) {
 
-  typedef std::vector<const LiveInterval*> LIVector;
   LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
   MachineRegisterInfo *mri = &mf->getRegInfo();
   const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
@@ -556,7 +555,7 @@
 
   mri->freezeReservedRegs(MF);
 
-  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getFunction()->getName() << "\n");
+  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getName() << "\n");
 
   // Allocator main loop:
   //
@@ -570,11 +569,12 @@
   // Find the vreg intervals in need of allocation.
   findVRegIntervalsToAlloc();
 
+#ifndef NDEBUG
   const Function* func = mf->getFunction();
   std::string fqn =
     func->getParent()->getModuleIdentifier() + "." +
     func->getName().str();
-  (void)fqn;
+#endif
 
   // If there are non-empty intervals allocate them using pbqp.
   if (!vregsToAlloc.empty()) {

diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 9906334..d018835 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp

@@ -1564,8 +1564,7 @@
   Loops = &getAnalysis<MachineLoopInfo>();
 
   DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
-               << "********** Function: "
-               << ((Value*)MF->getFunction())->getName() << '\n');
+               << "********** Function: " << MF->getName() << '\n');
 
   if (VerifyCoalescing)
     MF->verify(this, "Before register coalescing");

diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 8a6df98..47c3df1 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h

@@ -63,6 +63,13 @@
       : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
 
+    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+    /// No need to call setRegisters().
+    CoalescerPair(unsigned VirtReg, unsigned PhysReg,
+                  const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+
     /// setRegisters - set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
     bool setRegisters(const MachineInstr*);

diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 43448c8..6cdfe7cd 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp

@@ -63,6 +63,7 @@
   decreaseSetPressure(MaxSetPressure, RC, TRI);
 }
 
+#ifndef NDEBUG
 void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
   dbgs() << "Live In: ";
   for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
@@ -78,6 +79,7 @@
              << '\n';
   }
 }
+#endif
 
 /// Increase the current pressure as impacted by these physical registers and
 /// bump the high water mark if needed.

diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 752f8e4..af8cd8f 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp

@@ -279,6 +279,7 @@
   } while (!WorkList.empty());
 }
 
+#ifndef NDEBUG
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
 void SUnit::dump(const ScheduleDAG *G) const {
@@ -336,6 +337,7 @@
   }
   dbgs() << "\n";
 }
+#endif
 
 #ifndef NDEBUG
 /// VerifyScheduledDAG - Verify that all SUnits were scheduled and that

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 9c1dba3..2d8f235 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp

@@ -209,7 +209,7 @@
       if (Reg == 0) continue;
 
       if (TRI->isPhysicalRegister(Reg))
-        Uses[Reg].push_back(&ExitSU);
+        Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
       else {
         assert(!IsPostRA && "Virtual register encountered after regalloc.");
         addVRegUseDeps(&ExitSU, i);
@@ -225,15 +225,15 @@
              E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
         if (!Uses.contains(Reg))
-          Uses[Reg].push_back(&ExitSU);
+          Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
       }
   }
 }
 
 /// MO is an operand of SU's instruction that defines a physical register. Add
 /// data dependencies from SU to any uses of the physical register.
-void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
-                                           const MachineOperand &MO) {
+void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
   assert(MO.isDef() && "expect physreg def");
 
   // Ask the target if address-backscheduling is desirable, and if so how much.
@@ -245,11 +245,13 @@
        Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
       continue;
-    std::vector<SUnit*> &UseList = Uses[*Alias];
+    std::vector<PhysRegSUOper> &UseList = Uses[*Alias];
     for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
-      SUnit *UseSU = UseList[i];
+      SUnit *UseSU = UseList[i].SU;
       if (UseSU == SU)
         continue;
+      MachineInstr *UseMI = UseSU->getInstr();
+      int UseOp = UseList[i].OpIdx;
       unsigned LDataLatency = DataLatency;
       // Optionally add in a special extra latency for nodes that
       // feed addresses.
@@ -258,7 +260,6 @@
       // adjustSchedDependency for the targets that care about it.
       if (SpecialAddressLatency != 0 && !UnitLatencies &&
           UseSU != &ExitSU) {
-        MachineInstr *UseMI = UseSU->getInstr();
         const MCInstrDesc &UseMCID = UseMI->getDesc();
         int RegUseIndex = UseMI->findRegisterUseOperandIdx(*Alias);
         assert(RegUseIndex >= 0 && "UseMI doesn't use register!");
@@ -273,8 +274,15 @@
       // perform its own adjustments.
       SDep dep(SU, SDep::Data, LDataLatency, *Alias);
       if (!UnitLatencies) {
-        unsigned Latency = computeOperandLatency(SU, UseSU, dep);
+        unsigned Latency =
+          TII->computeOperandLatency(InstrItins, SU->getInstr(), OperIdx,
+                                     (UseOp < 0 ? 0 : UseMI), UseOp);
         dep.setLatency(Latency);
+        unsigned MinLatency =
+          TII->computeOperandLatency(InstrItins, SU->getInstr(), OperIdx,
+                                     (UseOp < 0 ? 0 : UseMI), UseOp,
+                                     /*FindMin=*/true);
+        dep.setMinLatency(MinLatency);
 
         ST.adjustSchedDependency(SU, UseSU, dep);
       }
@@ -301,9 +309,9 @@
        Alias.isValid(); ++Alias) {
     if (!Defs.contains(*Alias))
       continue;
-    std::vector<SUnit *> &DefList = Defs[*Alias];
+    std::vector<PhysRegSUOper> &DefList = Defs[*Alias];
     for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
-      SUnit *DefSU = DefList[i];
+      SUnit *DefSU = DefList[i].SU;
       if (DefSU == &ExitSU)
         continue;
       if (DefSU != SU &&
@@ -324,14 +332,14 @@
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's uses.
     // Push this SUnit on the use list.
-    Uses[MO.getReg()].push_back(SU);
+    Uses[MO.getReg()].push_back(PhysRegSUOper(SU, OperIdx));
   }
   else {
-    addPhysRegDataDeps(SU, MO);
+    addPhysRegDataDeps(SU, OperIdx);
 
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's defs.
-    std::vector<SUnit *> &DefList = Defs[MO.getReg()];
+    std::vector<PhysRegSUOper> &DefList = Defs[MO.getReg()];
 
     // If a def is going to wrap back around to the top of the loop,
     // backschedule it.
@@ -393,11 +401,11 @@
     // the block. Instead, we leave only one call at the back of the
     // DefList.
     if (SU->isCall) {
-      while (!DefList.empty() && DefList.back()->isCall)
+      while (!DefList.empty() && DefList.back().SU->isCall)
         DefList.pop_back();
     }
     // Defs are pushed in the order they are visited and never reordered.
-    DefList.push_back(SU);
+    DefList.push_back(PhysRegSUOper(SU, OperIdx));
   }
 }
 
@@ -468,8 +476,14 @@
       if (!UnitLatencies) {
         // Adjust the dependence latency using operand def/use information, then
         // allow the target to perform its own adjustments.
-        unsigned Latency = computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
+        int DefOp = Def->findRegisterDefOperandIdx(Reg);
+        unsigned Latency =
+          TII->computeOperandLatency(InstrItins, Def, DefOp, MI, OperIdx);
         dep.setLatency(Latency);
+        unsigned MinLatency =
+          TII->computeOperandLatency(InstrItins, Def, DefOp, MI, OperIdx,
+                                     /*FindMin=*/true);
+        dep.setMinLatency(MinLatency);
 
         const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
         ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
@@ -488,7 +502,7 @@
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
   if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
-      (MI->hasVolatileMemoryRef() &&
+      (MI->hasOrderedMemoryRef() &&
        (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
     return true;
   return false;
@@ -997,19 +1011,10 @@
   }
 }
 
-unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
-                                                  const SDep& dep,
-                                                  bool FindMin) const {
-  // For a data dependency with a known register...
-  if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
-    return 1;
-
-  return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(),
-                                    Use->getInstr(), dep.getReg(), FindMin);
-}
-
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+#ifndef NDEBUG
   SU->getInstr()->dump();
+#endif
 }
 
 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {

diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 38feee9..6e781b1 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp

@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Constants.h"
-#include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -35,7 +34,7 @@
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const ScheduleDAG *G) {
-      return G->MF.getFunction()->getName();
+      return G->MF.getName();
     }
 
     static bool renderGraphFromBottomUp() {

diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index e675366..5ca22b2 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp

@@ -89,6 +89,7 @@
   ReservedScoreboard.reset();
 }
 
+#ifndef NDEBUG
 void ScoreboardHazardRecognizer::Scoreboard::dump() const {
   dbgs() << "Scoreboard:\n";
 
@@ -104,6 +105,7 @@
     dbgs() << '\n';
   }
 }
+#endif
 
 bool ScoreboardHazardRecognizer::atIssueLimit() const {
   if (IssueWidth == 0)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1c485a0..d7fa009 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -413,7 +413,7 @@
         !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
       return 0;
 
-    // fold (fsub (fadd A, B)) -> (fsub (fneg A), B)
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
                                     Options, Depth + 1))
       return V;
@@ -2496,8 +2496,18 @@
         // lanes of the constant together.
         EVT VT = Vector->getValueType(0);
         unsigned BitWidth = VT.getVectorElementType().getSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (BitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
+               SplatBitSize < BitWidth;
+               SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
         Constant = APInt::getAllOnesValue(BitWidth);
-        for (unsigned i = 0, n = VT.getVectorNumElements(); i < n; ++i)
+        for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
           Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
       }
     }
@@ -5681,6 +5691,127 @@
                        DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
                                    N0.getOperand(1), N1));
 
+  // In unsafe math mode, we can fold chains of FADD's of the same value
+  // into multiplications.  This transform is not safe in general because
+  // we are reducing the number of rounding steps.
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
+      !N0CFP && !N1CFP) {
+    if (N0.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+      ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+
+      // (fadd (fmul c, x), x) -> (fmul c+1, x)
+      if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fmul x, c), x) -> (fmul c+1, x)
+      if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
+      if (!CFP00 && !CFP01 && N0.getOperand(0) == N0.getOperand(1) &&
+          N0.getOperand(0) == N1) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, DAG.getConstantFP(3.0, VT));
+      }
+
+      // (fadd (fmul c, x), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fmul x, c), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    if (N1.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+      ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+
+      // (fadd x, (fmul c, x)) -> (fmul c+1, x)
+      if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+      // (fadd x, (fmul x, c)) -> (fmul c+1, x)
+      if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
+      if (!CFP10 && !CFP11 && N1.getOperand(0) == N1.getOperand(1) &&
+          N1.getOperand(0) == N0) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, DAG.getConstantFP(3.0, VT));
+      }
+
+      // (fadd (fadd x, x), (fmul c, x)) -> (fmul c+2, x)
+      if (CFP10 && !CFP11 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fadd x, x), (fmul x, c)) -> (fmul c+2, x)
+      if (CFP11 && !CFP10 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    // (fadd (fadd x, x), (fadd x, x)) -> (fmul 4.0, x)
+    if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+        N0.getOperand(0) == N0.getOperand(1) &&
+        N1.getOperand(0) == N1.getOperand(1) &&
+        N0.getOperand(0) == N1.getOperand(0)) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getConstantFP(4.0, VT));
+    }
+  }
+
   // FADD -> FMA combines:
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
@@ -5692,8 +5823,8 @@
       return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
     }
-  
-    // fold (fadd x, (fmul y, z)) -> (fma x, y, z)
+
+    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
       return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
@@ -5867,6 +5998,7 @@
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
 
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
@@ -5877,6 +6009,58 @@
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
 
+  // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FMUL &&
+      N0 == N2.getOperand(0) &&
+      N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                       DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1)));
+  }
+
+
+  // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N0.getOpcode() == ISD::FMUL && N1CFP &&
+      N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMA, dl, VT,
+                       N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)),
+                       N2);
+  }
+
+  // (fma x, 1, y) -> (fadd x, y)
+  // (fma x, -1, y) -> (fadd (fneg x), y)
+  if (N1CFP) {
+    if (N1CFP->isExactlyValue(1.0))
+      return DAG.getNode(ISD::FADD, dl, VT, N0, N2);
+
+    if (N1CFP->isExactlyValue(-1.0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
+      SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
+      AddToWorkList(RHSNeg.getNode());
+      return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
+    }
+  }
+
+  // (fma x, c, x) -> (fmul x, (c+1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(1.0, VT)));
+  }
+
+  // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(-1.0, VT)));
+  }
+
+
   return SDValue();
 }
 
@@ -6225,6 +6409,30 @@
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (VT.isVector() && !LegalOperations) {
+    // If operand is a BUILD_VECTOR node, see if we can constant fold it.
+    if (N0.getOpcode() == ISD::BUILD_VECTOR) {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+        SDValue Op = N0.getOperand(i);
+        if (Op.getOpcode() != ISD::UNDEF &&
+            Op.getOpcode() != ISD::ConstantFP)
+          break;
+        EVT EltVT = Op.getValueType();
+        SDValue FoldOp = DAG.getNode(ISD::FNEG, N0.getDebugLoc(), EltVT, Op);
+        if (FoldOp.getOpcode() != ISD::UNDEF &&
+            FoldOp.getOpcode() != ISD::ConstantFP)
+          break;
+        Ops.push_back(FoldOp);
+        AddToWorkList(FoldOp.getNode());
+      }
+
+      if (Ops.size() == N0.getNumOperands())
+        return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                           VT, &Ops[0], Ops.size());
+    }
+  }
+
   if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
                          &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
@@ -6246,6 +6454,17 @@
     }
   }
 
+  // (fneg (fmul c, x)) -> (fmul -c, x)
+  if (N0.getOpcode() == ISD::FMUL) {
+    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CFP1) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                                     N0.getOperand(1)));
+    }
+  }
+
   return SDValue();
 }
 
@@ -7876,29 +8095,15 @@
       if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
         return SDValue();
 
-      // If the element type of the input vector is not the same as
-      // the output element type, make concat_vectors based on input element
-      // type and then bitcast it to the output vector type.
-      //
-      // In another words avoid nodes like this:
-      //  <NODE> v16i8 = concat_vectors v4i16 v4i16
-      // Replace it with this one:
-      //  <NODE0> v8i16 = concat_vectors v4i16 v4i16
-      //  <NODE1> v16i8 = bitcast NODE0
-      EVT ItemType = VecIn1.getValueType().getVectorElementType();
-      if (ItemType != VT.getVectorElementType()) {
-        EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(),
-                                ItemType,
-                                VecIn1.getValueType().getVectorNumElements()*2);
-        // Widen the input vector by adding undef values.
-        VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT,
-                             VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
-        VecIn1 = DAG.getNode(ISD::BITCAST, dl, VT, VecIn1);
-      } else
-        // Widen the input vector by adding undef values.
-        VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                             VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      if (VecIn1.getValueType().getVectorElementType() !=
+          VT.getVectorElementType())
+        return SDValue();
 
+      // Widen the input vector by adding undef values.
+      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
     }
 
     // If VecIn2 is unused then change it to undef.
@@ -8749,7 +8954,7 @@
 // to alias with anything but itself.  Provides base object and offset as
 // results.
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
-                           const GlobalValue *&GV, void *&CV) {
+                           const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
   Base = Ptr; Offset = 0; GV = 0; CV = 0;
 
@@ -8774,8 +8979,8 @@
   // for ConstantSDNodes since the same constant pool entry may be represented
   // by multiple nodes with different offsets.
   if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
-    CV = C->isMachineConstantPoolEntry() ? (void *)C->getMachineCPVal()
-                                         : (void *)C->getConstVal();
+    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal()
+                                         : (const void *)C->getConstVal();
     Offset += C->getOffset();
     return false;
   }
@@ -8800,7 +9005,7 @@
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
   const GlobalValue *GV1, *GV2;
-  void *CV1, *CV2;
+  const void *CV1, *CV2;
   bool isFrameIndex1 = FindBaseOffset(Ptr1, Base1, Offset1, GV1, CV1);
   bool isFrameIndex2 = FindBaseOffset(Ptr2, Base2, Offset2, GV2, CV2);
 

diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 3e18ea7..b2a2a5c 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp

@@ -97,7 +97,7 @@
             cast<ArrayType>(Ty)->getElementType()->isIntegerTy(8)));
         StaticAllocaMap[AI] =
           MF->getFrameInfo()->CreateStackObject(TySize, Align, false,
-                                                MayNeedSP);
+                                                MayNeedSP, AI);
       }
 
   for (; BB != EB; ++BB)

diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4488d27..6d2cdea 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp

@@ -55,7 +55,8 @@
 ///
 /// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
 /// the chain and glue. These operands may be implicit on the machine instr.
-static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
+static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
+                              unsigned &NumImpUses) {
   unsigned N = Node->getNumOperands();
   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     --N;
@@ -63,7 +64,8 @@
     --N; // Ignore chain if it exists.
 
   // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
-  for (unsigned I = N; I; --I) {
+  NumImpUses = N - NumExpUses;
+  for (unsigned I = N; I > NumExpUses; --I) {
     if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
       continue;
     if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
@@ -720,7 +722,8 @@
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
   unsigned NumImpUses = 0;
-  unsigned NodeOperands = countOperands(Node, NumImpUses);
+  unsigned NodeOperands =
+    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
   bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
@@ -870,6 +873,17 @@
     break;
   }
 
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END: {
+    unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START) ?
+    TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END;
+
+    FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Node->getOperand(1));
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+    .addFrameIndex(FI->getIndex());
+    break;
+  }
+
   case ISD::INLINEASM: {
     unsigned NumOps = Node->getNumOperands();
     if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
@@ -890,19 +904,23 @@
                           getZExtValue();
     MI->addOperand(MachineOperand::CreateImm(ExtraInfo));
 
+    // Remember to operand index of the group flags.
+    SmallVector<unsigned, 8> GroupIdx;
+
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
       unsigned Flags =
         cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
-      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
 
+      GroupIdx.push_back(MI->getNumOperands());
       MI->addOperand(MachineOperand::CreateImm(Flags));
       ++i;  // Skip the ID value.
 
       switch (InlineAsm::getKind(Flags)) {
       default: llvm_unreachable("Bad flags!");
         case InlineAsm::Kind_RegDef:
-        for (; NumVals; --NumVals, ++i) {
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
           // FIXME: Add dead flags for physical and virtual registers defined.
           // For now, mark physical register defs as implicit to help fast
@@ -913,7 +931,7 @@
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
       case InlineAsm::Kind_Clobber:
-        for (; NumVals; --NumVals, ++i) {
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
           MI->addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/ true,
                          /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg),
@@ -928,9 +946,20 @@
       case InlineAsm::Kind_Mem:  // Addressing mode.
         // The addressing mode has been selected, just add all of the
         // operands to the machine instruction.
-        for (; NumVals; --NumVals, ++i)
+        for (unsigned j = 0; j != NumVals; ++j, ++i)
           AddOperand(MI, Node->getOperand(i), 0, 0, VRBaseMap,
                      /*IsDebug=*/false, IsClone, IsCloned);
+
+        // Manually set isTied bits.
+        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
+          unsigned DefGroup = 0;
+          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+            unsigned DefIdx = GroupIdx[DefGroup] + 1;
+            unsigned UseIdx = GroupIdx.back() + 1;
+            for (unsigned j = 0; j != NumVals; ++j)
+              MI->tieOperands(DefIdx + j, UseIdx + j);
+          }
+        }
         break;
       }
     }

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 908ebb9..7b34170 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

@@ -2042,7 +2042,7 @@
                                                    SDValue Op0,
                                                    EVT DestVT,
                                                    DebugLoc dl) {
-  if (Op0.getValueType() == MVT::i32) {
+  if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     // simple 32-bit [signed|unsigned] integer to float/double expansion
 
     // Get the stack frame index of a 8 byte buffer.

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 94fc976..37f0e60 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h

@@ -625,6 +625,7 @@
   SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
   SDValue WidenVecRes_VSETCC(SDNode* N);
 
+  SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 704f99b..22f8d51 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

@@ -64,6 +64,7 @@
   // Implement vselect in terms of XOR, AND, OR when blend is not supported
   // by the target.
   SDValue ExpandVSELECT(SDValue Op);
+  SDValue ExpandSELECT(SDValue Op);
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
@@ -220,6 +221,7 @@
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
+  case ISD::FMA:
   case ISD::SIGN_EXTEND_INREG:
     QueryType = Node->getValueType(0);
     break;
@@ -260,6 +262,8 @@
   case TargetLowering::Expand:
     if (Node->getOpcode() == ISD::VSELECT)
       Result = ExpandVSELECT(Op);
+    else if (Node->getOpcode() == ISD::SELECT)
+      Result = ExpandSELECT(Op);
     else if (Node->getOpcode() == ISD::UINT_TO_FP)
       Result = ExpandUINT_TO_FLOAT(Op);
     else if (Node->getOpcode() == ISD::FNEG)
@@ -435,6 +439,66 @@
   return TF;
 }
 
+SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
+  // Lower a select instruction where the condition is a scalar and the
+  // operands are vectors. Lower this select to VSELECT and implement it
+  // using XOR AND OR. The selector bit is broadcasted. 
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  assert(VT.isVector() && !Mask.getValueType().isVector()
+         && Op1.getValueType() == Op2.getValueType() && "Invalid type");
+
+  unsigned NumElem = VT.getVectorNumElements();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // Also, we need to be able to construct a splat vector using BUILD_VECTOR.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::BUILD_VECTOR,  VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Generate a mask operand.
+  EVT MaskTy = TLI.getSetCCResultType(VT);
+  assert(MaskTy.isVector() && "Invalid CC type");
+  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
+         && "Invalid mask size");
+
+  // What is the size of each element in the vector mask.
+  EVT BitTy = MaskTy.getScalarType();
+
+  Mask = DAG.getNode(ISD::SELECT, DL, BitTy, Mask,
+          DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), BitTy),
+          DAG.getConstant(0, BitTy));
+
+  // Broadcast the mask so that the entire vector is all-one or all zero.
+  SmallVector<SDValue, 8> Ops(NumElem, Mask);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, &Ops[0], Ops.size());
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+            APInt::getAllOnesValue(BitTy.getSizeInBits()), MaskTy);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, MaskTy, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, MaskTy, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
@@ -449,12 +513,17 @@
   // AND,OR,XOR, we will have to scalarize the op.
   // Notice that the operation may be 'promoted' which means that it is
   // 'bitcasted' to another type which is handled.
+  // This operation also isn't safe with AND, OR, XOR when the boolean
+  // type is 0/1 as we need an all ones vector constant to mask with.
+  // FIXME: Sign extend 1 to all ones if thats legal on the target.
   if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
       TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
-      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand)
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getBooleanContents(true) !=
+      TargetLowering::ZeroOrNegativeOneBooleanContent)
     return DAG.UnrollVectorOp(Op.getNode());
 
-  assert(VT.getSizeInBits() == Op.getOperand(1).getValueType().getSizeInBits()
+  assert(VT.getSizeInBits() == Op1.getValueType().getSizeInBits()
          && "Invalid mask size");
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 4709202..4095728 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

@@ -1366,6 +1366,9 @@
   case ISD::FTRUNC:
     Res = WidenVecRes_Unary(N);
     break;
+  case ISD::FMA:
+    Res = WidenVecRes_Ternary(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -1373,6 +1376,16 @@
     SetWidenedVector(SDValue(N, ResNo), Res);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
+  // Ternary op widening.
+  DebugLoc dl = N->getDebugLoc();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = GetWidenedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
   unsigned Opcode = N->getOpcode();

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index bf0a437..2b86e36 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp

@@ -656,6 +656,8 @@
     break;
   case ISD::MERGE_VALUES:
   case ISD::TokenFactor:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
   case ISD::CopyToReg:
   case ISD::CopyFromReg:
   case ISD::EH_LABEL:
@@ -1756,6 +1758,7 @@
     return V;
   }
 
+#ifndef NDEBUG
   void dump(ScheduleDAG *DAG) const {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
@@ -1766,6 +1769,7 @@
       SU->dump(DAG);
     }
   }
+#endif
 };
 
 typedef RegReductionPriorityQueue<bu_ls_rr_sort>
@@ -1893,6 +1897,7 @@
 //===----------------------------------------------------------------------===//
 
 void RegReductionPQBase::dumpRegPressure() const {
+#ifndef NDEBUG
   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
          E = TRI->regclass_end(); I != E; ++I) {
     const TargetRegisterClass *RC = *I;
@@ -1902,6 +1907,7 @@
     DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
           << '\n');
   }
+#endif
 }
 
 bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 748668c..222dc55 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

@@ -643,6 +643,7 @@
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+#ifndef NDEBUG
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
     return;
@@ -659,8 +660,10 @@
     dbgs() << "\n";
     GluedNodes.pop_back();
   }
+#endif
 }
 
+#ifndef NDEBUG
 void ScheduleDAGSDNodes::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     if (SUnit *SU = Sequence[i])
@@ -669,6 +672,7 @@
       dbgs() << "**** NOOP ****\n";
   }
 }
+#endif
 
 #ifndef NDEBUG
 /// VerifyScheduledSequence - Verify that all SUnits were scheduled and that

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f4fe892..d85d41b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

@@ -1097,10 +1097,9 @@
          "Cannot set target flags on target-independent globals");
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  EVT PTy = TLI.getPointerTy();
-  unsigned BitWidth = PTy.getSizeInBits();
+  unsigned BitWidth = TLI.getPointerTy().getSizeInBits();
   if (BitWidth < 64)
-    Offset = (Offset << (64 - BitWidth) >> (64 - BitWidth));
+    Offset = SignExtend64(Offset, BitWidth);
 
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   if (!GVar) {
@@ -2817,6 +2816,24 @@
         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
           if (CFP->getValueAPF().isZero())
             return N1;
+      } else if (Opcode == ISD::FMUL) {
+        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1);
+        SDValue V = N2;
+
+        // If the first operand isn't the constant, try the second
+        if (!CFP) {
+          CFP = dyn_cast<ConstantFPSDNode>(N2);
+          V = N1;
+        }
+
+        if (CFP) {
+          // 0*x --> 0
+          if (CFP->isZero())
+            return SDValue(CFP,0);
+          // 1*x --> x
+          if (CFP->isExactlyValue(1.0))
+            return V;
+        }
       }
     }
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
@@ -2935,17 +2952,13 @@
     // expanding large vector constants.
     if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue Elt = N1.getOperand(N2C->getZExtValue());
-      EVT VEltTy = N1.getValueType().getVectorElementType();
-      if (Elt.getValueType() != VEltTy) {
+
+      if (VT != Elt.getValueType())
         // If the vector element type is not legal, the BUILD_VECTOR operands
-        // are promoted and implicitly truncated.  Make that explicit here.
-        Elt = getNode(ISD::TRUNCATE, DL, VEltTy, Elt);
-      }
-      if (VT != VEltTy) {
-        // If the vector element type is not legal, the EXTRACT_VECTOR_ELT
-        // result is implicitly extended.
-        Elt = getNode(ISD::ANY_EXTEND, DL, VT, Elt);
-      }
+        // are promoted and implicitly truncated, and the result implicitly
+        // extended. Make that explicit here.
+        Elt = getAnyExtOrTrunc(Elt, DL, VT);
+
       return Elt;
     }
 
@@ -3923,17 +3936,21 @@
                                 SDValue Swp, MachinePointerInfo PtrInfo,
                                 unsigned Alignment,
                                 AtomicOrdering Ordering,
-                                SynchronizationScope SynchScope) {                                
+                                SynchronizationScope SynchScope) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
+  // All atomics are load and store, except for ATMOIC_LOAD and ATOMIC_STORE.
   // For now, atomics are considered to be volatile always.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
@@ -3983,17 +4000,17 @@
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  // A monotonic store does not load; a release store "loads" in the sense
-  // that other stores cannot be sunk past it.
+  // An atomic store does not load. An atomic load does not store.
   // (An atomicrmw obviously both loads and stores.)
-  unsigned Flags = MachineMemOperand::MOStore;
-  if (Opcode != ISD::ATOMIC_STORE || Ordering > Monotonic)
-    Flags |= MachineMemOperand::MOLoad;
-
-  // For now, atomics are considered to be volatile always.
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
@@ -4056,16 +4073,17 @@
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  // A monotonic load does not store; an acquire load "stores" in the sense
-  // that other loads cannot be hoisted past it.
-  unsigned Flags = MachineMemOperand::MOLoad;
-  if (Ordering > Monotonic)
-    Flags |= MachineMemOperand::MOStore;
-
-  // For now, atomics are considered to be volatile always.
+  // An atomic store does not load. An atomic load does not store.
+  // (An atomicrmw obviously both loads and stores.)
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
@@ -4157,6 +4175,8 @@
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
+          Opcode == ISD::LIFETIME_START ||
+          Opcode == ISD::LIFETIME_END ||
           (Opcode <= INT_MAX &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
@@ -4226,7 +4246,7 @@
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
                       unsigned Alignment, const MDNode *TBAAInfo,
                       const MDNode *Ranges) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(VT);
@@ -4284,7 +4304,7 @@
   AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
-                                     MMO->isNonTemporal(), 
+                                     MMO->isNonTemporal(),
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
@@ -4303,7 +4323,7 @@
                               SDValue Chain, SDValue Ptr,
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
-                              bool isInvariant, unsigned Alignment, 
+                              bool isInvariant, unsigned Alignment,
                               const MDNode *TBAAInfo,
                               const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -4332,7 +4352,7 @@
          "Load is already a indexed load!");
   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
-                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), 
+                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(),
                  false, LD->getAlignment());
 }
 
@@ -4340,7 +4360,7 @@
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                bool isVolatile, bool isNonTemporal,
                                unsigned Alignment, const MDNode *TBAAInfo) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(Val.getValueType());
@@ -4365,7 +4385,7 @@
 
 SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
                                SDValue Ptr, MachineMemOperand *MMO) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   EVT VT = Val.getValueType();
   SDVTList VTs = getVTList(MVT::Other);
@@ -4394,7 +4414,7 @@
                                     EVT SVT,bool isVolatile, bool isNonTemporal,
                                     unsigned Alignment,
                                     const MDNode *TBAAInfo) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(SVT);
@@ -4421,7 +4441,7 @@
                                     MachineMemOperand *MMO) {
   EVT VT = Val.getValueType();
 
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (VT == SVT)
     return getStore(Chain, dl, Val, Ptr, MMO);

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index f3cf758..483b051 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DebugInfo.h"
@@ -825,6 +826,7 @@
   GFI = gfi;
   LibInfo = li;
   TD = DAG.getTarget().getTargetData();
+  Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
 
@@ -1765,6 +1767,7 @@
 /// visitBitTestCase - this function produces one "bit test"
 void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                            MachineBasicBlock* NextMBB,
+                                           uint32_t BranchWeightToNext,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
@@ -1802,8 +1805,10 @@
                        ISD::SETNE);
   }
 
-  addSuccessorWithWeight(SwitchBB, B.TargetBB);
-  addSuccessorWithWeight(SwitchBB, NextMBB);
+  // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
+  addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight);
+  // The branch weight from SwitchBB to NextMBB is BranchWeightToNext.
+  addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext);
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                               MVT::Other, getControlRoot(),
@@ -1926,6 +1931,7 @@
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
 
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // If any two of the cases has the same destination, and if one value
   // is the same as the other, but has one bit unset that the other has set,
   // use bit manipulation to do two compares at once.  For example:
@@ -1959,8 +1965,12 @@
                                     ISD::SETEQ);
 
         // Update successor info.
-        addSuccessorWithWeight(SwitchBB, Small.BB);
-        addSuccessorWithWeight(SwitchBB, Default);
+        // Both Small and Big will jump to Small.BB, so we sum up the weights.
+        addSuccessorWithWeight(SwitchBB, Small.BB,
+                               Small.ExtraWeight + Big.ExtraWeight);
+        addSuccessorWithWeight(SwitchBB, Default,
+          // The default destination is the first successor in IR.
+          BPI ? BPI->getEdgeWeight(SwitchBB->getBasicBlock(), (unsigned)0) : 0);
 
         // Insert the true branch.
         SDValue BrCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other,
@@ -1978,14 +1988,13 @@
   }
 
   // Order cases by weight so the most likely case will be checked first.
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  uint32_t UnhandledWeights = 0;
   if (BPI) {
     for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) {
-      uint32_t IWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
-                                            I->BB->getBasicBlock());
+      uint32_t IWeight = I->ExtraWeight;
+      UnhandledWeights += IWeight;
       for (CaseItr J = CR.Range.first; J < I; ++J) {
-        uint32_t JWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
-                                              J->BB->getBasicBlock());
+        uint32_t JWeight = J->ExtraWeight;
         if (IWeight > JWeight)
           std::swap(*I, *J);
       }
@@ -2034,10 +2043,12 @@
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
-    uint32_t ExtraWeight = I->ExtraWeight;
+    // The false weight should be sum of all un-handled cases.
+    UnhandledWeights -= I->ExtraWeight;
     CaseBlock CB(CC, LHS, RHS, MHS, /* truebb */ I->BB, /* falsebb */ FallThrough,
                  /* me */ CurBlock,
-                 /* trueweight */ ExtraWeight / 2, /* falseweight */ ExtraWeight / 2);
+                 /* trueweight */ I->ExtraWeight,
+                 /* falseweight */ UnhandledWeights);
 
     // If emitting the first comparison, just call visitSwitchCase to emit the
     // code into the current block.  Otherwise, push the CaseBlock onto the
@@ -2137,13 +2148,28 @@
     }
   }
 
+  // Calculate weight for each unique destination in CR.
+  DenseMap<MachineBasicBlock*, uint32_t> DestWeights;
+  if (FuncInfo.BPI)
+    for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(I->BB);
+      if (Itr != DestWeights.end()) 
+        Itr->second += I->ExtraWeight;
+      else
+        DestWeights[I->BB] = I->ExtraWeight;
+    }
+
   // Update successor info. Add one edge to each unique successor.
   BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs());
   for (std::vector<MachineBasicBlock*>::iterator I = DestBBs.begin(),
          E = DestBBs.end(); I != E; ++I) {
     if (!SuccsHandled[(*I)->getNumber()]) {
       SuccsHandled[(*I)->getNumber()] = true;
-      addSuccessorWithWeight(JumpTableBB, *I);
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(*I);
+      addSuccessorWithWeight(JumpTableBB, *I,
+                             Itr != DestWeights.end() ? Itr->second : 0);
     }
   }
 
@@ -2374,7 +2400,7 @@
 
     if (i == count) {
       assert((count < 3) && "Too much destinations to test!");
-      CasesBits.push_back(CaseBits(0, Dest, 0));
+      CasesBits.push_back(CaseBits(0, Dest, 0, 0/*Weight*/));
       count++;
     }
 
@@ -2383,6 +2409,7 @@
 
     uint64_t lo = (lowValue - lowBound).getZExtValue();
     uint64_t hi = (highValue - lowBound).getZExtValue();
+    CasesBits[i].ExtraWeight += I->ExtraWeight;
 
     for (uint64_t j = lo; j <= hi; j++) {
       CasesBits[i].Mask |=  1ULL << j;
@@ -2410,7 +2437,7 @@
     CurMF->insert(BBI, CaseBB);
     BTC.push_back(BitTestCase(CasesBits[i].Mask,
                               CaseBB,
-                              CasesBits[i].BB));
+                              CasesBits[i].BB, CasesBits[i].ExtraWeight));
 
     // Put SV in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(SV);
@@ -2438,30 +2465,25 @@
   
   Clusterifier TheClusterifier;
 
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
        i != e; ++i) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    TheClusterifier.add(i.getCaseValueEx(), SMBB);
+    TheClusterifier.add(i.getCaseValueEx(), SMBB, 
+        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
   }
   
   TheClusterifier.optimize();
   
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   size_t numCmps = 0;
   for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
        e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
     Clusterifier::Cluster &C = *i;
-    unsigned W = 0;
-    if (BPI) {
-      W = BPI->getEdgeWeight(SI.getParent(), C.second->getBasicBlock());
-      if (!W)
-        W = 16;
-      W *= C.first.Weight;
-      BPI->setEdgeWeight(SI.getParent(), C.second->getBasicBlock(), W);  
-    }
+    // Update edge weight for the cluster.
+    unsigned W = C.first.Weight;
 
     // FIXME: Currently work with ConstantInt based numbers.
     // Changing it to APInt based is a pretty heavy for this commit.
@@ -4853,7 +4875,21 @@
     Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, DestVT,
                       getValue(I.getArgOperand(0)),
                       getValue(I.getArgOperand(1)),
-                      DAG.getConstant(Idx, MVT::i32));
+                      DAG.getIntPtrConstant(Idx));
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::x86_avx_vextractf128_pd_256:
+  case Intrinsic::x86_avx_vextractf128_ps_256:
+  case Intrinsic::x86_avx_vextractf128_si_256:
+  case Intrinsic::x86_avx2_vextracti128: {
+    DebugLoc dl = getCurDebugLoc();
+    EVT DestVT = TLI.getValueType(I.getType());
+    uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
+                   DestVT.getVectorNumElements();
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                      getValue(I.getArgOperand(0)),
+                      DAG.getIntPtrConstant(Idx));
     setValue(&I, Res);
     return 0;
   }
@@ -5180,14 +5216,40 @@
                                         rw==1)); /* write */
     return 0;
   }
-
-  case Intrinsic::invariant_start:
   case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
+    // Stack coloring is not enabled in O0, discard region information.
+    if (TM.getOptLevel() == CodeGenOpt::None)
+      return 0;
+
+    SmallVector<Value *, 4> Allocas;
+    GetUnderlyingObjects(I.getArgOperand(1), Allocas, TD);
+
+    for (SmallVector<Value*, 4>::iterator Object = Allocas.begin(),
+         E = Allocas.end(); Object != E; ++Object) {
+      AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
+
+      // Could not find an Alloca.
+      if (!LifetimeObject)
+        continue;
+
+      int FI = FuncInfo.StaticAllocaMap[LifetimeObject];
+
+      SDValue Ops[2];
+      Ops[0] = getRoot();
+      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
+      unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
+
+      Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2);
+      DAG.setRoot(Res);
+    }
+  }
+  case Intrinsic::invariant_start:
     // Discard region information.
     setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
     return 0;
   case Intrinsic::invariant_end:
-  case Intrinsic::lifetime_end:
     // Discard region information.
     return 0;
   case Intrinsic::donothing:
@@ -6043,12 +6105,14 @@
   const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
-  // Remember the HasSideEffect and AlignStack bits as operand 3.
+  // Remember the HasSideEffect, AlignStack and AsmDialect bits as operand 3.
   unsigned ExtraInfo = 0;
   if (IA->hasSideEffects())
     ExtraInfo |= InlineAsm::Extra_HasSideEffects;
   if (IA->isAlignStack())
     ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+  // Set the asm dialect.
+  ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
   AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
                                                   TLI.getPointerTy()));
 

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 4090002..3b7615a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

@@ -150,9 +150,11 @@
     uint64_t Mask;
     MachineBasicBlock* BB;
     unsigned Bits;
+    uint32_t ExtraWeight;
 
-    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits):
-      Mask(mask), BB(bb), Bits(bits) { }
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
+             uint32_t Weight):
+      Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { }
   };
 
   typedef std::vector<Case>           CaseVector;
@@ -247,11 +249,13 @@
   typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
 
   struct BitTestCase {
-    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr):
-      Mask(M), ThisBB(T), TargetBB(Tr) { }
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr,
+                uint32_t Weight):
+      Mask(M), ThisBB(T), TargetBB(Tr), ExtraWeight(Weight) { }
     uint64_t Mask;
     MachineBasicBlock *ThisBB;
     MachineBasicBlock *TargetBB;
+    uint32_t ExtraWeight;
   };
 
   typedef SmallVector<BitTestCase, 3> BitTestInfo;
@@ -325,7 +329,7 @@
                       CodeGenOpt::Level ol)
     : SDNodeOrder(0), TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
       DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
-      HasTailCall(false), Context(dag.getContext()) {
+      HasTailCall(false) {
   }
 
   void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
@@ -452,6 +456,7 @@
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
+                        uint32_t BranchWeightToNext,
                         unsigned Reg,
                         BitTestCase &B,
                         MachineBasicBlock *SwitchBB);

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 13cd011..75989ad 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp

@@ -267,6 +267,8 @@
   case ISD::STACKRESTORE:               return "stackrestore";
   case ISD::TRAP:                       return "trap";
   case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::LIFETIME_START:             return "lifetime.start";
+  case ISD::LIFETIME_END:               return "lifetime.end";
 
   // Bit manipulation
   case ISD::BSWAP:                      return "bswap";
@@ -331,7 +333,7 @@
 }
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
-  OS << (void*)this << ": ";
+  OS << (const void*)this << ": ";
 
   for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
     if (i) OS << ",";
@@ -559,7 +561,7 @@
       child->printr(OS, G);
       once.insert(child);
     } else {         // Just the address. FIXME: also print the child's opcode.
-      OS << (void*)child;
+      OS << (const void*)child;
       if (unsigned RN = N->getOperand(i).getResNo())
         OS << ":" << RN;
     }

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4e5e3ba..7542941 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

@@ -554,7 +554,7 @@
 #endif
   {
     BlockNumber = FuncInfo->MBB->getNumber();
-    BlockName = MF->getFunction()->getName().str() + ":" +
+    BlockName = MF->getName().str() + ":" +
                 FuncInfo->MBB->getBasicBlock()->getName().str();
   }
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
@@ -1209,7 +1209,12 @@
       CodeGenAndEmitDAG();
     }
 
+    uint32_t UnhandledWeight = 0;
+    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j)
+      UnhandledWeight += SDB->BitTestCases[i].Cases[j].ExtraWeight;
+
     for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
+      UnhandledWeight -= SDB->BitTestCases[i].Cases[j].ExtraWeight;
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
@@ -1217,12 +1222,14 @@
       if (j+1 != ej)
         SDB->visitBitTestCase(SDB->BitTestCases[i],
                               SDB->BitTestCases[i].Cases[j+1].ThisBB,
+                              UnhandledWeight,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
                               FuncInfo->MBB);
       else
         SDB->visitBitTestCase(SDB->BitTestCases[i],
                               SDB->BitTestCases[i].Default,
+                              UnhandledWeight,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
                               FuncInfo->MBB);
@@ -1794,10 +1801,13 @@
         User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
       continue;
 
-    if (User->getOpcode() == ISD::CopyToReg ||
-        User->getOpcode() == ISD::CopyFromReg ||
-        User->getOpcode() == ISD::INLINEASM ||
-        User->getOpcode() == ISD::EH_LABEL) {
+    unsigned UserOpcode = User->getOpcode();
+    if (UserOpcode == ISD::CopyToReg ||
+        UserOpcode == ISD::CopyFromReg ||
+        UserOpcode == ISD::INLINEASM ||
+        UserOpcode == ISD::EH_LABEL ||
+        UserOpcode == ISD::LIFETIME_START ||
+        UserOpcode == ISD::LIFETIME_END) {
       // If their node ID got reset to -1 then they've already been selected.
       // Treat them like a MachineOpcode.
       if (User->getNodeId() == -1)
@@ -2213,6 +2223,8 @@
   case ISD::CopyFromReg:
   case ISD::CopyToReg:
   case ISD::EH_LABEL:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
     NodeToMatch->setNodeId(-1); // Mark selected.
     return 0;
   case ISD::AssertSext:
@@ -2981,7 +2993,7 @@
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
-    Msg << "\nIn function: " << MF->getFunction()->getName();
+    Msg << "\nIn function: " << MF->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 173ffac..3921635 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp

@@ -14,7 +14,6 @@
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
-#include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -50,7 +49,7 @@
 
     template<typename EdgeIter>
     static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
-      return itostr(I - SDNodeIterator::begin((SDNode *) Node));
+      return itostr(I - SDNodeIterator::begin((const SDNode *) Node));
     }
 
     /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
@@ -73,7 +72,7 @@
     }
 
     static std::string getGraphName(const SelectionDAG *G) {
-      return G->getMachineFunction().getFunction()->getName();
+      return G->getMachineFunction().getName();
     }
 
     static bool renderGraphFromBottomUp() {
@@ -146,7 +145,7 @@
 void SelectionDAG::viewGraph(const std::string &Title) {
 // This code is only for debugging!
 #ifndef NDEBUG
-  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getName(),
+  ViewGraph(this, "dag." + getMachineFunction().getName(),
             false, Title);
 #else
   errs() << "SelectionDAG::viewGraph is only available in debug builds on "

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6820175..dcaa9ba 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp

@@ -772,7 +772,7 @@
       LegalIntReg = IntReg;
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
-        (MVT::SimpleValueType)LegalIntReg;
+        (const MVT::SimpleValueType)LegalIntReg;
       ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
@@ -898,7 +898,6 @@
   return NULL;
 }
 
-
 EVT TargetLowering::getSetCCResultType(EVT VT) const {
   assert(!VT.isVector() && "No default SetCC type for vectors!");
   return PointerTy.SimpleTy;
@@ -2441,7 +2440,7 @@
   if (N0 == N1) {
     // The sext(setcc()) => setcc() optimization relies on the appropriate
     // constant being emitted.
-    uint64_t EqVal;
+    uint64_t EqVal = 0;
     switch (getBooleanContents(N0.getValueType().isVector())) {
     case UndefinedBooleanContent:
     case ZeroOrOneBooleanContent:

diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
index 21ae2f5..4fbe1b3 100644
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ b/lib/CodeGen/ShrinkWrapping.cpp

@@ -159,7 +159,7 @@
   // via --shrink-wrap-func=<funcname>.
 #ifndef NDEBUG
   if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getFunction()->getName().str();
+    std::string MFName = MF->getName().str();
     ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
   }
 #endif
@@ -187,7 +187,7 @@
 
   DEBUG(if (ShrinkWrapThisFunction) {
       dbgs() << "Place CSR spills/restores for "
-             << MF->getFunction()->getName() << "\n";
+             << MF->getName() << "\n";
     });
 
   if (calculateSets(Fn))
@@ -364,7 +364,7 @@
   // If no CSRs used, we are done.
   if (CSI.empty()) {
     DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+            dbgs() << "DISABLED: " << Fn.getName()
                    << ": uses no callee-saved registers\n");
     return false;
   }
@@ -384,7 +384,7 @@
   // implementation to functions with <= 500 MBBs.
   if (Fn.size() > 500) {
     DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+            dbgs() << "DISABLED: " << Fn.getName()
                    << ": too large (" << Fn.size() << " MBBs)\n");
     ShrinkWrapThisFunction = false;
   }
@@ -466,7 +466,7 @@
   }
 
   if (allCSRUsesInEntryBlock) {
-    DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                  << ": all CSRs used in EntryBlock\n");
     ShrinkWrapThisFunction = false;
   } else {
@@ -478,7 +478,7 @@
         allCSRsUsedInEntryFanout = false;
     }
     if (allCSRsUsedInEntryFanout) {
-      DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                    << ": all CSRs used in imm successors of EntryBlock\n");
       ShrinkWrapThisFunction = false;
     }
@@ -505,7 +505,7 @@
       if (dominatesExitNodes) {
         CSRUsedInChokePoints |= CSRUsed[MBB];
         if (CSRUsedInChokePoints == UsedCSRegs) {
-          DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                        << ": all CSRs used in choke point(s) at "
                        << getBasicBlockName(MBB) << "\n");
           ShrinkWrapThisFunction = false;
@@ -521,7 +521,7 @@
     return false;
 
   DEBUG({
-      dbgs() << "ENABLED: " << Fn.getFunction()->getName();
+      dbgs() << "ENABLED: " << Fn.getName();
       if (HasFastExitPath)
         dbgs() << " (fast exit path)";
       dbgs() << "\n";
@@ -861,7 +861,7 @@
   DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
       dbgs() << "-----------------------------------------------------------\n";
       dbgs() << "total iterations = " << iterations << " ( "
-           << Fn.getFunction()->getName()
+           << Fn.getName()
            << " " << numSRReducedThisFunc
            << " " << Fn.size()
            << " )\n";
@@ -984,7 +984,7 @@
       if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
         if (restored != spilled) {
           CSRegSet notRestored = (spilled - restored);
-          DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+          DEBUG(dbgs() << MF->getName() << ": "
                        << stringifyCSRegSet(notRestored)
                        << " spilled at " << getBasicBlockName(MBB)
                        << " are never restored on path to return "
@@ -1032,7 +1032,7 @@
     }
     if (spilled != restored) {
       CSRegSet notSpilled = (restored - spilled);
-      DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+      DEBUG(dbgs() << MF->getName() << ": "
                    << stringifyCSRegSet(notSpilled)
                    << " restored at " << getBasicBlockName(MBB)
                    << " are never spilled\n");

diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 980bd74..7f46a06 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp

@@ -196,53 +196,38 @@
     new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
 
   // Fill in the function context structure.
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-  Value *Zero = ConstantInt::get(Int32Ty, 0);
-  Value *One = ConstantInt::get(Int32Ty, 1);
-  Value *Two = ConstantInt::get(Int32Ty, 2);
-  Value *Three = ConstantInt::get(Int32Ty, 3);
-  Value *Four = ConstantInt::get(Int32Ty, 4);
-
-  Value *Idxs[2] = { Zero, 0 };
-
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
     LandingPadInst *LPI = LPads[I];
     IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt());
 
     // Reference the __data field.
-    Idxs[1] = Two;
-    Value *FCData = Builder.CreateGEP(FuncCtx, Idxs, "__data");
+    Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Idxs[1] = Zero;
-    Value *ExceptionAddr = Builder.CreateGEP(FCData, Idxs, "exception_gep");
+    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
+                                                      "exception_gep");
     Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
-    ExnVal = Builder.CreateIntToPtr(ExnVal, Type::getInt8PtrTy(F.getContext()));
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
-    Idxs[1] = One;
-    Value *SelectorAddr = Builder.CreateGEP(FCData, Idxs, "exn_selector_gep");
+    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
+                                                     "exn_selector_gep");
     Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
 
     substituteLPadValues(LPI, ExnVal, SelVal);
   }
 
   // Personality function
-  Idxs[1] = Three;
+  IRBuilder<> Builder(EntryBB->getTerminator());
   if (!PersonalityFn)
     PersonalityFn = LPads[0]->getPersonalityFn();
-  Value *PersonalityFieldPtr =
-    GetElementPtrInst::Create(FuncCtx, Idxs, "pers_fn_gep",
-                              EntryBB->getTerminator());
-  new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
-                EntryBB->getTerminator());
+  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
+                                                          "pers_fn_gep");
+  Builder.CreateStore(PersonalityFn, PersonalityFieldPtr, /*isVolatile=*/true);
 
   // LSDA address
-  Value *LSDA = CallInst::Create(LSDAAddrFn, "lsda_addr",
-                                 EntryBB->getTerminator());
-  Idxs[1] = Four;
-  Value *LSDAFieldPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "lsda_gep",
-                                                  EntryBB->getTerminator());
-  new StoreInst(LSDA, LSDAFieldPtr, true, EntryBB->getTerminator());
+  Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
+  Value *LSDAFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 4, "lsda_gep");
+  Builder.CreateStore(LSDA, LSDAFieldPtr, /*isVolatile=*/true);
 
   return FuncCtx;
 }
@@ -417,48 +402,31 @@
   Value *FuncCtx =
     setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-
-  Value *Idxs[2] = {
-    ConstantInt::get(Int32Ty, 0), 0
-  };
+  IRBuilder<> Builder(EntryBB->getTerminator());
 
   // Get a reference to the jump buffer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 5);
-  Value *JBufPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "jbuf_gep",
-                                             EntryBB->getTerminator());
+  Value *JBufPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 5, "jbuf_gep");
 
   // Save the frame pointer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 0);
-  Value *FramePtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_fp_gep",
-                                              EntryBB->getTerminator());
+  Value *FramePtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 0, "jbuf_fp_gep");
 
-  Value *Val = CallInst::Create(FrameAddrFn,
-                                ConstantInt::get(Int32Ty, 0),
-                                "fp",
-                                EntryBB->getTerminator());
-  new StoreInst(Val, FramePtr, true, EntryBB->getTerminator());
+  Value *Val = Builder.CreateCall(FrameAddrFn, Builder.getInt32(0), "fp");
+  Builder.CreateStore(Val, FramePtr, /*isVolatile=*/true);
 
   // Save the stack pointer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 2);
-  Value *StackPtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_sp_gep",
-                                              EntryBB->getTerminator());
+  Value *StackPtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 2, "jbuf_sp_gep");
 
-  Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
-  new StoreInst(Val, StackPtr, true, EntryBB->getTerminator());
+  Val = Builder.CreateCall(StackAddrFn, "sp");
+  Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
 
   // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
-  Value *SetjmpArg = CastInst::Create(Instruction::BitCast, JBufPtr,
-                                      Type::getInt8PtrTy(F.getContext()), "",
-                                      EntryBB->getTerminator());
-  CallInst::Create(BuiltinSetjmpFn, SetjmpArg, "", EntryBB->getTerminator());
+  Value *SetjmpArg = Builder.CreateBitCast(JBufPtr, Builder.getInt8PtrTy());
+  Builder.CreateCall(BuiltinSetjmpFn, SetjmpArg);
 
   // Store a pointer to the function context so that the back-end will know
   // where to look for it.
-  Value *FuncCtxArg = CastInst::Create(Instruction::BitCast, FuncCtx,
-                                       Type::getInt8PtrTy(F.getContext()), "",
-                                       EntryBB->getTerminator());
-  CallInst::Create(FuncCtxFn, FuncCtxArg, "", EntryBB->getTerminator());
+  Value *FuncCtxArg = Builder.CreateBitCast(FuncCtx, Builder.getInt8PtrTy());
+  Builder.CreateCall(FuncCtxFn, FuncCtxArg);
 
   // At this point, we are all set up, update the invoke instructions to mark
   // their call_site values.

diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index c8c3fb3..c98efb4 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp

@@ -143,6 +143,7 @@
 }
 
 
+#ifndef NDEBUG
 void SlotIndexes::dump() const {
   for (IndexList::const_iterator itr = indexList.begin();
        itr != indexList.end(); ++itr) {
@@ -159,6 +160,7 @@
     dbgs() << "BB#" << i << "\t[" << MBBRanges[i].first << ';'
            << MBBRanges[i].second << ")\n";
 }
+#endif
 
 // Print a SlotIndex to a raw_ostream.
 void SlotIndex::print(raw_ostream &os) const {
@@ -168,9 +170,11 @@
     os << "invalid";
 }
 
+#ifndef NDEBUG
 // Dump a SlotIndex to stderr.
 void SlotIndex::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
 

diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 4a2b7ec..96151b6 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp

@@ -356,6 +356,7 @@
   Edit->anyRematerializable(0);
 }
 
+#ifndef NDEBUG
 void SplitEditor::dump() const {
   if (RegAssign.empty()) {
     dbgs() << " empty\n";
@@ -366,6 +367,7 @@
     dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
   dbgs() << '\n';
 }
+#endif
 
 VNInfo *SplitEditor::defValue(unsigned RegIdx,
                               const VNInfo *ParentVNI,

diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
new file mode 100644
index 0000000..dbfa4bb
--- /dev/null
+++ b/lib/CodeGen/StackColoring.cpp

@@ -0,0 +1,696 @@
+//===-- StackColoring.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the stack-coloring optimization that looks for
+// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
+// which represent the possible lifetime of stack slots. It attempts to
+// merge disjoint stack slots and reduce the used stack space.
+// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
+//
+// TODO: In the future we plan to improve stack coloring in the following ways:
+// 1. Allow merging multiple small slots into a single larger slot at different
+//    offsets.
+// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
+//    spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackcoloring"
+#include "MachineTraceMetrics.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableColoring("no-stack-coloring",
+               cl::init(true), cl::Hidden,
+               cl::desc("Suppress stack coloring"));
+
+STATISTIC(NumMarkerSeen,  "Number of life markers found.");
+STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
+STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// StackColoring - A machine pass for merging disjoint stack allocations,
+/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
+class StackColoring : public MachineFunctionPass {
+  MachineFrameInfo *MFI;
+  MachineFunction *MF;
+
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+  /// Maps active slots (per bit) for each basic block.
+  DenseMap<MachineBasicBlock*, BlockLifetimeInfo> BlockLiveness;
+
+  /// Maps serial numbers to basic blocks.
+  DenseMap<MachineBasicBlock*, int> BasicBlocks;
+  /// Maps basic blocks to a serial number.
+  SmallVector<MachineBasicBlock*, 8> BasicBlockNumbering;
+
+  /// Maps liveness intervals for each slot.
+  SmallVector<LiveInterval*, 16> Intervals;
+  /// VNInfo is used for the construction of LiveIntervals.
+  VNInfo::Allocator VNInfoAllocator;
+  /// SlotIndex analysis object.
+  SlotIndexes* Indexes;
+
+  /// The list of lifetime markers found. These markers are to be removed
+  /// once the coloring is done.
+  SmallVector<MachineInstr*, 8> Markers;
+
+  /// SlotSizeSorter - A Sort utility for arranging stack slots according
+  /// to their size.
+  struct SlotSizeSorter {
+    MachineFrameInfo *MFI;
+    SlotSizeSorter(MachineFrameInfo *mfi) : MFI(mfi) { }
+    bool operator()(int LHS, int RHS) {
+      // We use -1 to denote a uninteresting slot. Place these slots at the end.
+      if (LHS == -1) return false;
+      if (RHS == -1) return true;
+      // Sort according to size.
+      return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+  }
+};
+
+public:
+  static char ID;
+  StackColoring() : MachineFunctionPass(ID) {
+    initializeStackColoringPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  /// Debug.
+  void dump();
+
+  /// Removes all of the lifetime marker instructions from the function.
+  /// \returns true if any markers were removed.
+  bool removeAllMarkers();
+
+  /// Scan the machine function and find all of the lifetime markers.
+  /// Record the findings in the BEGIN and END vectors.
+  /// \returns the number of markers found.
+  unsigned collectMarkers(unsigned NumSlot);
+
+  /// Perform the dataflow calculation and calculate the lifetime for each of
+  /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
+  /// LifetimeLIVE_OUT maps that represent which stack slots are live coming
+  /// in and out blocks.
+  void calculateLocalLiveness();
+
+  /// Construct the LiveIntervals for the slots.
+  void calculateLiveIntervals(unsigned NumSlots);
+
+  /// Go over the machine function and change instructions which use stack
+  /// slots to use the joint slots.
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
+
+  /// Map entries which point to other entries to their destination.
+  ///   A->B->C becomes A->C.
+   void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+};
+} // end anonymous namespace
+
+char StackColoring::ID = 0;
+char &llvm::StackColoringID = StackColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+
+void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void StackColoring::dump() {
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+    unsigned Num = BasicBlocks[*FI];
+    DEBUG(dbgs()<<"Inspecting block #"<<Num<<" ["<<FI->getName()<<"]\n");
+    Num = 0;
+    DEBUG(dbgs()<<"BEGIN  : {");
+    for (unsigned i=0; i < BlockLiveness[*FI].Begin.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].Begin.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"END    : {");
+    for (unsigned i=0; i < BlockLiveness[*FI].End.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].End.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"LIVE_IN: {");
+    for (unsigned i=0; i < BlockLiveness[*FI].LiveIn.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].LiveIn.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+    DEBUG(dbgs()<<"LIVEOUT: {");
+    for (unsigned i=0; i < BlockLiveness[*FI].LiveOut.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].LiveOut.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+  }
+}
+
+unsigned StackColoring::collectMarkers(unsigned NumSlot) {
+  unsigned MarkersFound = 0;
+  // Scan the function to find all lifetime markers.
+  // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
+  // deterministic numbering, and because we'll need a post-order iteration
+  // later for solving the liveness dataflow problem.
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+
+    // Assign a serial number to this basic block.
+    BasicBlocks[*FI] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(*FI);
+
+    BlockLiveness[*FI].Begin.resize(NumSlot);
+    BlockLiveness[*FI].End.resize(NumSlot);
+
+    for (MachineBasicBlock::iterator BI = (*FI)->begin(), BE = (*FI)->end();
+         BI != BE; ++BI) {
+
+      if (BI->getOpcode() != TargetOpcode::LIFETIME_START &&
+          BI->getOpcode() != TargetOpcode::LIFETIME_END)
+        continue;
+
+      Markers.push_back(BI);
+
+      bool IsStart = BI->getOpcode() == TargetOpcode::LIFETIME_START;
+      MachineOperand &MI = BI->getOperand(0);
+      unsigned Slot = MI.getIndex();
+
+      MarkersFound++;
+
+      const Value *Allocation = MFI->getObjectAllocation(Slot);
+      if (Allocation) {
+        DEBUG(dbgs()<<"Found lifetime marker for allocation: "<<
+              Allocation->getName()<<"\n");
+      }
+
+      if (IsStart) {
+        BlockLiveness[*FI].Begin.set(Slot);
+      } else {
+        if (BlockLiveness[*FI].Begin.test(Slot)) {
+          // Allocas that start and end within a single block are handled
+          // specially when computing the LiveIntervals to avoid pessimizing
+          // the liveness propagation.
+          BlockLiveness[*FI].Begin.reset(Slot);
+        } else {
+          BlockLiveness[*FI].End.set(Slot);
+        }
+      }
+    }
+  }
+
+  // Update statistics.
+  NumMarkerSeen += MarkersFound;
+  return MarkersFound;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  // Perform a standard reverse dataflow computation to solve for
+  // global liveness.  The BEGIN set here is equivalent to KILL in the standard
+  // formulation, and END is equivalent to GEN.  The result of this computation
+  // is a map from blocks to bitvectors where the bitvectors represent which
+  // allocas are live in/out of that block.
+  SmallPtrSet<MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
+                                           BasicBlockNumbering.end());
+  unsigned NumSSMIters = 0;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    ++NumSSMIters;
+
+    SmallPtrSet<MachineBasicBlock*, 8> NextBBSet;
+
+    for (SmallVector<MachineBasicBlock*, 8>::iterator
+         PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
+         PI != PE; ++PI) {
+
+      MachineBasicBlock *BB = *PI;
+      if (!BBSet.count(BB)) continue;
+
+      BitVector LocalLiveIn;
+      BitVector LocalLiveOut;
+
+      // Forward propagation from begins to ends.
+      for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+           PE = BB->pred_end(); PI != PE; ++PI)
+        LocalLiveIn |= BlockLiveness[*PI].LiveOut;
+      LocalLiveIn |= BlockLiveness[BB].End;
+      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+
+      // Reverse propagation from ends to begins.
+      for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI)
+        LocalLiveOut |= BlockLiveness[*SI].LiveIn;
+      LocalLiveOut |= BlockLiveness[BB].Begin;
+      LocalLiveOut.reset(BlockLiveness[BB].End);
+
+      LocalLiveIn |= LocalLiveOut;
+      LocalLiveOut |= LocalLiveIn;
+
+      // After adopting the live bits, we need to turn-off the bits which
+      // are de-activated in this block.
+      LocalLiveOut.reset(BlockLiveness[BB].End);
+      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+
+      // If we have both BEGIN and END markers in the same basic block then
+      // we know that the BEGIN marker comes after the END, because we already
+      // handle the case where the BEGIN comes before the END when collecting
+      // the markers (and building the BEGIN/END vectore).
+      // Want to enable the LIVE_IN and LIVE_OUT of slots that have both
+      // BEGIN and END because it means that the value lives before and after
+      // this basic block.
+      BitVector LocalEndBegin = BlockLiveness[BB].End;
+      LocalEndBegin &= BlockLiveness[BB].Begin;
+      LocalLiveIn |= LocalEndBegin;
+      LocalLiveOut |= LocalEndBegin;
+
+      if (LocalLiveIn.test(BlockLiveness[BB].LiveIn)) {
+        changed = true;
+        BlockLiveness[BB].LiveIn |= LocalLiveIn;
+
+        for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+             PE = BB->pred_end(); PI != PE; ++PI)
+          NextBBSet.insert(*PI);
+      }
+
+      if (LocalLiveOut.test(BlockLiveness[BB].LiveOut)) {
+        changed = true;
+        BlockLiveness[BB].LiveOut |= LocalLiveOut;
+
+        for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+             SE = BB->succ_end(); SI != SE; ++SI)
+          NextBBSet.insert(*SI);
+      }
+    }
+
+    BBSet = NextBBSet;
+  }// while changed.
+}
+
+void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
+  SmallVector<SlotIndex, 16> Starts;
+  SmallVector<SlotIndex, 16> Finishes;
+
+  // For each block, find which slots are active within this block
+  // and update the live intervals.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBe = MF->end();
+       MBB != MBBe; ++MBB) {
+    Starts.clear();
+    Starts.resize(NumSlots);
+    Finishes.clear();
+    Finishes.resize(NumSlots);
+
+    // Create the interval for the basic blocks with lifetime markers in them.
+    for (SmallVector<MachineInstr*, 8>::iterator it = Markers.begin(),
+         e = Markers.end(); it != e; ++it) {
+      MachineInstr *MI = *it;
+      if (MI->getParent() != MBB)
+        continue;
+
+      assert((MI->getOpcode() == TargetOpcode::LIFETIME_START ||
+              MI->getOpcode() == TargetOpcode::LIFETIME_END) &&
+             "Invalid Lifetime marker");
+
+      bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START;
+      MachineOperand &Mo = MI->getOperand(0);
+      int Slot = Mo.getIndex();
+      assert(Slot >= 0 && "Invalid slot");
+
+      SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
+
+      if (IsStart) {
+        if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+          Starts[Slot] = ThisIndex;
+      } else {
+        if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
+          Finishes[Slot] = ThisIndex;
+      }
+    }
+
+    // Create the interval of the blocks that we previously found to be 'alive'.
+    BitVector Alive = BlockLiveness[MBB].LiveIn;
+    Alive |= BlockLiveness[MBB].LiveOut;
+
+    if (Alive.any()) {
+      for (int pos = Alive.find_first(); pos != -1;
+           pos = Alive.find_next(pos)) {
+        if (!Starts[pos].isValid())
+          Starts[pos] = Indexes->getMBBStartIdx(MBB);
+        if (!Finishes[pos].isValid())
+          Finishes[pos] = Indexes->getMBBEndIdx(MBB);
+      }
+    }
+
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      assert(Starts[i].isValid() == Finishes[i].isValid() && "Unmatched range");
+      if (!Starts[i].isValid())
+        continue;
+
+      assert(Starts[i] && Finishes[i] && "Invalid interval");
+      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
+      SlotIndex S = Starts[i];
+      SlotIndex F = Finishes[i];
+      if (S < F) {
+        // We have a single consecutive region.
+        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+      } else {
+        // We have two non consecutive regions. This happens when
+        // LIFETIME_START appears after the LIFETIME_END marker.
+        SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
+        SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
+        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
+        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+      }
+    }
+  }
+}
+
+bool StackColoring::removeAllMarkers() {
+  unsigned Count = 0;
+  for (unsigned i = 0; i < Markers.size(); ++i) {
+    Markers[i]->eraseFromParent();
+    Count++;
+  }
+  Markers.clear();
+
+  DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
+  return Count;
+}
+
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+  unsigned FixedInstr = 0;
+  unsigned FixedMemOp = 0;
+  unsigned FixedDbg = 0;
+  MachineModuleInfo *MMI = &MF->getMMI();
+
+  // Remap debug information that refers to stack slots.
+  MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo();
+  for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(),
+       VE = VMap.end(); VI != VE; ++VI) {
+    const MDNode *Var = VI->first;
+    if (!Var) continue;
+    std::pair<unsigned, DebugLoc> &VP = VI->second;
+    if (SlotRemap.count(VP.first)) {
+      DEBUG(dbgs()<<"Remapping debug info for ["<<Var->getName()<<"].\n");
+      VP.first = SlotRemap[VP.first];
+      FixedDbg++;
+    }
+  }
+
+  // Keep a list of *allocas* which need to be remapped.
+  DenseMap<const Value*, const Value*> Allocas;
+  for (DenseMap<int, int>::iterator it = SlotRemap.begin(),
+       e = SlotRemap.end(); it != e; ++it) {
+    const Value *From = MFI->getObjectAllocation(it->first);
+    const Value *To = MFI->getObjectAllocation(it->second);
+    assert(To && From && "Invalid allocation object");
+    Allocas[From] = To;
+  }
+
+  // Remap all instructions to the new stack slots.
+  MachineFunction::iterator BB, BBE;
+  MachineBasicBlock::iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      // Skip lifetime markers. We'll remove them soon.
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END)
+        continue;
+
+      // Update the MachineMemOperand to use the new alloca.
+      for (MachineInstr::mmo_iterator MM = I->memoperands_begin(),
+           E = I->memoperands_end(); MM != E; ++MM) {
+        MachineMemOperand *MMO = *MM;
+
+        const Value *V = MMO->getValue();
+
+        if (!V)
+          continue;
+
+        // Climb up and find the original alloca.
+        V = GetUnderlyingObject(V);
+        // If we did not find one, or if the one that we found is not in our
+        // map, then move on.
+        if (!V || !Allocas.count(V))
+          continue;
+
+        MMO->setValue(Allocas[V]);
+        FixedMemOp++;
+      }
+
+      // Update all of the machine instruction operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+        int FromSlot = MO.getIndex();
+
+        // Don't touch arguments.
+        if (FromSlot<0)
+          continue;
+
+        // Only look at mapped slots.
+        if (!SlotRemap.count(FromSlot))
+          continue;
+
+        // In a debug build, check that the instruction that we are modifying is
+        // inside the expected live range. If the instruction is not inside
+        // the calculated range then it means that the alloca usage moved
+        // outside of the lifetime markers.
+#ifndef NDEBUG
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        LiveInterval* Interval = Intervals[FromSlot];
+        assert(Interval->find(Index) != Interval->end() &&
+               "Found instruction usage outside of live range.");
+#endif
+
+        // Fix the machine instructions.
+        int ToSlot = SlotRemap[FromSlot];
+        MO.setIndex(ToSlot);
+        FixedInstr++;
+      }
+    }
+
+  DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
+}
+
+void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
+                                   unsigned NumSlots) {
+  // Expunge slot remap map.
+  for (unsigned i=0; i < NumSlots; ++i) {
+    // If we are remapping i
+    if (SlotRemap.count(i)) {
+      int Target = SlotRemap[i];
+      // As long as our target is mapped to something else, follow it.
+      while (SlotRemap.count(Target)) {
+        Target = SlotRemap[Target];
+        SlotRemap[i] = Target;
+      }
+    }
+  }
+}
+
+bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
+  DEBUG(dbgs() << "********** Stack Coloring **********\n"
+               << "********** Function: "
+               << ((const Value*)Func.getFunction())->getName() << '\n');
+  MF = &Func;
+  MFI = MF->getFrameInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  BlockLiveness.clear();
+  BasicBlocks.clear();
+  BasicBlockNumbering.clear();
+  Markers.clear();
+  Intervals.clear();
+  VNInfoAllocator.Reset();
+
+  unsigned NumSlots = MFI->getObjectIndexEnd();
+
+  // If there are no stack slots then there are no markers to remove.
+  if (!NumSlots)
+    return false;
+
+  SmallVector<int, 8> SortedSlots;
+
+  SortedSlots.reserve(NumSlots);
+  Intervals.reserve(NumSlots);
+
+  unsigned NumMarkers = collectMarkers(NumSlots);
+
+  unsigned TotalSize = 0;
+  DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
+  DEBUG(dbgs()<<"Slot structure:\n");
+
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
+    DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
+    TotalSize += MFI->getObjectSize(i);
+  }
+
+  DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
+
+  // Don't continue because there are not enough lifetime markers, or the
+  // stack or too small, or we are told not to optimize the slots.
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
+    DEBUG(dbgs()<<"Will not try to merge slots.\n");
+    return removeAllMarkers();
+  }
+
+  for (unsigned i=0; i < NumSlots; ++i) {
+    LiveInterval *LI = new LiveInterval(i, 0);
+    Intervals.push_back(LI);
+    LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    SortedSlots.push_back(i);
+  }
+
+  // Calculate the liveness of each block.
+  calculateLocalLiveness();
+
+  // Propagate the liveness information.
+  calculateLiveIntervals(NumSlots);
+
+  // Maps old slots to new slots.
+  DenseMap<int, int> SlotRemap;
+  unsigned RemovedSlots = 0;
+  unsigned ReducedSize = 0;
+
+  // Do not bother looking at empty intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    if (Intervals[SortedSlots[I]]->empty())
+      SortedSlots[I] = -1;
+  }
+
+  // This is a simple greedy algorithm for merging allocas. First, sort the
+  // slots, placing the largest slots first. Next, perform an n^2 scan and look
+  // for disjoint slots. When you find disjoint slots, merge the samller one
+  // into the bigger one and update the live interval. Remove the small alloca
+  // and continue.
+
+  // Sort the slots according to their size. Place unused slots at the end.
+  std::sort(SortedSlots.begin(), SortedSlots.end(), SlotSizeSorter(MFI));
+
+  bool Chanded = true;
+  while (Chanded) {
+    Chanded = false;
+    for (unsigned I = 0; I < NumSlots; ++I) {
+      if (SortedSlots[I] == -1)
+        continue;
+
+      for (unsigned J=I+1; J < NumSlots; ++J) {
+        if (SortedSlots[J] == -1)
+          continue;
+
+        int FirstSlot = SortedSlots[I];
+        int SecondSlot = SortedSlots[J];
+        LiveInterval *First = Intervals[FirstSlot];
+        LiveInterval *Second = Intervals[SecondSlot];
+        assert (!First->empty() && !Second->empty() && "Found an empty range");
+
+        // Merge disjoint slots.
+        if (!First->overlaps(*Second)) {
+          Chanded = true;
+          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          SlotRemap[SecondSlot] = FirstSlot;
+          SortedSlots[J] = -1;
+          DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
+                SecondSlot<<" together.\n");
+          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
+                                           MFI->getObjectAlignment(SecondSlot));
+
+          assert(MFI->getObjectSize(FirstSlot) >=
+                 MFI->getObjectSize(SecondSlot) &&
+                 "Merging a small object into a larger one");
+
+          RemovedSlots+=1;
+          ReducedSize += MFI->getObjectSize(SecondSlot);
+          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
+          MFI->RemoveStackObject(SecondSlot);
+        }
+      }
+    }
+  }// While changed.
+
+  // Record statistics.
+  StackSpaceSaved += ReducedSize;
+  StackSlotMerged += RemovedSlots;
+  DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
+        ReducedSize<<" bytes\n");
+
+  // Scan the entire function and update all machine operands that use frame
+  // indices to use the remapped frame index.
+  expungeSlotMap(SlotRemap, NumSlots);
+  remapInstructions(SlotRemap);
+
+  // Release the intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    delete Intervals[I];
+  }
+
+  return removeAllMarkers();
+}

diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 20da36e..9d0fd0a 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp

@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "stackcoloring"
-#include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -391,8 +390,7 @@
 bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   DEBUG({
       dbgs() << "********** Stack Slot Coloring **********\n"
-             << "********** Function: "
-             << MF.getFunction()->getName() << '\n';
+             << "********** Function: " << MF.getName() << '\n';
     });
 
   MFI = MF.getFrameInfo();

diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 5b06195..39fd600 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp

@@ -404,9 +404,9 @@
 }
 
 void StrongPHIElimination::addReg(unsigned Reg) {
-  if (RegNodeMap.count(Reg))
-    return;
-  RegNodeMap[Reg] = new (Allocator) Node(Reg);
+  Node *&N = RegNodeMap[Reg];
+  if (!N)
+    N = new (Allocator) Node(Reg);
 }
 
 StrongPHIElimination::Node*
@@ -714,8 +714,9 @@
         assert(getRegColor(CopyReg) == CopyReg);
       }
 
-      if (!InsertedSrcCopyMap.count(std::make_pair(PredBB, PHIColor)))
-        InsertedSrcCopyMap[std::make_pair(PredBB, PHIColor)] = CopyInstr;
+      // Insert into map if not already there.
+      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
+                                               CopyInstr));
     }
 
     SrcMO.setReg(CopyReg);

diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index ddee6b2..7e7f835 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp

@@ -99,17 +99,8 @@
 
   if (NewMI) {
     // Create a new instruction.
-    bool Reg0IsDead = HasDef ? MI->getOperand(0).isDead() : false;
     MachineFunction &MF = *MI->getParent()->getParent();
-    if (HasDef)
-      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead), SubReg0)
-        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
-        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
-    else
-      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
-        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
+    MI = MF.CloneMachineInstr(MI);
   }
 
   if (HasDef) {
@@ -645,9 +636,16 @@
 }
 
 /// computeOperandLatency - Compute and return the latency of the given data
-/// dependent def and use when the operand indices are already known.
+/// dependent def and use when the operand indices are already known. UseMI may
+/// be NULL for an unknown use.
 ///
-/// FindMin may be set to get the minimum vs. expected latency.
+/// FindMin may be set to get the minimum vs. expected latency. Minimum
+/// latency is used for scheduling groups, while expected latency is for
+/// instruction cost and critical path.
+///
+/// Depending on the subtarget's itinerary properties, this may or may not need
+/// to call getOperandLatency(). For most subtargets, we don't need DefIdx or
+/// UseIdx to compute min latency.
 unsigned TargetInstrInfo::
 computeOperandLatency(const InstrItineraryData *ItinData,
                       const MachineInstr *DefMI, unsigned DefIdx,
@@ -660,7 +658,13 @@
 
   assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
 
-  int OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  int OperLatency = 0;
+  if (UseMI)
+    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  else {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+  }
   if (OperLatency >= 0)
     return OperLatency;
 
@@ -673,77 +677,3 @@
                             defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
-
-/// computeOperandLatency - Compute and return the latency of the given data
-/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an
-/// unknown use. Depending on the subtarget's itinerary properties, this may or
-/// may not need to call getOperandLatency().
-///
-/// FindMin may be set to get the minimum vs. expected latency. Minimum
-/// latency is used for scheduling groups, while expected latency is for
-/// instruction cost and critical path.
-///
-/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency.
-/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use.
-unsigned TargetInstrInfo::
-computeOperandLatency(const InstrItineraryData *ItinData,
-                      const TargetRegisterInfo *TRI,
-                      const MachineInstr *DefMI, const MachineInstr *UseMI,
-                      unsigned Reg, bool FindMin) const {
-
-  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
-  if (DefLatency >= 0)
-    return DefLatency;
-
-  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
-
-  // Find the definition of the register in the defining instruction.
-  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
-  if (DefIdx != -1) {
-    const MachineOperand &MO = DefMI->getOperand(DefIdx);
-    if (MO.isReg() && MO.isImplicit() &&
-        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
-      // This is an implicit def, getOperandLatency() won't return the correct
-      // latency. e.g.
-      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
-      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
-      // What we want is to compute latency between def of %D6/%D7 and use of
-      // %Q3 instead.
-      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
-      if (DefMI->getOperand(Op2).isReg())
-        DefIdx = Op2;
-    }
-    // For all uses of the register, calculate the maxmimum latency
-    int OperLatency = -1;
-
-    // UseMI is null, then it must be a scheduling barrier.
-    if (!UseMI) {
-      unsigned DefClass = DefMI->getDesc().getSchedClass();
-      OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
-    }
-    else {
-      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = UseMI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse())
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MOReg != Reg)
-          continue;
-
-        int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i);
-        OperLatency = std::max(OperLatency, UseCycle);
-      }
-    }
-    // If we found an operand latency, we're done.
-    if (OperLatency >= 0)
-      return OperLatency;
-  }
-  // No operand latency was found.
-  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
-
-  // Expected latency is the max of the stage latency and itinerary props.
-  if (!FindMin)
-    InstrLatency = std::max(InstrLatency,
-                            defaultDefLatency(ItinData->SchedModel, DefMI));
-  return InstrLatency;
-}

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index aa601af..bd12f92 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp

@@ -1202,8 +1202,7 @@
 collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
   const MCInstrDesc &MCID = MI->getDesc();
   bool AnyOps = false;
-  unsigned NumOps = MI->isInlineAsm() ?
-    MI->getNumOperands() : MCID.getNumOperands();
+  unsigned NumOps = MI->getNumOperands();
 
   for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
     unsigned DstIdx = 0;
@@ -1373,7 +1372,7 @@
 
   DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
   DEBUG(dbgs() << "********** Function: "
-        << MF->getFunction()->getName() << '\n');
+        << MF->getName() << '\n');
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();

diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 93840f0..bd10a4b 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp

@@ -19,7 +19,6 @@
 #define DEBUG_TYPE "regalloc"
 #include "VirtRegMap.h"
 #include "LiveDebugVariables.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -127,9 +126,11 @@
   OS << '\n';
 }
 
+#ifndef NDEBUG
 void VirtRegMap::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 //                              VirtRegRewriter
@@ -197,11 +198,11 @@
   VRM = &getAnalysis<VirtRegMap>();
   DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
                << "********** Function: "
-               << MF->getFunction()->getName() << '\n');
+               << MF->getName() << '\n');
   DEBUG(VRM->dump());
 
   // Add kill flags while we still have virtual registers.
-  LIS->addKillFlags();
+  LIS->addKillFlags(VRM);
 
   // Live-in lists on basic blocks are required for physregs.
   addMBBLiveIns();

diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 441f1e8..1e9e509 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt

@@ -8,5 +8,6 @@
   DWARFDebugAranges.cpp
   DWARFDebugInfoEntry.cpp
   DWARFDebugLine.cpp
+  DWARFDebugRangeList.cpp
   DWARFFormValue.cpp
   )

diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
index e2fd55f..ead57f9 100644
--- a/lib/DebugInfo/DIContext.cpp
+++ b/lib/DebugInfo/DIContext.cpp

@@ -18,7 +18,9 @@
                                       StringRef abbrevSection,
                                       StringRef aRangeSection,
                                       StringRef lineSection,
-                                      StringRef stringSection) {
+                                      StringRef stringSection,
+                                      StringRef rangeSection) {
   return new DWARFContextInMemory(isLittleEndian, infoSection, abbrevSection,
-                                  aRangeSection, lineSection, stringSection);
+                                  aRangeSection, lineSection, stringSection,
+                                  rangeSection);
 }

diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index b27d57b..bdd65b7 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp

@@ -63,7 +63,7 @@
     Version = debug_info_data.getU16(&offset);
     bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
     Abbrevs = abbrevs;
-    AddrSize = debug_info_data.getU8 (&offset);
+    AddrSize = debug_info_data.getU8(&offset);
 
     bool versionOK = DWARFContext::isSupportedVersion(Version);
     bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
@@ -75,6 +75,15 @@
   return 0;
 }
 
+bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(Context.getRangeSection(),
+                           Context.isLittleEndian(), AddrSize);
+  return RangeList.extract(RangesData, &RangeListOffset);
+}
+
 void DWARFCompileUnit::clear() {
   Offset = 0;
   Length = 0;
@@ -94,7 +103,9 @@
      << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
      << ")\n";
 
-  getCompileUnitDIE(false)->dump(OS, this, -1U);
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
 }
 
 const char *DWARFCompileUnit::getCompilationDir() {
@@ -174,11 +185,11 @@
       addDIE(die);
       return 1;
     }
-    else if (depth == 0 && initial_die_array_size == 1) {
+    else if (depth == 0 && initial_die_array_size == 1)
       // Don't append the CU die as we already did that
-    } else {
-      addDIE (die);
-    }
+      ;
+    else
+      addDIE(die);
 
     const DWARFAbbreviationDeclaration *abbrDecl =
       die.getAbbreviationDeclarationPtr();
@@ -199,9 +210,9 @@
   // Give a little bit of info if we encounter corrupt DWARF (our offset
   // should always terminate at or before the start of the next compilation
   // unit header).
-  if (offset > next_cu_offset) {
-    fprintf (stderr, "warning: DWARF compile unit extends beyond its bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
-  }
+  if (offset > next_cu_offset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its"
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
 
   setDIERelations();
   return DieArray.size();
@@ -244,12 +255,21 @@
     clearDIEs(true);
 }
 
-const DWARFDebugInfoEntryMinimal*
-DWARFCompileUnit::getFunctionDIEForAddress(int64_t address) {
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
   extractDIEsIfNeeded(false);
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
   for (size_t i = 0, n = DieArray.size(); i != n; i++) {
-    if (DieArray[i].addressRangeContainsAddress(this, address))
-      return &DieArray[i];
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      SubprogramDIE = &DieArray[i];
+      break;
+    }
   }
-  return 0;
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryMinimal::InlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(this, Address);
 }

diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index b34a596..03e2862 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h

@@ -12,6 +12,7 @@
 
 #include "DWARFDebugAbbrev.h"
 #include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
 #include <vector>
 
 namespace llvm {
@@ -45,6 +46,11 @@
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
   /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool cu_die_only);
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
   void clear();
   void dump(raw_ostream &OS);
   uint32_t getOffset() const { return Offset; }
@@ -106,11 +112,11 @@
 
   void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
                               bool clear_dies_if_already_not_parsed);
-  /// getFunctionDIEForAddress - Returns pointer to parsed subprogram DIE,
-  /// address ranges of which contain the provided address,
-  /// or NULL if there is no such subprogram. The pointer
-  /// is valid until DWARFCompileUnit::clear() or clearDIEs() is called.
-  const DWARFDebugInfoEntryMinimal *getFunctionDIEForAddress(int64_t address);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address.
+  DWARFDebugInfoEntryMinimal::InlinedChain getInlinedChainForAddress(
+      uint64_t Address);
 };
 
 }

diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 797662b..241f55e 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp

@@ -32,15 +32,17 @@
   while (set.extract(arangesData, &offset))
     set.dump(OS);
 
+  uint8_t savedAddressByteSize = 0;
   OS << "\n.debug_lines contents:\n";
   for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
     DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
+    savedAddressByteSize = cu->getAddressByteSize();
     unsigned stmtOffset =
       cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
                                                            -1U);
     if (stmtOffset != -1U) {
       DataExtractor lineData(getLineSection(), isLittleEndian(),
-                             cu->getAddressByteSize());
+                             savedAddressByteSize);
       DWARFDebugLine::DumpingState state(OS);
       DWARFDebugLine::parseStatementTable(lineData, &stmtOffset, state);
     }
@@ -54,6 +56,18 @@
     OS << format("0x%8.8x: \"%s\"\n", lastOffset, s);
     lastOffset = offset;
   }
+
+  OS << "\n.debug_ranges contents:\n";
+  // In fact, different compile units may have different address byte
+  // sizes, but for simplicity we just use the address byte size of the last
+  // compile unit (there is no easy and fast way to associate address range
+  // list and the compile unit it describes).
+  DataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                           savedAddressByteSize);
+  offset = 0;
+  DWARFDebugRangeList rangeList;
+  while (rangeList.extract(rangesData, &offset))
+    rangeList.dump(OS);
 }
 
 const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
@@ -131,75 +145,152 @@
   };
 }
 
-DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) {
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CUs.empty())
     parseCompileUnits();
 
-  DWARFCompileUnit *i = std::lower_bound(CUs.begin(), CUs.end(), offset,
-                                         OffsetComparator());
-  if (i != CUs.end())
-    return &*i;
+  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
+                                          OffsetComparator());
+  if (CU != CUs.end())
+    return &*CU;
   return 0;
 }
 
-DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address,
-    DILineInfoSpecifier specifier) {
+DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   // First, get the offset of the compile unit.
-  uint32_t cuOffset = getDebugAranges()->findAddress(address);
+  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
   // Retrieve the compile unit.
-  DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset);
-  if (!cu)
+  return getCompileUnitForOffset(CUOffset);
+}
+
+static bool getFileNameForCompileUnit(
+    DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable,
+    uint64_t FileIndex, bool NeedsAbsoluteFilePath, std::string &FileName) {
+  if (CU == 0 ||
+      LineTable == 0 ||
+      !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath,
+                                     FileName))
+    return false;
+  if (NeedsAbsoluteFilePath && sys::path::is_relative(FileName)) {
+    // We may still need to append compilation directory of compile unit.
+    SmallString<16> AbsolutePath;
+    if (const char *CompilationDir = CU->getCompilationDir()) {
+      sys::path::append(AbsolutePath, CompilationDir);
+    }
+    sys::path::append(AbsolutePath, FileName);
+    FileName = AbsolutePath.str();
+  }
+  return true;
+}
+
+static bool getFileLineInfoForCompileUnit(
+    DWARFCompileUnit *CU, const DWARFDebugLine::LineTable *LineTable,
+    uint64_t Address, bool NeedsAbsoluteFilePath, std::string &FileName,
+    uint32_t &Line, uint32_t &Column) {
+  if (CU == 0 || LineTable == 0)
+    return false;
+  // Get the index of row we're looking for in the line table.
+  uint32_t RowIndex = LineTable->lookupAddress(Address);
+  if (RowIndex == -1U)
+    return false;
+  // Take file number and line/column from the row.
+  const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+  if (!getFileNameForCompileUnit(CU, LineTable, Row.File,
+                                 NeedsAbsoluteFilePath, FileName))
+    return false;
+  Line = Row.Line;
+  Column = Row.Column;
+  return true;
+}
+
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
     return DILineInfo();
-  SmallString<16> fileName("<invalid>");
-  SmallString<16> functionName("<invalid>");
-  uint32_t line = 0;
-  uint32_t column = 0;
-  if (specifier.needs(DILineInfoSpecifier::FunctionName)) {
-    const DWARFDebugInfoEntryMinimal *function_die =
-        cu->getFunctionDIEForAddress(address);
-    if (function_die) {
-      if (const char *name = function_die->getSubprogramName(cu))
-        functionName = name;
+  std::string FileName = "<invalid>";
+  std::string FunctionName = "<invalid>";
+  uint32_t Line = 0;
+  uint32_t Column = 0;
+  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    // The address may correspond to instruction in some inlined function,
+    // so we have to build the chain of inlined functions and take the
+    // name of the topmost function in it.
+    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+        CU->getInlinedChainForAddress(Address);
+    if (InlinedChain.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
     }
   }
-  if (specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    // Get the line table for this compile unit.
-    const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
-    if (lineTable) {
-      // Get the index of the row we're looking for in the line table.
-      uint32_t rowIndex = lineTable->lookupAddress(address);
-      if (rowIndex != -1U) {
-        const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
-        // Take file/line info from the line table.
-        const DWARFDebugLine::FileNameEntry &fileNameEntry =
-            lineTable->Prologue.FileNames[row.File - 1];
-        fileName = fileNameEntry.Name;
-        if (specifier.needs(DILineInfoSpecifier::AbsoluteFilePath) &&
-            sys::path::is_relative(fileName.str())) {
-          // Append include directory of file (if it is present in line table)
-          // and compilation directory of compile unit to make path absolute.
-          const char *includeDir = 0;
-          if (uint64_t includeDirIndex = fileNameEntry.DirIdx) {
-            includeDir = lineTable->Prologue
-                         .IncludeDirectories[includeDirIndex - 1];
-          }
-          SmallString<16> absFileName;
-          if (includeDir == 0 || sys::path::is_relative(includeDir)) {
-            if (const char *compilationDir = cu->getCompilationDir())
-              sys::path::append(absFileName, compilationDir);
-          }
-          if (includeDir) {
-            sys::path::append(absFileName, includeDir);
-          }
-          sys::path::append(absFileName, fileName.str());
-          fileName = absFileName;
-        }
-        line = row.Line;
-        column = row.Column;
+  if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    const DWARFDebugLine::LineTable *LineTable =
+        getLineTableForCompileUnit(CU);
+    const bool NeedsAbsoluteFilePath =
+        Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+    getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                  NeedsAbsoluteFilePath,
+                                  FileName, Line, Column);
+  }
+  return DILineInfo(StringRef(FileName), StringRef(FunctionName),
+                    Line, Column);
+}
+
+DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return DIInliningInfo();
+
+  const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.size() == 0)
+    return DIInliningInfo();
+
+  DIInliningInfo InliningInfo;
+  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
+  const DWARFDebugLine::LineTable *LineTable = 0;
+  for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
+    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i];
+    std::string FileName = "<invalid>";
+    std::string FunctionName = "<invalid>";
+    uint32_t Line = 0;
+    uint32_t Column = 0;
+    // Get function name if necessary.
+    if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+      if (const char *Name = FunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+    if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+      const bool NeedsAbsoluteFilePath =
+          Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+      if (i == 0) {
+        // For the topmost frame, initialize the line table of this
+        // compile unit and fetch file/line info from it.
+        LineTable = getLineTableForCompileUnit(CU);
+        // For the topmost routine, get file/line info from line table.
+        getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                      NeedsAbsoluteFilePath,
+                                      FileName, Line, Column);
+      } else {
+        // Otherwise, use call file, call line and call column from
+        // previous DIE in inlined chain.
+        getFileNameForCompileUnit(CU, LineTable, CallFile,
+                                  NeedsAbsoluteFilePath, FileName);
+        Line = CallLine;
+        Column = CallColumn;
+      }
+      // Get call file/line/column of a current DIE.
+      if (i + 1 < n) {
+        FunctionDIE.getCallerFrame(CU, CallFile, CallLine, CallColumn);
       }
     }
+    DILineInfo Frame(StringRef(FileName), StringRef(FunctionName),
+                     Line, Column);
+    InliningInfo.addFrame(Frame);
   }
-  return DILineInfo(fileName, functionName, line, column);
+  return InliningInfo;
 }
 
 void DWARFContextInMemory::anchor() { }

diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index e55a27e..7633997 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h

@@ -13,6 +13,7 @@
 #include "DWARFCompileUnit.h"
 #include "DWARFDebugAranges.h"
 #include "DWARFDebugLine.h"
+#include "DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -53,9 +54,6 @@
     return &CUs[index];
   }
 
-  /// Return the compile unit that includes an offset (relative to .debug_info).
-  DWARFCompileUnit *getCompileUnitForOffset(uint32_t offset);
-
   /// Get a pointer to the parsed DebugAbbrev object.
   const DWARFDebugAbbrev *getDebugAbbrev();
 
@@ -66,8 +64,10 @@
   const DWARFDebugLine::LineTable *
   getLineTableForCompileUnit(DWARFCompileUnit *cu);
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address,
-      DILineInfoSpecifier specifier = DILineInfoSpecifier());
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
 
   bool isLittleEndian() const { return IsLittleEndian; }
 
@@ -76,12 +76,19 @@
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
+  virtual StringRef getRangeSection() = 0;
 
   static bool isSupportedVersion(unsigned version) {
     return version == 2 || version == 3;
   }
-};
+private:
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
 
+  /// Return the compile unit which contains instruction with provided
+  /// address.
+  DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
+};
 
 /// DWARFContextInMemory is the simplest possible implementation of a
 /// DWARFContext. It assumes all content is available in memory and stores
@@ -93,19 +100,22 @@
   StringRef ARangeSection;
   StringRef LineSection;
   StringRef StringSection;
+  StringRef RangeSection;
 public:
   DWARFContextInMemory(bool isLittleEndian,
                        StringRef infoSection,
                        StringRef abbrevSection,
                        StringRef aRangeSection,
                        StringRef lineSection,
-                       StringRef stringSection)
+                       StringRef stringSection,
+                       StringRef rangeSection)
     : DWARFContext(isLittleEndian),
       InfoSection(infoSection),
       AbbrevSection(abbrevSection),
       ARangeSection(aRangeSection),
       LineSection(lineSection),
-      StringSection(stringSection)
+      StringSection(stringSection),
+      RangeSection(rangeSection)
     {}
 
   virtual StringRef getInfoSection() { return InfoSection; }
@@ -113,6 +123,7 @@
   virtual StringRef getARangeSection() { return ARangeSection; }
   virtual StringRef getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
+  virtual StringRef getRangeSection() { return RangeSection; }
 };
 
 }

diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index ef470e5..f9a34c9 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp

@@ -62,7 +62,6 @@
     uint32_t offset = 0;
 
     typedef std::vector<DWARFDebugArangeSet> SetCollection;
-    typedef SetCollection::const_iterator SetCollectionIter;
     SetCollection sets;
 
     DWARFDebugArangeSet set;

diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 429a36c..1bfd126 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp

@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.cpp --------------------------------------------===//
+//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -101,7 +101,7 @@
   DataExtractor debug_info_data = cu->getDebugInfoExtractor();
   uint64_t abbrCode = debug_info_data.getULEB128(offset_ptr);
 
-  assert (fixed_form_sizes); // For best performance this should be specified!
+  assert(fixed_form_sizes); // For best performance this should be specified!
 
   if (abbrCode) {
     uint32_t offset = *offset_ptr;
@@ -126,6 +126,7 @@
           switch (form) {
           // Blocks if inlined data that have a length field and the data bytes
           // inlined in the .debug_info.
+          case DW_FORM_exprloc:
           case DW_FORM_block:
             form_size = debug_info_data.getULEB128(&offset);
             break;
@@ -150,6 +151,11 @@
             form_size = cu->getAddressByteSize();
             break;
 
+          // 0 sized form.
+          case DW_FORM_flag_present:
+            form_size = 0;
+            break;
+
           // 1 byte values
           case DW_FORM_data1:
           case DW_FORM_flag:
@@ -173,6 +179,7 @@
           // 8 byte values
           case DW_FORM_data8:
           case DW_FORM_ref8:
+          case DW_FORM_ref_sig8:
             form_size = 8;
             break;
 
@@ -188,6 +195,13 @@
             form = debug_info_data.getULEB128(&offset);
             break;
 
+          case DW_FORM_sec_offset:
+            if (cu->getAddressByteSize() == 4)
+              debug_info_data.getU32(offset_ptr);
+            else
+              debug_info_data.getU64(offset_ptr);
+            break;
+
           default:
             *offset_ptr = Offset;
             return false;
@@ -249,6 +263,7 @@
               switch (form) {
               // Blocks if inlined data that have a length field and the data
               // bytes // inlined in the .debug_info
+              case DW_FORM_exprloc:
               case DW_FORM_block:
                 form_size = debug_info_data.getULEB128(&offset);
                 break;
@@ -273,6 +288,11 @@
                 form_size = cu_addr_size;
                 break;
 
+              // 0 byte value
+              case DW_FORM_flag_present:
+                form_size = 0;
+                break;
+
               // 1 byte values
               case DW_FORM_data1:
               case DW_FORM_flag:
@@ -299,6 +319,7 @@
               // 8 byte values
               case DW_FORM_data8:
               case DW_FORM_ref8:
+              case DW_FORM_ref_sig8:
                 form_size = 8;
                 break;
 
@@ -314,6 +335,13 @@
                 form_is_indirect = true;
                 break;
 
+              case DW_FORM_sec_offset:
+                if (cu->getAddressByteSize() == 4)
+                  debug_info_data.getU32(offset_ptr);
+                else
+                  debug_info_data.getU64(offset_ptr);
+                break;
+                
               default:
                 *offset_ptr = offset;
                 return false;
@@ -336,6 +364,16 @@
   return false;
 }
 
+bool DWARFDebugInfoEntryMinimal::isSubprogramDIE() const {
+  return getTag() == DW_TAG_subprogram;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
+  uint32_t Tag = getTag();
+  return Tag == DW_TAG_subprogram ||
+         Tag == DW_TAG_inlined_subroutine;
+}
+
 uint32_t
 DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
                                               const uint16_t attr,
@@ -418,24 +456,31 @@
   return fail_value;
 }
 
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+    uint64_t &LowPC, uint64_t &HighPC) const {
+  HighPC = -1ULL;
+  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
+  if (LowPC != -1ULL)
+    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  return (HighPC != -1ULL);
+}
+
 void
-DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
-                                               DWARFDebugAranges *debug_aranges)
+DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
+                                               DWARFDebugAranges *DebugAranges)
                                                    const {
   if (AbbrevDecl) {
-    uint16_t tag = AbbrevDecl->getTag();
-    if (tag == DW_TAG_subprogram) {
-      uint64_t hi_pc = -1ULL;
-      uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
-      if (lo_pc != -1ULL)
-        hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
-      if (hi_pc != -1ULL)
-        debug_aranges->appendRange(cu->getOffset(), lo_pc, hi_pc);
+    if (isSubprogramDIE()) {
+      uint64_t LowPC, HighPC;
+      if (getLowAndHighPC(CU, LowPC, HighPC)) {
+        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
+      }
+      // FIXME: try to append ranges from .debug_ranges section.
     }
 
     const DWARFDebugInfoEntryMinimal *child = getFirstChild();
     while (child) {
-      child->buildAddressRangeTable(cu, debug_aranges);
+      child->buildAddressRangeTable(CU, DebugAranges);
       child = child->getSibling();
     }
   }
@@ -443,51 +488,90 @@
 
 bool
 DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-    const DWARFCompileUnit *cu, const uint64_t address) const {
-  if (!isNULL() && getTag() == DW_TAG_subprogram) {
-    uint64_t hi_pc = -1ULL;
-    uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
-    if (lo_pc != -1ULL)
-      hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
-    if (hi_pc != -1ULL) {
-      return (lo_pc <= address && address < hi_pc);
-    }
+    const DWARFCompileUnit *CU, const uint64_t Address) const {
+  if (isNULL())
+    return false;
+  uint64_t LowPC, HighPC;
+  if (getLowAndHighPC(CU, LowPC, HighPC))
+    return (LowPC <= Address && Address <= HighPC);
+  // Try to get address ranges from .debug_ranges section.
+  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  if (RangesOffset != -1U) {
+    DWARFDebugRangeList RangeList;
+    if (CU->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(CU->getBaseAddress(), Address);
   }
   return false;
 }
 
 const char*
-DWARFDebugInfoEntryMinimal::getSubprogramName(
-    const DWARFCompileUnit *cu) const {
-  if (isNULL() || getTag() != DW_TAG_subprogram)
+DWARFDebugInfoEntryMinimal::getSubroutineName(
+    const DWARFCompileUnit *CU) const {
+  if (!isSubroutineDIE())
     return 0;
   // Try to get mangled name if possible.
   if (const char *name =
-      getAttributeValueAsString(cu, DW_AT_MIPS_linkage_name, 0))
+      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(cu, DW_AT_linkage_name, 0))
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(cu, DW_AT_name, 0))
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
-      getAttributeValueAsReference(cu, DW_AT_specification, -1U);
+      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extract(cu, &spec_ref)) {
-      if (const char *name = spec_die.getSubprogramName(cu))
+    if (spec_die.extract(CU, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(CU))
         return name;
     }
   }
   // Try to get name from abstract origin DIE.
   uint32_t abs_origin_ref =
-      getAttributeValueAsReference(cu, DW_AT_abstract_origin, -1U);
+      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extract(cu, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubprogramName(cu))
+    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(CU))
         return name;
     }
   }
   return 0;
 }
+
+void DWARFDebugInfoEntryMinimal::getCallerFrame(
+    const DWARFCompileUnit *CU, uint32_t &CallFile, uint32_t &CallLine,
+    uint32_t &CallColumn) const {
+  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+}
+
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
+    const DWARFCompileUnit *CU, const uint64_t Address) const {
+  DWARFDebugInfoEntryMinimal::InlinedChain InlinedChain;
+  if (isNULL())
+    return InlinedChain;
+  for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
+    // Append current DIE to inlined chain only if it has correct tag
+    // (e.g. it is not a lexical block).
+    if (DIE->isSubroutineDIE()) {
+      InlinedChain.push_back(*DIE);
+    }
+    // Try to get child which also contains provided address.
+    const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
+    while (Child) {
+      if (Child->addressRangeContainsAddress(CU, Address)) {
+        // Assume there is only one such child.
+        break;
+      }
+      Child = Child->getSibling();
+    }
+    DIE = Child;
+  }
+  // Reverse the obtained chain to make the root of inlined chain last.
+  std::reverse(InlinedChain.begin(), InlinedChain.end());
+  return InlinedChain;
+}

diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index d5d86b9..9c1b2be 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h

@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
 
 #include "DWARFAbbreviationDeclaration.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -19,6 +20,7 @@
 class DWARFCompileUnit;
 class DWARFContext;
 class DWARFFormValue;
+class DWARFInlinedSubroutineChain;
 
 /// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
 class DWARFDebugInfoEntryMinimal {
@@ -52,6 +54,13 @@
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
   bool isNULL() const { return AbbrevDecl == 0; }
+
+  /// Returns true if DIE represents a subprogram (not inlined).
+  bool isSubprogramDIE() const;
+  /// Returns true if DIE represents a subprogram or an inlined
+  /// subroutine.
+  bool isSubroutineDIE() const;
+
   uint32_t getOffset() const { return Offset; }
   uint32_t getNumAttributes() const {
     return !isNULL() ? AbbrevDecl->getNumAttributes() : 0;
@@ -126,17 +135,40 @@
                                     const uint16_t attr,
                                     int64_t fail_value) const;
 
-  void buildAddressRangeTable(const DWARFCompileUnit *cu,
-                              DWARFDebugAranges *debug_aranges) const;
+  /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
+  /// Returns true if both attributes are present.
+  bool getLowAndHighPC(const DWARFCompileUnit *CU,
+                       uint64_t &LowPC, uint64_t &HighPC) const;
 
-  bool addressRangeContainsAddress(const DWARFCompileUnit *cu,
-                                   const uint64_t address) const;
+  void buildAddressRangeTable(const DWARFCompileUnit *CU,
+                              DWARFDebugAranges *DebugAranges) const;
 
-  // If a DIE represents a subprogram, returns its mangled name
-  // (or short name, if mangled is missing). This name may be fetched
-  // from specification or abstract origin for this subprogram.
-  // Returns null if no name is found.
-  const char* getSubprogramName(const DWARFCompileUnit *cu) const;
+  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+                                   const uint64_t Address) const;
+
+  /// If a DIE represents a subprogram (or inlined subroutine),
+  /// returns its mangled name (or short name, if mangled is missing).
+  /// This name may be fetched from specification or abstract origin
+  /// for this subprogram. Returns null if no name is found.
+  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+
+  /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
+  /// DW_AT_call_column from DIE (or zeroes if they are missing).
+  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+                      uint32_t &CallLine, uint32_t &CallColumn) const;
+
+  /// InlinedChain - represents a chain of inlined_subroutine
+  /// DIEs, (possibly ending with subprogram DIE), all of which are contained
+  /// in some concrete inlined instance tree. Address range for each DIE
+  /// (except the last DIE) in this chain is contained in address
+  /// range for next DIE in the chain.
+  typedef SmallVector<DWARFDebugInfoEntryMinimal, 4> InlinedChain;
+
+  /// Get inlined chain for a given address, rooted at the current DIE.
+  /// Returns empty chain if address is not contained in address range
+  /// of current DIE.
+  InlinedChain getInlinedChainForAddress(const DWARFCompileUnit *CU,
+                                         const uint64_t Address) const;
 };
 
 }

diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index d99575d..267364a 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp

@@ -10,6 +10,7 @@
 #include "DWARFDebugLine.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
@@ -513,3 +514,29 @@
   }
   return index;
 }
+
+bool
+DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                              bool NeedsAbsoluteFilePath,
+                                              std::string &Result) const {
+  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size())
+    return false;
+  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
+  const char *FileName = Entry.Name;
+  if (!NeedsAbsoluteFilePath ||
+      sys::path::is_absolute(FileName)) {
+    Result = FileName;
+    return true;
+  }
+  SmallString<16> FilePath;
+  uint64_t IncludeDirIndex = Entry.DirIdx;
+  // Be defensive about the contents of Entry.
+  if (IncludeDirIndex > 0 &&
+      IncludeDirIndex <= Prologue.IncludeDirectories.size()) {
+    const char *IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+    sys::path::append(FilePath, IncludeDir);
+  }
+  sys::path::append(FilePath, FileName);
+  Result = FilePath.str();
+  return true;
+}

diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index 6382b45..586dd7e 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h

@@ -12,6 +12,7 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include <map>
+#include <string>
 #include <vector>
 
 namespace llvm {
@@ -174,6 +175,13 @@
     // Returns the index of the row with file/line info for a given address,
     // or -1 if there is no such row.
     uint32_t lookupAddress(uint64_t address) const;
+
+    // Extracts filename by its index in filename table in prologue.
+    // Returns true on success.
+    bool getFileNameByIndex(uint64_t FileIndex,
+                            bool NeedsAbsoluteFilePath,
+                            std::string &Result) const;
+
     void dump(raw_ostream &OS) const;
 
     struct Prologue Prologue;

diff --git a/lib/DebugInfo/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARFDebugRangeList.cpp
new file mode 100644
index 0000000..1806bee
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugRangeList.cpp

@@ -0,0 +1,67 @@
+//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugRangeList.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugRangeList::clear() {
+  Offset = -1U;
+  AddressSize = 0;
+  Entries.clear();
+}
+
+bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
+  clear();
+  if (!data.isValidOffset(*offset_ptr))
+    return false;
+  AddressSize = data.getAddressSize();
+  if (AddressSize != 4 && AddressSize != 8)
+    return false;
+  Offset = *offset_ptr;
+  while (true) {
+    RangeListEntry entry;
+    uint32_t prev_offset = *offset_ptr;
+    entry.StartAddress = data.getAddress(offset_ptr);
+    entry.EndAddress = data.getAddress(offset_ptr);
+    // Check that both values were extracted correctly.
+    if (*offset_ptr != prev_offset + 2 * AddressSize) {
+      clear();
+      return false;
+    }
+    if (entry.isEndOfListEntry())
+      break;
+    Entries.push_back(entry);
+  }
+  return true;
+}
+
+void DWARFDebugRangeList::dump(raw_ostream &OS) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    const char *format_str = (AddressSize == 4
+                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
+                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    OS << format(format_str, Offset, Entries[i].StartAddress,
+                                     Entries[i].EndAddress);
+  }
+  OS << format("%08x <End of list>\n", Offset);
+}
+
+bool DWARFDebugRangeList::containsAddress(uint64_t BaseAddress,
+                                          uint64_t Address) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    if (Entries[i].isBaseAddressSelectionEntry(AddressSize))
+      BaseAddress = Entries[i].EndAddress;
+    else if (Entries[i].containsAddress(BaseAddress, Address))
+      return true;
+  }
+  return false;
+}

diff --git a/lib/DebugInfo/DWARFDebugRangeList.h b/lib/DebugInfo/DWARFDebugRangeList.h
new file mode 100644
index 0000000..4e34a91
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugRangeList.h

@@ -0,0 +1,78 @@
+//===-- DWARFDebugRangeList.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#define LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugRangeList {
+public:
+  struct RangeListEntry {
+    // A beginning address offset. This address offset has the size of an
+    // address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the beginning
+    // of an address range.
+    uint64_t StartAddress;
+    // An ending address offset. This address offset again has the size of
+    // an address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the first
+    // address past the end of the address range. The ending address must
+    // be greater than or equal to the beginning address.
+    uint64_t EndAddress;
+    // The end of any given range list is marked by an end of list entry,
+    // which consists of a 0 for the beginning address offset
+    // and a 0 for the ending address offset.
+    bool isEndOfListEntry() const {
+      return (StartAddress == 0) && (EndAddress == 0);
+    }
+    // A base address selection entry consists of:
+    // 1. The value of the largest representable address offset
+    // (for example, 0xffffffff when the size of an address is 32 bits).
+    // 2. An address, which defines the appropriate base address for
+    // use in interpreting the beginning and ending address offsets of
+    // subsequent entries of the location list.
+    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
+      assert(AddressSize == 4 || AddressSize == 8);
+      if (AddressSize == 4)
+        return StartAddress == -1U;
+      else
+        return StartAddress == -1ULL;
+    }
+    bool containsAddress(uint64_t BaseAddress, uint64_t Address) const {
+      return (BaseAddress + StartAddress <= Address) &&
+             (Address < BaseAddress + EndAddress);
+    }
+  };
+
+private:
+  // Offset in .debug_ranges section.
+  uint32_t Offset;
+  uint8_t AddressSize;
+  std::vector<RangeListEntry> Entries;
+
+public:
+  DWARFDebugRangeList() { clear(); }
+  void clear();
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  /// containsAddress - Returns true if range list contains the given
+  /// address. Has to be passed base address of the compile unit that
+  /// references this range list.
+  bool containsAddress(uint64_t BaseAddress, uint64_t Address) const;
+};
+
+}  // namespace llvm
+
+#endif  // LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H

diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index ee2a3ab..c9ecbbb 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp

@@ -41,6 +41,10 @@
   8, // 0x14 DW_FORM_ref8
   0, // 0x15 DW_FORM_ref_udata
   0, // 0x16 DW_FORM_indirect
+  4, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
 };
 
 static const uint8_t form_sizes_addr8[] = {
@@ -67,6 +71,10 @@
   8, // 0x14 DW_FORM_ref8
   0, // 0x15 DW_FORM_ref_udata
   0, // 0x16 DW_FORM_indirect
+  8, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
 };
 
 const uint8_t *
@@ -93,6 +101,7 @@
     case DW_FORM_ref_addr:
       Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
       break;
+    case DW_FORM_exprloc:
     case DW_FORM_block:
       Value.uval = data.getULEB128(offset_ptr);
       is_block = true;
@@ -141,12 +150,24 @@
       // Set the string value to also be the data for inlined cstr form
       // values only so we can tell the differnence between DW_FORM_string
       // and DW_FORM_strp form values
-      Value.data = (uint8_t*)Value.cstr;
+      Value.data = (const uint8_t*)Value.cstr;
       break;
     case DW_FORM_indirect:
       Form = data.getULEB128(offset_ptr);
       indirect = true;
       break;
+    case DW_FORM_sec_offset:
+      if (cu->getAddressByteSize() == 4)
+        Value.uval = data.getU32(offset_ptr);
+      else
+        Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_flag_present:
+      Value.uval = 1;
+      break;
+    case DW_FORM_ref_sig8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
     default:
       return false;
     }
@@ -179,6 +200,7 @@
     switch (form) {
     // Blocks if inlined data that have a length field and the data bytes
     // inlined in the .debug_info
+    case DW_FORM_exprloc:
     case DW_FORM_block: {
       uint64_t size = debug_info_data.getULEB128(offset_ptr);
       *offset_ptr += size;
@@ -211,6 +233,10 @@
       *offset_ptr += cu->getAddressByteSize();
       return true;
 
+    // 0 byte values - implied from the form.
+    case DW_FORM_flag_present:
+      return true;
+      
     // 1 byte values
     case DW_FORM_data1:
     case DW_FORM_flag:
@@ -234,6 +260,7 @@
     // 8 byte values
     case DW_FORM_data8:
     case DW_FORM_ref8:
+    case DW_FORM_ref_sig8:
       *offset_ptr += 8;
       return true;
 
@@ -249,6 +276,15 @@
       indirect = true;
       form = debug_info_data.getULEB128(offset_ptr);
       break;
+
+    // 4 for DWARF32, 8 for DWARF64.
+    case DW_FORM_sec_offset:
+      if (cu->getAddressByteSize() == 4)
+        *offset_ptr += 4;
+      else
+        *offset_ptr += 8;
+      return true;
+      
     default:
       return false;
     }
@@ -264,22 +300,26 @@
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_flag_present: OS << "true"; break;
   case DW_FORM_flag:
   case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
   case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
   case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
+  case DW_FORM_ref_sig8:
   case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_string:
     OS << '"';
     OS.write_escaped(getAsCString(NULL));
     OS << '"';
     break;
+  case DW_FORM_exprloc:
   case DW_FORM_block:
   case DW_FORM_block1:
   case DW_FORM_block2:
   case DW_FORM_block4:
     if (uvalue > 0) {
       switch (Form) {
+      case DW_FORM_exprloc:
       case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
       case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
       case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
@@ -342,6 +382,14 @@
   case DW_FORM_indirect:
     OS << "DW_FORM_indirect";
     break;
+
+  case DW_FORM_sec_offset:
+    if (cu->getAddressByteSize() == 4)
+      OS << format("0x%08x", (uint32_t)uvalue);
+    else
+      OS << format("0x%016" PRIx64, uvalue);
+    break;
+    
   default:
     OS << format("DW_FORM(0x%4.4x)", Form);
     break;
@@ -404,6 +452,7 @@
 
 bool DWARFFormValue::isBlockForm(uint16_t form) {
   switch (form) {
+  case DW_FORM_exprloc:
   case DW_FORM_block:
   case DW_FORM_block1:
   case DW_FORM_block2:

diff --git a/lib/DebugInfo/DWARFFormValue.h b/lib/DebugInfo/DWARFFormValue.h
index 22ac011..c5b590d 100644
--- a/lib/DebugInfo/DWARFFormValue.h
+++ b/lib/DebugInfo/DWARFFormValue.h

@@ -52,7 +52,7 @@
   bool extractValue(DataExtractor data, uint32_t *offset_ptr,
                     const DWARFCompileUnit *cu);
   bool isInlinedCStr() const {
-    return Value.data != NULL && Value.data == (uint8_t*)Value.cstr;
+    return Value.data != NULL && Value.data == (const uint8_t*)Value.cstr;
   }
   const uint8_t *BlockData() const;
   uint64_t getReference(const DWARFCompileUnit* cu) const;

diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 4afc900..ba0aeca 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp

@@ -833,7 +833,7 @@
 static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
                              unsigned StoreBytes) {
   assert((IntVal.getBitWidth()+7)/8 >= StoreBytes && "Integer too small!");
-  uint8_t *Src = (uint8_t *)IntVal.getRawData();
+  const uint8_t *Src = (const uint8_t *)IntVal.getRawData();
 
   if (sys::isLittleEndianHost()) {
     // Little-endian host - the source is ordered from LSB to MSB.  Order the

diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index ff3a9dc..3bf6db8 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp

@@ -780,7 +780,7 @@
 
 void JITEmitter::startFunction(MachineFunction &F) {
   DEBUG(dbgs() << "JIT: Starting CodeGen of Function "
-        << F.getFunction()->getName() << "\n");
+        << F.getName() << "\n");
 
   uintptr_t ActualSize = 0;
   // Set the memory writable, if it's not already
@@ -929,7 +929,7 @@
   PrevDL = DebugLoc();
 
   DEBUG(dbgs() << "JIT: Finished CodeGen of [" << (void*)FnStart
-        << "] Function: " << F.getFunction()->getName()
+        << "] Function: " << F.getName()
         << ": " << (FnEnd-FnStart) << " bytes of text, "
         << Relocations.size() << " relocations\n");
 

diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 99c65ec..fa7130514 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp

@@ -113,6 +113,11 @@
 }
 
 void *MCJIT::getPointerToFunction(Function *F) {
+  // FIXME: This should really return a uint64_t since it's a pointer in the
+  // target address space, not our local address space. That's part of the
+  // ExecutionEngine interface, though. Fix that when the old JIT finally
+  // dies.
+
   // FIXME: Add support for per-module compilation state
   if (!isCompiled)
     emitObject(M);
@@ -126,10 +131,13 @@
 
   // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
+  //
+  // This is the accessor for the target address, so make sure to check the
+  // load address of the symbol, not the local address.
   StringRef BaseName = F->getName();
   if (BaseName[0] == '\1')
-    return (void*)Dyld.getSymbolAddress(BaseName.substr(1));
-  return (void*)Dyld.getSymbolAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+    return (void*)Dyld.getSymbolLoadAddress(BaseName.substr(1));
+  return (void*)Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
                                        + BaseName).str());
 }
 

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index a98ddc0..d47287b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp

@@ -479,6 +479,10 @@
   return Dyld->getSymbolAddress(Name);
 }
 
+uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  return Dyld->getSymbolLoadAddress(Name);
+}
+
 void RuntimeDyld::resolveRelocations() {
   Dyld->resolveRelocations();
 }

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 0aea598..a1c0e40 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp

@@ -36,8 +36,7 @@
   typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
   typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
 
-  typedef typename ELFObjectFile<target_endianness, is64Bits>::
-    Elf_Ehdr Elf_Ehdr;
+  typedef Elf_Ehdr_Impl<target_endianness, is64Bits> Elf_Ehdr;
 
   typedef typename ELFDataTypeTypedefHelper<
           target_endianness, is64Bits>::value_type addr_type;

diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 3d89994..d5df732 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h

@@ -177,6 +177,10 @@
     return true;
   }
 
+  uint64_t getSectionLoadAddress(unsigned SectionID) {
+    return Sections[SectionID].LoadAddress;
+  }
+
   uint8_t *getSectionAddress(unsigned SectionID) {
     return (uint8_t*)Sections[SectionID].Address;
   }
@@ -223,7 +227,10 @@
   void resolveRelocationEntry(const RelocationEntry &RE, uint64_t Value);
 
   /// \brief A object file specific relocation resolver
-  /// \param Address Address to apply the relocation action
+  /// \param LocalAddress The address to apply the relocation action
+  /// \param FinalAddress If the linker prepare code for remote executon then
+  ///                     FinalAddress has the remote address to apply the
+  ///                     relocation action, otherwise is same as LocalAddress
   /// \param Value Target symbol address to apply the relocation action
   /// \param Type object file specific relocation type
   /// \param Addend A constant addend used to compute the value to be stored
@@ -267,6 +274,15 @@
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
+  uint64_t getSymbolLoadAddress(StringRef Name) {
+    // FIXME: Just look up as a function for now. Overly simple of course.
+    // Work in progress.
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+      return 0;
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    return getSectionLoadAddress(Loc.first) + Loc.second;
+  }
+
   void resolveRelocations();
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);

diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 7203b9a..6e37b5c 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp

@@ -270,9 +270,10 @@
 
     /// ComputeSymbolTable - Compute the symbol table data
     ///
-    /// \param StringTable [out] - The string table data.
-    /// \param StringIndexMap [out] - Map from symbol names to offsets in the
-    /// string table.
+    /// \param Asm - The assembler.
+    /// \param SectionIndexMap - Maps a section to its index.
+    /// \param RevGroupMap - Maps a signature symbol to the group section.
+    /// \param NumRegularSections - Number of non-relocation sections.
     void ComputeSymbolTable(MCAssembler &Asm,
                             const SectionIndexMapTy &SectionIndexMap,
                             RevGroupMapTy RevGroupMap,

diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 8da2e0e..7ea0f3b 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp

@@ -68,8 +68,8 @@
   GlobalDirective = "\t.globl\t";
   HasSetDirective = true;
   HasAggressiveSymbolFolding = true;
-  LCOMMDirectiveType = LCOMM::None;
   COMMDirectiveAlignmentIsInBytes = true;
+  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
   HasNoDeadStrip = false;

diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 678e75a..fd79193 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp

@@ -19,8 +19,10 @@
 
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   GlobalPrefix = "_";
+  // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
+  // alignment.
   COMMDirectiveAlignmentIsInBytes = false;
-  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   HasDotTypeDotSizeDirective = false;
   HasSingleParameterDotFile = false;
   PrivateGlobalPrefix = "L";  // Prefix for private global symbols

diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 8e0ac23..a0e3eba 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp

@@ -32,6 +32,7 @@
 
   AlignmentIsInBytes = false;
   COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
   InlineAsmStart = " InlineAsm Start";
   InlineAsmEnd = " InlineAsm End";
 

diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 373df4b..b0bc290 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp

@@ -166,7 +166,7 @@
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
-  /// @param Size - The alignment of the common symbol in bytes.
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment);
 
@@ -517,13 +517,19 @@
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
-  assert(MAI.getLCOMMDirectiveType() != LCOMM::None &&
-         "Doesn't have .lcomm, can't emit it!");
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
-    assert(MAI.getLCOMMDirectiveType() == LCOMM::ByteAlignment &&
-           "Alignment not supported on .lcomm!");
-    OS << ',' << ByteAlign;
+    switch (MAI.getLCOMMDirectiveAlignmentType()) {
+    case LCOMM::NoAlignment:
+      llvm_unreachable("alignment not supported on .lcomm!");
+    case LCOMM::ByteAlignment:
+      OS << ',' << ByteAlign;
+      break;
+    case LCOMM::Log2Alignment:
+      assert(isPowerOf2_32(ByteAlign) && "alignment must be a power of 2");
+      OS << ',' << Log2_32(ByteAlign);
+      break;
+    }
   }
   EmitEOL();
 }

diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index b7d2c28..926d39b 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp

@@ -331,6 +331,12 @@
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     unsigned Offset = Layout.getFragmentOffset(&AF);
     unsigned Size = OffsetToAlignment(Offset, AF.getAlignment());
+    // If we are padding with nops, force the padding to be larger than the
+    // minimum nop size.
+    if (Size > 0 && AF.hasEmitNops()) {
+      while (Size % getBackend().getMinimumNopSize())
+        Size += AF.getAlignment();
+    }
     if (Size > AF.getMaxBytesToEmit())
       return 0;
     return Size;
@@ -830,6 +836,7 @@
 
 }
 
+#ifndef NDEBUG
 void MCFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
@@ -970,6 +977,7 @@
   }
   OS << "]>\n";
 }
+#endif
 
 // anchors for MC*Fragment vtables
 void MCDataFragment::anchor() { }

diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 4c63e434..96938f7 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp

@@ -425,9 +425,11 @@
   OS << '"' << getName() << '"';
 }
 
+#ifndef NDEBUG
 void MCDwarfFile::dump() const {
   print(dbgs());
 }
+#endif
 
 // Utility function to write a tuple for .debug_abbrev.
 static void EmitAbbrev(MCStreamer *MCOS, uint64_t Name, uint64_t Form) {

diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 0eb7fcc..b196659 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp

@@ -136,10 +136,12 @@
   llvm_unreachable("Invalid expression kind!");
 }
 
+#ifndef NDEBUG
 void MCExpr::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 /* *** */
 
@@ -197,7 +199,8 @@
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_ARM_TARGET1: return "(target1)";
-  case VK_PPC_TOC: return "toc";
+  case VK_PPC_TOC: return "tocbase";
+  case VK_PPC_TOC_ENTRY: return "toc";
   case VK_PPC_DARWIN_HA16: return "ha16";
   case VK_PPC_DARWIN_LO16: return "lo16";
   case VK_PPC_GAS_HA16: return "ha";

diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 7bbfd2e..e96010b 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp

@@ -32,10 +32,12 @@
   OS << ">";
 }
 
+#ifndef NDEBUG
 void MCOperand::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
 }
+#endif
 
 void MCInst::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << "<MCInst " << getOpcode();
@@ -62,7 +64,9 @@
   OS << ">";
 }
 
+#ifndef NDEBUG
 void MCInst::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
 }
+#endif

diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index 9c0fc92..95d7d16 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp

@@ -16,6 +16,8 @@
   OS << '"' << getInstance() << '"';
 }
 
+#ifndef NDEBUG
 void MCLabel::dump() const {
   print(dbgs());
 }
+#endif

diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index b75fe2c..74f6dc6 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp

@@ -70,9 +70,7 @@
     llvm_unreachable("macho doesn't support this directive");
   }
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                     unsigned ByteAlignment) {
-    llvm_unreachable("macho doesn't support this directive");
-  }
+                                     unsigned ByteAlignment);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
@@ -325,6 +323,15 @@
   SD.setCommon(Size, ByteAlignment);
 }
 
+void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlignment) {
+  // '.lcomm' is equivalent to '.zerofill'.
+  return EmitZerofill(getContext().getMachOSection("__DATA", "__bss",
+                                                   MCSectionMachO::S_ZEROFILL,
+                                                   0, SectionKind::getBSS()),
+                      Symbol, Size, ByteAlignment);
+}
+
 void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);

diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index bad7cfe..21756cd 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp

@@ -258,12 +258,18 @@
 void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
-  DF->addFixup(MCFixup::Create(DF->getContents().size(),
-                               Value,
-                               FK_GPRel_4));
+  DF->addFixup(MCFixup::Create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
+// Associate GPRel32 fixup with data and resize data area
+void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->addFixup(MCFixup::Create(DF->getContents().size(), Value, FK_GPRel_4));
+  DF->getContents().resize(DF->getContents().size() + 8, 0);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   const MCSymbol *LineSectionSymbol = NULL;

diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 240c10b..55ef01c 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp

@@ -133,13 +133,13 @@
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI);
-  ~AsmParser();
+  virtual ~AsmParser();
 
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false);
 
-  void AddDirectiveHandler(MCAsmParserExtension *Object,
-                           StringRef Directive,
-                           DirectiveHandler Handler) {
+  virtual void AddDirectiveHandler(MCAsmParserExtension *Object,
+                                   StringRef Directive,
+                                   DirectiveHandler Handler) {
     DirectiveMap[Directive] = std::make_pair(Object, Handler);
   }
 
@@ -166,7 +166,7 @@
   virtual bool Error(SMLoc L, const Twine &Msg,
                      ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
 
-  const AsmToken &Lex();
+  virtual const AsmToken &Lex();
 
   bool ParseExpression(const MCExpr *&Res);
   virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc);
@@ -207,7 +207,7 @@
   /// subsequently.
   void JumpToLoc(SMLoc Loc);
 
-  void EatToEndOfStatement();
+  virtual void EatToEndOfStatement();
 
   bool ParseMacroArgument(MacroArgument &MA);
   bool ParseMacroArguments(const Macro *M, MacroArguments &A);
@@ -215,7 +215,7 @@
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
-  StringRef ParseStringToEndOfStatement();
+  virtual StringRef ParseStringToEndOfStatement();
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
@@ -230,7 +230,7 @@
 
   /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
   /// and set \arg Res to the identifier contents.
-  bool ParseIdentifier(StringRef &Res);
+  virtual bool ParseIdentifier(StringRef &Res);
 
   // Directive Parsing.
 
@@ -2280,8 +2280,13 @@
     if (ParseAbsoluteExpression(Pow2Alignment))
       return true;
 
+    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
+    if (IsLocal && LCOMM == LCOMM::NoAlignment)
+      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
+
     // If this target takes alignments in bytes (not log) validate and convert.
-    if (Lexer.getMAI().getAlignmentIsInBytes()) {
+    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
+        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
       if (!isPowerOf2_64(Pow2Alignment))
         return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
       Pow2Alignment = Log2_64(Pow2Alignment);
@@ -2309,13 +2314,9 @@
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
-  // '.lcomm' is equivalent to '.zerofill'.
   // Create the Symbol as a common or local common with Size and Pow2Alignment
   if (IsLocal) {
-    getStreamer().EmitZerofill(Ctx.getMachOSection(
-                                 "__DATA", "__bss", MCSectionMachO::S_ZEROFILL,
-                                 0, SectionKind::getBSS()),
-                               Sym, Size, 1 << Pow2Alignment);
+    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
     return false;
   }
 

diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 9316bb1..d55de1f 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp

@@ -203,7 +203,7 @@
   StringRef Name;
   if (getParser().ParseIdentifier(Name))
     return TokError("expected identifier in directive");
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);;
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");

diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 3a825f0..93ee2dd 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp

@@ -44,5 +44,7 @@
 }
 
 void MCParsedAsmOperand::dump() const {
+#ifndef NDEBUG
   dbgs() << "  " << *this;
+#endif
 }

diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 05c83f7..cbf853c 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp

@@ -70,7 +70,7 @@
 }
 
 
-MCSchedModel *
+const MCSchedModel *
 MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   assert(ProcSchedModel && "Processor machine model not available!");
 
@@ -93,11 +93,11 @@
     return &MCSchedModel::DefaultSchedModel;
   }
   assert(Found->Value && "Missing processor SchedModel value");
-  return (MCSchedModel *)Found->Value;
+  return (const MCSchedModel *)Found->Value;
 }
 
 InstrItineraryData
 MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
-  MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  const MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
   return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }

diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index f7f9184..f60126b 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp

@@ -76,6 +76,8 @@
   OS << '"' << getName() << '"';
 }
 
+#ifndef NDEBUG
 void MCSymbol::dump() const {
   print(dbgs());
 }
+#endif

diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index c6ea16c..a37149d 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp

@@ -31,6 +31,8 @@
     OS << " + " << getConstant();
 }
 
+#ifndef NDEBUG
 void MCValue::dump() const {
   print(dbgs(), 0);
 }
+#endif

diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 5820a22..c57b0d6 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp

@@ -396,8 +396,7 @@
       continue;
 
     // Initialize the section indirect symbol base, if necessary.
-    if (!IndirectSymBase.count(it->SectionData))
-      IndirectSymBase[it->SectionData] = IndirectIndex;
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
 
     Asm.getOrCreateSymbolData(*it->Symbol);
   }
@@ -414,8 +413,7 @@
       continue;
 
     // Initialize the section indirect symbol base, if necessary.
-    if (!IndirectSymBase.count(it->SectionData))
-      IndirectSymBase[it->SectionData] = IndirectIndex;
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
 
     // Set the symbol type to undefined lazy, but only on construction.
     //

diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 0a44e77..317a48e 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp

@@ -337,9 +337,9 @@
 }
 
 /// Get scheduling itinerary of a CPU.
-void *SubtargetFeatures::getItinerary(const StringRef CPU,
-                                      const SubtargetInfoKV *Table,
-                                      size_t TableSize) {
+const void *SubtargetFeatures::getItinerary(const StringRef CPU,
+                                            const SubtargetInfoKV *Table,
+                                            size_t TableSize) {
   assert(Table && "missing table");
 #ifndef NDEBUG
   for (size_t i = 1; i < TableSize; i++) {
@@ -368,11 +368,13 @@
   OS << "\n";
 }
 
+#ifndef NDEBUG
 /// dump - Dump feature info.
 ///
 void SubtargetFeatures::dump() const {
   print(dbgs());
 }
+#endif
 
 /// getDefaultSubtargetFeatures - Return a string listing the features
 /// associated with the target triple.

diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index ed261a4..f143e6d 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp

@@ -196,8 +196,10 @@
     assert(value < 10U && "Invalid character in exponent");
 
     unsignedExponent = unsignedExponent * 10 + value;
-    if (unsignedExponent > 32767)
+    if (unsignedExponent > 32767) {
       overflow = true;
+      break;
+    }
   }
 
   if (exponentAdjustment > 32767 || exponentAdjustment < -32768)

diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 2d8ddd9..45fec36 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp

@@ -160,7 +160,7 @@
 // On linux we have a weird situation. The stderr/out/in symbols are both
 // macros and global variables because of standards requirements. So, we
 // boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
-#if defined(__linux__) && !defined(__android__)
+#if defined(__linux__) and !defined(__ANDROID__)
   {
     EXPLICIT_SYMBOL(stderr);
     EXPLICIT_SYMBOL(stdout);

diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index c6282c6..4d489a8 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp

@@ -38,6 +38,14 @@
   return memcmp(Data, RHS.Data, Size*sizeof(*Data)) == 0;
 }
 
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeIDRef::operator<(FoldingSetNodeIDRef RHS) const {
+  if (Size != RHS.Size)
+    return Size < RHS.Size;
+  return memcmp(Data, RHS.Data, Size*sizeof(*Data)) < 0;
+}
+
 //===----------------------------------------------------------------------===//
 // FoldingSetNodeID Implementation
 
@@ -152,6 +160,16 @@
   return FoldingSetNodeIDRef(Bits.data(), Bits.size()) == RHS;
 }
 
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeID::operator<(const FoldingSetNodeID &RHS)const{
+  return *this < FoldingSetNodeIDRef(RHS.Bits.data(), RHS.Bits.size());
+}
+
+bool FoldingSetNodeID::operator<(FoldingSetNodeIDRef RHS) const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()) < RHS;
+}
+
 /// Intern - Copy this node's data to a memory region allocated from the
 /// given allocator and return a FoldingSetNodeIDRef describing the
 /// interned data.

diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 8cb9857..59bfcfc 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp

@@ -49,8 +49,7 @@
 }
 
 bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
-// getsid not supported in Android bionic library
-#if LLVM_ON_UNIX && !defined(ANDROID_TARGET_BUILD)
+#if LLVM_ON_UNIX && !defined(__ANDROID__)
   char MyHostname[256];
   MyHostname[255] = 0;
   MyHostname[0] = 0;

diff --git a/lib/Support/SmallVector.cpp b/lib/Support/SmallVector.cpp
index a89f149..f9c0e78 100644
--- a/lib/Support/SmallVector.cpp
+++ b/lib/Support/SmallVector.cpp

@@ -16,14 +16,15 @@
 
 /// grow_pod - This is an implementation of the grow() method which only works
 /// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(size_t MinSizeInBytes, size_t TSize) {
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+                               size_t TSize) {
   size_t CurSizeBytes = size_in_bytes();
   size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
 
   void *NewElts;
-  if (this->isSmall()) {
+  if (BeginX == FirstEl) {
     NewElts = malloc(NewCapacityInBytes);
 
     // Copy the elements over.  No need to run dtors on PODs.
@@ -37,4 +38,3 @@
   this->BeginX = NewElts;
   this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
 }
-

diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index c2fc261..9ac1f86 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp

@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 using namespace llvm;
 
@@ -69,7 +70,7 @@
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return it.
-    if (BucketItem == 0) {
+    if (LLVM_LIKELY(BucketItem == 0)) {
       // If we found a tombstone, we want to reuse the tombstone instead of an
       // empty bucket.  This reduces probing.
       if (FirstTombstone != -1) {
@@ -84,7 +85,7 @@
     if (BucketItem == getTombstoneVal()) {
       // Skip over tombstones.  However, remember the first one we see.
       if (FirstTombstone == -1) FirstTombstone = BucketNo;
-    } else if (HashTable[BucketNo] == FullHashValue) {
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
       // If the full hash value matches, check deeply for a match.  The common
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This
@@ -123,12 +124,12 @@
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return.
-    if (BucketItem == 0)
+    if (LLVM_LIKELY(BucketItem == 0))
       return -1;
     
     if (BucketItem == getTombstoneVal()) {
       // Ignore tombstones.
-    } else if (HashTable[BucketNo] == FullHashValue) {
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
       // If the full hash value matches, check deeply for a match.  The common
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This

diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index cca549d..d1dc7c8 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp

@@ -95,6 +95,7 @@
   case SCEI: return "scei";
   case BGP: return "bgp";
   case BGQ: return "bgq";
+  case Freescale: return "fsl";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -138,7 +139,7 @@
   case GNUEABI: return "gnueabi";
   case EABI: return "eabi";
   case MachO: return "macho";
-  case ANDROIDEABI: return "androideabi";
+  case Android: return "android";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -269,6 +270,7 @@
     .Case("scei", Triple::SCEI)
     .Case("bgp", Triple::BGP)
     .Case("bgq", Triple::BGQ)
+    .Case("fsl", Triple::Freescale)
     .Default(Triple::UnknownVendor);
 }
 
@@ -305,7 +307,7 @@
     .StartsWith("gnueabi", Triple::GNUEABI)
     .StartsWith("gnu", Triple::GNU)
     .StartsWith("macho", Triple::MachO)
-    .StartsWith("androideabi", Triple::ANDROIDEABI)
+    .StartsWith("android", Triple::Android)
     .Default(Triple::UnknownEnvironment);
 }
 

diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 1d667ab..2f1e382 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc

@@ -293,7 +293,7 @@
 #endif
 }
 
-/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTrace, 0);
@@ -305,10 +305,10 @@
 
     exception_mask_t mask = EXC_MASK_CRASH;
 
-    kern_return_t ret = task_set_exception_ports(self, 
+    kern_return_t ret = task_set_exception_ports(self,
                              mask,
                              MACH_PORT_NULL,
-                             EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES, 
+                             EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES,
                              THREAD_STATE_NONE);
     (void)ret;
   }

diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index fa69c2d..7cd5364 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp

@@ -266,8 +266,8 @@
 
 raw_ostream &raw_ostream::write(unsigned char C) {
   // Group exceptional cases into a single branch.
-  if (BUILTIN_EXPECT(OutBufCur >= OutBufEnd, false)) {
-    if (BUILTIN_EXPECT(!OutBufStart, false)) {
+  if (LLVM_UNLIKELY(OutBufCur >= OutBufEnd)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == Unbuffered) {
         write_impl(reinterpret_cast<char*>(&C), 1);
         return *this;
@@ -286,8 +286,8 @@
 
 raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
   // Group exceptional cases into a single branch.
-  if (BUILTIN_EXPECT(size_t(OutBufEnd - OutBufCur) < Size, false)) {
-    if (BUILTIN_EXPECT(!OutBufStart, false)) {
+  if (LLVM_UNLIKELY(size_t(OutBufEnd - OutBufCur) < Size)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == Unbuffered) {
         write_impl(Ptr, Size);
         return *this;
@@ -302,7 +302,7 @@
     // If the buffer is empty at this point we have a string that is larger
     // than the buffer. Directly write the chunk that is a multiple of the
     // preferred buffer size and put the remainder in the buffer.
-    if (BUILTIN_EXPECT(OutBufCur == OutBufStart, false)) {
+    if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
       size_t BytesToWrite = Size - (Size % NumBytes);
       write_impl(Ptr, BytesToWrite);
       copy_to_buffer(Ptr + BytesToWrite, Size - BytesToWrite);
@@ -523,7 +523,7 @@
     ssize_t ret;
 
     // Check whether we should attempt to use atomic writes.
-    if (BUILTIN_EXPECT(!UseAtomicWrites, true)) {
+    if (LLVM_LIKELY(!UseAtomicWrites)) {
       ret = ::write(FD, Ptr, Size);
     } else {
       // Use ::writev() where available.

diff --git a/lib/Support/regexec.c b/lib/Support/regexec.c
index 0078616..bd5e72d 100644
--- a/lib/Support/regexec.c
+++ b/lib/Support/regexec.c

@@ -69,7 +69,7 @@
 #define	SETUP(v)	((v) = 0)
 #define	onestate	long
 #define	INIT(o, n)	((o) = (unsigned long)1 << (n))
-#define	INC(o)		((o) <<= 1)
+#define	INC(o)		((o) = (unsigned long)(o) << 1)
 #define	ISSTATEIN(v, o)	(((v) & (o)) != 0)
 /* some abbreviations; note that some of these know variable names! */
 /* do "if I'm here, I can also be there" etc without branches */

diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
index 1463b68..5dd688cb 100644
--- a/lib/TableGen/Error.cpp
+++ b/lib/TableGen/Error.cpp

@@ -20,8 +20,19 @@
 
 SourceMgr SrcMgr;
 
-void PrintWarning(SMLoc WarningLoc, const Twine &Msg) {
-  SrcMgr.PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
+static void PrintMessage(ArrayRef<SMLoc> Loc, SourceMgr::DiagKind Kind,
+                         const Twine &Msg) {
+  SMLoc NullLoc;
+  if (Loc.empty())
+    Loc = NullLoc;
+  SrcMgr.PrintMessage(Loc.front(), Kind, Msg);
+  for (unsigned i = 1; i < Loc.size(); ++i)
+    SrcMgr.PrintMessage(Loc[i], SourceMgr::DK_Note,
+                        "instantiated from multiclass");
+}
+
+void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
+  PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
 }
 
 void PrintWarning(const char *Loc, const Twine &Msg) {
@@ -36,8 +47,8 @@
   PrintWarning(Warning.getLoc(), Warning.getMessage());
 }
 
-void PrintError(SMLoc ErrorLoc, const Twine &Msg) {
-  SrcMgr.PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
+void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
+  PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
 }
 
 void PrintError(const char *Loc, const Twine &Msg) {

diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 99fdc1f..b2a7b62 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp

@@ -112,7 +112,10 @@
 }
 
 Init *BitRecTy::convertValue(TypedInit *VI) {
-  if (dynamic_cast<BitRecTy*>(VI->getType()))
+  RecTy *Ty = VI->getType();
+  if (dynamic_cast<BitRecTy*>(Ty) ||
+      dynamic_cast<BitsRecTy*>(Ty) ||
+      dynamic_cast<IntRecTy*>(Ty))
     return VI;  // Accept variable if it is already of bit type!
   return 0;
 }
@@ -178,60 +181,15 @@
 }
 
 Init *BitsRecTy::convertValue(TypedInit *VI) {
-  if (BitsRecTy *BRT = dynamic_cast<BitsRecTy*>(VI->getType()))
-    if (BRT->Size == Size) {
-      SmallVector<Init *, 16> NewBits(Size);
- 
-      for (unsigned i = 0; i != Size; ++i)
-        NewBits[i] = VarBitInit::get(VI, i);
-      return BitsInit::get(NewBits);
-    }
-
   if (Size == 1 && dynamic_cast<BitRecTy*>(VI->getType()))
     return BitsInit::get(VI);
 
-  if (TernOpInit *Tern = dynamic_cast<TernOpInit*>(VI)) {
-    if (Tern->getOpcode() == TernOpInit::IF) {
-      Init *LHS = Tern->getLHS();
-      Init *MHS = Tern->getMHS();
-      Init *RHS = Tern->getRHS();
+  if (VI->getType()->typeIsConvertibleTo(this)) {
+    SmallVector<Init *, 16> NewBits(Size);
 
-      IntInit *MHSi = dynamic_cast<IntInit*>(MHS);
-      IntInit *RHSi = dynamic_cast<IntInit*>(RHS);
-
-      if (MHSi && RHSi) {
-        int64_t MHSVal = MHSi->getValue();
-        int64_t RHSVal = RHSi->getValue();
-
-        if (canFitInBitfield(MHSVal, Size) && canFitInBitfield(RHSVal, Size)) {
-          SmallVector<Init *, 16> NewBits(Size);
-
-          for (unsigned i = 0; i != Size; ++i)
-            NewBits[i] =
-              TernOpInit::get(TernOpInit::IF, LHS,
-                              IntInit::get((MHSVal & (1LL << i)) ? 1 : 0),
-                              IntInit::get((RHSVal & (1LL << i)) ? 1 : 0),
-                              VI->getType());
-
-          return BitsInit::get(NewBits);
-        }
-      } else {
-        BitsInit *MHSbs = dynamic_cast<BitsInit*>(MHS);
-        BitsInit *RHSbs = dynamic_cast<BitsInit*>(RHS);
-
-        if (MHSbs && RHSbs) {
-          SmallVector<Init *, 16> NewBits(Size);
-
-          for (unsigned i = 0; i != Size; ++i)
-            NewBits[i] = TernOpInit::get(TernOpInit::IF, LHS,
-                                         MHSbs->getBit(i),
-                                         RHSbs->getBit(i),
-                                         VI->getType());
-
-          return BitsInit::get(NewBits);
-        }
-      }
-    }
+    for (unsigned i = 0; i != Size; ++i)
+      NewBits[i] = VarBitInit::get(VI, i);
+    return BitsInit::get(NewBits);
   }
 
   return 0;
@@ -519,6 +477,15 @@
   return Result + " }";
 }
 
+// Fix bit initializer to preserve the behavior that bit reference from a unset
+// bits initializer will resolve into VarBitInit to keep the field name and bit
+// number used in targets with fixed insn length.
+static Init *fixBitInit(const RecordVal *RV, Init *Before, Init *After) {
+  if (RV || After != UnsetInit::get())
+    return After;
+  return Before;
+}
+
 // resolveReferences - If there are any field references that refer to fields
 // that have been filled in, we can propagate the values now.
 //
@@ -526,16 +493,39 @@
   bool Changed = false;
   SmallVector<Init *, 16> NewBits(getNumBits());
 
-  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
-    Init *B;
-    Init *CurBit = getBit(i);
+  Init *CachedInit = 0;
+  Init *CachedBitVar = 0;
+  bool CachedBitVarChanged = false;
 
-    do {
-      B = CurBit;
-      CurBit = CurBit->resolveReferences(R, RV);
-      Changed |= B != CurBit;
-    } while (B != CurBit);
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    Init *CurBit = Bits[i];
+    Init *CurBitVar = CurBit->getBitVar();
+
     NewBits[i] = CurBit;
+
+    if (CurBitVar == CachedBitVar) {
+      if (CachedBitVarChanged) {
+        Init *Bit = CachedInit->getBit(CurBit->getBitNum());
+        NewBits[i] = fixBitInit(RV, CurBit, Bit);
+      }
+      continue;
+    }
+    CachedBitVar = CurBitVar;
+    CachedBitVarChanged = false;
+
+    Init *B;
+    do {
+      B = CurBitVar;
+      CurBitVar = CurBitVar->resolveReferences(R, RV);
+      CachedBitVarChanged |= B != CurBitVar;
+      Changed |= B != CurBitVar;
+    } while (B != CurBitVar);
+    CachedInit = CurBitVar;
+
+    if (CachedBitVarChanged) {
+      Init *Bit = CurBitVar->getBit(CurBit->getBitNum());
+      NewBits[i] = fixBitInit(RV, CurBit, Bit);
+    }
   }
 
   if (Changed)
@@ -682,20 +672,6 @@
   return Result + "]";
 }
 
-Init *OpInit::resolveBitReference(Record &R, const RecordVal *IRV,
-                                  unsigned Bit) const {
-  Init *Folded = Fold(&R, 0);
-
-  if (Folded != this) {
-    TypedInit *Typed = dynamic_cast<TypedInit *>(Folded);
-    if (Typed) {
-      return Typed->resolveBitReference(R, IRV, Bit);
-    }
-  }
-
-  return 0;
-}
-
 Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
                                           unsigned Elt) const {
   Init *Resolved = resolveReferences(R, IRV);
@@ -718,6 +694,12 @@
   return 0;
 }
 
+Init *OpInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<OpInit*>(this);
+  return VarBitInit::get(const_cast<OpInit*>(this), Bit);
+}
+
 UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
   typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
 
@@ -922,9 +904,9 @@
   case EQ: {
     // try to fold eq comparison for 'bit' and 'int', otherwise fallback
     // to string objects.
-    IntInit* L =
+    IntInit *L =
       dynamic_cast<IntInit*>(LHS->convertInitializerTo(IntRecTy::get()));
-    IntInit* R =
+    IntInit *R =
       dynamic_cast<IntInit*>(RHS->convertInitializerTo(IntRecTy::get()));
 
     if (L && R)
@@ -1324,25 +1306,10 @@
   return NameString->getValue();
 }
 
-Init *VarInit::resolveBitReference(Record &R, const RecordVal *IRV,
-                                   unsigned Bit) const {
-  if (R.isTemplateArg(getNameInit())) return 0;
-  if (IRV && IRV->getNameInit() != getNameInit()) return 0;
-
-  RecordVal *RV = R.getValue(getNameInit());
-  assert(RV && "Reference to a non-existent variable?");
-  assert(dynamic_cast<BitsInit*>(RV->getValue()));
-  BitsInit *BI = (BitsInit*)RV->getValue();
-
-  assert(Bit < BI->getNumBits() && "Bit reference out of range!");
-  Init *B = BI->getBit(Bit);
-
-  // If the bit is set to some value, or if we are resolving a reference to a
-  // specific variable and that variable is explicitly unset, then replace the
-  // VarBitInit with it.
-  if (IRV || !dynamic_cast<UnsetInit*>(B))
-    return B;
-  return 0;
+Init *VarInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarInit*>(this);
+  return VarBitInit::get(const_cast<VarInit*>(this), Bit);
 }
 
 Init *VarInit::resolveListElementReference(Record &R,
@@ -1425,9 +1392,11 @@
 }
 
 Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  if (Init *I = getVariable()->resolveBitReference(R, RV, getBitNum()))
-    return I;
-  return const_cast<VarBitInit *>(this);
+  Init *I = TI->resolveReferences(R, RV);
+  if (TI != I)
+    return I->getBit(getBitNum());
+
+  return const_cast<VarBitInit*>(this);
 }
 
 VarListElementInit *VarListElementInit::get(TypedInit *T,
@@ -1456,11 +1425,10 @@
   return const_cast<VarListElementInit *>(this);
 }
 
-Init *VarListElementInit::resolveBitReference(Record &R, const RecordVal *RV,
-                                              unsigned Bit) const {
-  // FIXME: This should be implemented, to support references like:
-  // bit B = AA[0]{1};
-  return 0;
+Init *VarListElementInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarListElementInit*>(this);
+  return VarBitInit::get(const_cast<VarListElementInit*>(this), Bit);
 }
 
 Init *VarListElementInit:: resolveListElementReference(Record &R,
@@ -1513,17 +1481,10 @@
   return I;
 }
 
-Init *FieldInit::resolveBitReference(Record &R, const RecordVal *RV,
-                                     unsigned Bit) const {
-  if (Init *BitsVal = Rec->getFieldInit(R, RV, FieldName))
-    if (BitsInit *BI = dynamic_cast<BitsInit*>(BitsVal)) {
-      assert(Bit < BI->getNumBits() && "Bit reference out of range!");
-      Init *B = BI->getBit(Bit);
-
-      if (dynamic_cast<BitInit*>(B))  // If the bit is set.
-        return B;                     // Replace the VarBitInit with it.
-    }
-  return 0;
+Init *FieldInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<FieldInit*>(this);
+  return VarBitInit::get(const_cast<FieldInit*>(this), Bit);
 }
 
 Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
@@ -1751,7 +1712,15 @@
     if (RV == &Values[i]) // Skip resolve the same field as the given one
       continue;
     if (Init *V = Values[i].getValue())
-      Values[i].setValue(V->resolveReferences(*this, RV));
+      if (Values[i].setValue(V->resolveReferences(*this, RV)))
+        throw TGError(getLoc(), "Invalid value is found when setting '"
+                      + Values[i].getNameInitAsString()
+                      + "' after resolving references"
+                      + (RV ? " against '" + RV->getNameInitAsString()
+                              + "' of ("
+                              + RV->getValue()->getAsUnquotedString() + ")"
+                            : "")
+                      + "\n");
   }
   Init *OldName = getNameInit();
   Init *NewName = Name->resolveReferences(*this, RV);
@@ -1963,6 +1932,23 @@
         "' does not have a bit initializer!";
 }
 
+bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    throw "Record `" + getName() + "' does not have a field named `" +
+      FieldName.str() + "'!\n";
+
+  if (R->getValue() == UnsetInit::get()) {
+    Unset = true;
+    return false;
+  }
+  Unset = false;
+  if (BitInit *BI = dynamic_cast<BitInit*>(R->getValue()))
+    return BI->getValue();
+  throw "Record `" + getName() + "', field `" + FieldName.str() +
+        "' does not have a bit initializer!";
+}
+
 /// getValueAsDag - This method looks up the specified field and returns its
 /// value as an Dag, throwing an exception if the field does not exist or if
 /// the value is not the right type.

diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index b9c7ff6..aee93e76 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp

@@ -1044,35 +1044,28 @@
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XIf: {
-      // FIXME: The `!if' operator doesn't handle non-TypedInit well at
-      // all. This can be made much more robust.
-      TypedInit *MHSt = dynamic_cast<TypedInit*>(MHS);
-      TypedInit *RHSt = dynamic_cast<TypedInit*>(RHS);
-
       RecTy *MHSTy = 0;
       RecTy *RHSTy = 0;
 
-      if (MHSt == 0 && RHSt == 0) {
-        BitsInit *MHSbits = dynamic_cast<BitsInit*>(MHS);
-        BitsInit *RHSbits = dynamic_cast<BitsInit*>(RHS);
-
-        if (MHSbits && RHSbits &&
-            MHSbits->getNumBits() == RHSbits->getNumBits()) {
-          Type = BitRecTy::get();
-          break;
-        } else {
-          BitInit *MHSbit = dynamic_cast<BitInit*>(MHS);
-          BitInit *RHSbit = dynamic_cast<BitInit*>(RHS);
-
-          if (MHSbit && RHSbit) {
-            Type = BitRecTy::get();
-            break;
-          }
-        }
-      } else if (MHSt != 0 && RHSt != 0) {
+      if (TypedInit *MHSt = dynamic_cast<TypedInit*>(MHS))
         MHSTy = MHSt->getType();
+      if (BitsInit *MHSbits = dynamic_cast<BitsInit*>(MHS))
+        MHSTy = BitsRecTy::get(MHSbits->getNumBits());
+      if (dynamic_cast<BitInit*>(MHS))
+        MHSTy = BitRecTy::get();
+
+      if (TypedInit *RHSt = dynamic_cast<TypedInit*>(RHS))
         RHSTy = RHSt->getType();
-      }
+      if (BitsInit *RHSbits = dynamic_cast<BitsInit*>(RHS))
+        RHSTy = BitsRecTy::get(RHSbits->getNumBits());
+      if (dynamic_cast<BitInit*>(RHS))
+        RHSTy = BitRecTy::get();
+
+      // For UnsetInit, it's typed from the other hand.
+      if (dynamic_cast<UnsetInit*>(MHS))
+        MHSTy = RHSTy;
+      if (dynamic_cast<UnsetInit*>(RHS))
+        RHSTy = MHSTy;
 
       if (!MHSTy || !RHSTy) {
         TokError("could not get type for !if");
@@ -2277,7 +2270,10 @@
                      DefName, StringRecTy::get())->Fold(DefProto, &MC);
   }
 
-  Record *CurRec = new Record(DefName, DefmPrefixLoc, Records);
+  // Make a trail of SMLocs from the multiclass instantiations.
+  SmallVector<SMLoc, 4> Locs(1, DefmPrefixLoc);
+  Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
+  Record *CurRec = new Record(DefName, Locs, Records);
 
   SubClassReference Ref;
   Ref.RefLoc = DefmPrefixLoc;

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 29033e5..e2f0d7d 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp

@@ -683,7 +683,7 @@
   // Handle register classes that require multiple instructions.
   unsigned BeginIdx = 0;
   unsigned SubRegs = 0;
-  unsigned Spacing = 1;
+  int Spacing = 1;
 
   // Use VORRq when possible.
   if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
@@ -705,27 +705,38 @@
   else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
 
-  if (Opc) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    MachineInstrBuilder Mov;
-    for (unsigned i = 0; i != SubRegs; ++i) {
-      unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-      unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
-      assert(Dst && Src && "Bad sub-register");
-      Mov = AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-                             .addReg(Src));
-      // VORR takes two source operands.
-      if (Opc == ARM::VORRq)
-        Mov.addReg(Src);
-    }
-    // Add implicit super-register defs and kills to the last instruction.
-    Mov->addRegisterDefined(DestReg, TRI);
-    if (KillSrc)
-      Mov->addRegisterKilled(SrcReg, TRI);
-    return;
-  }
+  assert(Opc && "Impossible reg-to-reg copy");
 
-  llvm_unreachable("Impossible reg-to-reg copy");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
+
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+      .addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
 }
 
 static const
@@ -1569,16 +1580,20 @@
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
-/// return the corresponding opcode for the predicated pseudo-instruction.
-static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI,
-                                 const MachineRegisterInfo &MRI) {
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
   if (!TargetRegisterInfo::isVirtualRegister(Reg))
     return 0;
   if (!MRI.hasOneNonDBGUse(Reg))
     return 0;
-  MI = MRI.getVRegDef(Reg);
+  MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
     return 0;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return 0;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
   for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
@@ -1588,55 +1603,18 @@
       return 0;
     if (!MO.isReg())
       continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return 0;
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
       return 0;
     if (MO.isDef() && !MO.isDead())
       return 0;
   }
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::ANDri:   return ARM::ANDCCri;
-  case ARM::ANDrr:   return ARM::ANDCCrr;
-  case ARM::ANDrsi:  return ARM::ANDCCrsi;
-  case ARM::ANDrsr:  return ARM::ANDCCrsr;
-  case ARM::t2ANDri: return ARM::t2ANDCCri;
-  case ARM::t2ANDrr: return ARM::t2ANDCCrr;
-  case ARM::t2ANDrs: return ARM::t2ANDCCrs;
-  case ARM::EORri:   return ARM::EORCCri;
-  case ARM::EORrr:   return ARM::EORCCrr;
-  case ARM::EORrsi:  return ARM::EORCCrsi;
-  case ARM::EORrsr:  return ARM::EORCCrsr;
-  case ARM::t2EORri: return ARM::t2EORCCri;
-  case ARM::t2EORrr: return ARM::t2EORCCrr;
-  case ARM::t2EORrs: return ARM::t2EORCCrs;
-  case ARM::ORRri:   return ARM::ORRCCri;
-  case ARM::ORRrr:   return ARM::ORRCCrr;
-  case ARM::ORRrsi:  return ARM::ORRCCrsi;
-  case ARM::ORRrsr:  return ARM::ORRCCrsr;
-  case ARM::t2ORRri: return ARM::t2ORRCCri;
-  case ARM::t2ORRrr: return ARM::t2ORRCCrr;
-  case ARM::t2ORRrs: return ARM::t2ORRCCrs;
-
-  // ARM ADD/SUB
-  case ARM::ADDri:   return ARM::ADDCCri;
-  case ARM::ADDrr:   return ARM::ADDCCrr;
-  case ARM::ADDrsi:  return ARM::ADDCCrsi;
-  case ARM::ADDrsr:  return ARM::ADDCCrsr;
-  case ARM::SUBri:   return ARM::SUBCCri;
-  case ARM::SUBrr:   return ARM::SUBCCrr;
-  case ARM::SUBrsi:  return ARM::SUBCCrsi;
-  case ARM::SUBrsr:  return ARM::SUBCCrsr;
-
-  // Thumb2 ADD/SUB
-  case ARM::t2ADDri:   return ARM::t2ADDCCri;
-  case ARM::t2ADDri12: return ARM::t2ADDCCri12;
-  case ARM::t2ADDrr:   return ARM::t2ADDCCrr;
-  case ARM::t2ADDrs:   return ARM::t2ADDCCrs;
-  case ARM::t2SUBri:   return ARM::t2SUBCCri;
-  case ARM::t2SUBri12: return ARM::t2SUBCCri12;
-  case ARM::t2SUBrr:   return ARM::t2SUBCCrr;
-  case ARM::t2SUBrs:   return ARM::t2SUBCCrs;
-  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
+    return 0;
+  return MI;
 }
 
 bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
@@ -1665,19 +1643,18 @@
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineInstr *DefMI = 0;
-  unsigned Opc = canFoldIntoMOVCC(MI->getOperand(2).getReg(), DefMI, MRI);
-  bool Invert = !Opc;
-  if (!Opc)
-    Opc = canFoldIntoMOVCC(MI->getOperand(1).getReg(), DefMI, MRI);
-  if (!Opc)
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
     return 0;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      get(Opc), MI->getOperand(0).getReg())
-    .addOperand(MI->getOperand(Invert ? 2 : 1));
+                                      DefMI->getDesc(),
+                                      MI->getOperand(0).getReg());
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1696,6 +1673,15 @@
   if (NewMI->hasOptionalDef())
     AddDefaultCC(NewMI);
 
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  FalseReg.setImplicit();
+  NewMI->addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
   // The caller will erase MI, but not DefMI.
   DefMI->eraseFromParent();
   return NewMI;
@@ -2042,13 +2028,14 @@
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
-    if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
       MI = 0;
       for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
            UE = MRI->use_end(); UI != UE; ++UI) {
         if (UI->getParent() != CmpInstr->getParent()) continue;
         MachineInstr *PotentialAND = &*UI;
-        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true))
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(PotentialAND))
           continue;
         MI = PotentialAND;
         break;
@@ -2114,6 +2101,10 @@
   // The single candidate is called MI.
   if (!MI) MI = Sub;
 
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(MI))
+    return false;
+
   switch (MI->getOpcode()) {
   default: break;
   case ARM::RSBrr:
@@ -2220,6 +2211,7 @@
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
+    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
     CmpInstr->eraseFromParent();
 
     // Modify the condition code of operands in OperandsToUpdate.
@@ -3366,7 +3358,8 @@
   // converted.
   if (Subtarget.isCortexA9() && !isPredicated(MI) &&
       (MI->getOpcode() == ARM::VMOVRS ||
-       MI->getOpcode() == ARM::VMOVSR))
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
   // No other instructions can be swizzled, so just determine their domain.
@@ -3386,13 +3379,28 @@
   return std::make_pair(ExeGeneric, 0);
 }
 
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+
 void
 ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
   MachineInstrBuilder MIB(MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  bool isKill;
   switch (MI->getOpcode()) {
     default:
       llvm_unreachable("cannot handle opcode!");
@@ -3403,78 +3411,175 @@
 
       // Zap the predicate operands.
       assert(!isPredicated(MI) && "Cannot predicate a VORRd");
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
 
-      // Change to a VORRd which requires two identical use operands.
+      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
       MI->setDesc(get(ARM::VORRd));
-
-      // Add the extra source operand and new predicates.
-      // This will go before any implicit ops.
-      AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1)));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(SrcReg)
+                        .addReg(SrcReg));
       break;
     case ARM::VMOVRS:
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
 
+      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
 
-      DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
+      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
+      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+      // Note that DSrc has been widened and the other lane may be undef, which
+      // contaminates the entire register.
       MI->setDesc(get(ARM::VGETLNi32));
-      MIB.addReg(DReg);
-      MIB.addImm(Lane);
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(DReg, RegState::Undef)
+                        .addImm(Lane));
 
-      MIB->getOperand(1).setIsUndef();
+      // The old source should be an implicit use, otherwise we might think it
+      // was dead before here.
       MIB.addReg(SrcReg, RegState::Implicit);
-
-      AddDefaultPred(MIB);
       break;
     case ARM::VMOVSR:
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
 
+      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
-      DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
-      isKill = MI->getOperand(0).isKill();
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
-      MI->RemoveOperand(0);
+      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
+      // If we insert both a novel <def> and an <undef> on the DReg, we break
+      // any existing dependency chain on the unused lane. Either already being
+      // present means this instruction is in that chain anyway so we can make
+      // the transformation.
+      if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI))
+          break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+      // Again DDst may be undefined at the beginning of this instruction.
       MI->setDesc(get(ARM::VSETLNi32));
-      MIB.addReg(DReg, RegState::Define);
-      MIB.addReg(DReg, RegState::Undef);
-      MIB.addReg(SrcReg);
-      MIB.addImm(Lane);
-
-      if (isKill)
-        MIB->addRegisterKilled(DstReg, TRI, true);
-      MIB->addRegisterDefined(DstReg, TRI);
-
+      MIB.addReg(DReg, RegState::Define)
+         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
+         .addReg(SrcReg)
+         .addImm(Lane);
       AddDefaultPred(MIB);
+
+      // The narrower destination must be marked as set to keep previous chains
+      // in place.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
       break;
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
+
+      // If we insert both a novel <def> and an <undef> on the DReg, we break
+      // any existing dependency chain on the unused lane. Either already being
+      // present means this instruction is in that chain anyway so we can make
+      // the transformation.
+      if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI))
+          break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
+           .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        break;
+      }
+
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
+      AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      break;
+    }
   }
 
 }

diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 132b81f..89cbc84 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp

@@ -410,7 +410,7 @@
 
   do {
     DEBUG(errs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+          << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {

diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index a953985..dd05f0c 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp

@@ -1388,10 +1388,9 @@
     // If the original WaterList entry was "new water" on this iteration,
     // propagate that to the new island.  This is just keeping NewWaterList
     // updated to match the WaterList, which will be updated below.
-    if (NewWaterList.count(WaterBB)) {
-      NewWaterList.erase(WaterBB);
+    if (NewWaterList.erase(WaterBB))
       NewWaterList.insert(NewIsland);
-    }
+
     // The new CPE goes before the following block (NewMBB).
     NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
 

diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 15bb32e..8ed6b75 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp

@@ -1208,6 +1208,57 @@
       ExpandLaneOp(MBBI);
       return true;
 
+    case ARM::VSETLNi8Q:
+    case ARM::VSETLNi16Q: {
+      // Expand VSETLNs acting on a Q register to equivalent VSETLNs acting
+      // on the respective D register.
+
+      unsigned QReg  = MI.getOperand(1).getReg();
+      unsigned QLane = MI.getOperand(3).getImm();
+
+      unsigned NewOpcode, DLane, DSubReg;
+      switch (Opcode) {
+      default: llvm_unreachable("Invalid opcode!");
+      case ARM::VSETLNi8Q:
+        // 4 possible 8-bit lanes per DPR:
+        NewOpcode = ARM::VSETLNi8;
+        DLane = QLane % 8;
+        DSubReg  = (QLane / 8) ? ARM::dsub_1 : ARM::dsub_0;
+        break;
+      case ARM::VSETLNi16Q:
+        // 4 possible 16-bit lanes per DPR.
+        NewOpcode = ARM::VSETLNi16;
+        DLane = QLane % 4;
+        DSubReg  = (QLane / 4) ? ARM::dsub_1 : ARM::dsub_0;
+        break;
+      }
+
+      MachineInstrBuilder MIB =
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(NewOpcode));
+
+      unsigned DReg = TRI->getSubReg(QReg, DSubReg);
+
+      MIB.addReg(DReg, RegState::Define); // Output DPR
+      MIB.addReg(DReg);                   // Input DPR
+      MIB.addOperand(MI.getOperand(2));   // Input GPR
+      MIB.addImm(DLane);                  // Lane
+
+      // Add the predicate operands.
+      MIB.addOperand(MI.getOperand(4));
+      MIB.addOperand(MI.getOperand(5));
+
+      if (MI.getOperand(1).isKill()) // Add an implicit kill for the Q register.
+        MIB->addRegisterKilled(QReg, TRI, true);
+      // And an implicit def of the output register (which should always be the
+      // same as the input register).
+      MIB->addRegisterDefined(QReg, TRI);
+
+      TransferImpOps(MI, MIB, MIB);
+
+      MI.eraseFromParent();
+      return true;
+    }
+
     case ARM::VTBL3Pseudo: ExpandVTBL(MBBI, ARM::VTBL3, false); return true;
     case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;

diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 5a5ca1b..045d904 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp

@@ -617,10 +617,7 @@
   if (VT != MVT::i32) return 0;
 
   Reloc::Model RelocM = TM.getRelocationModel();
-
-  // TODO: Need more magic for ARM PIC.
-  if (!isThumb2 && (RelocM == Reloc::PIC_)) return 0;
-
+  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
 
   // Use movw+movt when possible, it avoids constant pool entries.
@@ -668,17 +665,30 @@
         .addConstantPoolIndex(Idx);
       if (RelocM == Reloc::PIC_)
         MIB.addImm(Id);
+      AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
         .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (RelocM == Reloc::PIC_) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DL, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
     }
-    AddOptionalDefs(MIB);
   }
 
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+  if (IsIndirect) {
     MachineInstrBuilder MIB;
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
@@ -2212,25 +2222,17 @@
   unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if (isThumb2) {
-    // Explicitly adding the predicate here.
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
     AddDefaultPred(MIB);
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
-  } else {
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
+  if (EnableARMLongCalls)
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-  }
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2358,30 +2360,20 @@
   unsigned CallOpc = ARMSelectCallOp(UseReg);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if(isThumb2) {
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
-  } else {
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
 
-    // Explicitly adding the predicate here.
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
     AddDefaultPred(MIB);
-  }
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2650,7 +2642,7 @@
   unsigned Reg1 = getRegForValue(Src1Value);
   if (Reg1 == 0) return false;
 
-  unsigned Reg2;
+  unsigned Reg2 = 0;
   if (Opc == ARM::MOVsr) {
     Reg2 = getRegForValue(Src2Value);
     if (Reg2 == 0) return false;

diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c6f9d15..1406620 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp

@@ -2637,6 +2637,38 @@
                                     dl, MVT::i32, MVT::i32, Ops, 5);
     }
   }
+  case ARMISD::UMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::UMLAL : ARM::UMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::SMLAL : ARM::SMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
   case ISD::LOAD: {
     SDNode *ResNode = 0;
     if (Subtarget->isThumb() && Subtarget->hasThumb2())

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index df4039b..e51315e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp

@@ -514,6 +514,7 @@
     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
@@ -566,6 +567,11 @@
     }
   }
 
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+
   computeRegisterProperties();
 
   // ARM does not have f32 extending load.
@@ -791,12 +797,9 @@
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
-
-  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) {
-    setTargetDAGCombine(ISD::AND);
-    setTargetDAGCombine(ISD::OR);
-    setTargetDAGCombine(ISD::XOR);
-  }
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
@@ -981,6 +984,8 @@
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
@@ -4154,10 +4159,21 @@
   }
 
   // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
   bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
@@ -4168,13 +4184,21 @@
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
-    if (!Value.getNode())
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+    
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
       Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
+    }
   }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
 
-  if (!Value.getNode())
+  if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
@@ -4184,9 +4208,34 @@
 
   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
   // i32 and try again.
-  if (usesOnlyOneValue && EltSize <= 32) {
-    if (!isConstant)
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT)
+        N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+      else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumElts; ++i)
@@ -4198,9 +4247,11 @@
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
-    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
-    if (Val.getNode())
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+    }
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
@@ -5418,7 +5469,7 @@
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
@@ -5529,7 +5580,7 @@
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = MRI.createVirtualRegister(TRC);
@@ -7193,6 +7244,154 @@
   return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
 }
 
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //          loAdd   UMUL_LOHI
+  //            \    / :lo    \ :hi
+  //             \  /          \          [no multiline comment]
+  //              ADDC         |  hiAdd
+  //                 \ :glue  /  /
+  //                  \      /  /
+  //                    ADDE
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         AddcNode->getValueType(1) == MVT::Glue &&
+         "Expect ADDC with two result values: i32, glue");
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (AddeNode == NULL)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+     return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiMul = &MULOp;
+  SDValue* HiAdd = NULL;
+  SDValue* LoMul = NULL;
+  SDValue* LowAdd = NULL;
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  if (AddcOp0->getOpcode() == Opc) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1->getOpcode() == Opc) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (LoMul == NULL)
+    return SDValue();
+
+  if (LoMul->getNode() != HiMul->getNode())
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+                                 DAG.getVTList(MVT::i32, MVT::i32),
+                                 &Ops[0], Ops.size());
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+
+}
+
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
@@ -8764,6 +8963,7 @@
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);

diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 13b83de..2b8f382 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h

@@ -173,6 +173,9 @@
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -257,6 +260,11 @@
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    virtual bool isSelectSupported(SelectSupportKind Kind) const {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     virtual EVT getSetCCResultType(EVT VT) const;
 

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 992aba5..e23989e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td

@@ -83,6 +83,13 @@
                                              SDTCisInt<0>,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
+
+def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
+def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
+def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
@@ -90,9 +97,10 @@
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
-                              [SDNPHasChain, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
-                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
 def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
                                 SDT_ARMStructByVal,
                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
@@ -148,14 +156,16 @@
 
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
-                               SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
-                               SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 
 def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
@@ -275,7 +285,7 @@
 
 def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-    int64_t Value = -(int)N->getZExtValue();
+    unsigned Value = -(unsigned)N->getZExtValue();
     return Value && ARM_AM::getSOImmVal(Value) != -1;
   }], imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
@@ -1791,12 +1801,15 @@
   let Inst{15-12} = Rd;
   let Inst{11-0} = label{11-0};
 }
+
+let hasSideEffects = 1 in {
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
                     4, IIC_iALUi, []>;
 
 def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       4, IIC_iALUi, []>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -3399,6 +3412,18 @@
   let Inst{11-8}  = Rm;
   let Inst{3-0}   = Rn;
 }
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 
 // FIXME: The v5 pseudos are only necessary for the additional Constraint
 //        property. Remove them when it's possible to add those properties
@@ -3481,14 +3506,14 @@
 }
 
 // Multiply + accumulate
-def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
-def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
@@ -3504,17 +3529,22 @@
   let Inst{3-0}   = Rn;
 }
 
-let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in {
 def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               (ins GPR:$Rn, GPR:$Rm, pred:$p),
                               4, IIC_iMAC64, [],
@@ -3986,48 +4016,6 @@
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $Rd">;
 
-// Conditional instructions
-multiclass AsI1_bincc_irs<Instruction iri, Instruction irr, Instruction irsi,
-                          Instruction irsr,
-                          InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis> {
-  def ri  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_imm:$imm,
-                                 pred:$p, cc_out:$s),
-                            4, iii, [],
-                       (iri GPR:$Rd, GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rr  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, GPR:$Rm,
-                                 pred:$p, cc_out:$s),
-                            4, iir, [],
-                           (irr GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsi : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_reg_imm:$shift,
-                                 pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsi GPR:$Rd, GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsr : ARMPseudoExpand<(outs GPRnopc:$Rd),
-                           (ins GPRnopc:$Rfalse, GPRnopc:$Rn, so_reg_reg:$shift,
-                                pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsr GPR:$Rd, GPR:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-}
-
-defm ANDCC : AsI1_bincc_irs<ANDri, ANDrr, ANDrsi, ANDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ORRCC : AsI1_bincc_irs<ORRri, ORRrr, ORRrsi, ORRrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm EORCC : AsI1_bincc_irs<EORri, EORrr, EORrsi, EORrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ADDCC : AsI1_bincc_irs<ADDri, ADDrr, ADDrsi, ADDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm SUBCC : AsI1_bincc_irs<SUBri, SUBrr, SUBrsi, SUBrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-
 } // neverHasSideEffects
 
 

diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 048d340..8158a11 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td

@@ -1980,7 +1980,7 @@
 def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
                        NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 
 def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
@@ -2023,7 +2023,7 @@
 def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
                              NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
                              extractelt, addrmode6oneL32> {
@@ -5045,25 +5045,23 @@
                                            GPR:$R, imm:$lane))]> {
   let Inst{21} = lane{0};
 }
+
+def VSETLNi8Q : PseudoNeonI<(outs QPR:$V),
+                             (ins QPR:$src1, GPR:$R, VectorIndex8:$lane),
+                             IIC_VMOVISL, "",
+                             [(set QPR:$V, (vector_insert (v16i8 QPR:$src1),
+                                           GPR:$R, imm:$lane))]>;
+def VSETLNi16Q : PseudoNeonI<(outs QPR:$V),
+                             (ins QPR:$src1, GPR:$R, VectorIndex16:$lane),
+                             IIC_VMOVISL, "",
+                             [(set QPR:$V, (vector_insert (v8i16 QPR:$src1),
+                                           GPR:$R, imm:$lane))]>;
 }
-def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
-          (v16i8 (INSERT_SUBREG QPR:$src1,
-                  (v8i8 (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
-                                   (DSubReg_i8_reg imm:$lane))),
-                            GPR:$src2, (SubReg_i8_lane imm:$lane))),
-                  (DSubReg_i8_reg imm:$lane)))>;
-def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
-          (v8i16 (INSERT_SUBREG QPR:$src1,
-                  (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
-                                     (DSubReg_i16_reg imm:$lane))),
-                             GPR:$src2, (SubReg_i16_lane imm:$lane))),
-                  (DSubReg_i16_reg imm:$lane)))>;
+
 def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
-          (v4i32 (INSERT_SUBREG QPR:$src1,
-                  (v2i32 (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
-                                     (DSubReg_i32_reg imm:$lane))),
-                             GPR:$src2, (SubReg_i32_lane imm:$lane))),
-                  (DSubReg_i32_reg imm:$lane)))>;
+         (v4i32 (INSERT_SUBREG QPR:$src1,
+                 GPR:$src2,
+                 (SSubReg_f32_reg imm:$lane)))>;
 
 def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
           (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),

diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 554f6d9..e171f8b 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td

@@ -1200,6 +1200,7 @@
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
                               2, IIC_iALUi, []>;
 
+let hasSideEffects = 1 in
 def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
                               (ins i32imm:$label, nohash_imm:$id, pred:$p),
                               2, IIC_iALUi, []>;

diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 8ecf009..f1a6cce 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td

@@ -523,6 +523,23 @@
   let Inst{7-4}   = opc7_4;
   let Inst{3-0}   = Rm;
 }
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
 
 
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
@@ -757,33 +774,6 @@
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
    }
-
-   // Predicated versions.
-   def CCri : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_imm:$imm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCri12 : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPR:$Rn, imm0_4095:$imm,
-                                  pred:$p),
-                             4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri12) GPRnopc:$Rd,
-                              GPR:$Rn, imm0_4095:$imm, pred:$p)>,
-                RegConstraint<"$Rfalse = $Rd">;
-   def CCrr : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, rGPR:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUr, [],
-                             (!cast<Instruction>(NAME#rr) GPRnopc:$Rd,
-                              GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCrs : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_reg:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUsi, [],
-                             (!cast<Instruction>(NAME#rs) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_reg:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
 }
 
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
@@ -1200,6 +1190,7 @@
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
                                 4, IIC_iALUi, []>;
+let hasSideEffects = 1 in
 def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$label, nohash_imm:$id, pred:$p),
                                 4, IIC_iALUi,
@@ -2437,15 +2428,17 @@
 } // isCommutable
 
 // Multiply + accumulate
-def t2SMLAL : T2MulLong<0b100, 0b0000,
+def t2SMLAL : T2MlaLong<0b100, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
-def t2UMLAL : T2MulLong<0b110, 0b0000,
+def t2UMLAL : T2MlaLong<0b110, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
 def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -3049,37 +3042,6 @@
                  RegConstraint<"$false = $Rd">;
 } // isCodeGenOnly = 1
 
-multiclass T2I_bincc_irs<Instruction iri, Instruction irr, Instruction irs,
-                   InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> {
-   // shifted imm
-   def ri : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_imm:$imm,
-                                pred:$p, cc_out:$s),
-                           4, iii, [],
-                  (iri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // register
-   def rr : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, rGPR:$Rm,
-                                pred:$p, cc_out:$s),
-                           4, iir, [],
-                        (irr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // shifted register
-   def rs : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_reg:$ShiftedRm,
-                                pred:$p, cc_out:$s),
-                           4, iis, [],
-            (irs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-} // T2I_bincc_irs
-
-defm t2ANDCC : T2I_bincc_irs<t2ANDri, t2ANDrr, t2ANDrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2ORRCC : T2I_bincc_irs<t2ORRri, t2ORRrr, t2ORRrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2EORCC : T2I_bincc_irs<t2EORri, t2EORrr, t2EORrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index a812e21..2dc49a9 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp

@@ -169,7 +169,7 @@
       intptr_t LazyPtr = getIndirectSymAddr(Fn);
       if (!LazyPtr) {
         // In PIC mode, the function stub is loading a lazy-ptr.
-        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
         DEBUG(if (F)
                 errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
                        << "] for GV '" << F->getName() << "'\n";

diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 3a5957b..e1e2f6e 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp

@@ -181,49 +181,44 @@
   OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index);
 
   // Asm Match Converter Methods
-  bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+  void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -267,6 +262,12 @@
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
+
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                           const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum, NumMCOperands);
+  }
 };
 } // end anonymous namespace
 
@@ -3880,8 +3881,8 @@
 /// cvtT2LdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -3892,14 +3893,13 @@
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtT2StrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2StrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateReg(0));
@@ -3910,14 +3910,13 @@
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3926,28 +3925,26 @@
 
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3956,14 +3953,13 @@
 
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3972,57 +3968,53 @@
 
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 
 /// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4034,14 +4026,13 @@
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4053,14 +4044,13 @@
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4072,14 +4062,13 @@
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4091,14 +4080,13 @@
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4109,14 +4097,13 @@
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4127,40 +4114,27 @@
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-/// cvtThumbMultiple- Convert parsed operands to MCInst.
+/// cvtThumbMultiply - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtThumbMultiply(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // The second source operand must be the same register as the destination
-  // operand.
-  if (Operands.size() == 6 &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[5])->getReg()) &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[4])->getReg())) {
-    Error(Operands[3]->getStartLoc(),
-          "destination register must match source register");
-    return false;
-  }
   ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
   // If we have a three-operand form, make sure to set Rn to be the operand
@@ -4173,12 +4147,10 @@
   ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
   Inst.addOperand(Inst.getOperand(0));
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
-
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4188,11 +4160,10 @@
   ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4204,11 +4175,10 @@
   ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4218,11 +4188,10 @@
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4234,7 +4203,6 @@
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -5377,6 +5345,25 @@
                    "in register list");
     break;
   }
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[5])->getReg()) &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[4])->getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
   // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
@@ -7475,9 +7462,11 @@
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
+  unsigned Kind;
   unsigned ErrorInfo;
   unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+
+  MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo);
   switch (MatchResult) {
   default: break;
   case Match_Success:
@@ -7540,9 +7529,6 @@
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction",
                  ((ARMOperand*)Operands[0])->getLocRange());
-  case Match_ConversionFail:
-    // The converter function will have already emitted a diagnostic.
-    return true;
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:

diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index c90751d0..57642e1 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp

@@ -2701,6 +2701,8 @@
   unsigned align = fieldFromInstruction(Insn, 4, 1);
   unsigned size = fieldFromInstruction(Insn, 6, 2);
 
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
   align *= (1 << size);
 
   switch (Inst.getOpcode()) {
@@ -2831,6 +2833,8 @@
   unsigned align = fieldFromInstruction(Insn, 4, 1);
 
   if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
     size = 4;
     align = 16;
   } else {
@@ -3170,7 +3174,7 @@
     int imm = Val & 0xFF;
 
     if (!(Val & 0x100)) imm *= -1;
-    Inst.addOperand(MCOperand::CreateImm(imm << 2));
+    Inst.addOperand(MCOperand::CreateImm(imm * 4));
   }
 
   return MCDisassembler::Success;
@@ -3710,8 +3714,16 @@
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
@@ -3769,8 +3781,16 @@
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (Rm != 0xF) { // Writeback
@@ -4090,8 +4110,15 @@
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;
@@ -4164,8 +4191,15 @@
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7d6acbc..b53da3b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp

@@ -194,6 +194,10 @@
     case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
+    case ARM::fixup_t2_condbranch:
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
     case ARM::fixup_arm_movt_hi16:
     case ARM::fixup_arm_movt_hi16_pcrel:
       Type = ELF::R_ARM_MOVT_PREL;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index d32805e..c1aab9c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp

@@ -50,7 +50,6 @@
   Code32Directive = ".code\t32";
 
   WeakRefDirective = "\t.weak\t";
-  LCOMMDirectiveType = LCOMM::NoAlignment;
 
   HasLEB128 = true;
   SupportsDebugInformation = true;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 94f1082..1917564 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp

@@ -783,7 +783,7 @@
 
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (Imm8 < 0)
-    Imm8 = -Imm8;
+    Imm8 = -(uint32_t)Imm8;
 
   // Scaled by 4.
   Imm8 /= 4;

diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index a51e0fa..95640f7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp

@@ -410,7 +410,7 @@
   if (Type == macho::RIT_ARM_Half) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
-    uint32_t Value = 0;;
+    uint32_t Value = 0;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:

diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 03d5a9a..3396e8b 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp

@@ -130,8 +130,7 @@
     void
     printS10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
              && "Invalid s10 argument");
       O << value;
@@ -140,8 +139,7 @@
     void
     printU10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
       O << value;
     }

diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index c27caea..425371d 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp

@@ -83,12 +83,10 @@
       return true;
     } else if (vt == MVT::i32) {
       int32_t i_val = (int32_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend32<16>(i_val);
     } else {
       int64_t i_val = (int64_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend64<16>(i_val);
     }
   }
 
@@ -99,9 +97,10 @@
     EVT vt = FPN->getValueType(0);
     if (vt == MVT::f32) {
       int val = FloatToBits(FPN->getValueAPF().convertToFloat());
-      int sval = (int) ((val << 16) >> 16);
-      Imm = (short) val;
-      return val == sval;
+      if (val == SignExtend32<16>(val)) {
+        Imm = (short) val;
+        return true;
+      }
     }
 
     return false;

diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 1f2d8ac..306084b 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt

@@ -16,6 +16,7 @@
   HexagonExpandPredSpillCode.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
+  HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp

diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 0000000..b131a8f
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp

@@ -0,0 +1,952 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "HexagonMachineScheduler.h"
+
+#include <queue>
+
+using namespace llvm;
+
+static cl::opt<bool> ForceTopDown("vliw-misched-topdown", cl::Hidden,
+                                  cl::desc("Force top-down list scheduling"));
+static cl::opt<bool> ForceBottomUp("vliw-misched-bottomup", cl::Hidden,
+                                   cl::desc("Force bottom-up list scheduling"));
+
+#ifndef NDEBUG
+static cl::opt<bool> ViewMISchedDAGs("vliw-view-misched-dags", cl::Hidden,
+  cl::desc("Pop up a window to show MISched dags after they are processed"));
+
+static cl::opt<unsigned> MISchedCutoff("vliw-misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+#else
+static bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I, MachineBasicBlock::iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
+/// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
+void VLIWMachineScheduler::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    SchedImpl->releaseTopNode(SuccSU);
+}
+
+/// releaseSuccessors - Call releaseSucc on each of SU's successors.
+void VLIWMachineScheduler::releaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    releaseSucc(SU, &*I);
+  }
+}
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
+void VLIWMachineScheduler::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(0);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void VLIWMachineScheduler::releasePredecessors(SUnit *SU) {
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    releasePred(SU, &*I);
+  }
+}
+
+void VLIWMachineScheduler::moveInstruction(MachineInstr *MI,
+                                    MachineBasicBlock::iterator InsertPos) {
+  // Advance RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    ++RegionBegin;
+
+  // Update the instruction stream.
+  BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
+  LIS->handleMove(MI);
+
+  // Recede RegionBegin if an instruction moves above the first.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool VLIWMachineScheduler::checkSchedLimit() {
+#ifndef NDEBUG
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void VLIWMachineScheduler::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned endcount)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, endcount);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd =
+    (RegionEnd == bb->end()) ? RegionEnd : llvm::next(RegionEnd);
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void VLIWMachineScheduler::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.getPressure().dump(TRI));
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    BotRPTracker.recede();
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = TRI->getRegPressureSetLimit(i);
+    DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+          << "Limit " << Limit
+          << " Actual " << RegionPressure[i] << "\n");
+    if (RegionPressure[i] > Limit)
+      RegionCriticalPSets.push_back(PressureElement(i, 0));
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].PSetID) << " ";
+        dbgs() << "\n");
+
+  TotalPackets = 0;
+}
+
+// FIXME: When the pressure tracker deals in pressure differences then we won't
+// iterate over all RegionCriticalPSets[i].
+void VLIWMachineScheduler::
+updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
+  for (unsigned i = 0, e = RegionCriticalPSets.size(); i < e; ++i) {
+    unsigned ID = RegionCriticalPSets[i].PSetID;
+    int &MaxUnits = RegionCriticalPSets[i].UnitIncrease;
+    if ((int)NewMaxPressure[ID] > MaxUnits)
+      MaxUnits = NewMaxPressure[ID];
+  }
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+// Release all DAG roots for scheduling.
+void VLIWMachineScheduler::releaseRoots() {
+  SmallVector<SUnit*, 16> BotRoots;
+
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (I->Preds.empty())
+      SchedImpl->releaseTopNode(&(*I));
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (I->Succs.empty())
+      BotRoots.push_back(&(*I));
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I)
+    SchedImpl->releaseBottomNode(*I);
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI->getLoopDepth(BB)
+        << " \n");
+
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+
+  // To view Height/Depth correctly, they should be accessed at least once.
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  if (ViewMISchedDAGs) viewGraph();
+
+  SchedImpl->initialize(this);
+
+  // Release edges from the special Entry node or to the special Exit node.
+  releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
+
+  // Release all DAG roots for scheduling.
+  releaseRoots();
+
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
+  CurrentBottom = RegionEnd;
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    if (!checkSchedLimit())
+      break;
+
+    // Move the instruction to its new location in the instruction stream.
+    MachineInstr *MI = SU->getInstr();
+
+    if (IsTopNode) {
+      assert(SU->isTopReady() && "node still has unscheduled dependencies");
+      if (&*CurrentTop == MI)
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else {
+        moveInstruction(MI, CurrentTop);
+        TopRPTracker.setPos(MI);
+      }
+
+      // Update top scheduled pressure.
+      TopRPTracker.advance();
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+
+      // Release dependent instructions for scheduling.
+      releaseSuccessors(SU);
+    } else {
+      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
+      else {
+        if (&*CurrentTop == MI) {
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+          TopRPTracker.setPos(CurrentTop);
+        }
+        moveInstruction(MI, CurrentBottom);
+        CurrentBottom = MI;
+      }
+      // Update bottom scheduled pressure.
+      BotRPTracker.recede();
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
+
+      // Release dependent instructions for scheduling.
+      releasePredecessors(SU);
+    }
+    SU->isScheduled = true;
+    SchedImpl->schedNode(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void VLIWMachineScheduler::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *prior(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == llvm::prior(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = NULL;
+}
+
+void ConvergingVLIWScheduler::initialize(VLIWMachineScheduler *dag) {
+  DAG = dag;
+  TRI = DAG->TRI;
+  Top.DAG = dag;
+  Bot.DAG = dag;
+
+  // Initialize the HazardRecognizers.
+  const TargetMachine &TM = DAG->MF.getTarget();
+  const InstrItineraryData *Itin = TM.getInstrItineraryData();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  Top.ResourceModel = new VLIWResourceModel(TM);
+  Bot.ResourceModel = new VLIWResourceModel(TM);
+
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = DAG->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += DAG->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q,
+                                             SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return 0;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 100;
+static const unsigned PriorityThree = 50;
+static const unsigned PriorityFour = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I)
+      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I)
+      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  // Factor in reg pressure as a heuristic.
+  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+
+  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
+  }
+  SUnit *SU;
+  if (ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
+

diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 0000000..f3643d6
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h

@@ -0,0 +1,437 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy - Interface to a machine scheduling algorithm.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+class VLIWMachineScheduler;
+
+/// MachineSchedStrategy - Interface used by VLIWMachineScheduler to drive
+/// the selected scheduling algorithm.
+///
+/// TODO: Move this to ScheduleDAGInstrs.h
+class MachineSchedStrategy {
+public:
+  virtual ~MachineSchedStrategy() {}
+
+  /// Initialize the strategy after building the DAG for a new region.
+  virtual void initialize(VLIWMachineScheduler *DAG) = 0;
+
+  /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to
+  /// schedule the node at the top of the unscheduled region. Otherwise it will
+  /// be scheduled at the bottom.
+  virtual SUnit *pickNode(bool &IsTopNode) = 0;
+
+  /// Notify MachineSchedStrategy that VLIWMachineScheduler has
+  /// scheduled a node.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
+
+  /// When all predecessor dependencies have been resolved, free this node for
+  /// top-down scheduling.
+  virtual void releaseTopNode(SUnit *SU) = 0;
+  /// When all successor dependencies have been resolved, free this node for
+  /// bottom-up scheduling.
+  virtual void releaseBottomNode(SUnit *SU) = 0;
+};
+
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
+/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
+/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
+class ReadyQueue {
+  unsigned ID;
+  std::string Name;
+  std::vector<SUnit*> Queue;
+
+public:
+  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
+
+  unsigned getID() const { return ID; }
+
+  StringRef getName() const { return Name; }
+
+  // SU is in this queue if it's NodeQueueID is a superset of this ID.
+  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
+
+  bool empty() const { return Queue.empty(); }
+
+  unsigned size() const { return Queue.size(); }
+
+  typedef std::vector<SUnit*>::iterator iterator;
+
+  iterator begin() { return Queue.begin(); }
+
+  iterator end() { return Queue.end(); }
+
+  iterator find(SUnit *SU) {
+    return std::find(Queue.begin(), Queue.end(), SU);
+  }
+
+  void push(SUnit *SU) {
+    Queue.push_back(SU);
+    SU->NodeQueueId |= ID;
+  }
+
+  void remove(iterator I) {
+    (*I)->NodeQueueId &= ~ID;
+    *I = Queue.back();
+    Queue.pop_back();
+  }
+
+  void dump() {
+    dbgs() << Name << ": ";
+    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+      dbgs() << Queue[i]->NodeNum << " ";
+    dbgs() << "\n";
+  }
+};
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const InstrItineraryData *InstrItins;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+  VLIWResourceModel(MachineSchedContext *C, const InstrItineraryData *IID) :
+    InstrItins(IID), TotalPackets(0) {
+    const TargetMachine &TM = C->MF->getTarget();
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(InstrItins->SchedModel->IssueWidth);
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  VLIWResourceModel(const TargetMachine &TM) :
+    InstrItins(TM.getInstrItineraryData()), TotalPackets(0) {
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(InstrItins->SchedModel->IssueWidth);
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  unsigned getTotalPackets() const { return TotalPackets; }
+};
+
+class VLIWMachineScheduler : public ScheduleDAGInstrs {
+  /// AA - AliasAnalysis for making memory reference queries.
+  AliasAnalysis *AA;
+
+  RegisterClassInfo *RegClassInfo;
+  MachineSchedStrategy *SchedImpl;
+
+  MachineBasicBlock::iterator LiveRegionEnd;
+
+  /// Register pressure in this region computed by buildSchedGraph.
+  IntervalPressure RegPressure;
+  RegPressureTracker RPTracker;
+
+  /// List of pressure sets that exceed the target's pressure limit before
+  /// scheduling, listed in increasing set ID order. Each pressure set is paired
+  /// with its max pressure in the currently scheduled regions.
+  std::vector<PressureElement> RegionCriticalPSets;
+
+  /// The top of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentTop;
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  /// The bottom of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentBottom;
+  IntervalPressure BotPressure;
+  RegPressureTracker BotRPTracker;
+
+#ifndef NDEBUG
+  /// The number of instructions scheduled so far. Used to cut off the
+  /// scheduler at the point determined by misched-cutoff.
+  unsigned NumInstrsScheduled;
+#endif
+
+  /// Total packets in the region.
+  unsigned TotalPackets;
+
+  const MachineLoopInfo *MLI;
+public:
+  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure), MLI(C->MLI) {
+#ifndef NDEBUG
+    NumInstrsScheduled = 0;
+#endif
+    TotalPackets = 0;
+  }
+
+  virtual ~VLIWMachineScheduler() {
+    delete SchedImpl;
+  }
+
+  MachineBasicBlock::iterator top() const { return CurrentTop; }
+  MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
+
+  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
+  /// region. This covers all instructions in a block, while schedule() may only
+  /// cover a subset.
+  void enterRegion(MachineBasicBlock *bb,
+                   MachineBasicBlock::iterator begin,
+                   MachineBasicBlock::iterator end,
+                   unsigned endcount);
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  void schedule();
+
+  unsigned CurCycle;
+
+  /// Get current register pressure for the top scheduled instructions.
+  const IntervalPressure &getTopPressure() const { return TopPressure; }
+  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
+
+  /// Get current register pressure for the bottom scheduled instructions.
+  const IntervalPressure &getBotPressure() const { return BotPressure; }
+  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
+
+  /// Get register pressure for the entire scheduling region before scheduling.
+  const IntervalPressure &getRegPressure() const { return RegPressure; }
+
+  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+    return RegionCriticalPSets;
+  }
+
+  /// getIssueWidth - Return the max instructions per scheduling group.
+  unsigned getIssueWidth() const {
+    return (InstrItins && InstrItins->SchedModel)
+      ? InstrItins->SchedModel->IssueWidth : 1;
+  }
+
+  /// getNumMicroOps - Return the number of issue slots required for this MI.
+  unsigned getNumMicroOps(MachineInstr *MI) const {
+    return 1;
+    //if (!InstrItins) return 1;
+    //int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass());
+    //return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI);
+  }
+
+private:
+  void scheduleNodeTopDown(SUnit *SU);
+  void listScheduleTopDown();
+
+  void initRegPressure();
+  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+
+  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
+  bool checkSchedLimit();
+
+  void releaseRoots();
+
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSuccessors(SUnit *SU);
+  void releasePred(SUnit *SU, SDep *PredEdge);
+  void releasePredecessors(SUnit *SU);
+
+  void placeDebugValues();
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(NULL), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    VLIWMachineScheduler *DAG;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetRegisterInfo *TRI;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler():
+    DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(VLIWMachineScheduler *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+
+} // namespace
+
+
+#endif

diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 7ece408..1e91c39 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp

@@ -337,7 +337,7 @@
 
   DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
                << "********** Function: "
-               << MF.getFunction()->getName() << "\n");
+               << MF.getName() << "\n");
 
 #if 0
   // for now disable this, if we move NewValueJump before register

diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 55cbc09..a295015 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp

@@ -109,6 +109,7 @@
   MRI = &MF.getRegInfo();
 
   DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
 
   if (DisableHexagonPeephole) return false;
 
@@ -117,6 +118,7 @@
        MBBb != MBBe; ++MBBb) {
     MachineBasicBlock* MBB = MBBb;
     PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
@@ -140,6 +142,24 @@
         }
       }
 
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
+      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+        assert(MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+      }
+
       // Look for P=NOT(P).
       if (!DisablePNotP &&
           (MI->getOpcode() == Hexagon::NOT_p)) {
@@ -178,6 +198,21 @@
             // Change the 1st operand.
             MI->RemoveOperand(1);
             MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI->RemoveOperand(1);
+              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
+                                                       false /*isDef*/,
+                                                       false /*isImp*/,
+                                                       false /*isKill*/,
+                                                       false /*isDead*/,
+                                                       false /*isUndef*/,
+                                                       false /*isEarlyClobber*/,
+                                                       PeepholeSrc.second));
+            }
           }
         }
       }

diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2c23674..3742486 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp

@@ -310,6 +310,58 @@
   Moves.push_back(MachineMove(0, Dst, Src));
 }
 
+// Get the weight in units of pressure for this register class.
+const RegClassWeight &
+HexagonRegisterInfo::getRegClassWeight(const TargetRegisterClass *RC) const {
+  // Each TargetRegisterClass has a per register weight, and weight
+  // limit which must be less than the limits of its pressure sets.
+  static const RegClassWeight RCWeightTable[] = {
+    {1, 32}, // IntRegs
+    {1, 8},  // CRRegs
+    {1, 4},  // PredRegs
+    {2, 16}, // DoubleRegs
+    {0, 0} };
+  return RCWeightTable[RC->getID()];
+}
+
+/// Get the number of dimensions of register pressure.
+unsigned HexagonRegisterInfo::getNumRegPressureSets() const {
+  return 4;
+}
+
+/// Get the name of this register unit pressure set.
+const char *HexagonRegisterInfo::getRegPressureSetName(unsigned Idx) const {
+  static const char *const RegPressureSetName[] = {
+    "IntRegsRegSet",
+    "CRRegsRegSet",
+    "PredRegsRegSet",
+    "DoubleRegsRegSet"
+  };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureSetName[Idx];
+}
+
+/// Get the register unit pressure limit for this dimension.
+/// This limit must be adjusted dynamically for reserved registers.
+unsigned HexagonRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+  static const int RegPressureLimit [] = { 16, 4, 2, 8 };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureLimit[Idx];
+}
+
+const int*
+HexagonRegisterInfo::getRegClassPressureSets(const TargetRegisterClass *RC)
+  const {
+  static const int RCSetsTable[] = {
+    0,  -1,  // IntRegs
+    1,  -1,  // CRRegs
+    2,  -1,  // PredRegs
+    0,  -1,  // DoubleRegs
+    -1 };
+  static const unsigned RCSetStartTable[] = { 0, 2, 4, 6, 0 };
+  unsigned SetListStart = RCSetStartTable[RC->getID()];
+  return &RCSetsTable[SetListStart];
+}
 unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
 }

diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 85355ae..8820d13 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h

@@ -87,6 +87,11 @@
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
+  const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  unsigned getNumRegPressureSets() const;
+  const char *getRegPressureSetName(unsigned Idx) const;
+  unsigned getRegPressureSetLimit(unsigned Idx) const;
+  const int* getRegClassPressureSets(const TargetRegisterClass *RC) const;
 };
 
 } // end namespace llvm

diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index d1076b8..b5ff69a7 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td

@@ -47,6 +47,7 @@
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItineraries;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 9b41126..5668ae81 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td

@@ -58,6 +58,7 @@
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a7b291f..5688e9c 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp

@@ -14,6 +14,7 @@
 #include "HexagonTargetMachine.h"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
@@ -29,6 +30,11 @@
                         "disable-hexagon-hwloops", cl::Hidden,
                         cl::desc("Disable Hardware Loops for Hexagon target"));
 
+static cl::
+opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+                                cl::Hidden, cl::ZeroOrMore, cl::init(false),
+                                cl::desc("Disable Hexagon MI Scheduling"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -42,6 +48,13 @@
   RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
 }
 
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
 ///
@@ -83,7 +96,13 @@
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // Enable MI scheduler.
+    if (!DisableHexagonMISched) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createVLIWMachineSched);
+    }
+  }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();

diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index a03ed03..3d5f685 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp

@@ -3474,8 +3474,8 @@
       // 1. Two loads unless they are volatile.
       // 2. Two stores in V4 unless they are volatile.
       else if ((DepType == SDep::Order) &&
-               !I->hasVolatileMemoryRef() &&
-               !J->hasVolatileMemoryRef()) {
+               !I->hasOrderedMemoryRef() &&
+               !J->hasOrderedMemoryRef()) {
         if (QRI->Subtarget.hasV4TOps() &&
             // hexagonv4 allows dual store.
             MCIDI.mayStore() && MCIDJ.mayStore()) {

diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index d6e6c36..86f75d1 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp

@@ -24,7 +24,7 @@
   HasLEB128 = true;
 
   PrivateGlobalPrefix = ".L";
-  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";

diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index 38fb0e8..4059403 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp

@@ -56,6 +56,12 @@
 
   /// }
 
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum,
+                                   NumMCOperands);
+  }
 
 public:
   MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
@@ -317,10 +323,10 @@
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
   MCInst Inst;
-  SMLoc ErrorLoc;
+  unsigned Kind;
   unsigned ErrorInfo;
 
-  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) {
+  switch (MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo)) {
   default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
@@ -329,10 +335,8 @@
     return Error(IDLoc, "instruction use requires an option to be enabled");
   case Match_MnemonicFail:
       return Error(IDLoc, "unrecognized instruction mnemonic");
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
-  case Match_InvalidOperand:
-    ErrorLoc = IDLoc;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
@@ -343,6 +347,7 @@
 
     return Error(ErrorLoc, "invalid operand for instruction");
   }
+  }
 
   llvm_unreachable("Implement any new match types added!");
 }

diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index 46f5207..daa76e8 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp

@@ -140,7 +140,7 @@
 
   unsigned oi = i == 2 ? 1 : 2;
 
-  DEBUG(dbgs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
         dbgs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();

diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 11e8997..af24a5a 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk

@@ -11,10 +11,14 @@
   MipsGenSubtargetInfo.inc
 
 mips_codegen_SRC_FILES := \
+  Mips16FrameLowering.cpp \
+  Mips16InstrInfo.cpp \
+  Mips16RegisterInfo.cpp \
   MipsAnalyzeImmediate.cpp \
   MipsAsmPrinter.cpp \
   MipsCodeEmitter.cpp \
   MipsDelaySlotFiller.cpp \
+  MipsDirectObjLower.cpp \
   MipsELFWriterInfo.cpp \
   MipsFrameLowering.cpp \
   MipsInstrInfo.cpp \
@@ -25,16 +29,13 @@
   MipsMachineFunction.cpp \
   MipsMCInstLower.cpp \
   MipsRegisterInfo.cpp \
-  MipsSubtarget.cpp \
   MipsSEFrameLowering.cpp \
   MipsSEInstrInfo.cpp \
   MipsSERegisterInfo.cpp \
-  MipsTargetMachine.cpp \
-  MipsTargetObjectFile.cpp \
   MipsSelectionDAGInfo.cpp \
-  Mips16FrameLowering.cpp \
-  Mips16InstrInfo.cpp \
-  Mips16RegisterInfo.cpp
+  MipsSubtarget.cpp \
+  MipsTargetMachine.cpp \
+  MipsTargetObjectFile.cpp
 
 # For the host
 # =====================================================

diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 43bd345..8418b75 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp

@@ -8,20 +8,36 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 namespace {
+
 class MipsAsmParser : public MCTargetAsmParser {
 
+  enum FpFormatTy {
+    FP_FORMAT_NONE = -1,
+    FP_FORMAT_S,
+    FP_FORMAT_D,
+    FP_FORMAT_L,
+    FP_FORMAT_W
+  } FpFormat;
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
@@ -34,14 +50,67 @@
   bool ParseInstruction(StringRef Name, SMLoc NameLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
+  bool parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
   bool ParseDirective(AsmToken DirectiveID);
 
-  OperandMatchResultTy parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+  MipsAsmParser::OperandMatchResultTy
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+
+  unsigned
+  getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                      unsigned OperandNum, unsigned &NumMCOperands);
+
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+                    StringRef Mnemonic);
+
+  int tryParseRegister(StringRef Mnemonic);
+
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               StringRef Mnemonic);
+
+  bool parseMemOffset(const MCExpr *&Res);
+  bool parseRelocOperand(const MCExpr *&Res);
+  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
+
+  bool isMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64) != 0;
+  }
+
+  bool isFP64() const {
+    return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
+  }
+
+  int matchRegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic);
+
+  void setFpFormat(FpFormatTy Format) {
+    FpFormat = Format;
+  }
+
+  void setDefaultFpFormat();
+
+  void setFpFormat(StringRef Format);
+
+  FpFormatTy getFpFormat() {return FpFormat;}
+
+  bool requestsDoubleOperand(StringRef Mnemonic);
+
+  unsigned getReg(int RC,int RegNo);
+
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser() {
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
 };
 }
 
@@ -50,6 +119,7 @@
 /// MipsOperand - Instances of this class represent a parsed Mips machine
 /// instruction.
 class MipsOperand : public MCParsedAsmOperand {
+
   enum KindTy {
     k_CondCode,
     k_CoprocNum,
@@ -61,18 +131,58 @@
   } Kind;
 
   MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNum;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned Base;
+      const MCExpr *Off;
+    } Mem;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
 public:
   void addRegOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const{
-    llvm_unreachable("unimplemented!");
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
   }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst,Expr);
   }
+
   void addMemOperands(MCInst &Inst, unsigned N) const {
-    llvm_unreachable("unimplemented!");
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst,Expr);
   }
 
   bool isReg() const { return Kind == k_Register; }
@@ -82,46 +192,751 @@
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
-    return "";
+    return StringRef(Tok.Data, Tok.Length);
   }
 
   unsigned getReg() const {
     assert((Kind == k_Register) && "Invalid access!");
-    return 0;
+    return Reg.RegNum;
   }
 
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  static MipsOperand *CreateToken(StringRef Str, SMLoc S) {
+    MipsOperand *Op = new MipsOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
+                                 SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
   virtual void print(raw_ostream &OS) const {
     llvm_unreachable("unimplemented!");
   }
 };
 }
 
+unsigned MipsAsmParser::
+getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                    unsigned OperandNum, unsigned &NumMCOperands) {
+  assert (0 && "getMCInstOperandNum() not supported by the Mips target.");
+  // The Mips backend doesn't currently include the matcher implementation, so
+  // the getMCInstOperandNumImpl() is undefined.  This is a temporary
+  // work around.
+  NumMCOperands = 0;
+  return 0;
+}
+
 bool MipsAsmParser::
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
+  MCInst Inst;
+  unsigned ErrorInfo;
+  unsigned Kind;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Kind, Inst, ErrorInfo);
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success: {
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst);
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MipsOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
   return true;
 }
 
+int MipsAsmParser::matchRegisterName(StringRef Name) {
+
+   int CC = StringSwitch<unsigned>(Name)
+    .Case("zero",  Mips::ZERO)
+    .Case("a0",  Mips::A0)
+    .Case("a1",  Mips::A1)
+    .Case("a2",  Mips::A2)
+    .Case("a3",  Mips::A3)
+    .Case("v0",  Mips::V0)
+    .Case("v1",  Mips::V1)
+    .Case("s0",  Mips::S0)
+    .Case("s1",  Mips::S1)
+    .Case("s2",  Mips::S2)
+    .Case("s3",  Mips::S3)
+    .Case("s4",  Mips::S4)
+    .Case("s5",  Mips::S5)
+    .Case("s6",  Mips::S6)
+    .Case("s7",  Mips::S7)
+    .Case("k0",  Mips::K0)
+    .Case("k1",  Mips::K1)
+    .Case("sp",  Mips::SP)
+    .Case("fp",  Mips::FP)
+    .Case("gp",  Mips::GP)
+    .Case("ra",  Mips::RA)
+    .Case("t0",  Mips::T0)
+    .Case("t1",  Mips::T1)
+    .Case("t2",  Mips::T2)
+    .Case("t3",  Mips::T3)
+    .Case("t4",  Mips::T4)
+    .Case("t5",  Mips::T5)
+    .Case("t6",  Mips::T6)
+    .Case("t7",  Mips::T7)
+    .Case("t8",  Mips::T8)
+    .Case("t9",  Mips::T9)
+    .Case("at",  Mips::AT)
+    .Case("fcc0",  Mips::FCC0)
+    .Default(-1);
+
+  if (CC != -1) {
+    //64 bit register in Mips are following 32 bit definitions.
+    if (isMips64())
+      CC++;
+    return CC;
+  }
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if( NumString.getAsInteger(10, IntVal))
+      return -1; //not integer
+    if (IntVal > 31)
+      return -1;
+
+    FpFormatTy Format = getFpFormat();
+
+    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
+      return getReg(Mips::FGR32RegClassID, IntVal);
+    if (Format == FP_FORMAT_D) {
+      if(isFP64()) {
+        return getReg(Mips::FGR64RegClassID, IntVal);
+      }
+      //only even numbers available as register pairs
+      if (( IntVal > 31) || (IntVal%2 !=  0))
+        return -1;
+      return getReg(Mips::AFGR64RegClassID, IntVal/2);
+    }
+  }
+
+  return -1;
+}
+void MipsAsmParser::setDefaultFpFormat() {
+
+  if (isMips64() || isFP64())
+    FpFormat = FP_FORMAT_D;
+  else
+    FpFormat = FP_FORMAT_S;
+}
+
+bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){
+
+  bool IsDouble = StringSwitch<bool>(Mnemonic.lower())
+    .Case("ldxc1", true)
+    .Case("ldc1",  true)
+    .Case("sdxc1", true)
+    .Case("sdc1",  true)
+    .Default(false);
+
+  return IsDouble;
+}
+void MipsAsmParser::setFpFormat(StringRef Format) {
+
+  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
+    .Case(".s",  FP_FORMAT_S)
+    .Case(".d",  FP_FORMAT_D)
+    .Case(".l",  FP_FORMAT_L)
+    .Case(".w",  FP_FORMAT_W)
+    .Default(FP_FORMAT_NONE);
+}
+
+unsigned MipsAsmParser::getReg(int RC,int RegNo){
+  return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
+}
+
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum,StringRef Mnemonic) {
+
+  if (Mnemonic.lower() == "rdhwr") {
+    //at the moment only hwreg29 is supported
+    if (RegNum != 29)
+      return -1;
+    return Mips::HWR29;
+  }
+
+  if (RegNum > 31)
+    return -1;
+
+  return getReg(Mips::CPURegsRegClassID,RegNum);
+}
+
+int MipsAsmParser::tryParseRegister(StringRef Mnemonic) {
+  const AsmToken &Tok = Parser.getTok();
+  int RegNum = -1;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    std::string lowerCase = Tok.getString().lower();
+    RegNum = matchRegisterName(lowerCase);
+  } else if (Tok.is(AsmToken::Integer))
+    RegNum = matchRegisterByNumber(static_cast<unsigned> (Tok.getIntVal()),
+                                   Mnemonic.lower());
+    else
+      return RegNum;  //error
+  //64 bit div operations require Mips::ZERO instead of MIPS::ZERO_64
+  if (isMips64() && RegNum == Mips::ZERO_64) {
+    if (Mnemonic.find("ddiv") != StringRef::npos)
+      RegNum = Mips::ZERO;
+  }
+  return RegNum;
+}
+
 bool MipsAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                          StringRef Mnemonic){
+
+  SMLoc S = Parser.getTok().getLoc();
+  int RegNo = -1;
+
+  //FIXME: we should make a more generic method for CCR
+  if ((Mnemonic == "cfc1" || Mnemonic == "ctc1")
+      && Operands.size() == 2 && Parser.getTok().is(AsmToken::Integer)){
+    RegNo = Parser.getTok().getIntVal();  //get the int value
+    //at the moment only fcc0 is supported
+    if (RegNo ==  0)
+      RegNo = Mips::FCC0;
+  } else
+    RegNo = tryParseRegister(Mnemonic);
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
+      Parser.getTok().getLoc()));
+  Parser.Lex(); // Eat register token.
+  return false;
+}
+
+bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
+                                 StringRef Mnemonic) {
+  //Check if the current operand has a custom associated parser, if so, try to
+  //custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Dollar: {
+    //parse register
+    SMLoc S = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat dollar token.
+    //parse register operand
+    if (!tryParseRegisterOperand(Operands,Mnemonic)) {
+      if (getLexer().is(AsmToken::LParen)) {
+        //check if it is indexed addressing operand
+        Operands.push_back(MipsOperand::CreateToken("(", S));
+        Parser.Lex(); //eat parenthesis
+        if (getLexer().isNot(AsmToken::Dollar))
+          return true;
+
+        Parser.Lex(); //eat dollar
+        if (tryParseRegisterOperand(Operands,Mnemonic))
+          return true;
+
+        if (!getLexer().is(AsmToken::RParen))
+          return true;
+
+        S = Parser.getTok().getLoc();
+        Operands.push_back(MipsOperand::CreateToken(")", S));
+        Parser.Lex();
+      }
+      return false;
+    }
+    //maybe it is a symbol reference
+    StringRef Identifier;
+    if (Parser.ParseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+
+    // Otherwise create a symbol ref.
+    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+    return false;
+  }
+  case AsmToken::Identifier:
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::String: {
+     // quoted label names
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::Percent: {
+    //it is a symbol reference or constant expression
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc(); //start location of the operand
+    if (parseRelocOperand(IdVal))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }//case AsmToken::Percent
+  }//switch(getLexer().getKind())
   return true;
 }
 
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+
+  Parser.Lex(); //eat % token
+  const AsmToken &Tok = Parser.getTok(); //get next token, operation
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+
+  std::string Str = Tok.getIdentifier().str();
+
+  Parser.Lex(); //eat identifier
+  //now make expression from the rest of the operand
+  const MCExpr *IdVal;
+  SMLoc EndLoc;
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    while (1) {
+      Parser.Lex(); //eat '(' token
+      if (getLexer().getKind() == AsmToken::Percent) {
+        Parser.Lex(); //eat % token
+        const AsmToken &nextTok = Parser.getTok();
+        if (nextTok.isNot(AsmToken::Identifier))
+          return true;
+        Str += "(%";
+        Str += nextTok.getIdentifier();
+        Parser.Lex(); //eat identifier
+        if (getLexer().getKind() != AsmToken::LParen)
+          return true;
+      } else
+        break;
+    }
+    if (getParser().ParseParenExpression(IdVal,EndLoc))
+      return true;
+
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex(); //eat ')' token
+
+  } else
+    return true; //parenthesis must follow reloc operand
+
+  //Check the type of the expression
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal)) {
+    //it's a constant, evaluate lo or hi value
+    int Val = MCE->getValue();
+    if (Str == "lo") {
+      Val = Val & 0xffff;
+    } else if (Str == "hi") {
+      Val = (Val & 0xffff0000) >> 16;
+    }
+    Res = MCConstantExpr::Create(Val, getContext());
+    return false;
+  }
+
+  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(IdVal)) {
+    //it's a symbol, create symbolic expression from symbol
+    StringRef Symbol = MSRE->getSymbol().getName();
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(Str);
+    Res = MCSymbolRefExpr::Create(Symbol,VK,getContext());
+    return false;
+  }
+  return true;
+}
+
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = tryParseRegister("");
+  EndLoc = Parser.getTok().getLoc();
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) {
+
+  SMLoc S;
+
+  switch(getLexer().getKind()) {
+  default:
+    return true;
+  case AsmToken::Integer:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+    return (getParser().ParseExpression(Res));
+  case AsmToken::Percent:
+    return parseRelocOperand(Res);
+  case AsmToken::LParen:
+    return false;  //it's probably assuming 0
+  }
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
+               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+
+  const MCExpr *IdVal = 0;
+  SMLoc S;
+  //first operand is the offset
+  S = Parser.getTok().getLoc();
+
+  if (parseMemOffset(IdVal))
+    return MatchOperand_ParseFail;
+
+  const AsmToken &Tok = Parser.getTok(); //get next token
+  if (Tok.isNot(AsmToken::LParen)) {
+    Error(Parser.getTok().getLoc(), "'(' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat '(' token.
+
+  const AsmToken &Tok1 = Parser.getTok(); //get next token
+  if (Tok1.is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '$' token.
+    if (tryParseRegisterOperand(Operands,"")) {
+      Error(Parser.getTok().getLoc(), "unexpected token in operand");
+      return MatchOperand_ParseFail;
+    }
+
+  } else {
+    Error(Parser.getTok().getLoc(),"unexpected token in operand");
+    return MatchOperand_ParseFail;
+  }
+
+  const AsmToken &Tok2 = Parser.getTok(); //get next token
+  if (Tok2.isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat ')' token.
+
+  if (IdVal == 0)
+    IdVal = MCConstantExpr::Create(0, getContext());
+
+  //now replace register operand with the mem operand
+  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  int RegNo = op->getReg();
+  //remove register from operands
+  Operands.pop_back();
+  //and add memory operand
+  Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E));
+  delete op;
+  return MatchOperand_Success;
+}
+
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+
+  MCSymbolRefExpr::VariantKind VK
+                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
+    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
+    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
+    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
+    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
+    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
+    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
+    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
+    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
+    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
+    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
+    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
+    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+    .Default(MCSymbolRefExpr::VK_None);
+
+  return VK;
+}
+
+static int ConvertCcString(StringRef CondString) {
+  int CC = StringSwitch<unsigned>(CondString)
+      .Case(".f",    0)
+      .Case(".un",   1)
+      .Case(".eq",   2)
+      .Case(".ueq",  3)
+      .Case(".olt",  4)
+      .Case(".ult",  5)
+      .Case(".ole",  6)
+      .Case(".ule",  7)
+      .Case(".sf",   8)
+      .Case(".ngle", 9)
+      .Case(".seq",  10)
+      .Case(".ngl",  11)
+      .Case(".lt",   12)
+      .Case(".nge",  13)
+      .Case(".le",   14)
+      .Case(".ngt",  15)
+      .Default(-1);
+
+  return CC;
+}
+
+bool MipsAsmParser::
+parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  //split the format
+  size_t Start = Name.find('.'), Next = Name.rfind('.');
+  StringRef Format1 = Name.slice(Start, Next);
+  //and add the first format to the operands
+  Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc));
+  //now for the second format
+  StringRef Format2 = Name.slice(Next, StringRef::npos);
+  Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc));
+
+  //set the format for the first register
+  setFpFormat(Format1);
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+
+    }
+    Parser.Lex();  // Eat the comma.
+
+    //set the format for the first register
+    setFpFormat(Format2);
+
+    // Parse and remember the operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
 bool MipsAsmParser::
 ParseInstruction(StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  return true;
+  //floating point instructions: should register be treated as double?
+  if (requestsDoubleOperand(Name)) {
+    setFpFormat(FP_FORMAT_D);
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  }
+  else {
+    setDefaultFpFormat();
+    // Create the leading tokens for the mnemonic, split by '.' characters.
+    size_t Start = 0, Next = Name.find('.');
+    StringRef Mnemonic = Name.slice(Start, Next);
+
+    Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
+
+    if (Next != StringRef::npos) {
+      //there is a format token in mnemonic
+      //StringRef Rest = Name.slice(Next, StringRef::npos);
+      size_t Dot = Name.find('.', Next+1);
+      StringRef Format = Name.slice(Next, Dot);
+      if (Dot == StringRef::npos) //only one '.' in a string, it's a format
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      else {
+        if (Name.startswith("c.")){
+          // floating point compare, add '.' and immediate represent for cc
+          Operands.push_back(MipsOperand::CreateToken(".", NameLoc));
+          int Cc = ConvertCcString(Format);
+          if (Cc == -1) {
+            return Error(NameLoc, "Invalid conditional code");
+          }
+          SMLoc E = SMLoc::getFromPointer(
+              Parser.getTok().getLoc().getPointer() -1 );
+          Operands.push_back(MipsOperand::CreateImm(
+              MCConstantExpr::Create(Cc, getContext()), NameLoc, E));
+        } else {
+          //trunc, ceil, floor ...
+          return parseMathOperation(Name, NameLoc, Operands);
+        }
+
+        //the rest is a format
+        Format = Name.slice(Dot, StringRef::npos);
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      }
+
+      setFpFormat(Format);
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    while (getLexer().is(AsmToken::Comma) ) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.EatToEndOfStatement();
+        return Error(Loc, "unexpected token in argument list");
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
 }
 
 bool MipsAsmParser::
 ParseDirective(AsmToken DirectiveID) {
-  return true;
-}
 
-MipsAsmParser::OperandMatchResultTy MipsAsmParser::
-  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&) {
-  return MatchOperand_ParseFail;
+  if (DirectiveID.getString() == ".ent") {
+    //ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".end") {
+    //ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".frame") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".set") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".fmask") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".mask") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".gpword") {
+    //ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  return true;
 }
 
 extern "C" void LLVMInitializeMipsAsmParser() {
@@ -130,3 +945,7 @@
   RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target);
   RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget);
 }
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"

diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index f535c50..0f84358 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt

@@ -21,6 +21,7 @@
   MipsAsmPrinter.cpp
   MipsCodeEmitter.cpp
   MipsDelaySlotFiller.cpp
+  MipsDirectObjLower.cpp
   MipsELFWriterInfo.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp

diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 234455e..9603327 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h

@@ -122,7 +122,7 @@
 {
   switch (RegEnum) {
   case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
-  case Mips::D0:
+  case Mips::D0:   case Mips::FCC0:
     return 0;
   case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
     return 1;

diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index b8489ca..5d240fe 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp

@@ -56,7 +56,7 @@
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
                                          bool _isN64, bool IsLittleEndian)
   : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
-                            /*HasRelocationAddend*/ false,
+                            /*HasRelocationAddend*/ (_isN64) ? true : false,
                             /*IsN64*/ _isN64) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}

diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 8dab62d..1d7370a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp

@@ -143,7 +143,11 @@
                        SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getBranchTargetOpValue expects only expressions");
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,
@@ -159,7 +163,10 @@
                      SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getJumpTargetOpValue expects only expressions");
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,

diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 2bc286b..ec84ad8 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp

@@ -26,7 +26,7 @@
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0),
-    RI(*tm.getSubtargetImpl(), *this) {}
+    RI(*tm.getSubtargetImpl()) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
   return RI;

diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index c15d1bf..106e82f 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp

@@ -38,9 +38,8 @@
 
 using namespace llvm;
 
-Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
+  : MipsRegisterInfo(ST) {}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions

diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 3f4b3a7..c702a15 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h

@@ -17,11 +17,11 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
-  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,

diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 20fc178..147be5d 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td

@@ -110,9 +110,9 @@
 def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
 def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
 let Pattern = []<dag> in {
-def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
-def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
-def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
+  def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
+  def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
+  def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
 }
 }
 // Rotate Instructions
@@ -217,7 +217,15 @@
 def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
 
 def DEXT : ExtBase<3, "dext", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DEXTU : ExtBase<2, "dextu", CPU64Regs>;
+  def DEXTM : ExtBase<1, "dextm", CPU64Regs>;
+}
 def DINS : InsBase<7, "dins", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DINSU : InsBase<6, "dinsu", CPU64Regs>;
+  def DINSM : InsBase<5, "dinsm", CPU64Regs>;
+}
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),

diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index dc8fbd0..99b163e 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp

@@ -91,7 +91,7 @@
 
   // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
   int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
-  int64_t ShiftedImm = Imm << (Seq[1].ImmOpnd - 16);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
 
   if (!isInt<16>(ShiftedImm))
     return;

diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 00ff754..e780134 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp

@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "mips-asm-printer"
 #include "Mips.h"
 #include "MipsAsmPrinter.h"
+#include "MipsDirectObjLower.h"
 #include "MipsInstrInfo.h"
 #include "MipsMCInstLower.h"
 #include "InstPrinter/MipsInstPrinter.h"
@@ -58,33 +59,31 @@
     return;
   }
 
-  // Direct object specific instruction lowering
-  if (!OutStreamer.hasRawTextSupport())
-    switch (MI->getOpcode()) {
-    case Mips::DSLL:
-    case Mips::DSRL:
-    case Mips::DSRA:
-      assert(MI->getNumOperands() == 3 &&
-             "Invalid no. of machine operands for shift!");
-      assert(MI->getOperand(2).isImm());
-      int64_t Shift = MI->getOperand(2).getImm();
-      if (Shift > 31) {
-        MCInst TmpInst0;
-        MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32);
-        OutStreamer.EmitInstruction(TmpInst0);
-        return;
-      }
-      break;
-    }
-
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
   do {
     MCInst TmpInst0;
     MCInstLowering.Lower(I++, TmpInst0);
+
+    // Direct object specific instruction lowering
+    if (!OutStreamer.hasRawTextSupport()){
+      switch (TmpInst0.getOpcode()) {
+      // If shift amount is >= 32 it the inst needs to be lowered further
+      case Mips::DSLL:
+      case Mips::DSRL:
+      case Mips::DSRA:
+        Mips::LowerLargeShift(TmpInst0);
+        break;
+        // Double extract instruction is chosen by pos and size operands
+      case Mips::DEXT:
+      case Mips::DINS:
+        Mips::LowerDextDins(TmpInst0);
+      }
+    }
+
     OutStreamer.EmitInstruction(TmpInst0);
-  } while ((I != E) && I->isInsideBundle());
+  } while ((I != E) && I->isInsideBundle()); // Delay slot check
 }
 
 //===----------------------------------------------------------------------===//
@@ -214,7 +213,7 @@
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
   case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
-  default: llvm_unreachable("Unknown Mips ABI");;
+  default: llvm_unreachable("Unknown Mips ABI");
   }
 }
 

diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index cb7022b..5433295 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp

@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -139,7 +138,7 @@
 
   do {
     DEBUG(errs() << "JITTing function '"
-        << MF.getFunction()->getName() << "'\n");
+        << MF.getName() << "'\n");
     MCE.startFunction(MF);
 
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();

diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 2bba8a3..e3c8ed7 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp

@@ -30,10 +30,11 @@
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
                        " are not NOP.");
 
-static cl::opt<bool> EnableDelaySlotFiller(
-  "enable-mips-delay-filler",
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
   cl::init(false),
-  cl::desc("Fill the Mips delay slots useful instructions."),
+  cl::desc("Disable the delay slot filler, which attempts to fill the Mips"
+           "delay slots with useful instructions."),
   cl::Hidden);
 
 // This option can be used to silence complaints by machine verifier passes.
@@ -114,7 +115,9 @@
 
       InstrIter D;
 
-      if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
+      // Delay slot filling is disabled at -O0.
+      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
+          findDelayInstr(MBB, I, D)) {
         MBB.splice(llvm::next(I), &MBB, D);
         ++UsefulSlots;
       } else

diff --git a/lib/Target/Mips/MipsDirectObjLower.cpp b/lib/Target/Mips/MipsDirectObjLower.cpp
new file mode 100644
index 0000000..0d74db8
--- /dev/null
+++ b/lib/Target/Mips/MipsDirectObjLower.cpp

@@ -0,0 +1,86 @@
+//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MCInst records that are normally
+// left to the assembler to lower such as large shifts.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsDirectObjLower.h"
+#include "MipsInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void Mips::LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  bool isLarge = false;
+  int64_t Shift;
+  Shift = Inst.getOperand(2).getImm();
+  if (Shift > 31) {
+    Shift -= 32;
+    isLarge = true;
+  }
+
+  // saminus32
+  (Inst.getOperand(2)).setImm(Shift);
+
+  if (isLarge)
+    switch (Inst.getOpcode()) {
+    default:
+      // Calling function is not synchronized
+      llvm_unreachable("Unexpected shift instruction");
+    case Mips::DSLL:
+      Inst.setOpcode(Mips::DSLL32);
+      return;
+    case Mips::DSRL:
+      Inst.setOpcode(Mips::DSRL32);
+      return;
+    case Mips::DSRA:
+      Inst.setOpcode(Mips::DSRA32);
+      return;
+    }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+void Mips::LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if ((pos < 32)) { // DEXT/DINS, do nothing
+      return;
+    } else { // DEXTU/DINSU
+      InstIn.getOperand(2).setImm(pos - 32);
+      InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+      return;
+    }
+  } else { // DEXTM/DINSM
+    assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+    InstIn.getOperand(3).setImm(size - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+    return;
+  }
+}

diff --git a/lib/Target/Mips/MipsDirectObjLower.h b/lib/Target/Mips/MipsDirectObjLower.h
new file mode 100644
index 0000000..8813cc9
--- /dev/null
+++ b/lib/Target/Mips/MipsDirectObjLower.h

@@ -0,0 +1,28 @@
+//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSDIRECTOBJLOWER_H
+#define MIPSDIRECTOBJLOWER_H
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+
+  namespace Mips {
+  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
+  //                       where the assembler usually finishes the lowering
+  //                       such as large shifts.
+    void LowerLargeShift(MCInst &Inst);
+    void LowerDextDins(MCInst &Inst);
+  }
+}
+
+#endif

diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 5a97c17..4205223 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp

@@ -337,8 +337,9 @@
     // Generate:
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
-    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0);
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
       if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
           isa<JumpTableSDNode>(Opnd0)) {
         Base = Addr.getOperand(0);

diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index c5207c6..aa7b459 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp

@@ -1571,15 +1571,15 @@
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
-    MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
+    const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering();
 
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
       SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
-      SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
-      return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode);
+      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
+      return DAG.getNode(ISD::ADD, dl, MVT::i32, GPReg, GPRelNode);
     }
     // %hi/%lo relocation
     SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,

diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 50e3eb5..8ade891 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp

@@ -262,46 +262,3 @@
   }
   }
 }
-
-unsigned
-llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                          MachineBasicBlock& MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          bool LastInstrIsADDiu,
-                          MipsAnalyzeImmediate::Inst *LastInst) {
-  MipsAnalyzeImmediate AnalyzeImm;
-  unsigned Size = IsN64 ? 64 : 32;
-  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
-  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
-
-  const MipsAnalyzeImmediate::InstSeq &Seq =
-    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
-  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-  if (LastInst && (Seq.size() == 1)) {
-    *LastInst = *Inst;
-    return 0;
-  }
-
-  // The first instruction can be a LUi, which is different from other
-  // instructions (ADDiu, ORI and SLL) in that it does not have a register
-  // operand.
-  if (Inst->Opc == LUi)
-    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-  else
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  // Build the remaining instructions in Seq. Skip the last instruction if
-  // LastInst is not 0.
-  for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst)
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  if (LastInst)
-    *LastInst = *Inst;
-
-  return Seq.size() - !!LastInst;
-}

diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 7d56259..aca2bc7 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h

@@ -88,18 +88,6 @@
                    const SmallVectorImpl<MachineOperand>& Cond) const;
 };
 
-namespace Mips {
-  /// Emit a series of instructions to load an immediate. All instructions
-  /// except for the last one are emitted. The function returns the number of
-  /// MachineInstrs generated. The opcode-immediate pair of the last
-  /// instruction is returned in LastInst, if it is not 0.
-  unsigned
-  loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                MachineBasicBlock& MBB, MachineBasicBlock::iterator II,
-                DebugLoc DL, bool LastInstrIsADDiu,
-                MipsAnalyzeImmediate::Inst *LastInst);
-}
-
 /// Create MipsInstrInfo objects.
 const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
 const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);

diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index fd952ef..6b6005f 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td

@@ -74,9 +74,10 @@
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
-                           [SDNPHasChain, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
 
 // MAdd*/MSub* nodes
 def MipsMAdd      : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub,
@@ -110,7 +111,7 @@
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
                            [SDNPHasChain, SDNPInGlue]>;
 
-def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
@@ -1079,6 +1080,26 @@
 def INS : InsBase<4, "ins", CPURegs>;
 
 //===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst,$src", (ADD CPURegs:$dst,CPURegs:$src,ZERO)>;
+def : InstAlias<"bal $offset", (BGEZAL RA,brtarget:$offset)>;
+def : InstAlias<"addu $rs,$rt,$imm",
+                (ADDiu CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"add $rs,$rt,$imm",
+                (ADDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"and $rs,$rt,$imm",
+                (ANDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"j $rs", (JR CPURegs:$rs)>;
+def : InstAlias<"not $rt,$rs", (NOR CPURegs:$rt,CPURegs:$rs,ZERO)>;
+def : InstAlias<"neg $rt,$rs", (SUB CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"negu $rt,$rs", (SUBu CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"slt $rs,$rt,$imm",
+                (SLTi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"xor $rs,$rt,$imm",
+                (XORi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+
+//===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 

diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index f78203f..b9dbd52 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp

@@ -10,6 +10,10 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
+// FIXME: 
+// 1. Fix pc-region jump instructions which cross 256MB segment boundaries. 
+// 2. If program has inline assembly statements whose size cannot be
+//    determined accurately, load branch target addresses from the GOT. 
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-long-branch"
@@ -48,7 +52,7 @@
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size;
+    uint64_t Size, Address;
     bool HasLongBranch;
     MachineInstr *Br;
 
@@ -61,7 +65,10 @@
     static char ID;
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {}
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
 
     virtual const char *getPassName() const {
       return "Mips Long Branch";
@@ -81,6 +88,9 @@
     const MipsInstrInfo *TII;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    unsigned ABI;
+    unsigned LongBranchSeqSize;
   };
 
   char MipsLongBranch::ID = 0;
@@ -230,12 +240,6 @@
 
 // Expand branch instructions to long branches.
 void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
-  I.HasLongBranch = true;
-
-  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI();
-  bool N64 = ABI == MipsSubtarget::N64;
-
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
   DebugLoc DL = I.Br->getDebugLoc();
@@ -248,101 +252,105 @@
   MBB->addSuccessor(LongBrMBB);
 
   if (IsPIC) {
-    // $longbr:
-    //  addiu $sp, $sp, -regsize * 2
-    //  sw $ra, 0($sp)
-    //  bal $baltgt
-    //  sw $a3, regsize($sp)
-    // $baltgt:
-    //  lui $a3, %hi($baltgt)
-    //  lui $at, %hi($tgt)
-    //  addiu $a3, $a3, %lo($baltgt)
-    //  addiu $at, $at, %lo($tgt)
-    //  subu $at, $at, $a3
-    //  addu $at, $ra, $at
-    //
-    //  if n64:
-    //   lui $a3, %highest($baltgt)
-    //   lui $ra, %highest($tgt)
-    //   addiu $a3, $a3, %higher($baltgt)
-    //   addiu $ra, $ra, %higher($tgt)
-    //   dsll $a3, $a3, 32
-    //   dsll $ra, $ra, 32
-    //   subu $at, $at, $a3
-    //   addu $at, $at, $ra
-    //
-    //  lw $ra, 0($sp)
-    //  lw $a3, regsize($sp)
-    //  jr $at
-    //  addiu $sp, $sp, regsize * 2
-    // $fallthrough:
-    //
-    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(FallThroughMBB, BalTgtMBB);
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
-    int RegSize = N64 ? 8 : 4;
-    unsigned AT = N64 ? Mips::AT_64 : Mips::AT;
-    unsigned A3 = N64 ? Mips::A3_64 : Mips::A3;
-    unsigned SP = N64 ? Mips::SP_64 : Mips::SP;
-    unsigned RA = N64 ? Mips::RA_64 : Mips::RA;
-    unsigned Load = N64 ? Mips::LD_P8 : Mips::LW;
-    unsigned Store = N64 ? Mips::SD_P8 : Mips::SW;
-    unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi;
-    unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu;
-    unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu;
-    unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu;
-    unsigned JR = N64 ? Mips::JR64 : Mips::JR;
+    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
+    int64_t Offset = TgtAddress - (I.Address + I.Size - 20);
+    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
+    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
 
-    Pos = LongBrMBB->begin();
+    if (ABI != MipsSubtarget::N64) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  bal $baltgt
+      //  lui $at, %hi($tgt - $baltgt)
+      // $baltgt:
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
 
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(-RegSize * 2);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP)
-      .addImm(0);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP)
-      .addImm(RegSize)->setIsInsideBundle();
+      Pos = LongBrMBB->begin();
 
-    Pos = BalTgtMBB->begin();
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi)
+        ->setIsInsideBundle();
 
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT);
+      Pos = BalTgtMBB->begin();
 
-    if (N64) {
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA);
-      I.Size += 4 * 8;
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
+        .addReg(Mips::AT).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(8)->setIsInsideBundle();
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  lui64 $at, %highest($tgt - $baltgt)
+      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  daddiu $at, $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  dsll $at, $at, 16
+      // $baltgt:
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
+      int64_t Highest =
+        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
+        .addImm(Highest);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Hi);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16)->setIsInsideBundle();
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(16)->setIsInsideBundle();
     }
-
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(RegSize * 2)->setIsInsideBundle();
-    I.Size += 4 * 14;
   } else {
     // $longbr:
     //  j $tgt
@@ -353,7 +361,6 @@
     LongBrMBB->addSuccessor(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle();
-    I.Size += 4 * 2;
   }
 
   if (I.Br->isUnconditionalBranch()) {
@@ -401,19 +408,36 @@
       if (!I->Br || I->HasLongBranch)
         continue;
 
-      if (!ForceLongBranch)
-        // Check if offset fits into 16-bit immediate field of branches.
-        if (isInt<16>(computeOffset(I->Br) / 4))
-          continue;
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+        continue;
 
-      expandToLongBranch(*I);
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
       ++LongBranches;
       EverMadeChange = MadeChange = true;
     }
   }
 
-  if (EverMadeChange)
-    MF->RenumberBlocks();
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
+
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
 
   return true;
 }

diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d4c5e6d..5fa6339 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp

@@ -11,7 +11,6 @@
 // MCInst records.
 //
 //===----------------------------------------------------------------------===//
-
 #include "MipsMCInstLower.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
@@ -161,31 +160,3 @@
   }
 }
 
-// If the D<shift> instruction has a shift amount that is greater
-// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
-void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI,
-                                      MCInst& Inst,
-                                      int64_t Shift) {
-  // rt
-  Inst.addOperand(LowerOperand(MI->getOperand(0)));
-  // rd
-  Inst.addOperand(LowerOperand(MI->getOperand(1)));
-  // saminus32
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-
-  switch (MI->getOpcode()) {
-  default:
-    // Calling function is not synchronized
-    llvm_unreachable("Unexpected shift instruction");
-    break;
-  case Mips::DSLL:
-    Inst.setOpcode(Mips::DSLL32);
-    break;
-  case Mips::DSRL:
-    Inst.setOpcode(Mips::DSRL32);
-    break;
-  case Mips::DSRA:
-    Inst.setOpcode(Mips::DSRA32);
-    break;
-  }
-}

diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 0abb996..3eab5a4 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h

@@ -33,7 +33,6 @@
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
   void Initialize(Mangler *mang, MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift);
 
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,

diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index ae6ae3a..79a142a 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp

@@ -22,7 +22,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/Type.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -43,9 +42,8 @@
 
 using namespace llvm;
 
-MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
-                                   const TargetInstrInfo &tii)
-  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {}
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
@@ -131,6 +129,12 @@
     Reserved.set(Mips::RA_64);
   }
 
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
   return Reserved;
 }
 
@@ -160,7 +164,7 @@
            "Instr doesn't have FrameIndex operand!");
   }
 
-  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
         errs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();

diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 9a05e94..78adf7f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h

@@ -22,16 +22,14 @@
 
 namespace llvm {
 class MipsSubtarget;
-class TargetInstrInfo;
 class Type;
 
 class MipsRegisterInfo : public MipsGenRegisterInfo {
 protected:
   const MipsSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
 public:
-  MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
+  MipsRegisterInfo(const MipsSubtarget &Subtarget);
 
   /// getRegisterNumbering - Given the enum value for some register, e.g.
   /// Mips::RA, return the number that it corresponds to (e.g. 31).

diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index eeb1de3..e4b44ef 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp

@@ -260,14 +260,53 @@
   if (isInt<16>(Amount))// addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
   else { // Expand immediate that doesn't fit in 16-bit.
-    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
-
     MBB.getParent()->getInfo<MipsFunctionInfo>()->setEmitNOAT();
-    Mips::loadImmediate(Amount, STI.isABI_N64(), *this, MBB, I, DL, false, 0);
-    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(ATReg);
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg);
   }
 }
 
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), ATReg).addReg(ATReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return ATReg;
+}
+
 unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||

diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 346e74d..55b78b2 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h

@@ -15,7 +15,6 @@
 #define MIPSSEINSTRUCTIONINFO_H
 
 #include "MipsInstrInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsSERegisterInfo.h"
 
 namespace llvm {
@@ -70,6 +69,13 @@
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned *NewImm) const;
+
 private:
   virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
 

diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 043a1ef..d868f73 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp

@@ -40,8 +40,8 @@
 using namespace llvm;
 
 MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+                                       const MipsSEInstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
@@ -122,15 +122,14 @@
     DebugLoc DL = II->getDebugLoc();
     unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
     unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
-    MipsAnalyzeImmediate::Inst LastInst(0, 0);
+    unsigned NewImm;
 
     MipsFI->setEmitNOAT();
-    Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true,
-                        &LastInst);
-    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
+    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
+    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(Reg);
 
     FrameReg = ATReg;
-    Offset = SignExtend64<16>(LastInst.ImmOpnd);
+    Offset = SignExtend64<16>(NewImm);
   }
 
   MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);

diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 4b17b33..b4eab65 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h

@@ -18,11 +18,14 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
+  const MipsSEInstrInfo &TII;
+
 public:
   MipsSERegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+                     const MipsSEInstrInfo &TII);
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,

diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 11ff809..ac83d83 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp

@@ -25,7 +25,8 @@
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
-                             const std::string &FS, bool little) :
+                             const std::string &FS, bool little,
+                             Reloc::Model RM) :
   MipsGenSubtargetInfo(TT, CPU, FS),
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
@@ -54,6 +55,9 @@
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
+
+  // Set UseSmallSection.
+  UseSmallSection = !IsLinux && (RM == Reloc::Static);
 }
 
 bool

diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ba15362..0595e8d 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h

@@ -65,6 +65,9 @@
   // isLinux - Target system is Linux. Is false we consider ELFOS for now.
   bool IsLinux;
 
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
   /// Features related to the presence of specific instructions.
 
   // HasSEInReg - SEB and SEH (signext in register) instructions.
@@ -109,7 +112,7 @@
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &CPU,
-                const std::string &FS, bool little);
+                const std::string &FS, bool little, Reloc::Model RM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -133,6 +136,7 @@
   bool inMips16Mode() const { return InMips16Mode; }
   bool isAndroid() const { return IsAndroid; }
   bool isLinux() const { return IsLinux; }
+  bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 

diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 2928a73..b70542b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp

@@ -42,7 +42,7 @@
                   CodeGenOpt::Level OL,
                   bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle),
+    Subtarget(TT, CPU, FS, isLittle, RM),
     DataLayout(isLittle ?
                (Subtarget.isABI_N64() ?
                 "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :

diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 04dc60a..1f5e34f 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp

@@ -26,6 +26,7 @@
 
 void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection =
     getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
@@ -60,9 +61,10 @@
 IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
                        SectionKind Kind) const {
 
-  // Only use small section for non linux targets.
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-  if (Subtarget.isLinux())
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
     return false;
 
   // Only global variables, not functions.

diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index d175e3e..413142e 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp

@@ -137,7 +137,7 @@
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   char Value = MI->getOperand(OpNo).getImm();
-  Value = (Value << (32-5)) >> (32-5);
+  Value = SignExtend32<5>(Value);
   O << (int)Value;
 }
 

diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 245b457..b9ea8b5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp

@@ -64,7 +64,6 @@
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  LCOMMDirectiveType = LCOMM::NoAlignment;
   AssemblerDialect = 0;           // Old-Style mnemonics.
 }
 

diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index b7f1688..cb15dad 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td

@@ -35,6 +35,10 @@
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+                                       "PPC::DIR_E5500", "">;
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
@@ -94,6 +98,12 @@
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc, FeatureMFOCRF,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
 def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
                                          FeatureMFOCRF, FeatureFSqrt,
                                          FeatureSTFIWX, FeatureISEL,

diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f76b89c..6e0e8bb 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp

@@ -109,6 +109,8 @@
     bool doFinalization(Module &M);
 
     virtual void EmitFunctionEntryLabel();
+
+    void EmitFunctionBodyEnd();
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -345,23 +347,32 @@
     OutStreamer.EmitLabel(PICBase);
     return;
   }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
   case PPC::LDtoc: {
     // Transform %X3 = LDtoc <ga:@min1>, %X2
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
-      
+
     // Change the opcode to LD, and the global address operand to be a
     // reference to the TOC entry we will synthesize later.
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
-    assert(MO.isGlobal());
-      
-    // Map symbol -> label of TOC entry.
-    MCSymbol *&TOCEntry = TOC[Mang->getSymbol(MO.getGlobal())];
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = 0;
+    if (MO.isGlobal())
+      MOSymbol = Mang->getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    MCSymbol *&TOCEntry = TOC[MOSymbol];
     if (TOCEntry == 0)
       TOCEntry = GetTempSymbol("C", TOCLabelID++);
-      
+    
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -406,9 +417,9 @@
     OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
   MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.@tocbase"));
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
+                        8/*size*/, 0/*addrspace*/);
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
+                        8/*size*/, 0/*addrspace*/);
   OutStreamer.SwitchSection(Current);
 
   MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
@@ -441,6 +452,23 @@
   return AsmPrinter::doFinalization(M);
 }
 
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget.isPPC64()) {
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.EmitIntValue(0, 8/*size*/);
+  }
+}
+
 void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   static const char *const CPUDirectives[] = {
     "",
@@ -453,6 +481,8 @@
     "ppc750",
     "ppc970",
     "ppcA2",
+    "ppce500mc",
+    "ppce5500",
     "power6",
     "power7",
     "ppc64"

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index a00f686..e8f4d16 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

@@ -975,6 +975,7 @@
 
   case ISD::AND: {
     unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
 
     // If this is an and of a value rotated between 0 and 31 bits and then and'd
     // with a mask, emit rlwinm
@@ -993,6 +994,14 @@
       SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
       return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
     }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - CountTrailingOnes_64(Imm64);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) };
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+    }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 61d44c5..dbb3b14 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -449,6 +449,21 @@
   setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    maxStoresPerMemset = 32;
+    maxStoresPerMemsetOptSize = 16;
+    maxStoresPerMemcpy = 32;
+    maxStoresPerMemcpyOptSize = 8;
+    maxStoresPerMemmove = 32;
+    maxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+    benefitFromCodePlacementOpt = true;
+  }
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -517,6 +532,8 @@
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::MTFSF:           return "PPCISD::MTFSF";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
   }
 }
 
@@ -811,14 +828,13 @@
   }
 
   // Properly sign extend the value.
-  int ShAmt = (4-ByteSize)*8;
-  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
 
   // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
   if (MaskVal == 0) return SDValue();
 
   // Finally, if this value fits in a 5 bit sext field, return it
-  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+  if (SignExtend32<5>(MaskVal) == MaskVal)
     return DAG.getTargetConstant(MaskVal, MVT::i32);
   return SDValue();
 }
@@ -1204,6 +1220,14 @@
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   const Constant *C = CP->getConstVal();
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue CPIHi =
@@ -1217,6 +1241,14 @@
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1441,7 +1473,7 @@
                               MachinePointerInfo(),
                               MVT::i32, false, false, 0);
 
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), 
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
                      false, false, false, 0);
 }
 
@@ -2408,7 +2440,7 @@
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
-      (Addr << 6 >> 6) != Addr)
+      SignExtend32<26>(Addr) != Addr)
     return 0;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
@@ -2819,6 +2851,10 @@
                                  isTailCall, RegsToPass, Ops, NodeTys,
                                  PPCSubTarget);
 
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
@@ -3116,14 +3152,6 @@
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call with floating args passed in
-  // registers.
-  if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
-                                     dl, MVT::i32), 0);
-    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
-  }
-
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
   SDValue InFlag;
@@ -3133,6 +3161,18 @@
     InFlag = Chain.getValue(1);
   }
 
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
+  // registers.
+  if (isVarArg) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
   if (isTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
                     false, TailCallArguments);
@@ -4126,7 +4166,7 @@
     unsigned TypeShiftAmt = i & (SplatBitSize-1);
 
     // vsplti + shl self.
-    if (SextVal == (i << (int)TypeShiftAmt)) {
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
@@ -4171,17 +4211,17 @@
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
     }

diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index b0a013b..902b188 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h

@@ -174,6 +174,10 @@
       ///   operand #3 optional in flag
       TC_RETURN,
 
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 

diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 39778a5..cfe71d17 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td

@@ -29,6 +29,9 @@
   let PrintMethod = "printSymbolLo";
   let EncoderMethod = "getLO16Encoding";
 }
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -296,12 +299,14 @@
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
                       "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+}
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
@@ -459,7 +464,7 @@
 
 let Defs = [CARRY] in {
 def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
-                      "sradi $rA, $rS, $SH", IntRotateD,
+                      "sradi $rA, $rS, $SH", IntRotateDI,
                       [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
 }
 def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
@@ -482,7 +487,7 @@
 let isCommutable = 1 in {
 def RLDIMI : MDForm_1<30, 3,
                       (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64, RegConstraint<"$rSi = $rA">,
                       NoEncode<"$rSi">;
 }
@@ -494,11 +499,11 @@
                       []>, isPPC64;
 def RLDICL : MDForm_1<30, 0,
                       (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldicl $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64;
 def RLDICR : MDForm_1<30, 1,
                       (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
-                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      "rldicr $rA, $rS, $SH, $ME", IntRotateDI,
                       []>, isPPC64;
 
 def RLWINM8 : MForm_2<21,
@@ -541,19 +546,19 @@
 let mayLoad = 1 in
 def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
                             ptr_rc:$rA),
-                    "lhau $rD, $disp($rA)", LdStLoad,
+                    "lhau $rD, $disp($rA)", LdStLHAU,
                     []>, RegConstraint<"$rA = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
 def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lhaux $rD, $addr", LdStLoad,
+                    "lhaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lwaux $rD, $addr", LdStLoad,
+                    "lwaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">, isPPC64;
 }
@@ -584,31 +589,31 @@
 // Update forms.
 let mayLoad = 1 in {
 def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStLoad,
+                    "lbzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStLoad,
+                    "lhzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStLoad,
+                    "lwzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
 def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -624,6 +629,14 @@
                   "",
                   [(set G8RC:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tjumptable:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64;
 
 let hasSideEffects = 1 in { 
 let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
@@ -642,13 +655,13 @@
                    
 let mayLoad = 1 in
 def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
-                    "ldu $rD, $addr", LdStLD,
+                    "ldu $rD, $addr", LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
 def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "ldux $rD, $addr", LdStLoad,
+                   "ldux $rD, $addr", LdStLDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
 }
@@ -695,14 +708,14 @@
 
 def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
@@ -710,7 +723,7 @@
 
 def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
                                           iaddroff:$ptroff))]>,
@@ -718,7 +731,7 @@
 
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
-                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTDU,
                     [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
@@ -727,7 +740,7 @@
 
 def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti8 G8RC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -736,7 +749,7 @@
 
 def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti16 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -745,7 +758,7 @@
 
 def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti32 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -754,7 +767,7 @@
 
 def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stdux $rS, $ptroff, $ptrreg", LdStSTDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,

diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 47f09dc..d2df664 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp

@@ -54,7 +54,8 @@
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) {
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
     return new PPCScoreboardHazardRecognizer(II, DAG);
   }
@@ -70,7 +71,8 @@
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   // Most subtargets use a PPC970 recognizer.
-  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) {
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
     const TargetInstrInfo *TII = TM.getInstrInfo();
     assert(TII && "No InstrInfo?");
 

diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f57f0c9..a503908 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td

@@ -123,9 +123,11 @@
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
-                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
 def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                            [SDNPHasChain, SDNPSideEffect,
+                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
@@ -153,6 +155,12 @@
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 // Instructions to support atomic operations
 def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -330,9 +338,6 @@
   let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
-def tocentry : Operand<iPTR> {
-  let MIOperandInfo = (ops i32imm:$imm);
-}
 
 // PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
 // that doesn't matter.
@@ -673,7 +678,7 @@
                   [(set GPRC:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
-                  "lfs $rD, $src", LdStLFDU,
+                  "lfs $rD, $src", LdStLFD,
                   [(set F4RC:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
                   "lfd $rD, $src", LdStLFD,
@@ -683,32 +688,32 @@
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1 in {
 def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStLoad,
+                   "lbzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStLoad,
+                   "lhau $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStLoad,
+                   "lhzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStLoad,
+                   "lwzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfs $rD, $addr", LdStLFDU,
+                  "lfsu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfd $rD, $addr", LdStLFD,
+                  "lfdu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -716,37 +721,37 @@
 // Indexed (r+r) Loads with Update (preinc).
 def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhaux $rD, $addr", LdStLoad,
+                   "lhaux $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfsux $rD, $addr", LdStLoad,
+                   "lfsux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfdux $rD, $addr", LdStLoad,
+                   "lfdux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -778,10 +783,10 @@
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
-                      "lfsx $frD, $src", LdStLFDU,
+                      "lfsx $frD, $src", LdStLFD,
                       [(set F4RC:$frD, (load xaddr:$src))]>;
 def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
-                      "lfdx $frD, $src", LdStLFDU,
+                      "lfdx $frD, $src", LdStLFD,
                       [(set F8RC:$frD, (load xaddr:$src))]>;
 }
 
@@ -801,10 +806,10 @@
                    "stw $rS, $src", LdStStore,
                    [(store GPRC:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
-                   "stfs $rS, $dst", LdStUX,
+                   "stfs $rS, $dst", LdStSTFD,
                    [(store F4RC:$rS, iaddr:$dst)]>;
 def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
-                   "stfd $rS, $dst", LdStUX,
+                   "stfd $rS, $dst", LdStSTFD,
                    [(store F8RC:$rS, iaddr:$dst)]>;
 }
 
@@ -812,33 +817,33 @@
 let PPC970_Unit = 2 in {
 def STBU  : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfsu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfsu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfdu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfdu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
@@ -863,7 +868,7 @@
  
 def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti8 GPRC:$rS,
                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -872,7 +877,7 @@
  
 def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti16 GPRC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -881,7 +886,7 @@
                  
 def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -889,7 +894,7 @@
 
 def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
                               (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfsux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -897,7 +902,7 @@
 
 def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
                               (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfdux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -913,14 +918,14 @@
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfiwx $frS, $dst", LdStUX,
+                     "stfiwx $frS, $dst", LdStSTFD,
                      [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
                      
 def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
-                     "stfsx $frS, $dst", LdStUX,
+                     "stfsx $frS, $dst", LdStSTFD,
                      [(store F4RC:$frS, xaddr:$dst)]>;
 def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfdx $frS, $dst", LdStUX,
+                     "stfdx $frS, $dst", LdStSTFD,
                      [(store F8RC:$frS, xaddr:$dst)]>;
 }
 
@@ -964,7 +969,7 @@
                      [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
 }
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
                        "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
@@ -1143,6 +1148,16 @@
               "crxor $dst, $dst, $dst", BrCR,
               []>;
 
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", BrCR,
+              [(PPCcr6unset)]>;
+}
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1233,7 +1248,7 @@
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def FADDrtz: AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
@@ -1364,7 +1379,7 @@
 let Uses = [RM] in {
   def FADD  : AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
   def FADDS : AForm_2<59, 21,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1388,7 +1403,7 @@
                       [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
   def FSUB  : AForm_2<63, 20,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      "fsub $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
   def FSUBS : AForm_2<59, 20,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),

diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 6a6ccb9..660c0c3 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td

@@ -40,6 +40,7 @@
 def IntMulLI     : InstrItinClass;
 def IntRFID      : InstrItinClass;
 def IntRotateD   : InstrItinClass;
+def IntRotateDI  : InstrItinClass;
 def IntRotate    : InstrItinClass;
 def IntShift     : InstrItinClass;
 def IntTrapD     : InstrItinClass;
@@ -52,15 +53,18 @@
 def LdStDCBF     : InstrItinClass;
 def LdStDCBI     : InstrItinClass;
 def LdStLoad     : InstrItinClass;
+def LdStLoadUpd  : InstrItinClass;
 def LdStStore    : InstrItinClass;
+def LdStStoreUpd : InstrItinClass;
 def LdStDSS      : InstrItinClass;
 def LdStICBI     : InstrItinClass;
-def LdStUX       : InstrItinClass;
 def LdStLD       : InstrItinClass;
+def LdStLDU      : InstrItinClass;
 def LdStLDARX    : InstrItinClass;
 def LdStLFD      : InstrItinClass;
 def LdStLFDU     : InstrItinClass;
 def LdStLHA      : InstrItinClass;
+def LdStLHAU     : InstrItinClass;
 def LdStLMW      : InstrItinClass;
 def LdStLVecX    : InstrItinClass;
 def LdStLWA      : InstrItinClass;
@@ -69,6 +73,9 @@
 def LdStSLBIE    : InstrItinClass;
 def LdStSTD      : InstrItinClass;
 def LdStSTDCX    : InstrItinClass;
+def LdStSTDU     : InstrItinClass;
+def LdStSTFD     : InstrItinClass;
+def LdStSTFDU    : InstrItinClass;
 def LdStSTVEBX   : InstrItinClass;
 def LdStSTWCX    : InstrItinClass;
 def LdStSync     : InstrItinClass;
@@ -86,6 +93,7 @@
 def SprRFI       : InstrItinClass;
 def SprSC        : InstrItinClass;
 def FPGeneral    : InstrItinClass;
+def FPAddSub     : InstrItinClass;
 def FPCompare    : InstrItinClass;
 def FPDivD       : InstrItinClass;
 def FPDivS       : InstrItinClass;
@@ -110,6 +118,8 @@
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
 include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction to itinerary class map - When add new opcodes to the supported
@@ -171,7 +181,7 @@
 //    extsh      IntSimple
 //    extsw      IntSimple
 //    fabs       FPGeneral
-//    fadd       FPGeneral
+//    fadd       FPAddSub
 //    fadds      FPGeneral
 //    fcfid      FPGeneral
 //    fcmpo      FPCompare
@@ -201,35 +211,35 @@
 //    fsel       FPGeneral
 //    fsqrt      FPSqrt
 //    fsqrts     FPSqrt
-//    fsub       FPGeneral
+//    fsub       FPAddSub
 //    fsubs      FPGeneral
 //    icbi       LdStICBI
 //    isync      SprISYNC
 //    lbz        LdStLoad
-//    lbzu       LdStLoad
-//    lbzux      LdStUX
+//    lbzu       LdStLoadUpd
+//    lbzux      LdStLoadUpd
 //    lbzx       LdStLoad
 //    ld         LdStLD
 //    ldarx      LdStLDARX
-//    ldu        LdStLD
-//    ldux       LdStLD
+//    ldu        LdStLDU
+//    ldux       LdStLDU
 //    ldx        LdStLD
 //    lfd        LdStLFD
 //    lfdu       LdStLFDU
 //    lfdux      LdStLFDU
-//    lfdx       LdStLFDU
-//    lfs        LdStLFDU
+//    lfdx       LdStLFD
+//    lfs        LdStLFD
 //    lfsu       LdStLFDU
 //    lfsux      LdStLFDU
-//    lfsx       LdStLFDU
+//    lfsx       LdStLFD
 //    lha        LdStLHA
-//    lhau       LdStLHA
-//    lhaux      LdStLHA
+//    lhau       LdStLHAU
+//    lhaux      LdStLHAU
 //    lhax       LdStLHA
 //    lhbrx      LdStLoad
 //    lhz        LdStLoad
-//    lhzu       LdStLoad
-//    lhzux      LdStUX
+//    lhzu       LdStLoadUpd
+//    lhzux      LdStLoadUpd
 //    lhzx       LdStLoad
 //    lmw        LdStLMW
 //    lswi       LdStLMW
@@ -243,12 +253,12 @@
 //    lvxl       LdStLVecX
 //    lwa        LdStLWA
 //    lwarx      LdStLWARX
-//    lwaux      LdStLHA
+//    lwaux      LdStLHAU
 //    lwax       LdStLHA
 //    lwbrx      LdStLoad
 //    lwz        LdStLoad
-//    lwzu       LdStLoad
-//    lwzux      LdStUX
+//    lwzu       LdStLoadUpd
+//    lwzux      LdStLoadUpd
 //    lwzx       LdStLoad
 //    mcrf       BrMCR
 //    mcrfs      FPGeneral
@@ -292,10 +302,10 @@
 //    rfid       IntRFID
 //    rldcl      IntRotateD
 //    rldcr      IntRotateD
-//    rldic      IntRotateD
-//    rldicl     IntRotateD
-//    rldicr     IntRotateD
-//    rldimi     IntRotateD
+//    rldic      IntRotateDI
+//    rldicl     IntRotateDI
+//    rldicr     IntRotateDI
+//    rldimi     IntRotateDI
 //    rlwimi     IntRotate
 //    rlwinm     IntGeneral
 //    rlwnm      IntGeneral
@@ -305,33 +315,33 @@
 //    sld        IntRotateD
 //    slw        IntGeneral
 //    srad       IntRotateD
-//    sradi      IntRotateD
+//    sradi      IntRotateDI
 //    sraw       IntShift
 //    srawi      IntShift
 //    srd        IntRotateD
 //    srw        IntGeneral
 //    stb        LdStStore
-//    stbu       LdStStore
-//    stbux      LdStStore
+//    stbu       LdStStoreUpd
+//    stbux      LdStStoreUpd
 //    stbx       LdStStore
 //    std        LdStSTD
 //    stdcx.     LdStSTDCX
-//    stdu       LdStSTD
-//    stdux      LdStSTD
+//    stdu       LdStSTDU
+//    stdux      LdStSTDU
 //    stdx       LdStSTD
-//    stfd       LdStUX
-//    stfdu      LdStUX
-//    stfdux     LdStUX
-//    stfdx      LdStUX
-//    stfiwx     LdStUX
-//    stfs       LdStUX
-//    stfsu      LdStUX
-//    stfsux     LdStUX
-//    stfsx      LdStUX
+//    stfd       LdStSTFD
+//    stfdu      LdStSTFDU
+//    stfdux     LdStSTFDU
+//    stfdx      LdStSTFD
+//    stfiwx     LdStSTFD
+//    stfs       LdStSTFD
+//    stfsu      LdStSTFDU
+//    stfsux     LdStSTFDU
+//    stfsx      LdStSTFD
 //    sth        LdStStore
 //    sthbrx     LdStStore
-//    sthu       LdStStore
-//    sthux      LdStStore
+//    sthu       LdStStoreUpd
+//    sthux      LdStStoreUpd
 //    sthx       LdStStore
 //    stmw       LdStLMW
 //    stswi      LdStLMW
@@ -344,8 +354,8 @@
 //    stw        LdStStore
 //    stwbrx     LdStStore
 //    stwcx.     LdStSTWCX
-//    stwu       LdStStore
-//    stwux      LdStStore
+//    stwu       LdStStoreUpd
+//    stwux      LdStStoreUpd
 //    stwx       LdStStore
 //    subf       IntGeneral
 //    subfc      IntGeneral

diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index cd0fb70..37b6eac 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td

@@ -288,6 +288,15 @@
                                InstrStage<2, [LWB]>],
                               [9, 5],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -297,6 +306,15 @@
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -306,7 +324,7 @@
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<1, [IFTH1, IFTH2]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
                                InstrStage<1, [LRACC]>,
@@ -315,6 +333,15 @@
                                InstrStage<1, [LWB]>],
                               [8, 5, 5],
                               [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -342,6 +369,15 @@
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -371,6 +407,15 @@
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
@@ -537,6 +582,19 @@
                                InstrStage<1, [FWB]>],
                               [10, 4, 4],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,

diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 4d4a5d0..ba63b5c 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td

@@ -181,6 +181,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -302,7 +313,18 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLD      , [InstrStage<4,
+  InstrItinData<LdStLoadUpd , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -324,6 +346,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStICBI    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -335,7 +368,7 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<4,
+  InstrItinData<LdStSTFD    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -346,6 +379,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7, 7],
                               [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,6 +423,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStLMW     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -412,6 +467,17 @@
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -593,6 +659,17 @@
                                InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
                               [15, 7, 7],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,

diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 0000000..9bb779a
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td

@@ -0,0 +1,265 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// Power processor.
+// 
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def DIS0 : FuncUnit; // Dispatch stage - insn 1
+def DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is executed.
+def SFX0  : FuncUnit; // Simple unit 0
+def SFX1  : FuncUnit; // Simple unit 1
+def BU    : FuncUnit; // Branch unit
+def CFX_DivBypass 
+          : FuncUnit; // CFX divide bypass path
+def CFX_0 : FuncUnit; // CFX pipeline
+def LSU_0 : FuncUnit; // LSU pipeline
+def FPU_0 : FuncUnit; // FPU pipeline
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 1, 1], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<14, [CFX_DivBypass]>],
+                              [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11], // Latency = 8
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [5, 1], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1], // Latency = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [5, 1], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [5, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [SFX0]>],
+                              [8, 1],
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [4, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<68, [FPU_0]>],
+                              [71, 1, 1], // Latency = 68, Repeat rate = 68
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE500mcItineraries;
+}

diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 0000000..d7e11ac
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td

@@ -0,0 +1,309 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// Power processor.
+// 
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+// def DIS0 : FuncUnit;
+// def DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is being executed.
+// def SFX0  : FuncUnit; // Simple unit 0
+// def SFX1  : FuncUnit; // Simple unit 1
+// def BU    : FuncUnit; // Branch unit
+// def CFX_DivBypass 
+//           : FuncUnit; // CFX divide bypass path
+// def CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def CFX_1 : FuncUnit; // CFX pipeline stage 1 
+
+// def LSU_0 : FuncUnit; // LSU pipeline
+// def FPU_0 : FuncUnit; // FPU pipeline
+
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
+   LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<26, [CFX_DivBypass]>],
+                              [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<16, [CFX_DivBypass]>],
+                              [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<7, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 7
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                                                            
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLDARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [LSU_0]>],
+                              [8, 2], // Latency = r+3, Repeat rate = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [CFX_0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [CFX_0]>],
+                              [9, 2], // Latency = 5, Repeat rate = 5
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [CFX_0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<31, [FPU_0]>],
+                              [39, 2, 2], // Latency = 35, Repeat rate = 31
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<16, [FPU_0]>],
+                              [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [12, 2], // Latency = 8, Repeat rate = 2
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE5500Itineraries;
+}

diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 61e89ed..72a0a39 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td

@@ -34,12 +34,16 @@
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
@@ -58,6 +62,7 @@
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,

diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index e19ddfa..fc9120d 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td

@@ -33,13 +33,17 @@
   InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
@@ -60,6 +64,7 @@
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,

diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index e7446cb..a4e82ce 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td

@@ -36,19 +36,24 @@
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
@@ -66,6 +71,7 @@
   InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
   InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,

diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index 1371499..7c02ea099 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td

@@ -27,6 +27,7 @@
   InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
   InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
   InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
@@ -37,15 +38,20 @@
   InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
   InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
@@ -53,6 +59,7 @@
   InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
   InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
@@ -69,6 +76,7 @@
   InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
   InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
   InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
   InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
   InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
   InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,

diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 0207c83..b8b1614 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h

@@ -41,6 +41,8 @@
     DIR_750, 
     DIR_970, 
     DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
     DIR_PWR6,
     DIR_PWR7,
     DIR_64  

diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 15541ef..e64c140 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td

@@ -129,7 +129,7 @@
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
-                           [SDNPHasChain]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";

diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 8e215a7..62f973e 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp

@@ -24,6 +24,16 @@
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "_ZdaPv",
+    "_ZdlPv",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -31,16 +41,29 @@
     "__memcpy_chk",
     "acos",
     "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
     "acosl",
     "asin",
     "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
     "asinl",
     "atan",
     "atan2",
     "atan2f",
     "atan2l",
     "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
     "atanl",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
     "ceil",
     "ceilf",
     "ceill",
@@ -54,6 +77,9 @@
     "coshl",
     "cosl",
     "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
     "exp2",
     "exp2f",
     "exp2l",
@@ -74,6 +100,7 @@
     "fmodl",
     "fputc",
     "fputs",
+    "free",
     "fwrite",
     "iprintf",
     "log",
@@ -86,8 +113,12 @@
     "log2",
     "log2f",
     "log2l",
+    "logb",
+    "logbf",
+    "logbl",
     "logf",
     "logl",
+    "malloc",
     "memchr",
     "memcmp",
     "memcpy",
@@ -97,11 +128,14 @@
     "nearbyint",
     "nearbyintf",
     "nearbyintl",
+    "posix_memalign",
     "pow",
     "powf",
     "powl",
     "putchar",
     "puts",
+    "realloc",
+    "reallocf",
     "rint",
     "rintf",
     "rintl",
@@ -121,10 +155,12 @@
     "strcat",
     "strchr",
     "strcpy",
+    "strdup",
     "strlen",
     "strncat",
     "strncmp",
     "strncpy",
+    "strndup",
     "strnlen",
     "tan",
     "tanf",
@@ -134,7 +170,8 @@
     "tanl",
     "trunc",
     "truncf",
-    "truncl"
+    "truncl",
+    "valloc"
   };
 
 /// initialize - Initialize the set of available library functions based on the
@@ -205,6 +242,21 @@
     TLI.setUnavailable(LibFunc::tanhl);
 
     // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
     TLI.setUnavailable(LibFunc::exp2);
     TLI.setUnavailable(LibFunc::exp2f);
     TLI.setUnavailable(LibFunc::exp2l);
@@ -217,6 +269,9 @@
     TLI.setUnavailable(LibFunc::log1p);
     TLI.setUnavailable(LibFunc::log1pf);
     TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
     TLI.setUnavailable(LibFunc::nearbyint);
     TLI.setUnavailable(LibFunc::nearbyintf);
     TLI.setUnavailable(LibFunc::nearbyintl);

diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 73a0095..c89e738 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp

@@ -67,12 +67,19 @@
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
 
-  bool MatchInstruction(SMLoc IDLoc,
+  bool MatchInstruction(SMLoc IDLoc,  unsigned &Kind,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         SmallVectorImpl<MCInst> &MCInsts,
                         unsigned &OrigErrorInfo,
                         bool matchingInlineAsm = false);
 
+  unsigned getMCInstOperandNum(unsigned Kind, MCInst &Inst,
+                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               unsigned OperandNum, unsigned &NumMCOperands) {
+    return getMCInstOperandNumImpl(Kind, Inst, Operands, OperandNum,
+                                   NumMCOperands);
+  }
+
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
   /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
   bool isSrcOp(X86Operand &Op);
@@ -514,12 +521,13 @@
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
   RegNo = 0;
-  if (!isParsingIntelSyntax()) {
-    const AsmToken &TokPercent = Parser.getTok();
-    assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
-    StartLoc = TokPercent.getLoc();
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
     Parser.Lex(); // Eat percent token.
-  }
 
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -1516,9 +1524,12 @@
 MatchAndEmitInstruction(SMLoc IDLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                         MCStreamer &Out) {
-  SmallVector<MCInst, 2> Insts;
+  unsigned Kind;
   unsigned ErrorInfo;
-  bool Error = MatchInstruction(IDLoc, Operands, Insts, ErrorInfo);
+  SmallVector<MCInst, 2> Insts;
+
+  bool Error = MatchInstruction(IDLoc, Kind, Operands, Insts,
+                                ErrorInfo);
   if (!Error)
     for (unsigned i = 0, e = Insts.size(); i != e; ++i)
       Out.EmitInstruction(Insts[i]);
@@ -1526,7 +1537,7 @@
 }
 
 bool X86AsmParser::
-MatchInstruction(SMLoc IDLoc,
+MatchInstruction(SMLoc IDLoc, unsigned &Kind,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                  SmallVectorImpl<MCInst> &MCInsts, unsigned &OrigErrorInfo,
                  bool matchingInlineAsm) {
@@ -1537,7 +1548,7 @@
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
-  // Also, MatchInstructionImpl should do actually *do* the EmitInstruction
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
   if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
       Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
@@ -1568,7 +1579,7 @@
   MCInst Inst;
 
   // First, try a direct match.
-  switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo,
+  switch (MatchInstructionImpl(Operands, Kind, Inst, OrigErrorInfo,
                                isParsingIntelSyntax())) {
   default: break;
   case Match_Success:
@@ -1585,9 +1596,6 @@
     Error(IDLoc, "instruction requires a CPU feature not currently enabled",
           EmptyRanges, matchingInlineAsm);
     return true;
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction",
-                 EmptyRanges, matchingInlineAsm);
   case Match_InvalidOperand:
     WasOriginallyInvalidOperand = true;
     break;
@@ -1619,14 +1627,19 @@
   Tmp[Base.size()] = Suffixes[0];
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
+  unsigned tKind;
 
-  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match1 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match1 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[1];
-  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match2 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match2 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[2];
-  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match3 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match3 == Match_Success) Kind = tKind;
   Tmp[Base.size()] = Suffixes[3];
-  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match4 = MatchInstructionImpl(Operands, tKind, Inst, ErrorInfoIgnore);
+  if (Match4 == Match_Success) Kind = tKind;
 
   // Restore the old token.
   Op->setTokenValue(Base);
@@ -1677,8 +1690,10 @@
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges = matchingInlineAsm ? EmptyRanges :
+        Op->getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
-                   Op->getLocRange(), matchingInlineAsm);
+                   Ranges, matchingInlineAsm);
     }
 
     // Recover location info for the operand if we know which was the problem.
@@ -1730,7 +1745,10 @@
     return ParseDirectiveWord(2, DirectiveID.getLoc());
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
-  else if (IDVal.startswith(".intel_syntax")) {
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {

diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 5039887..f136927 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp

@@ -44,7 +44,7 @@
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) {
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
@@ -95,8 +95,8 @@
 ///                   be a pointer to a MemoryObject.
 /// @param byte     - A pointer to the byte to be read.
 /// @param address  - The address to be read.
-static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
-  MemoryObject* region = static_cast<MemoryObject*>(arg);
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
   return region->readByte(address, byte);
 }
 
@@ -135,10 +135,10 @@
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
-                              (void*)&region,
+                              (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (void*)MII,
+                              (const void*)MII,
                               address,
                               fMode);
 
@@ -379,6 +379,8 @@
   }
 
   switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
     return;

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 0c92912..af444d1 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c

@@ -200,7 +200,7 @@
                              insn->readerCursor + offset);        \
       if (ret)                                                    \
         return ret;                                               \
-      combined = combined | ((type)byte << ((type)offset * 8));   \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
     }                                                             \
     *ptr = combined;                                              \
     insn->readerCursor += sizeof(type);                           \
@@ -719,7 +719,7 @@
  * @return      - 0 if the ModR/M could be read when needed or was not needed;
  *                nonzero otherwise.
  */
-static int getID(struct InternalInstruction* insn, void *miiArg) {
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
   uint8_t attrMask;
   uint16_t instructionID;
   
@@ -1621,10 +1621,10 @@
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode) {
   memset(insn, 0, sizeof(struct InternalInstruction));

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 797703f..05cbb4c 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h

@@ -403,7 +403,7 @@
  *                  be read from.
  * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
  */
-typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
 
 /*
  * dlog_t - Type for the logging function that the consumer can provide to
@@ -422,7 +422,7 @@
   /* Reader interface (C) */
   byteReader_t reader;
   /* Opaque value passed to the reader */
-  void* readerArg;
+  const void* readerArg;
   /* The address of the next byte to read via the reader */
   uint64_t readerCursor;
 
@@ -561,10 +561,10 @@
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
@@ -579,7 +579,7 @@
                           unsigned line,
                           const char *s);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
 
 #ifdef __cplusplus
 }

diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 624e56f..4011035 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt

@@ -941,3 +941,15 @@
 cost of reduced accuracy.
 
 //===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 18e6b7c..d078a7b 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td

@@ -120,6 +120,9 @@
                                       "Support BMI2 instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                                     "HasSlowDivide", "true",
+                                     "Use small divide for positive values less than 256">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -160,7 +163,8 @@
 def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
-                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
+                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                               FeatureSlowDivide]>;
 // "Arrandale" along with corei3 and corei5
 def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,

diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index db71e27..a4785c9 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp

@@ -233,12 +233,14 @@
 
 
 void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
-                                 raw_ostream &O, const char *Modifier) {
+                                 raw_ostream &O, const char *Modifier,
+                                 unsigned AsmVariant) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type!");
   case MachineOperand::MO_Register: {
-    O << '%';
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
       EVT VT = (strcmp(Modifier+6,"64") == 0) ?
@@ -471,7 +473,7 @@
     }
   }
 
-  printOperand(MI, OpNo, O);
+  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
   return false;
 }
 

diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 35386cd..0062387 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h

@@ -50,7 +50,7 @@
 
   // These methods are used by the tablegen'erated instruction printer.
   void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = 0, unsigned AsmVariant = 0);
   void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);

diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index d7050495..e202321 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp

@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
@@ -134,8 +133,7 @@
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
 
   do {
-    DEBUG(dbgs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index e5952aa..54704d8 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp

@@ -2014,13 +2014,17 @@
 unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   MVT VT;
   if (!isTypeLegal(C->getType(), VT))
-    return false;
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-  default: return false;
+  default: return 0;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -2058,7 +2062,7 @@
     break;
   case MVT::f80:
     // No f80 support yet.
-    return false;
+    return 0;
   }
 
   // Materialize addresses with LEA instructions.

diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 955c75a..9d5de81 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp

@@ -171,6 +171,7 @@
     // Shuffle live registers to match the expectations of successor blocks.
     void finishBlockStack();
 
+#ifndef NDEBUG
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -181,6 +182,7 @@
         dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
       dbgs() << "\n";
     }
+#endif
 
     /// getSlot - Return the stack slot number a particular register number is
     /// in.

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 27195b4..5fdc61e 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp

@@ -100,6 +100,7 @@
       Base_Reg = Reg;
     }
 
+#ifndef NDEBUG
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
@@ -133,6 +134,7 @@
         dbgs() << "nul";
       dbgs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
@@ -1011,7 +1013,7 @@
           AM.IndexReg = ShVal.getNode()->getOperand(0);
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
-          uint64_t Disp = AddVal->getSExtValue() << Val;
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!FoldOffsetIntoAddress(Disp, AM))
             return false;
         }
@@ -2116,7 +2118,8 @@
 
     // Make sure that we don't change the operation by removing bits.
     // This only matters for OR and XOR, AND is unaffected.
-    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
       break;
 
     unsigned ShlOp, Op;

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7954170..5c525ae 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp

@@ -85,7 +85,7 @@
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                VecIdx);
 
@@ -118,7 +118,7 @@
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                      VecIdx);
 }
@@ -182,6 +182,10 @@
     setSchedulingPreference(Sched::RegPressure);
   setStackPointerRegisterToSaveRestore(X86StackPtr);
 
+  // Bypass i32 with i8 on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+    addBypassSlowDivType(Type::getInt32Ty(getGlobalContext()), Type::getInt8Ty(getGlobalContext()));
+
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
     // Setup Windows compiler runtime calls.
     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
@@ -735,6 +739,7 @@
     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
@@ -824,6 +829,7 @@
     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
@@ -857,6 +863,7 @@
     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
@@ -925,6 +932,8 @@
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   }
 
   if (Subtarget->hasSSE41()) {
@@ -939,6 +948,9 @@
     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
 
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1016,19 +1028,25 @@
     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
 
@@ -1052,7 +1070,7 @@
     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
 
-    if (Subtarget->hasFMA()) {
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
@@ -2832,7 +2850,7 @@
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -3506,25 +3524,26 @@
     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
       MatchOddMask = false;
   }
-  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
-  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
 
-  const int *CompactionMask;
-  if (MatchEvenMask)
-    CompactionMask = CompactionMaskEven;
-  else if (MatchOddMask)
-    CompactionMask = CompactionMaskOdd;
-  else
+  if (!MatchEvenMask && !MatchOddMask)
     return SDValue();
-
+  
   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
 
-  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
-                                     UndefNode, CompactionMask);
-  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
-                                     UndefNode, CompactionMask);
-  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
-  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
 }
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -4977,6 +4996,18 @@
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
                                 false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -5881,8 +5912,6 @@
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
@@ -5906,7 +5935,11 @@
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (V2IsUndef)
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5992,6 +6025,51 @@
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
 }
 
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG,
+                                 const X86TargetLowering &TLI) {
+  EVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if 
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !TLI.getSubtarget()->hasAVX2() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  SmallVector<SDValue, 32> pshufbMask;
+  for (unsigned i = 0; i != 32; i++) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0 || EltIdx >= 32)
+      EltIdx = 0x80;
+    else {
+      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+        // Cross lane is not allowed.
+        return SDValue();
+      EltIdx &= 0xf;
+    }
+    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  }
+  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                  MVT::v32i8, &pshufbMask[0], 32));
+}
+
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
@@ -6818,6 +6896,12 @@
       return NewOp;
   }
 
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   // Handle all 128-bit wide vectors with 4 elements, and match them with
   // several different shuffle types.
   if (NumElems == 4 && VT.is128BitVector())
@@ -8115,26 +8199,35 @@
   return FIST;
 }
 
-SDValue X86TargetLowering::LowerFABS(SDValue Op,
-                                     SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  Constant *C;
-  if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2,
-                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
-  } else {
-    C = ConstantVector::getSplat(4,
-               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    NumElts = VT.getVectorNumElements();
   }
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
@@ -8154,10 +8247,11 @@
   else
     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
   if (VT.isVector()) {
     MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
@@ -9943,62 +10037,6 @@
                                Op.getOperand(1), Op.getOperand(2), DAG);
   }
 
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
-
-    unsigned NewIntNo;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
-      break;
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
-  }
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -10077,6 +10115,74 @@
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
   }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   }
 }
 
@@ -10918,7 +11024,7 @@
         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
 
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
       }
       // fall through
     case MVT::v4i32:
@@ -14020,7 +14126,7 @@
 //
 // where Op could be BRCOND or CMOV.
 //
-static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // Quit if not CMP and SUB with its value result used.
   if (Cmp.getOpcode() != X86ISD::CMP &&
       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@@ -14056,40 +14162,133 @@
   if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
     SetCC = SetCC.getOperand(0);
 
-  // Quit if not SETCC.
-  // FIXME: So far we only handle the boolean value generated from SETCC. If
-  // there is other ways to generate boolean values, we need handle them here
-  // as well.
-  if (SetCC.getOpcode() != X86ISD::SETCC)
-    return SDValue();
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      // A special case for rdrand, where 0 is set if false cond is found.
+      SDValue Op = SetCC.getOperand(0);
+      if (Op.getOpcode() != X86ISD::RDRAND)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
+  }
 
-  // Set the condition code or opposite one if necessary.
-  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
-  if (needOppositeCond)
-    CC = X86::GetOppositeBranchCondition(CC);
-
-  return SetCC.getOperand(1);
+  return SDValue();
 }
 
-static bool IsValidFCMOVCondition(X86::CondCode CC) {
-  switch (CC) {
-  default:
-    return false;
-  case X86::COND_B:
-  case X86::COND_BE:
-  case X86::COND_E:
-  case X86::COND_P:
-  case X86::COND_AE:
-  case X86::COND_A:
-  case X86::COND_NE:
-  case X86::COND_NP:
-    return true;
+/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
+/// updated. If only flag result is used and the result is evaluated from a
+/// series of element extraction, try to combine it into a PTEST.
+static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget) {
+  SDNode *N = Or.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  // Only SSE4.1 and beyond supports PTEST or like.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (N->getOpcode() != X86ISD::OR)
+    return SDValue();
+
+  // Quit if the value result of OR is used.
+  if (N->hasAnyUseOfValue(0))
+    return SDValue();
+
+  // Quit if not used as a boolean value.
+  if (CC != X86::COND_E && CC != X86::COND_NE)
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  SDValue VecIn;
+  EVT VT = MVT::Other;
+  unsigned Mask = 0;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    // Check if all elements are extracted from the same vector.
+    SDValue ExtractedFromVec = I->getOperand(0);
+    if (VecIn.getNode() == 0) {
+      VT = ExtractedFromVec.getValueType();
+      // FIXME: only 128-bit vector is supported so far.
+      if (!VT.is128BitVector())
+        return SDValue();
+      VecIn = ExtractedFromVec;
+    } else if (VecIn != ExtractedFromVec)
+      return SDValue();
+
+    // Record the constant index.
+    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
   }
+
+  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
+
+  // Quit if not all elements are used.
+  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
 }
 
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
 
   // If the flag operand isn't dead, don't touch this CMOV.
@@ -14114,10 +14313,18 @@
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(Cond, CC);
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
   if (Flags.getNode() &&
       // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
+    SDValue Ops[] = { FalseOp, TrueOp,
+                      DAG.getConstant(CC, MVT::i8), Flags };
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
+                       Ops, array_lengthof(Ops));
+  }
+
+  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
+  if (Flags.getNode()) {
     SDValue Ops[] = { FalseOp, TrueOp,
                       DAG.getConstant(CC, MVT::i8), Flags };
     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@@ -15384,7 +15591,7 @@
     return SDValue();
 
   // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
-  // into FMINC and MMAXC, which are Commutative operations.
+  // into FMINC and FMAXC, which are Commutative operations.
   unsigned NewOp = 0;
   switch (N->getOpcode()) {
     default: llvm_unreachable("unknown opcode");
@@ -15502,8 +15709,13 @@
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -15525,9 +15737,10 @@
 
   unsigned Opcode;
   if (!NegMul)
-    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   else
-    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
   return DAG.getNode(Opcode, dl, VT, A, B, C);
 }
 
@@ -15625,7 +15838,9 @@
 }
 
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
@@ -15641,7 +15856,13 @@
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@@ -15663,7 +15884,14 @@
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
+  if (Flags.getNode()) {
+    SDValue Cond = DAG.getConstant(CC, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
+                       Flags);
+  }
+
+  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@@ -15858,7 +16086,7 @@
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -15888,7 +16116,7 @@
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:

diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index b0c27c8..bfe954114 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td

@@ -16,15 +16,18 @@
 //
 
 // Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),

diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 95ee7e5..5663800 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td

@@ -19,7 +19,8 @@
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
-                    SDPatternOperator Op = null_frag, bit MayLoad = 1> {
+                    SDPatternOperator Op = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -27,7 +28,7 @@
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
@@ -35,6 +36,7 @@
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
+  let isCommutable = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -42,7 +44,7 @@
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
                                                VR256:$src3)))]>;
 
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
                    !strconcat(OpcodeStr,
@@ -59,7 +61,7 @@
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, !strconcat("213", PackTy)),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op, 0>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
 let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, !strconcat("132", PackTy)),
@@ -112,148 +114,18 @@
                                v4f64>, VEX_W;
 }
 
-let Predicates = [HasFMA] in {
-  def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMADDSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFMSUBADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMADDSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFMSUBADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMADDSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMADDSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFMSUBADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFMSUBADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMADDSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmaddsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMADDSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFMSUBADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfmsubadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFMSUBADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMADDPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFNMADDPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMSUBPSr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps VR128:$src2, VR128:$src1,
-             (memopv4f32 addr:$src3)),
-            (VFNMSUBPSr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMADDPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFNMADDPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMSUBPSr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_ps_256 VR256:$src2, VR256:$src1,
-             (memopv8f32 addr:$src3)),
-            (VFNMSUBPSr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMADDPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFNMADDPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1, VR128:$src3),
-            (VFNMSUBPDr213r VR128:$src1, VR128:$src2, VR128:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd VR128:$src2, VR128:$src1,
-             (memopv2f64 addr:$src3)),
-            (VFNMSUBPDr213m VR128:$src1, VR128:$src2, addr:$src3)>;
-
-  def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMADDPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmadd_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFNMADDPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1, VR256:$src3),
-            (VFNMSUBPDr213rY VR256:$src1, VR256:$src2, VR256:$src3)>;
-  def : Pat<(int_x86_fma_vfnmsub_pd_256 VR256:$src2, VR256:$src1,
-             (memopv4f64 addr:$src3)),
-            (VFNMSUBPDr213mY VR256:$src1, VR256:$src2, addr:$src3)>;
-
-} // Predicates = [HasFMA]
-
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                     RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
-                    SDPatternOperator OpNode = null_frag, bit MayLoad = 1> {
+                    SDPatternOperator OpNode = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
-  let mayLoad = MayLoad in
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
@@ -266,6 +138,7 @@
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
                         ComplexPattern mem_cpat, Intrinsic IntId,
                         RegisterClass RC> {
+  let isCommutable = 1 in
   def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -294,7 +167,7 @@
 }
 
 defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
-                     x86memop, RC, OpVT, mem_frag, OpNode, 0>,
+                     x86memop, RC, OpVT, mem_frag, OpNode>,
             fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
                          memop, mem_cpat, Int, RC>;
 }
@@ -324,73 +197,102 @@
 //===----------------------------------------------------------------------===//
 
 
-multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop,
-                 ComplexPattern mem_cpat, Intrinsic Int> {
-  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
-  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, memop:$src3),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4;
-  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, memop:$src2, VR128:$src3),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in
-  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
 }
 
-multiclass fma4p<bits<8> opc, string OpcodeStr,
-                 Intrinsic Int128, Intrinsic Int256,
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
   def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, f128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2,
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
                               (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, f128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
   def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4;
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4;
   def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2,
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
                               (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
   def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+             (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
@@ -406,45 +308,58 @@
 
 let Predicates = [HasFMA4] in {
 
-defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
-                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
-                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
-                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
-                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
-                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
-                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
-                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
-                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
-                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
-                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
-                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
-                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
 } // HasFMA4
 

diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 81b4f81..55ad2ec 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td

@@ -287,12 +287,14 @@
   let CodeSize = 3;
 }
 
+def __xs : XS;
+
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -303,7 +305,7 @@
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -314,18 +316,25 @@
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
@@ -341,18 +350,18 @@
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -372,27 +381,31 @@
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
 //   VSDI   - SSE2 instructions with XD prefix in AVX form.
 //   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
@@ -405,6 +418,12 @@
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -415,21 +434,23 @@
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 
 
 // SSSE3 Instruction Templates:
 // 
 //   SS38I - SSSE3 instructions with T8 prefix.
 //   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
 //
 // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
 // uses the MMX registers. The 64-bit versions are grouped with the MMX
@@ -438,10 +459,18 @@
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSSE3]>;
+        Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -452,11 +481,11 @@
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
@@ -464,9 +493,10 @@
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
@@ -475,7 +505,7 @@
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)

diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index ee2d3c4..9035435 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td

@@ -183,8 +183,8 @@
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
 def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -240,6 +240,10 @@
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 459f01a..4f3d824 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp

@@ -1110,6 +1110,36 @@
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      TB_ALIGN_32 },
     { X86::VPXORYrr,          X86::VPXORYrm,           TB_ALIGN_32 },
     // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1237,6 +1267,36 @@
     { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1786,10 +1846,8 @@
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-  bool isDead = MI->getOperand(0).isDead();
-  bool isKill = MI->getOperand(1).isKill();
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
 
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -1807,11 +1865,9 @@
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHUFPDrri: {
@@ -1821,15 +1877,13 @@
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
     M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHL64ri: {
@@ -1840,15 +1894,14 @@
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle RSP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
       return 0;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
@@ -1859,15 +1912,15 @@
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR32_NOSPRegClass))
       return 0;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL16ri: {
@@ -1880,10 +1933,8 @@
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   default: {
@@ -1906,14 +1957,12 @@
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     }
     case X86::INC16r:
@@ -1921,10 +1970,8 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     case X86::DEC64r:
     case X86::DEC32r:
@@ -1936,14 +1983,12 @@
         (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     }
     case X86::DEC16r:
@@ -1951,10 +1996,8 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     case X86::ADD64rr:
     case X86::ADD64rr_DB:
@@ -1981,9 +2024,8 @@
         return 0;
 
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
 
       // Preserve undefness of the operands.
       bool isUndef = MI->getOperand(1).isUndef();
@@ -2003,9 +2045,15 @@
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2015,10 +2063,9 @@
     case X86::ADD64ri32_DB:
     case X86::ADD64ri8_DB:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
     case X86::ADD32ri8:
@@ -2026,10 +2073,9 @@
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                                Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -2039,10 +2085,9 @@
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
   }
@@ -2051,10 +2096,10 @@
   if (!NewMI) return 0;
 
   if (LV) {  // Update live variables
-    if (isKill)
-      LV->replaceKillInstruction(Src, MI, NewMI);
-    if (isDead)
-      LV->replaceKillInstruction(Dest, MI, NewMI);
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
   }
 
   MFI->insert(MBBI, NewMI);          // Insert the new inst
@@ -3444,6 +3489,13 @@
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
     return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MI, get(X86::VXORPSYrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
@@ -3557,14 +3609,16 @@
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    if (MI->getOpcode() == X86::MOV64r0)
-      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV32r0)
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV16r0)
-      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV8r0)
-      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
+    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
+    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    }
+    if (Opc)
+       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
     if (NewMI)
       return NewMI;
 
@@ -3793,15 +3847,12 @@
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::AVX_SET0PSY:
-    case X86::AVX_SET0PDY:
     case X86::AVX2_SETALLONES:
-    case X86::AVX2_SET0:
+    case X86::AVX_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -3837,11 +3888,8 @@
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PSY:
-  case X86::AVX_SET0PDY:
-  case X86::AVX_SETALLONES:
   case X86::AVX2_SETALLONES:
-  case X86::AVX2_SET0:
+  case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -3873,15 +3921,12 @@
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
-    else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
-      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
-    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0)
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES ||
-                      Opc == X86::AVX2_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -3956,6 +4001,8 @@
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index d293156..304676d 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td

@@ -114,7 +114,7 @@
 def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
-                            [SDNPHasChain]>;
+                            [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
                         [SDNPHasChain]>;
 def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
@@ -552,14 +552,21 @@
 def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
 def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && Subtarget->hasNoAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && Subtarget->hasNoAVX()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && Subtarget->hasNoAVX()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && Subtarget->hasNoAVX()">;
 def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && Subtarget->hasNoAVX()">;
 def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && Subtarget->hasNoAVX()">;
 def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;

diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index c8f40bb..bd54858 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td

@@ -118,11 +118,11 @@
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId64, OpndItins itins> {
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
                      (IntId64 (bitconvert (memopmmx addr:$src))))],
@@ -134,11 +134,11 @@
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
@@ -149,11 +149,11 @@
 
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64irr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
-  def R64irm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
@@ -163,12 +163,10 @@
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, OpndItins itins, Domain d> {
-  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        itins.rr, d>;
-  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        itins.rm, d>;
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -243,29 +241,30 @@
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
 
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                          (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
-                          [(set VR64:$dst,
-                            (x86mmx (bitconvert
-                            (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))],
-                          IIC_MMX_MOVQ_RR>;
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
 
-def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
-          [(set VR128:$dst,
-            (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))],
-                           IIC_MMX_MOVQ_RR>;
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
 
-def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
@@ -577,6 +576,7 @@
                            IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
+let Predicates = [HasSSE2] in {
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
@@ -585,5 +585,6 @@
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+}
 
 

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 220c06d..17e91a6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td

@@ -251,35 +251,37 @@
 
 // A 128-bit subvector extract from the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
 
-def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
 
-def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
           (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
           (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
 
 // A 128-bit subvector insert to the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
@@ -362,7 +364,7 @@
   def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
 }
 
-// Alias instructions that map fld0 to pxor for sse.
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1 in {
@@ -382,11 +384,11 @@
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, neverHasSideEffects = 1 in {
-def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+    isPseudo = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
@@ -394,35 +396,29 @@
 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 
 
-// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
 // and doesn't need it because on sandy bridge the register is set to zero
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-let Predicates = [HasAVX] in {
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-}
-let Predicates = [HasAVX2], neverHasSideEffects = 1 in
-def AVX2_SET0   : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   []>, VEX_4V;
+    isPseudo = 1, Predicates = [HasAVX] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 5 in {
-  def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
 }
 
-// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
 // VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
@@ -438,22 +434,17 @@
 def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
 
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
-  let Predicates = [HasAVX] in
-  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+    isPseudo = 1 in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
   let Predicates = [HasAVX2] in
-  def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
-                          [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
 
@@ -605,27 +596,27 @@
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0),
                            (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
                            sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0),
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
@@ -704,7 +695,7 @@
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
@@ -738,7 +729,7 @@
             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSD to the lower bits.
@@ -916,16 +907,16 @@
 
 let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86vzmovl
-                        (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4i64 (X86vzmovl
-                        (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v8f32 (X86vzmovl
-                        (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4f64 (X86vzmovl
-                        (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 }
 
@@ -975,10 +966,10 @@
             (VMOVUPDmr addr:$dst, VR128:$src)>;
 }
 
-let Predicates = [HasSSE1] in
+let Predicates = [UseSSE1] in
   def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
             (MOVUPSmr addr:$dst, VR128:$src)>;
-let Predicates = [HasSSE2] in
+let Predicates = [UseSSE2] in
   def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
             (MOVUPDmr addr:$dst, VR128:$src)>;
 
@@ -1028,12 +1019,52 @@
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 // The instructions selected below are then converted to MOVDQA/MOVDQU
 // during the SSE domain pass.
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
@@ -1180,7 +1211,7 @@
             (VMOVLPDmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
@@ -1205,7 +1236,7 @@
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Shuffle with MOVLPD
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1279,7 +1310,7 @@
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVHPS patterns
   def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1289,7 +1320,7 @@
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
   // is during lowering, where it's not possible to recognize the load fold 
   // cause it has two uses through a bitcast. One use disappears at isel time
@@ -1346,7 +1377,7 @@
             (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (MOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1456,7 +1487,7 @@
 def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1633,7 +1664,7 @@
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB, Requires<[HasSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 
 /// SSE 2 Only
 
@@ -1663,7 +1694,7 @@
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))],
                       IIC_SSE_CVT_Scalar_RM>,
                       XD,
-                  Requires<[HasSSE2, OptForSize]>;
+                  Requires<[UseSSE2, OptForSize]>;
 
 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1684,13 +1715,13 @@
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
 }
 
 // Convert scalar single to scalar double
@@ -1709,30 +1740,28 @@
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
 }
 
-let AddedComplexity = 1 in { // give AVX priority
-  def : Pat<(f64 (fextend FR32:$src)),
-            (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(fextend (loadf32 addr:$src)),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
 
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-            Requires<[HasAVX, OptForSpeed]>;
-} // AddedComplexity = 1
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (fextend FR32:$src))],
                    IIC_SSE_CVT_Scalar_RR>, XS,
-                 Requires<[HasSSE2]>;
+                 Requires<[UseSSE2]>;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (extloadf32 addr:$src))],
                    IIC_SSE_CVT_Scalar_RM>, XS,
-                 Requires<[HasSSE2, OptForSize]>;
+                 Requires<[UseSSE2, OptForSize]>;
 
 // extload f32 -> f64.  This matches load+fextend because we have a hack in
 // the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1740,9 +1769,9 @@
 // Since these loads aren't folded into the fextend, we have to match it
 // explicitly here.
 def : Pat<(fextend (loadf32 addr:$src)),
-          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
 def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1762,13 +1791,13 @@
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
 }
 
 // Convert packed single/double fp to doubleword
@@ -1904,7 +1933,7 @@
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -1978,10 +2007,10 @@
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
@@ -1994,15 +2023,15 @@
                      IIC_SSE_CVT_PD_RM>, TB, VEX;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, TB;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>, TB;
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, TB;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2105,11 +2134,11 @@
             (VCVTPS2PDrr VR128:$src)>;
   def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
             (VCVTPS2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
             (VCVTPS2PDYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Match fextend for 128 conversions
   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
             (CVTPS2PDrr VR128:$src)>;
@@ -2336,14 +2365,14 @@
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
@@ -2420,7 +2449,7 @@
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
                        (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
             (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2428,7 +2457,7 @@
             (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Generic SHUFPD patterns
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
                        (memopv2i64 addr:$src2), (i8 imm:$imm))),
@@ -2500,7 +2529,27 @@
                        SSEPackedDouble>, TB, OpSize;
 } // Constraints = "$src1 = $dst"
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2509,7 +2558,7 @@
             (VUNPCKLPDrr VR128:$src, VR128:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2578,16 +2627,16 @@
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -2683,14 +2732,12 @@
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-let mayLoad = 0 in {
-  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-                SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-                SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-                SSE_BIT_ITINS_P>;
-}
+defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+              SSE_BIT_ITINS_P>;
+defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+              SSE_BIT_ITINS_P>;
+defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+              SSE_BIT_ITINS_P>;
 
 let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
   defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
@@ -2794,27 +2841,23 @@
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    SizeItins itins,
                                    bit Is2Addr = 1> {
-  let mayLoad = 0 in {
   defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
               v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
               TB;
   defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
               v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
               TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode,
                                     SizeItins itins> {
-  let mayLoad = 0 in {
-    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
+  defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
                 v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
                 TB;
-    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
+  defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
                 v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
                 TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2924,7 +2967,7 @@
   }
 }
 
-let isCommutable = 1, isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
        VEX_4V, VEX_LIG;
   defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
@@ -2978,7 +3021,7 @@
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[HasSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>;
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
@@ -2992,7 +3035,7 @@
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
@@ -3000,6 +3043,7 @@
                 (ins VR128:$src1, ssmem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -3062,7 +3106,7 @@
   def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[HasSSE2, OptForSize]>;
+            Requires<[UseSSE2, OptForSize]>;
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
@@ -3072,20 +3116,20 @@
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
+let hasSideEffects = 0 in
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  }
   def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -3176,7 +3220,6 @@
                                     SSE_RCPP>, VEX;
 }
 
-let AddedComplexity = 1 in {
 def : Pat<(f32 (fsqrt FR32:$src)),
           (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
 def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -3199,9 +3242,8 @@
 def : Pat<(f32 (X86frcp (load addr:$src))),
           (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX, OptForSize]>;
-}
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3322,7 +3364,7 @@
                     IIC_SSE_MOVNT>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -3482,7 +3524,7 @@
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3492,7 +3534,7 @@
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3504,7 +3546,7 @@
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
                    IIC_SSE_MOVU_P_RM>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1 in {
@@ -3516,7 +3558,7 @@
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/],
                    IIC_SSE_MOVU_P_MR>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 // Intrinsic forms of MOVDQU load and store
@@ -3530,7 +3572,7 @@
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
                        IIC_SSE_MOVU_P_MR>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4028,7 +4070,7 @@
             (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
             (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
@@ -4210,7 +4252,7 @@
   defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
  let AddedComplexity = 5 in
   defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
 
@@ -4325,28 +4367,6 @@
 }
 } // ExeDomain = SSEPackedInt
 
-// Patterns for using AVX1 instructions with integer vectors
-// Here to give AVX2 priority
-let Predicates = [HasAVX] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -4395,7 +4415,7 @@
 }
 
 let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4556,7 +4576,7 @@
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4672,14 +4692,14 @@
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -4719,7 +4739,7 @@
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
                       IIC_SSE_MOVDQ>, XS,
-                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
@@ -4762,7 +4782,7 @@
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 let Predicates = [HasAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
@@ -4773,7 +4793,7 @@
             (VMOVZQI2PQIrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@@ -4803,7 +4823,7 @@
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4818,7 +4838,7 @@
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 }
 
 let AddedComplexity = 20 in {
@@ -4828,7 +4848,7 @@
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
-  let Predicates = [HasSSE2] in {
+  let Predicates = [UseSSE2] in {
     def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
               (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -4908,7 +4928,7 @@
             (VMOVSLDUPYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -4977,7 +4997,7 @@
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
@@ -5041,7 +5061,7 @@
                                  f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
   }
 }
-let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
                               f128mem, SSE_ALU_F32P>, TB, XD;
@@ -5424,7 +5444,7 @@
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
   defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
 let Predicates = [HasAVX2] in {
@@ -5449,7 +5469,7 @@
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
-let Predicates = [HasSSSE3] in {
+let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
@@ -5583,7 +5603,7 @@
             (VPMOVZXDQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load.
   def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
@@ -5633,7 +5653,7 @@
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
@@ -5704,7 +5724,7 @@
             (VPMOVZXWQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
             (PMOVSXBDrm addr:$src)>;
@@ -5772,7 +5792,7 @@
             (VPMOVZXBQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbq
               (bitconvert (v4i32 (X86vzmovl
@@ -5918,7 +5938,7 @@
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-          Requires<[HasSSE41]>;
+          Requires<[UseSSE41]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -6190,6 +6210,15 @@
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6199,26 +6228,33 @@
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
-def : Pat<(ffloor FR32:$src),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
-def : Pat<(f64 (ffloor FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
-def : Pat<(f32 (fnearbyint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
-def : Pat<(f64 (fnearbyint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
-def : Pat<(f32 (fceil FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
-def : Pat<(f64 (fceil FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
-def : Pat<(f32 (frint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
-def : Pat<(f64 (frint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
-def : Pat<(f32 (ftrunc FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
-def : Pat<(f64 (ftrunc FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
@@ -6356,7 +6392,7 @@
           (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId256> {
   let isCommutable = 1 in
@@ -6705,7 +6741,7 @@
 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
@@ -6802,9 +6838,8 @@
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
@@ -6840,9 +6875,8 @@
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
 }
 
 let Predicates = [HasAVX],
@@ -7237,40 +7271,59 @@
 
 let Predicates = [HasAVX] in {
 def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
-          (VINSERTF128rr VR256:$src1, VR128:$src2,
-                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
-                                   (i32 imm)),
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rr VR256:$src1, VR128:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
@@ -7290,56 +7343,61 @@
           []>, VEX;
 }
 
-// Extract and store.
-let Predicates = [HasAVX] in {
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-}
-
 // AVX1 patterns
 let Predicates = [HasAVX] in {
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2f64 (VEXTRACTF128rr
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7456,29 +7514,29 @@
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
-                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
                   (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
                   (memopv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
                   (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
@@ -7665,19 +7723,22 @@
 }
 
 // AVX1 broadcast patterns
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
 
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
@@ -7757,7 +7818,6 @@
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
-let AddedComplexity = 1 in {
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
@@ -7768,9 +7828,8 @@
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
                              (i8 imm:$src3)))]>, VEX_4V;
-}
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7805,23 +7864,43 @@
           []>, VEX_4V;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7838,23 +7917,40 @@
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTI128rr
                     (v8i32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTI128rr
                     (v16i16 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTI128rr
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 7ac4cec..764aa5d 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp

@@ -532,7 +532,7 @@
 #endif
 }
 
-template<typename T> void addUnaligned(void *Pos, T Delta) {
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
   T Value;
   std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
               sizeof(T));

diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9c0ce4e..1c2ef25 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp

@@ -377,12 +377,6 @@
   case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
-  case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
-  case X86::AVX2_SET0:     LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0

diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 877b8f6..3b4cfc4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp

@@ -522,7 +522,7 @@
 
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const{
+                                     int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;

diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 9087852..0d7b664 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp

@@ -346,6 +346,7 @@
   , HasVectorUAMem(false)
   , HasCmpxchg16b(false)
   , UseLeaForSP(false)
+  , HasSlowDivide(false)
   , PostRAScheduler(false)
   , stackAlignment(4)
   // FIXME: this is a known good value for Yonah. How about others?

diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6841c5b..dde7e24 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h

@@ -136,6 +136,10 @@
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
   /// PostRAScheduler - True if using post-register-allocation scheduler.
   bool PostRAScheduler;
 
@@ -198,6 +202,7 @@
   bool hasSSE42() const { return X86SSELevel >= SSE42; }
   bool hasAVX() const { return X86SSELevel >= AVX; }
   bool hasAVX2() const { return X86SSELevel >= AVX2; }
+  bool hasNoAVX() const { return X86SSELevel < AVX; }
   bool hasSSE4A() const { return HasSSE4A; }
   bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
   bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
@@ -205,7 +210,8 @@
   bool hasAES() const { return HasAES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
   bool hasFMA() const { return HasFMA; }
-  bool hasFMA4() const { return HasFMA4; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
@@ -219,6 +225,7 @@
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
 

diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 80b75dc..449eed3 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp

@@ -42,7 +42,6 @@
 
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
-    MachineBasicBlock *MBB;     // Current basic block
 
     // Any YMM register live-in to this function?
     bool FnHasLiveInYmm;
@@ -84,7 +83,7 @@
     //  2) All states must be clean for the result to be clean
     //  3) If none above and one unknown, the result state is also unknown
     //
-    unsigned computeState(unsigned PrevState, unsigned CurState) {
+    static unsigned computeState(unsigned PrevState, unsigned CurState) {
       if (PrevState == ST_INIT)
         return CurState;
 
@@ -122,7 +121,7 @@
 }
 
 static bool hasYmmReg(MachineInstr *MI) {
-  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
@@ -189,7 +188,6 @@
                                            MachineBasicBlock &BB) {
   bool Changed = false;
   unsigned BBNum = BB.getNumber();
-  MBB = &BB;
 
   // Don't process already solved BBs
   if (BBSolved[BBNum])
@@ -207,7 +205,7 @@
 
   // The entry MBB for the function may set the initial state to dirty if
   // the function receives any YMM incoming arguments
-  if (MBB == MF.begin()) {
+  if (&BB == MF.begin()) {
     EntryState = ST_CLEAN;
     if (FnHasLiveInYmm)
       EntryState = ST_DIRTY;
@@ -253,7 +251,7 @@
       // When unknown, only compute the information within the block to have
       // it available in the exit if possible, but don't change the block.
       if (EntryState != ST_UNKNOWN) {
-        BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
         ++NumVZU;
       }
 

diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index ae646a2..3e7666b 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td

@@ -33,7 +33,7 @@
                              SDNPVariadic]>;
 
 def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
-                         [SDNPHasChain, SDNPOptInGlue]>;
+                         [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
@@ -58,7 +58,7 @@
 
 def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
 def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPMayStore]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;

diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index cdd0a08..be5855a 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp

@@ -176,7 +176,7 @@
 
   #ifndef NDEBUG
   DEBUG(errs() << "\nFunction         : " 
-        << MF.getFunction()->getName() << "\n");
+        << MF.getName() << "\n");
   DEBUG(errs() << "<--------->\n");
   DEBUG(MI.print(errs()));
   DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");

diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 6d950d2..b888e95 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp

@@ -346,7 +346,7 @@
 /// Given a value that is stored to a global but never read, determine whether
 /// it's safe to remove the store and the chain of computation that feeds the
 /// store.
-static bool IsSafeComputationToRemove(Value *V) {
+static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
   do {
     if (isa<Constant>(V))
       return true;
@@ -355,7 +355,7 @@
     if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
         isa<GlobalValue>(V))
       return false;
-    if (isAllocationFn(V))
+    if (isAllocationFn(V, TLI))
       return true;
 
     Instruction *I = cast<Instruction>(V);
@@ -376,7 +376,8 @@
 /// of the global and clean up any that obviously don't assign the global a
 /// value that isn't dynamically allocated.
 ///
-static bool CleanupPointerRootUsers(GlobalVariable *GV) {
+static bool CleanupPointerRootUsers(GlobalVariable *GV,
+                                    const TargetLibraryInfo *TLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
   // usage over time.  The common strategy for leak checkers is to whitelist the
@@ -432,18 +433,18 @@
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
-        CleanupPointerRootUsers(GV);
+        CleanupPointerRootUsers(GV, TLI);
         return true;
       }
     }
   }
 
   for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first)) {
+    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I))
+	if (isAllocationFn(I, TLI))
 	  break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
@@ -975,7 +976,7 @@
   // nor is the global.
   if (AllNonStoreUsesGone) {
     if (isLeakCheckerRoot(GV)) {
-      Changed |= CleanupPointerRootUsers(GV);
+      Changed |= CleanupPointerRootUsers(GV, TLI);
     } else {
       Changed = true;
       CleanupConstantGlobalUsers(GV, 0, TD, TLI);
@@ -1465,9 +1466,10 @@
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value *NElems, TargetData *TD) {
+                                            Value *NElems, TargetData *TD,
+                                            const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  Type *MAT = getMallocAllocatedType(CI);
+  Type *MAT = getMallocAllocatedType(CI, TLI);
   StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
@@ -1688,7 +1690,7 @@
   // This eliminates dynamic allocation, avoids an indirection accessing the
   // data, and exposes the resultant global to further GlobalOpt.
   // We cannot optimize the malloc if we cannot determine malloc array size.
-  Value *NElems = getMallocArraySize(CI, TD, true);
+  Value *NElems = getMallocArraySize(CI, TD, TLI, true);
   if (!NElems)
     return false;
 
@@ -1725,7 +1727,7 @@
 
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
-    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
       Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
@@ -1742,7 +1744,8 @@
         CI = cast<CallInst>(Malloc);
     }
 
-    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
+    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, TLI, true),
+                               TD, TLI);
     return true;
   }
 
@@ -1771,8 +1774,8 @@
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      Type *MallocType = getMallocAllocatedType(CI);
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
       if (MallocType &&
           TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
                                              TD, TLI))
@@ -1964,7 +1967,7 @@
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
-      Changed = CleanupPointerRootUsers(GV);
+      Changed = CleanupPointerRootUsers(GV, TLI);
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.

diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 712888a..69a22fb 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp

@@ -20,6 +20,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -339,6 +340,7 @@
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
   DEBUG(dbgs() << "Inliner visiting SCC:");
@@ -417,7 +419,7 @@
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction())) {
+      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) {
         DEBUG(dbgs() << "    -> Deleting dead call: "
                      << *CS.getInstruction() << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.

diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cbe1ca4..b12fc01 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp

@@ -168,7 +168,7 @@
 /// the heavy lifting.
 ///
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
-  if (isFreeCall(&CI))
+  if (isFreeCall(&CI, TLI))
     return visitFree(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
@@ -243,7 +243,7 @@
   default: break;
   case Intrinsic::objectsize: {
     uint64_t Size;
-    if (getObjectSize(II->getArgOperand(0), Size, TD))
+    if (getObjectSize(II->getArgOperand(0), Size, TD, TLI))
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
     return 0;
   }
@@ -877,7 +877,7 @@
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
-  if (isAllocLikeFn(CS.getInstruction()))
+  if (isAllocLikeFn(CS.getInstruction(), TLI))
     return visitAllocSite(*CS.getInstruction());
 
   bool Changed = false;

diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 35a0bbb..2a7182f 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp

@@ -462,6 +462,16 @@
     }
   }
 
+  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
+  if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) {
+    Value *X;
+    ConstantInt *C1;
+    if (match(Op0, m_LShr(m_Value(X), m_ConstantInt(C1)))) {
+      APInt NC = C2->getValue().shl(C1->getLimitedValue(C1->getBitWidth()-1));
+      return BinaryOperator::CreateUDiv(X, Builder->getInt(NC));
+    }
+  }
+
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   { const APInt *CI; Value *N;
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||

diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 68ecd51..ff758c4 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp

@@ -1068,7 +1068,7 @@
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0))) {
+            isAllocationFn(BCI->getOperand(0), TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1107,7 +1107,8 @@
 
 
 static bool
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
+                     const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 4> Worklist;
   Worklist.push_back(AI);
 
@@ -1163,7 +1164,7 @@
           }
         }
 
-        if (isFreeCall(I)) {
+        if (isFreeCall(I, TLI)) {
           Users.push_back(I);
           continue;
         }
@@ -1188,7 +1189,7 @@
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
   SmallVector<WeakVH, 64> Users;
-  if (isAllocSiteRemovable(&MI, Users)) {
+  if (isAllocSiteRemovable(&MI, Users, TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       Instruction *I = cast_or_null<Instruction>(&*Users[i]);
       if (!I) continue;
@@ -1872,7 +1873,7 @@
       Instruction *Inst = BBI++;
 
       // DCE instruction if trivially dead.
-      if (isInstructionTriviallyDead(Inst)) {
+      if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
         DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
@@ -2002,7 +2003,7 @@
     if (I == 0) continue;  // skip null values.
 
     // Check to see if we can DCE the instruction.
-    if (isInstructionTriviallyDead(I)) {
+    if (isInstructionTriviallyDead(I, TLI)) {
       DEBUG(errs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
@@ -2102,7 +2103,7 @@
 
         // If the instruction was modified, it's possible that it is now dead.
         // if so, remove it.
-        if (isInstructionTriviallyDead(I)) {
+        if (isInstructionTriviallyDead(I, TLI)) {
           EraseInstFromFunction(*I);
         } else {
           Worklist.Add(I);

diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 06f4d2f..0775cf4 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp

@@ -15,7 +15,7 @@
 
 #define DEBUG_TYPE "asan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
@@ -217,7 +217,7 @@
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Instruction *CtorInsertBefore;
-  OwningPtr<FunctionBlackList> BL;
+  OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   InlineAsm *EmptyAsm;
@@ -544,6 +544,7 @@
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G);
 
+  if (BL->isIn(*G)) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   // Touch only those globals that will not be defined in other modules.
@@ -643,6 +644,8 @@
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
     // Determine whether this global should be poisoned in initialization.
     bool GlobalHasDynamicInitializer = HasDynamicInitializer(G);
+    // Don't check initialization order if this global is blacklisted.
+    GlobalHasDynamicInitializer &= !BL->isInInit(*G);
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -736,7 +739,7 @@
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   C = &(M.getContext());
   LongSize = TD->getPointerSizeInBits();
@@ -774,7 +777,7 @@
                             /*hasSideEffects=*/true);
 
   llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI;
+  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
 
   MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
     (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);

diff --git a/lib/Transforms/Instrumentation/Android.mk b/lib/Transforms/Instrumentation/Android.mk
index cc7a9da..a6512bf 100644
--- a/lib/Transforms/Instrumentation/Android.mk
+++ b/lib/Transforms/Instrumentation/Android.mk

@@ -2,9 +2,9 @@
 
 instrumentation_SRC_FILES := \
   AddressSanitizer.cpp \
+  BlackList.cpp \
   BoundsChecking.cpp \
   EdgeProfiling.cpp \
-  FunctionBlackList.cpp \
   GCOVProfiling.cpp \
   Instrumentation.cpp \
   OptimalEdgeProfiling.cpp \

diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
new file mode 100644
index 0000000..2cb1199
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.cpp

@@ -0,0 +1,102 @@
+//===-- BlackList.cpp - blacklist for sanitizers --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+#include <string>
+
+#include "BlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+BlackList::BlackList(const StringRef Path) {
+  // Validate and open blacklist file.
+  if (!Path.size()) return;
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+    report_fatal_error("Can't open blacklist file: " + Path + ": " +
+                       EC.message());
+  }
+
+  // Iterate through each line in the blacklist file.
+  SmallVector<StringRef, 16> Lines;
+  SplitString(File.take()->getBuffer(), Lines, "\n\r");
+  StringMap<std::string> Regexps;
+  for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
+       I != E; ++I) {
+    // Get our prefix and unparsed regexp.
+    std::pair<StringRef, StringRef> SplitLine = I->split(":");
+    StringRef Prefix = SplitLine.first;
+    std::string Regexp = SplitLine.second;
+
+    // Replace * with .*
+    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
+         pos += strlen(".*")) {
+      Regexp.replace(pos, strlen("*"), ".*");
+    }
+
+    // Check that the regexp is valid.
+    Regex CheckRE(Regexp);
+    std::string Error;
+    if (!CheckRE.isValid(Error)) {
+      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
+          ": " + Error);
+    }
+
+    // Add this regexp into the proper group by its prefix.
+    if (Regexps[Prefix].size())
+      Regexps[Prefix] += "|";
+    Regexps[Prefix] += Regexp;
+  }
+
+  // Iterate through each of the prefixes, and create Regexs for them.
+  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
+       I != E; ++I) {
+    Entries[I->getKey()] = new Regex(I->getValue());
+  }
+}
+
+bool BlackList::isIn(const Function &F) {
+  return isIn(*F.getParent()) || inSection("fun", F.getName());
+}
+
+bool BlackList::isIn(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global", G.getName());
+}
+
+bool BlackList::isIn(const Module &M) {
+  return inSection("src", M.getModuleIdentifier());
+}
+
+bool BlackList::isInInit(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global-init", G.getName());
+}
+
+bool BlackList::inSection(const StringRef Section,
+                                  const StringRef Query) {
+  Regex *FunctionRegex = Entries[Section];
+  return FunctionRegex ? FunctionRegex->match(Query) : false;
+}
+
+}  // namespace llvm

diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h
new file mode 100644
index 0000000..73977fc
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.h

@@ -0,0 +1,55 @@
+//===-- BlackList.h - blacklist for sanitizers ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+// The blacklist disables instrumentation of various functions and global
+// variables.  Each line contains a prefix, followed by a wild card expression.
+// ---
+// fun:*_ZN4base6subtle*
+// global:*global_with_bad_access_or_initialization*
+// global-init:*global_with_initialization_issues*
+// src:file_with_tricky_code.cc
+// ---
+// Note that the wild card is in fact an llvm::Regex, but * is automatically
+// replaced with .*
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+class Function;
+class GlobalVariable;
+class Module;
+class Regex;
+class StringRef;
+
+class BlackList {
+ public:
+  BlackList(const StringRef Path);
+  // Returns whether either this function or it's source file are blacklisted.
+  bool isIn(const Function &F);
+  // Returns whether either this global or it's source file are blacklisted.
+  bool isIn(const GlobalVariable &G);
+  // Returns whether this module is blacklisted by filename.
+  bool isIn(const Module &M);
+  // Returns whether a global should be excluded from initialization checking.
+  bool isInInit(const GlobalVariable &G);
+ private:
+  StringMap<Regex*> Entries;
+
+  bool inSection(const StringRef Section, const StringRef Query);
+};
+
+}  // namespace llvm

diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 09e0f14..6429081 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp

@@ -24,6 +24,7 @@
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
@@ -48,10 +49,12 @@
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetData>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
   private:
     const TargetData *TD;
+    const TargetLibraryInfo *TLI;
     ObjectSizeOffsetEvaluator *ObjSizeEval;
     BuilderTy *Builder;
     Instruction *Inst;
@@ -166,11 +169,12 @@
 
 bool BoundsChecking::runOnFunction(Function &F) {
   TD = &getAnalysis<TargetData>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory

diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 00de882..058f68c 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt

@@ -1,8 +1,8 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
+  BlackList.cpp
   BoundsChecking.cpp
   EdgeProfiling.cpp
-  FunctionBlackList.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp

diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.cpp b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
deleted file mode 100644
index 188ea4d..0000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.cpp
+++ /dev/null

@@ -1,79 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer 
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-
-#include "FunctionBlackList.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Function.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-
-namespace llvm {
-
-FunctionBlackList::FunctionBlackList(const std::string &Path) {
-  Functions = NULL;
-  const char *kFunPrefix = "fun:";
-  if (!Path.size()) return;
-  std::string Fun;
-
-  OwningPtr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path.c_str(), File)) {
-    report_fatal_error("Can't open blacklist file " + Path + ": " +
-                       EC.message());
-  }
-  MemoryBuffer *Buff = File.take();
-  const char *Data = Buff->getBufferStart();
-  size_t DataLen = Buff->getBufferSize();
-  SmallVector<StringRef, 16> Lines;
-  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
-  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
-    if (Lines[i].startswith(kFunPrefix)) {
-      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
-      std::string ThisFuncRE;
-      // add ThisFunc replacing * with .*
-      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
-        if (ThisFunc[j] == '*')
-          ThisFuncRE += '.';
-        ThisFuncRE += ThisFunc[j];
-      }
-      // Check that the regexp is valid.
-      Regex CheckRE(ThisFuncRE);
-      std::string Error;
-      if (!CheckRE.isValid(Error))
-        report_fatal_error("malformed blacklist regex: " + ThisFunc +
-                           ": " + Error);
-      // Append to the final regexp.
-      if (Fun.size())
-        Fun += "|";
-      Fun += ThisFuncRE;
-    }
-  }
-  if (Fun.size()) {
-    Functions = new Regex(Fun);
-  }
-}
-
-bool FunctionBlackList::isIn(const Function &F) {
-  if (Functions) {
-    bool Res = Functions->match(F.getName());
-    return Res;
-  }
-  return false;
-}
-
-}  // namespace llvm

diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.h b/lib/Transforms/Instrumentation/FunctionBlackList.h
deleted file mode 100644
index c1239b9..0000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.h
+++ /dev/null

@@ -1,37 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include <string>
-
-namespace llvm {
-class Function;
-class Regex;
-
-// Blacklisted functions are not instrumented.
-// The blacklist file contains one or more lines like this:
-// ---
-// fun:FunctionWildCard
-// ---
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-class FunctionBlackList {
- public:
-  FunctionBlackList(const std::string &Path);
-  bool isIn(const Function &F);
- private:
-  Regex *Functions;
-};
-
-}  // namespace llvm

diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 264a6a6..9fcde316 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp

@@ -88,11 +88,10 @@
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
-                                                     MDNode *>, 8> &);
+    void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
 
-    std::string mangleName(DICompileUnit CU, std::string NewStem);
+    std::string mangleName(DICompileUnit CU, const char *NewStem);
 
     bool EmitNotes;
     bool EmitData;
@@ -329,7 +328,7 @@
   };
 }
 
-std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) {
+std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
   if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
     for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
       MDNode *N = GCov->getOperand(i);
@@ -630,7 +629,7 @@
 }
 
 void GCOVProfiler::insertCounterWriteout(
-    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
   FunctionType *WriteoutFTy =
       FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = Function::Create(WriteoutFTy,
@@ -652,7 +651,7 @@
       std::string FilenameGcda = mangleName(compile_unit, "gcda");
       Builder.CreateCall(StartFile,
                          Builder.CreateGlobalStringPtr(FilenameGcda));
-      for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+      for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
              I = CountersBySP.begin(), E = CountersBySP.end();
            I != E; ++I) {
         DISubprogram SP(I->second);

diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index dc0fa71..17b7775 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp

@@ -21,7 +21,7 @@
 
 #define DEBUG_TYPE "tsan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
@@ -50,7 +50,7 @@
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
-STATISTIC(NumOmittedReadsBeforeWrite, 
+STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
@@ -77,7 +77,7 @@
   int getMemoryAccessFuncIndex(Value *Addr);
 
   TargetData *TD;
-  OwningPtr<FunctionBlackList> BL;
+  OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -121,7 +121,7 @@
   TD = getAnalysisIfAvailable<TargetData>();
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -186,7 +186,7 @@
       NumOmittedReadsFromConstantGlobals++;
       return true;
     }
-  } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
     if (isVtableAccess(L)) {
       // Reads from a vtable pointer can not race with any writes.
       NumOmittedReadsFromVtable++;
@@ -344,7 +344,7 @@
     case NotAtomic:              assert(false);
     case Unordered:              // Fall-through.
     case Monotonic:              v = 1 << 0; break;
- // case Consume:                v = 1 << 1; break;  // Not specified yet.
+    // case Consume:                v = 1 << 1; break;  // Not specified yet.
     case Acquire:                v = 1 << 2; break;
     case Release:                v = 1 << 3; break;
     case AcquireRelease:         v = 1 << 4; break;

diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index a8deda8..5912107 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp

@@ -43,6 +43,7 @@
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -148,7 +149,19 @@
   PFI = getAnalysisIfAvailable<ProfileInfo>();
   OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
 
-  // First pass, eliminate blocks that contain only PHI nodes and an
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes();
+
+    for (Function::iterator I = F.begin(); I != F.end(); I++) {
+      EverMadeChange |= bypassSlowDivision(F,
+                                           I,
+                                           BypassTypeMap);
+    }
+  }
+
+  // Eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
@@ -988,7 +1001,7 @@
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
 
-    RecursivelyDeleteTriviallyDeadInstructions(Repl);
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
     if (IterHandle != CurInstIterator) {
       // If the iterator instruction was recursively deleted, start over at the
@@ -1174,16 +1187,31 @@
 }
 
 
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
 bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
-  // If we have a SelectInst that will likely profit from branch prediction,
-  // turn it into a branch.
-  if (DisableSelectToBranch || OptSize || !TLI ||
-      !TLI->isPredictableSelectExpensive())
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
     return false;
 
-  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
-      !isFormingBranchFromSelectProfitable(SI))
-    return false;
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  // Do we have efficient codegen support for this kind of 'selects' ?
+  if (TLI->isSelectSupported(SelectKind)) {
+    // We have efficient codegen support for the select instruction.
+    // Check if it is profitable to keep this 'select'.
+    if (!TLI->isPredictableSelectExpensive() ||
+        !isFormingBranchFromSelectProfitable(SI))
+      return false;
+  }
 
   ModifiedDT = true;
 

diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 8dbcc23..086f0a1 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp

@@ -22,6 +22,7 @@
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -38,10 +39,11 @@
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
     virtual bool runOnBasicBlock(BasicBlock &BB) {
+      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = DI++;
-        if (isInstructionTriviallyDead(Inst)) {
+        if (isInstructionTriviallyDead(Inst, TLI)) {
           Inst->eraseFromParent();
           Changed = true;
           ++DIEEliminated;
@@ -87,6 +89,8 @@
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
 bool DCE::runOnFunction(Function &F) {
+  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
@@ -101,7 +105,7 @@
     Instruction *I = WorkList.back();
     WorkList.pop_back();
 
-    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
       // Loop over all of the values that the instruction uses, if there are
       // instructions being used, add them to the worklist, because they might
       // go dead after this one is removed.

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 8b1283f..1ff4329 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp

@@ -106,6 +106,7 @@
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
+                                  const TargetLibraryInfo *TLI,
                                   SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -130,7 +131,7 @@
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -276,7 +277,7 @@
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   uint64_t Size;
-  if (getObjectSize(V, Size, AA.getTargetData()))
+  if (getObjectSize(V, Size, AA.getTargetData(), AA.getTargetLibraryInfo()))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -454,7 +455,7 @@
     Instruction *Inst = BBI++;
 
     // Handle 'free' calls specially.
-    if (CallInst *F = isFreeCall(Inst)) {
+    if (CallInst *F = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
       MadeChange |= HandleFree(F);
       continue;
     }
@@ -483,7 +484,7 @@
           // in case we need it.
           WeakVH NextInst(BBI);
 
-          DeleteDeadInstruction(SI, *MD);
+          DeleteDeadInstruction(SI, *MD, AA->getTargetLibraryInfo());
 
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
@@ -530,7 +531,7 @@
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD);
+          DeleteDeadInstruction(DepWrite, *MD, AA->getTargetLibraryInfo());
           ++NumFastStores;
           MadeChange = true;
 
@@ -640,7 +641,7 @@
       Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
 
       // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD);
+      DeleteDeadInstruction(Dependency, *MD, AA->getTargetLibraryInfo());
       ++NumFastStores;
       MadeChange = true;
 
@@ -680,7 +681,8 @@
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+    else if (isAllocLikeFn(I, AA->getTargetLibraryInfo()) &&
+             !PointerMayBeCaptured(I, true, true))
       DeadStackObjects.insert(I);
   }
 
@@ -724,7 +726,8 @@
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
+        DeleteDeadInstruction(Dead, *MD, AA->getTargetLibraryInfo(),
+                              &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -732,9 +735,10 @@
     }
 
     // Remove any dead non-memory-mutating instructions.
-    if (isInstructionTriviallyDead(BBI)) {
+    if (isInstructionTriviallyDead(BBI, AA->getTargetLibraryInfo())) {
       Instruction *Inst = BBI++;
-      DeleteDeadInstruction(Inst, *MD, &DeadStackObjects);
+      DeleteDeadInstruction(Inst, *MD, AA->getTargetLibraryInfo(),
+                            &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -750,7 +754,7 @@
     if (CallSite CS = cast<Value>(BBI)) {
       // Remove allocation function calls from the list of dead stack objects; 
       // there can't be any references before the definition.
-      if (isAllocLikeFn(BBI))
+      if (isAllocLikeFn(BBI, AA->getTargetLibraryInfo()))
         DeadStackObjects.remove(BBI);
 
       // If this call does not access memory, it can't be loading any of our
@@ -771,15 +775,15 @@
           LiveAllocas.push_back(*I);
       }
 
+      // If all of the allocas were clobbered by the call then we're not going
+      // to find anything else to process.
+      if (DeadStackObjects.size() == LiveAllocas.size())
+        break;
+
       for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
            E = LiveAllocas.end(); I != E; ++I)
         DeadStackObjects.remove(*I);
 
-      // If all of the allocas were clobbered by the call then we're not going
-      // to find anything else to process.
-      if (DeadStackObjects.empty())
-        break;
-
       continue;
     }
 

diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 97595495..2627113 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp

@@ -374,7 +374,7 @@
     Instruction *Inst = I++;
 
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst)) {
+    if (isInstructionTriviallyDead(Inst, TLI)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
       Inst->eraseFromParent();
       Changed = true;

diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 4822fd0..16ae6ad 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp

@@ -271,16 +271,16 @@
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) e = nextValueNumber++;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) {
       e = nextValueNumber++;
       valueNumbering[C] = e;
@@ -413,7 +413,7 @@
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
-    case Instruction::Or :
+    case Instruction::Or:
     case Instruction::Xor:
     case Instruction::ICmp:
     case Instruction::FCmp:
@@ -632,6 +632,7 @@
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
+#ifndef NDEBUG
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
@@ -641,6 +642,7 @@
   }
   errs() << "}\n";
 }
+#endif
 
 /// IsValueFullyAvailableInBlock - Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -1436,7 +1438,7 @@
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
@@ -1951,7 +1953,7 @@
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -2231,12 +2233,20 @@
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       BasicBlock *Dst = i.getCaseSuccessor();
-      BasicBlockEdge E(Parent, Dst);
-      if (E.isSingleEdge())
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
         Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+      }
     }
     return Changed;
   }

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 37f8bdf..c933a17 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp

@@ -44,6 +44,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -68,6 +69,7 @@
     ScalarEvolution *SE;
     DominatorTree   *DT;
     TargetData      *TD;
+    TargetLibraryInfo *TLI;
 
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
@@ -414,11 +416,11 @@
   // new comparison.
   NewCompare->takeName(Compare);
   Compare->replaceAllUsesWith(NewCompare);
-  RecursivelyDeleteTriviallyDeadInstructions(Compare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
 
   // Delete the old floating point increment.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
 
   // If the FP induction variable still has uses, this is because something else
   // in the loop uses its value.  In order to canonicalize the induction
@@ -431,7 +433,7 @@
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
                                  PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
-    RecursivelyDeleteTriviallyDeadInstructions(PN);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
   }
   Changed = true;
 }
@@ -550,14 +552,14 @@
         PN->setIncomingValue(i, ExitVal);
 
         // If this instruction is dead now, delete it.
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+        RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
         if (NumPreds == 1) {
           // Completely replace a single-pred PHI. This is safe, because the
           // NewVal won't be variant in the loop, so we don't need an LCSSA phi
           // node anymore.
           PN->replaceAllUsesWith(ExitVal);
-          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
         }
       }
       if (NumPreds != 1) {
@@ -1697,6 +1699,7 @@
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
   TD = getAnalysisIfAvailable<TargetData>();
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
   Changed = false;
@@ -1763,7 +1766,7 @@
   while (!DeadInsts.empty())
     if (Instruction *Inst =
           dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
   // The Rewriter may not be used from this point on.
 
@@ -1772,7 +1775,7 @@
   SinkUnusedInvariants(L);
 
   // Clean up dead instructions.
-  Changed |= DeleteDeadPHIs(L->getHeader());
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
   // Check a post-condition.
   assert(L->isLCSSAForm(*DT) &&
          "Indvars did not leave the loop in lcssa form!");

diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dd42c59..20844c6 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp

@@ -1455,7 +1455,7 @@
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, TD);
+  SimplifyInstructionsInBlock(NewBB, TD, TLI);
 
   // Threaded an edge!
   ++NumThreads;

diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 0192e92..99bedce6 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp

@@ -108,6 +108,9 @@
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    bool MayThrow;           // The current loop contains an instruction which
+                             // may throw, thus preventing code motion of
+                             // instructions with side effects.
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -240,6 +243,15 @@
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
+  MayThrow = false;
+  // TODO: We've already searched for instructions which may throw in subloops.
+  // We may want to reuse this information.
+  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
+       (BB != BBE) && !MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !MayThrow; ++I)
+      MayThrow |= I->mayThrow();
+
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
   // their loop, into this loop, so there is no need to process the BODIES of
@@ -307,7 +319,7 @@
 
     // If the instruction is dead, we would try to sink it because it isn't used
     // in the loop, instead, just delete it.
-    if (isInstructionTriviallyDead(&I)) {
+    if (isInstructionTriviallyDead(&I, TLI)) {
       DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
       ++II;
       CurAST->deleteValue(&I);
@@ -418,17 +430,22 @@
       if (!FoundMod) return true;
     }
 
-    // FIXME: This should use mod/ref information to see if we can hoist or sink
-    // the call.
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
 
     return false;
   }
 
-  // Otherwise these instructions are hoistable/sinkable
-  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
-         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-         isa<ShuffleVectorInst>(I);
+  // Only these instructions are hoistable/sinkable.
+  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
+                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
+                            isa<ExtractElementInst>(I) ||
+                            isa<ShuffleVectorInst>(I));
+  if (!HoistableKind)
+      return false;
+
+  return isSafeToExecuteUnconditionally(I);
 }
 
 /// isNotUsedInLoop - Return true if the only users of this instruction are
@@ -604,6 +621,12 @@
 }
 
 bool LICM::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (MayThrow)
+    return false;
+
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.

diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ac1082c..a72e288 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

@@ -132,7 +132,8 @@
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+                                  const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -153,7 +154,7 @@
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -164,10 +165,11 @@
 
 /// deleteIfDeadInstruction - If the specified value is a dead instruction,
 /// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
+                                    const TargetLibraryInfo *TLI) {
   if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I))
-      deleteDeadInstruction(I, SE);
+    if (isInstructionTriviallyDead(I, TLI))
+      deleteDeadInstruction(I, SE, TLI);
 }
 
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
@@ -490,7 +492,7 @@
                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE);
+    deleteIfDeadInstruction(BasePtr, *SE, TLI);
     return false;
   }
 
@@ -538,7 +540,7 @@
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE, TLI);
   ++NumMemSet;
   return true;
 }
@@ -579,7 +581,7 @@
                             getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -594,8 +596,8 @@
                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE);
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -628,7 +630,7 @@
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE, TLI);
   ++NumMemCpy;
   return true;
 }

diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 982400c..f5daa7b 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp

@@ -120,7 +120,7 @@
             ++NumSimplified;
           }
         }
-        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I);
+        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
         if (IsSubloopHeader && !isa<PHINode>(I))
           break;

diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7eeb152..abe07aa 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp

@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -256,6 +257,7 @@
     return false;
 
   BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
@@ -267,13 +269,9 @@
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity.
-  // Keep it simple, and restrict loop rotation to loops with one exit only.
-  // In future, lift this restriction and support for multiple exits if
-  // required.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1)
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
   // Check size of original header and reject loop if it is very big.
@@ -286,11 +284,10 @@
 
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0 || OrigLatch == 0)
+  if (OrigPreheader == 0)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
@@ -298,6 +295,8 @@
   if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
     SE->forgetLoop(L);
 
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
   // loop.  Otherwise loop is not suitable for rotation.
@@ -408,10 +407,19 @@
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
-      // Since OrigPreheader now has the conditional branch to Exit block, it is
-      // the dominator of Exit.
-      DT->changeImmediateDominator(Exit, OrigPreheader);
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
@@ -440,6 +448,35 @@
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
     }
   }
 
@@ -452,6 +489,8 @@
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
 
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
   ++NumRotated;
   return true;
 }

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 0ae7a51..d7495da 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -121,9 +121,11 @@
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
+#ifndef NDEBUG
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -414,9 +416,11 @@
   }
 }
 
+#ifndef NDEBUG
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
@@ -974,9 +978,11 @@
     OS << ", plus " << SetupCost << " setup cost";
 }
 
+#ifndef NDEBUG
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1060,9 +1066,11 @@
     OS << ", Offset=" << Offset;
 }
 
+#ifndef NDEBUG
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1252,9 +1260,11 @@
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
+#ifndef NDEBUG
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
 /// be completely folded into the user instruction at isel time. This includes
@@ -3436,9 +3446,11 @@
      << " , add offset " << Imm;
 }
 
+#ifndef NDEBUG
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
 /// distance apart and try to form reuse opportunities between them.
@@ -4731,9 +4743,11 @@
   print_uses(OS);
 }
 
+#ifndef NDEBUG
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 

diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 3222f20..dce8e8b 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp

@@ -1236,16 +1236,19 @@
 
   // An ObjC-Identified object can't alias a load if it is never locally stored.
   if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return isStoredObjCPointer(A);
     if (BIsIdentified) {
-      // If both pointers have provenance, they can be directly compared.
-      if (A != B)
-        return false;
-    } else {
-      if (isa<LoadInst>(B))
-        return isStoredObjCPointer(A);
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return isStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
     }
-  } else {
-    if (BIsIdentified && isa<LoadInst>(A))
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
       return isStoredObjCPointer(B);
   }
 
@@ -1381,9 +1384,6 @@
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// NestCount - The known minimum level of retain+release nesting.
-    unsigned NestCount;
-
     /// KnownPositiveRefCount - True if the reference count is known to
     /// be incremented.
     bool KnownPositiveRefCount;
@@ -1401,7 +1401,7 @@
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
                  Seq(S_None) {}
 
     void SetKnownPositiveRefCount() {
@@ -1416,18 +1416,6 @@
       return KnownPositiveRefCount;
     }
 
-    void IncrementNestCount() {
-      if (NestCount != UINT_MAX) ++NestCount;
-    }
-
-    void DecrementNestCount() {
-      if (NestCount != 0) --NestCount;
-    }
-
-    bool IsKnownNested() const {
-      return NestCount > 0;
-    }
-
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1454,7 +1442,6 @@
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1868,6 +1855,26 @@
   return AutoreleaseCallee;
 }
 
+/// IsPotentialUse - Test whether the given value is possible a
+/// reference-counted pointer, including tests which utilize AliasAnalysis.
+static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialUse(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
 /// CanAlterRefCount - Test whether the given instruction can result in a
 /// reference count modification (positive or negative) for the pointer's
 /// object.
@@ -1894,7 +1901,7 @@
     for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
          I != E; ++I) {
       const Value *Op = *I;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1919,14 +1926,14 @@
     // Comparing a pointer with null, or any other constant, isn't really a use,
     // because we don't care what the pointer points to, or about the values
     // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1)))
+    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
       return false;
   } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
     for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
          OE = CS.arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1936,14 +1943,14 @@
     const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialUse(Op) && PA.related(Op, Ptr);
+    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
       return true;
   }
   return false;
@@ -2612,11 +2619,11 @@
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
     S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
@@ -2631,7 +2638,6 @@
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
-    S.DecrementNestCount();
 
     switch (S.GetSeq()) {
     case S_Stop:
@@ -2747,8 +2753,9 @@
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator SI(MyStates.succ_begin()),
-       SE(MyStates.succ_end()); SI != SE; ++SI) {
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
     const BasicBlock *Succ = *SI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
     assert(I != BBStates.end());
@@ -2760,7 +2767,6 @@
       assert(I != BBStates.end());
       MyStates.MergeSucc(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2823,12 +2829,11 @@
 
       S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not sufficient.
-      S.RRI.KnownSafe = S.IsKnownNested();
+      S.RRI.KnownSafe = S.IsKnownIncremented();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
 
     // A retain can be a potential use; procede to the generic checking
     // code below.
@@ -2838,7 +2843,7 @@
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementNestCount();
+    S.ClearRefCount();
 
     switch (S.GetSeq()) {
     case S_Retain:
@@ -2935,8 +2940,9 @@
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
     const BasicBlock *Pred = *PI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
     assert(I != BBStates.end());
@@ -2948,7 +2954,6 @@
       assert(I != BBStates.end());
       MyStates.MergePred(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, top-down.

diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d13e4ab..6d27db1 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp

@@ -59,9 +59,9 @@
   return new CFGSimplifyPass();
 }
 
-/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
-static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
@@ -87,8 +87,8 @@
   }
 }
 
-/// ChangeToCall - Convert the specified invoke into a normal call.
-static void ChangeToCall(InvokeInst *II) {
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -105,7 +105,7 @@
   II->eraseFromParent();
 }
 
-static bool MarkAliveBlocks(BasicBlock *BB,
+static bool markAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
@@ -129,7 +129,7 @@
           ++BBI;
           if (!isa<UnreachableInst>(BBI)) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            ChangeToUnreachable(BBI, false);
+            changeToUnreachable(BBI, false);
             Changed = true;
           }
           break;
@@ -148,7 +148,7 @@
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
-          ChangeToUnreachable(SI, true);
+          changeToUnreachable(SI, true);
           Changed = true;
           break;
         }
@@ -159,7 +159,7 @@
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Value *Callee = II->getCalledValue();
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        ChangeToUnreachable(II, true);
+        changeToUnreachable(II, true);
         Changed = true;
       } else if (II->doesNotThrow()) {
         if (II->use_empty() && II->onlyReadsMemory()) {
@@ -168,7 +168,7 @@
           II->getUnwindDest()->removePredecessor(II->getParent());
           II->eraseFromParent();
         } else
-          ChangeToCall(II);
+          changeToCall(II);
         Changed = true;
       }
     }
@@ -180,12 +180,12 @@
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
-static bool RemoveUnreachableBlocksFromFn(Function &F) {
+static bool removeUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -215,9 +215,9 @@
   return true;
 }
 
-/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
-static bool MergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
 
   BasicBlock *RetBlock = 0;
@@ -291,9 +291,9 @@
   return Changed;
 }
 
-/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
+static bool iterativelySimplifyCFG(Function &F, const TargetData *TD) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -317,24 +317,24 @@
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
   const TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
-  EverChanged |= MergeEmptyReturnBlocks(F);
-  EverChanged |= IterativeSimplifyCFG(F, TD);
+  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TD);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
 
-  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of
-  // RemoveUnreachableBlocksFromFn doesn't do anything.
-  if (!RemoveUnreachableBlocksFromFn(F))
+  // avoid reruning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocksFromFn doesn't do anything.
+  if (!removeUnreachableBlocksFromFn(F))
     return true;
 
   do {
-    EverChanged = IterativeSimplifyCFG(F, TD);
-    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+    EverChanged = iterativelySimplifyCFG(F, TD);
+    EverChanged |= removeUnreachableBlocksFromFn(F);
   } while (EverChanged);
 
   return true;

diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 3904419..65311fe 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp

@@ -28,6 +28,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetData.h"
@@ -38,6 +39,10 @@
 STATISTIC(NumSimplified, "Number of library calls simplified");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
 //===----------------------------------------------------------------------===//
@@ -893,16 +898,56 @@
 //===----------------------------------------------------------------------===//
 
 //===---------------------------------------===//
-// 'cos*' Optimizations
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
 
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
+
+//===---------------------------------------===//
+// 'cos*' Optimizations
 struct CosOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     // cos(-x) -> cos(x)
     Value *Op1 = CI->getArgOperand(0);
@@ -910,7 +955,7 @@
       BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
       return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
     }
-    return 0;
+    return Ret;
   }
 };
 
@@ -919,13 +964,20 @@
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
@@ -936,7 +988,7 @@
     }
 
     ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return 0;
+    if (Op2C == 0) return Ret;
 
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
@@ -974,12 +1026,19 @@
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
@@ -1016,29 +1075,7 @@
 
       return CI;
     }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
-
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return 0;
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
+    return Ret;
   }
 };
 
@@ -1534,7 +1571,8 @@
     StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
-    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
+    UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
     // Integer Optimizations
     FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
     ToAsciiOpt ToAscii;
@@ -1547,10 +1585,13 @@
   public:
     static char ID; // Pass identification
     SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
-                         StpCpy(false), StpCpyChk(true) {
+                         StpCpy(false), StpCpyChk(true),
+                         UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
+    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1586,6 +1627,12 @@
     Optimizations[TLI->getName(F)] = Opt;
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
+                              LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
@@ -1641,20 +1688,37 @@
   Optimizations["llvm.exp2.f64"] = &Exp2;
   Optimizations["llvm.exp2.f32"] = &Exp2;
 
-  if (TLI->has(LibFunc::fabs) && TLI->has(LibFunc::fabsf))
-    Optimizations["fabs"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf))
-    Optimizations["floor"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf))
-    Optimizations["ceil"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf))
-    Optimizations["round"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf))
-    Optimizations["rint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf))
-    Optimizations["nearbyint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::trunc) && TLI->has(LibFunc::truncf))
-    Optimizations["trunc"] = &UnaryDoubleFP;
+  AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
+  AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
+  AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
+  AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
+  AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
+
+  if(UnsafeFPShrink) {
+    AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+  }
 
   // Integer Optimizations
   Optimizations["ffs"] = &FFS;

diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index d831452..1e6586b 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp

@@ -55,10 +55,12 @@
   OS << ']';
 }
 
+#ifndef NDEBUG
 void ExtAddrMode::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 
 /// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.

diff --git a/lib/Transforms/Utils/Android.mk b/lib/Transforms/Utils/Android.mk
index 5a908d0..c3f0789 100644
--- a/lib/Transforms/Utils/Android.mk
+++ b/lib/Transforms/Utils/Android.mk

@@ -5,6 +5,7 @@
   BasicBlockUtils.cpp \
   BreakCriticalEdges.cpp \
   BuildLibCalls.cpp \
+  BypassSlowDivision.cpp \
   CloneFunction.cpp \
   CloneModule.cpp \
   CmpInstAnalysis.cpp \

diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2679b93..75a7817 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp

@@ -94,7 +94,7 @@
 /// is dead. Also recursively delete any operands that become dead as
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle.
-bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
   SmallVector<WeakVH, 8> PHIs;
@@ -105,7 +105,7 @@
   bool Changed = false;
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
     if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
-      Changed |= RecursivelyDeleteDeadPHINode(PN);
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
 
   return Changed;
 }

diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
new file mode 100644
index 0000000..30d60be
--- /dev/null
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp

@@ -0,0 +1,253 @@
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bypass-slow-division"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+
+namespace {
+  struct DivOpInfo {
+    bool SignedOp;
+    Value *Dividend;
+    Value *Divisor;
+
+    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+  };
+
+  struct DivPhiNodes {
+    PHINode *Quotient;
+    PHINode *Remainder;
+
+    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<DivOpInfo> {
+    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+      return Val1.SignedOp == Val2.SignedOp &&
+             Val1.Dividend == Val2.Dividend &&
+             Val1.Divisor == Val2.Divisor;
+    }
+
+    static DivOpInfo getEmptyKey() {
+      return DivOpInfo(false, 0, 0);
+    }
+
+    static DivOpInfo getTombstoneKey() {
+      return DivOpInfo(true, 0, 0);
+    }
+
+    static unsigned getHashValue(const DivOpInfo &Val) {
+      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+                        (unsigned)Val.SignedOp;
+    }
+  };
+
+  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+}
+
+// insertFastDiv - Substitutes the div/rem instruction with code that checks the
+// value of the operands and uses a shorter-faster div/rem instruction when
+// possible and the longer-slower div/rem instruction otherwise.
+static bool insertFastDiv(Function &F,
+                          Function::iterator &I,
+                          BasicBlock::iterator &J,
+                          IntegerType *BypassType,
+                          bool UseDivOp,
+                          bool UseSignedOp,
+                          DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  Value *Dividend = Instr->getOperand(0);
+  Value *Divisor = Instr->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor) ||
+      (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
+    // Operations with immediate values should have
+    // been solved and replaced during compile time.
+    return false;
+  }
+
+  // Basic Block is split before divide
+  BasicBlock *MainBB = I;
+  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
+  ++I; //advance iterator I to successorBB
+
+  // Add new basic block for slow divide operation
+  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  SlowBB->moveBefore(SuccessorBB);
+  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
+  Value *SlowQuotientV;
+  Value *SlowRemainderV;
+  if (UseSignedOp) {
+    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  } else {
+    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+  }
+  SlowBuilder.CreateBr(SuccessorBB);
+
+  // Add new basic block for fast divide operation
+  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  FastBB->moveBefore(SlowBB);
+  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
+  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
+                                                BypassType);
+  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
+                                                 BypassType);
+
+  // udiv/urem because optimization only handles positive numbers
+  Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
+                                                      ShortDivisorV);
+  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
+                                                  ShortDivisorV);
+  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                ShortQuotientV,
+                                                Dividend->getType());
+  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                 ShortRemainderV,
+                                                 Dividend->getType());
+  FastBuilder.CreateBr(SuccessorBB);
+
+  // Phi nodes for result of div and rem
+  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
+  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
+  QuoPhi->addIncoming(FastQuotientV, FastBB);
+  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  RemPhi->addIncoming(SlowRemainderV, SlowBB);
+  RemPhi->addIncoming(FastRemainderV, FastBB);
+
+  // Replace Instr with appropriate phi node
+  if (UseDivOp)
+    Instr->replaceAllUsesWith(QuoPhi);
+  else
+    Instr->replaceAllUsesWith(RemPhi);
+  Instr->eraseFromParent();
+
+  // Combine operands into a single value with OR for value testing below
+  MainBB->getInstList().back().eraseFromParent();
+  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values and branch
+  Value *ZeroV = MainBuilder.getInt32(0);
+  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
+  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
+
+  // point iterator J at first instruction of successorBB
+  J = I->begin();
+
+  // Cache phi nodes to be used later in place of other instances
+  // of div or rem with the same sign, dividend, and divisor
+  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
+  DivPhiNodes Value(QuoPhi, RemPhi);
+  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
+  return true;
+}
+
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
+// operands and operation are identical. Otherwise call insertFastDiv to perform
+// the optimization and cache the resulting dividend and remainder.
+static bool reuseOrInsertFastDiv(Function &F,
+                                 Function::iterator &I,
+                                 BasicBlock::iterator &J,
+                                 IntegerType *BypassType,
+                                 bool UseDivOp,
+                                 bool UseSignedOp,
+                                 DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
+
+  if (CacheI == PerBBDivCache.end()) {
+    // If previous instance does not exist, insert fast div
+    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
+                         PerBBDivCache);
+  }
+
+  // Replace operation value with previously generated phi node
+  DivPhiNodes &Value = CacheI->second;
+  if (UseDivOp) {
+    // Replace all uses of div instruction with quotient phi node
+    J->replaceAllUsesWith(Value.Quotient);
+  } else {
+    // Replace all uses of rem instruction with remainder phi node
+    J->replaceAllUsesWith(Value.Remainder);
+  }
+
+  // Advance to next operation
+  ++J;
+
+  // Remove redundant operation
+  Instr->eraseFromParent();
+  return true;
+}
+
+// bypassSlowDivision - This optimization identifies DIV instructions that can
+// be profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(Function &F,
+                              Function::iterator &I,
+                              const DenseMap<Type *, Type *> &BypassTypeMap) {
+  DivCacheTy DivCache;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+
+    // Get instruction details
+    unsigned Opcode = J->getOpcode();
+    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
+    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
+    bool UseSignedOp = Opcode == Instruction::SDiv ||
+                       Opcode == Instruction::SRem;
+
+    // Only optimize div or rem ops
+    if (!UseDivOp && !UseRemOp)
+      continue;
+
+    // Continue if div/rem type is not bypassed
+    DenseMap<Type *, Type *>::const_iterator BT =
+      BypassTypeMap.find(J->getType());
+    if (BT == BypassTypeMap.end())
+      continue;
+
+    IntegerType *BypassType = cast<IntegerType>(BT->second);
+    MadeChange |= reuseOrInsertFastDiv(F, I, J, BypassType, UseDivOp,
+                                       UseSignedOp, DivCache);
+  }
+
+  return MadeChange;
+}

diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 4ff31ca..215a16f 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt

@@ -3,6 +3,7 @@
   BasicBlockUtils.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
+  BypassSlowDivision.cpp
   CloneFunction.cpp
   CloneModule.cpp
   CmpInstAnalysis.cpp

diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index bed7d72..0601433 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp

@@ -52,7 +52,8 @@
 /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
-bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI) {
   TerminatorInst *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
@@ -96,7 +97,7 @@
       Value *Cond = BI->getCondition();
       BI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     return false;
@@ -161,7 +162,7 @@
       Value *Cond = SI->getCondition();
       SI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     
@@ -205,7 +206,7 @@
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Address);
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
       
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
@@ -230,7 +231,8 @@
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool llvm::isInstructionTriviallyDead(Instruction *I) {
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
   // We don't want the landingpad instruction removed by anything this general.
@@ -265,9 +267,9 @@
       return isa<UndefValue>(II->getArgOperand(1));
   }
 
-  if (isAllocLikeFn(I)) return true;
+  if (isAllocLikeFn(I, TLI)) return true;
 
-  if (CallInst *CI = isFreeCall(I))
+  if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
       return C->isNullValue() || isa<UndefValue>(C);
 
@@ -278,9 +280,11 @@
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
+bool
+llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                 const TargetLibraryInfo *TLI) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I))
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
     return false;
   
   SmallVector<Instruction*, 16> DeadInsts;
@@ -301,7 +305,7 @@
       // operand, and if it is 'trivially' dead, delete it in a future loop
       // iteration.
       if (Instruction *OpI = dyn_cast<Instruction>(OpV))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           DeadInsts.push_back(OpI);
     }
     
@@ -334,19 +338,20 @@
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI) {
   SmallPtrSet<Instruction*, 4> Visited;
   for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
        I = cast<Instruction>(*I->use_begin())) {
     if (I->use_empty())
-      return RecursivelyDeleteTriviallyDeadInstructions(I);
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
     // If we find an instruction more than once, we're on a cycle that
     // won't prove fruitful.
     if (!Visited.insert(I)) {
       // Break the cycle and delete the instruction and its operands.
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
-      (void)RecursivelyDeleteTriviallyDeadInstructions(I);
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
       return true;
     }
   }
@@ -358,7 +363,8 @@
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD,
+                                       const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
 #ifndef NDEBUG
@@ -381,7 +387,7 @@
       continue;
     }
 
-    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
     if (BIHandle != BI)
       BI = BB->begin();
   }

diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 518df7c..32d7fa1 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp

@@ -22,6 +22,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
+#include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -54,6 +55,7 @@
        cl::desc("Duplicate return instructions into unconditional branches"));
 
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 
 namespace {
   /// ValueEqualityComparisonCase - Represents a case of a switch.
@@ -101,14 +103,14 @@
 ///
 static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
   if (SI1 == SI2) return false;  // Can't merge with self!
-  
+
   // It is not safe to merge these two switch instructions if they have a common
   // successor, and if that successor has a PHI node, and if *that* PHI node has
   // conflicting incoming values from the two switch blocks.
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
   SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  
+
   for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
     if (SI1Succs.count(*I))
       for (BasicBlock::iterator BBI = (*I)->begin();
@@ -118,7 +120,7 @@
             PN->getIncomingValueForBlock(SI2BB))
           return false;
       }
-        
+
   return true;
 }
 
@@ -135,7 +137,7 @@
   assert(SI1->isUnconditional() && SI2->isConditional());
 
   // We fold the unconditional branch if we can easily update all PHI nodes in
-  // common successors: 
+  // common successors:
   // 1> We have a constant incoming value for the conditional branch;
   // 2> We have "Cond" as the incoming value for the unconditional branch;
   // 3> SI2->getCondition() and Cond have same operands.
@@ -170,7 +172,7 @@
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
   if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
-  
+
   PHINode *PN;
   for (BasicBlock::iterator I = Succ->begin();
        (PN = dyn_cast<PHINode>(I)); ++I)
@@ -222,7 +224,7 @@
     // doesn't dominate BB.
     if (Pred2->getSinglePredecessor() == 0)
       return 0;
-    
+
     // If we found a conditional branch predecessor, make sure that it branches
     // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
     if (Pred1Br->getSuccessor(0) == BB &&
@@ -252,7 +254,7 @@
   // Otherwise, if this is a conditional branch, then we can use it!
   BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
   if (BI == 0) return 0;
-  
+
   assert(BI->isConditional() && "Two successors but not conditional?");
   if (BI->getSuccessor(0) == Pred1) {
     IfTrue = Pred1;
@@ -345,7 +347,7 @@
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
   if (AggressiveInsts == 0) return false;
-  
+
   // If we have seen this instruction before, don't count it again.
   if (AggressiveInsts->count(I)) return true;
 
@@ -411,7 +413,7 @@
                        const TargetData *TD, bool isEQ, unsigned &UsedICmps) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
-  
+
   // If this is an icmp against a constant, handle this as one of the cases.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
     if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) {
@@ -420,21 +422,21 @@
         Vals.push_back(C);
         return I->getOperand(0);
       }
-      
+
       // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to
       // the set.
       ConstantRange Span =
         ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
-      
+
       // If this is an and/!= check then we want to optimize "x ugt 2" into
       // x != 0 && x != 1.
       if (!isEQ)
         Span = Span.inverse();
-      
+
       // If there are a ton of values, we don't want to make a ginormous switch.
       if (Span.getSetSize().ugt(8) || Span.isEmptySet())
         return 0;
-      
+
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
         Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
@@ -442,11 +444,11 @@
     }
     return 0;
   }
-  
+
   // Otherwise, we can only handle an | or &, depending on isEQ.
   if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
     return 0;
-  
+
   unsigned NumValsBeforeLHS = Vals.size();
   unsigned UsedICmpsBeforeLHS = UsedICmps;
   if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD,
@@ -467,12 +469,12 @@
       Extra = I->getOperand(1);
       return LHS;
     }
-    
+
     Vals.resize(NumValsBeforeLHS);
     UsedICmps = UsedICmpsBeforeLHS;
     return 0;
   }
-  
+
   // If the LHS can't be folded in, but Extra is available and RHS can, try to
   // use LHS as Extra.
   if (Extra == 0 || Extra == I->getOperand(0)) {
@@ -484,7 +486,7 @@
     assert(Vals.size() == NumValsBeforeLHS);
     Extra = OldExtra;
   }
-  
+
   return 0;
 }
 
@@ -615,6 +617,9 @@
   assert(ThisVal && "This isn't a value comparison!!");
   if (ThisVal != PredVal) return false;  // Different predicates.
 
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Find out information about when control will move from Pred to TI's block.
   std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
@@ -634,7 +639,7 @@
     // can simplify TI.
     if (!ValuesOverlap(PredCases, ThisCases))
       return false;
-    
+
     if (isa<BranchInst>(TI)) {
       // Okay, one of the successors of this condbr is dead.  Convert it to a
       // uncond br.
@@ -652,7 +657,7 @@
       EraseTerminatorInstAndDCECond(TI);
       return true;
     }
-      
+
     SwitchInst *SI = cast<SwitchInst>(TI);
     // Okay, TI has cases that are statically dead, prune them away.
     SmallPtrSet<Constant*, 16> DeadCases;
@@ -673,7 +678,7 @@
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
-  
+
   // Otherwise, TI's block must correspond to some matched value.  Find out
   // which value (or set of values) this is.
   ConstantInt *TIV = 0;
@@ -729,8 +734,8 @@
 }
 
 static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt**)P1;
-  const ConstantInt *RHS = *(const ConstantInt**)P2;
+  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
+  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -738,6 +743,67 @@
   return -1;
 }
 
+static inline bool HasBranchWeights(const Instruction* I) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Tries to get a branch weight for the given instruction, returns NULL if it
+/// can't. Pos starts at 0.
+static ConstantInt* GetWeight(Instruction* I, int Pos) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0)) {
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0))) {
+      if (MDS->getString().equals("branch_weights")) {
+        assert(ProfMD->getNumOperands() >= 3);
+        return dyn_cast<ConstantInt>(ProfMD->getOperand(1 + Pos));
+      }
+    }
+  }
+
+  return 0;
+}
+
+/// Scale the given weights based on the successor TI's metadata. Scaling is
+/// done by multiplying every weight by the sum of the successor's weights.
+static void ScaleWeights(Instruction* STI, MutableArrayRef<uint64_t> Weights) {
+  // Sum the successor's weights
+  assert(HasBranchWeights(STI));
+  unsigned Scale = 0;
+  MDNode* ProfMD = STI->getMetadata(LLVMContext::MD_prof);
+  for (unsigned i = 1; i < ProfMD->getNumOperands(); ++i) {
+    ConstantInt* CI = dyn_cast<ConstantInt>(ProfMD->getOperand(i));
+    assert(CI);
+    Scale += CI->getValue().getZExtValue();
+  }
+
+  // Skip default, as it's replaced during the folding
+  for (unsigned i = 1; i < Weights.size(); ++i) {
+    Weights[i] *= Scale;
+  }
+}
+
+/// Sees if any of the weights are too big for a uint32_t, and halves all the
+/// weights if any are.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  bool Halve = false;
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    if (Weights[i] > UINT_MAX) {
+      Halve = true;
+      break;
+    }
+
+  if (! Halve)
+    return;
+
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    Weights[i] /= 2;
+}
+
 /// FoldValueComparisonIntoPredecessors - The specified terminator is a value
 /// equality comparison instruction (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
@@ -770,6 +836,55 @@
       // build.
       SmallVector<BasicBlock*, 8> NewSuccessors;
 
+      // Update the branch weight metadata along the way
+      SmallVector<uint64_t, 8> Weights;
+      uint64_t PredDefaultWeight = 0;
+      bool PredHasWeights = HasBranchWeights(PTI);
+      bool SuccHasWeights = HasBranchWeights(TI);
+
+      if (PredHasWeights) {
+        MDNode* MD = PTI->getMetadata(LLVMContext::MD_prof);
+        assert(MD);
+        for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+          ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
+          assert(CI);
+          Weights.push_back(CI->getValue().getZExtValue());
+        }
+
+        // If the predecessor is a conditional eq, then swap the default weight
+        // to be the first entry.
+        if (BranchInst* BI = dyn_cast<BranchInst>(PTI)) {
+          assert(Weights.size() == 2);
+          ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ) {
+            std::swap(Weights.front(), Weights.back());
+          }
+        }
+
+        PredDefaultWeight = Weights.front();
+      } else if (SuccHasWeights) {
+        // If there are no predecessor weights but there are successor weights,
+        // populate Weights with 1, which will later be scaled to the sum of
+        // successor's weights
+        Weights.assign(1 + PredCases.size(), 1);
+        PredDefaultWeight = 1;
+      }
+
+      uint64_t SuccDefaultWeight = 0;
+      if (SuccHasWeights) {
+        int Index = 0;
+        if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+          ICmpInst* ICI = dyn_cast<ICmpInst>(BI->getCondition());
+          assert(ICI);
+
+          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+            Index = 1;
+        }
+
+        SuccDefaultWeight = GetWeight(TI, Index)->getValue().getZExtValue();
+      }
+
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
         // that don't occur in PTI, or that branch to BB will be activated.
@@ -780,6 +895,12 @@
           else {
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
+
+            if (PredHasWeights) {
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             PredCases.pop_back();
             --i; --e;
           }
@@ -790,14 +911,34 @@
           PredDefault = BBDefault;
           NewSuccessors.push_back(BBDefault);
         }
+
+        if (SuccHasWeights) {
+          ScaleWeights(TI, Weights);
+          Weights.front() *= SuccDefaultWeight;
+        } else if (PredHasWeights) {
+          Weights.front() /= (1 + BBCases.size());
+        }
+
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (!PTIHandled.count(BBCases[i].Value) &&
               BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
+            if (SuccHasWeights) {
+              Weights.push_back(PredDefaultWeight *
+                                GetWeight(TI, i)->getValue().getZExtValue());
+            } else if (PredHasWeights) {
+              // Split the old default's weight amongst the children
+              Weights.push_back(PredDefaultWeight / (1 + BBCases.size()));
+            }
           }
 
       } else {
+        // FIXME: preserve branch weight metadata, similarly to the 'then'
+        // above. For now, drop it.
+        PredHasWeights = false;
+        SuccHasWeights = false;
+
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
@@ -822,7 +963,7 @@
 
         // If there are any constants vectored to BB that TI doesn't handle,
         // they must go to the default destination of TI.
-        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
@@ -851,6 +992,17 @@
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
         NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
 
+      if (PredHasWeights || SuccHasWeights) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(Weights);
+
+        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+
+        NewSI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(BB->getContext()).
+                           createBranchWeights(MDWeights));
+      }
+
       EraseTerminatorInstAndDCECond(PTI);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
@@ -984,11 +1136,11 @@
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
       if (BB1V == BB2V) continue;
-      
+
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0) 
+      if (SI == 0)
         SI = cast<SelectInst>
           (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
                                 BB1V->getName()+"."+BB2V->getName()));
@@ -1056,7 +1208,7 @@
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
       if (OpI && OpI->getParent() == BIParent &&
@@ -1112,7 +1264,7 @@
   // as well.
   if (PHIs.empty())
     return false;
-  
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
 
@@ -1162,13 +1314,13 @@
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   unsigned Size = 0;
-  
+
   for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
     if (Size > 10) return false;  // Don't clone large BB's.
     ++Size;
-    
+
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
     for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
@@ -1176,7 +1328,7 @@
       Instruction *U = cast<Instruction>(*UI);
       if (U->getParent() != BB || isa<PHINode>(U)) return false;
     }
-    
+
     // Looks ok, continue checking.
   }
 
@@ -1194,31 +1346,31 @@
   // outside of the block.
   if (!PN || PN->getParent() != BB || !PN->hasOneUse())
     return false;
-  
+
   // Degenerate case of a single entry PHI.
   if (PN->getNumIncomingValues() == 1) {
     FoldSingleEntryPHINodes(PN->getParent());
-    return true;    
+    return true;
   }
 
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
-  
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
     if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue;
-    
+
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
-    
+
     if (RealDest == BB) continue;  // Skip self loops.
     // Skip if the predecessor's terminator is an indirect branch.
     if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
-    
+
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
@@ -1227,7 +1379,7 @@
                                             RealDest->getName()+".critedge",
                                             RealDest->getParent(), RealDest);
     BranchInst::Create(RealDest, EdgeBB);
-    
+
     // Update PHI nodes.
     AddPredecessorToBlock(RealDest, EdgeBB, BB);
 
@@ -1244,7 +1396,7 @@
       // Clone the instruction.
       Instruction *N = BBI->clone();
       if (BBI->hasName()) N->setName(BBI->getName()+".c");
-      
+
       // Update operands due to translation.
       for (User::op_iterator i = N->op_begin(), e = N->op_end();
            i != e; ++i) {
@@ -1252,7 +1404,7 @@
         if (PI != TranslateMap.end())
           *i = PI->second;
       }
-      
+
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, TD)) {
         TranslateMap[BBI] = V;
@@ -1297,7 +1449,7 @@
       // Don't bother if the branch will be constant folded trivially.
       isa<ConstantInt>(IfCond))
     return false;
-  
+
   // Okay, we found that we can merge this two-entry phi node into a select.
   // Doing so would require us to fold *all* two entry phi nodes in this block.
   // At some point this becomes non-profitable (particularly if the target
@@ -1307,14 +1459,14 @@
   for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
     if (NumPhis > 2)
       return false;
-  
+
   // Loop over the PHI's seeing if we can promote them all to select
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction*, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
-  
+
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
     if (Value *V = SimplifyInstruction(PN, TD)) {
@@ -1322,19 +1474,19 @@
       PN->eraseFromParent();
       continue;
     }
-    
+
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
                              MaxCostVal0) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
                              MaxCostVal1))
       return false;
   }
-  
+
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
   if (PN == 0) return true;
-  
+
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
   if (PN->getType()->isIntegerTy(1) &&
@@ -1342,7 +1494,7 @@
        isa<BinaryOperator>(PN->getIncomingValue(1)) ||
        isa<BinaryOperator>(IfCond)))
     return false;
-  
+
   // If we all PHI nodes are promotable, check to make sure that all
   // instructions in the predecessor blocks can be promoted as well.  If
   // not, we won't be able to get rid of the control flow, so it's not
@@ -1362,7 +1514,7 @@
         return false;
       }
   }
-    
+
   if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
     IfBlock2 = 0;
   } else {
@@ -1375,15 +1527,15 @@
         return false;
       }
   }
-  
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
-      
+
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
   IRBuilder<true, NoFolder> Builder(InsertPt);
-  
+
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
   if (IfBlock1)
@@ -1394,19 +1546,19 @@
     DomBlock->getInstList().splice(InsertPt,
                                    IfBlock2->getInstList(), IfBlock2->begin(),
                                    IfBlock2->getTerminator());
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
     Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
-    
-    SelectInst *NV = 
+
+    SelectInst *NV =
       cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
     PN->replaceAllUsesWith(NV);
     NV->takeName(PN);
     PN->eraseFromParent();
   }
-  
+
   // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
@@ -1420,14 +1572,14 @@
 /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
 /// to two returning blocks, try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, 
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
                                            IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
   ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
   ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
-  
+
   // Check to ensure both blocks are empty (just a return) or optionally empty
   // with PHI nodes.  If there are other instructions, merging would cause extra
   // computation on one path or the other.
@@ -1447,12 +1599,12 @@
     EraseTerminatorInstAndDCECond(BI);
     return true;
   }
-    
+
   // Otherwise, figure out what the true and false return values are
   // so we can insert a new select instruction.
   Value *TrueValue = TrueRet->getReturnValue();
   Value *FalseValue = FalseRet->getReturnValue();
-  
+
   // Unwrap any PHI nodes in the return blocks.
   if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
     if (TVPN->getParent() == TrueSucc)
@@ -1460,7 +1612,7 @@
   if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
     if (FVPN->getParent() == FalseSucc)
       FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
-  
+
   // In order for this transformation to be safe, we must be able to
   // unconditionally execute both operands to the return.  This is
   // normally the case, but we could have a potentially-trapping
@@ -1472,12 +1624,12 @@
   if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
     if (FCV->canTrap())
       return false;
-  
+
   // Okay, we collected all the mapped values and checked them for sanity, and
   // defined to really do this transformation.  First, update the CFG.
   TrueSucc->removePredecessor(BI->getParent());
   FalseSucc->removePredecessor(BI->getParent());
-  
+
   // Insert select instructions where needed.
   Value *BrCond = BI->getCondition();
   if (TrueValue) {
@@ -1491,15 +1643,15 @@
     }
   }
 
-  Value *RI = !TrueValue ? 
+  Value *RI = !TrueValue ?
     Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
 
   (void) RI;
-      
+
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
                << "\n  " << *BI << "NewRet = " << *RI
                << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
-      
+
   EraseTerminatorInstAndDCECond(BI);
 
   return true;
@@ -1600,7 +1752,7 @@
     if (Cond == 0)
       return false;
   }
-     
+
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
@@ -1623,7 +1775,7 @@
       isSafeToSpeculativelyExecute(FrontIt)) {
     BonusInst = &*FrontIt;
     ++FrontIt;
-    
+
     // Ignore dbg intrinsics.
     while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
   }
@@ -1631,13 +1783,13 @@
   // Only a single bonus inst is allowed.
   if (&*FrontIt != Cond)
     return false;
-  
+
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
 
   // Ingore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
-  
+
   if (&*CondIt != BI)
     return false;
 
@@ -1649,7 +1801,7 @@
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
     if (CE->canTrap())
       return false;
-  
+
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
@@ -1659,22 +1811,22 @@
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *PredBlock = *PI;
     BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
-    
+
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
     SmallVector<PHINode*, 4> PHIs;
     if (PBI == 0 || PBI->isUnconditional() ||
-        (BI->isConditional() && 
+        (BI->isConditional() &&
          !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
          !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
-    
+
     // Determine if the two branches share a common destination.
     Instruction::BinaryOps Opc;
     bool InvertPredCond = false;
-    
+
     if (BI->isConditional()) {
       if (PBI->getSuccessor(0) == TrueDest)
         Opc = Instruction::Or;
@@ -1693,7 +1845,7 @@
 
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
-    // must already have been resolved, so we won't be inhibiting the 
+    // must already have been resolved, so we won't be inhibiting the
     // out-of-order core by speculating them earlier.
     if (BonusInst) {
       // Collect the values used by the bonus inst
@@ -1707,47 +1859,47 @@
 
       SmallVector<std::pair<Value*, unsigned>, 4> Worklist;
       Worklist.push_back(std::make_pair(PBI->getOperand(0), 0));
-      
+
       // Walk up to four levels back up the use-def chain of the predecessor's
       // terminator to see if all those values were used.  The choice of four
       // levels is arbitrary, to provide a compile-time-cost bound.
       while (!Worklist.empty()) {
         std::pair<Value*, unsigned> Pair = Worklist.back();
         Worklist.pop_back();
-        
+
         if (Pair.second >= 4) continue;
         UsedValues.erase(Pair.first);
         if (UsedValues.empty()) break;
-        
+
         if (Instruction *I = dyn_cast<Instruction>(Pair.first)) {
           for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
                OI != OE; ++OI)
             Worklist.push_back(std::make_pair(OI->get(), Pair.second+1));
-        }       
+        }
       }
-      
+
       if (!UsedValues.empty()) return false;
     }
 
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    IRBuilder<> Builder(PBI);    
+    IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
     if (InvertPredCond) {
       Value *NewCond = PBI->getCondition();
-      
+
       if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = Builder.CreateNot(NewCond, 
+        NewCond = Builder.CreateNot(NewCond,
                                     PBI->getCondition()->getName()+".not");
       }
-      
+
       PBI->setCondition(NewCond);
       PBI->swapSuccessors();
     }
-    
+
     // If we have a bonus inst, clone it into the predecessor block.
     Instruction *NewBonus = 0;
     if (BonusInst) {
@@ -1756,7 +1908,7 @@
       NewBonus->takeName(BonusInst);
       BonusInst->setName(BonusInst->getName()+".old");
     }
-    
+
     // Clone Cond into the predecessor basic block, and or/and the
     // two conditions together.
     Instruction *New = Cond->clone();
@@ -1764,9 +1916,9 @@
     PredBlock->getInstList().insert(PBI, New);
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
-    
+
     if (BI->isConditional()) {
-      Instruction *NewCond = 
+      Instruction *NewCond =
         cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
                                             New, "or.cond"));
       PBI->setCondition(NewCond);
@@ -1806,7 +1958,7 @@
           // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
           // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
           //       is false: PBI_Cond and BI_Value
-          MergedCond = 
+          MergedCond =
             cast<Instruction>(Builder.CreateBinOp(Instruction::And,
                                 PBI->getCondition(), New,
                                 "and.cond"));
@@ -1814,7 +1966,7 @@
             Instruction *NotCond =
               cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
                                   "not.cond"));
-            MergedCond = 
+            MergedCond =
               cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
                                   NotCond, MergedCond,
                                   "or.cond"));
@@ -1921,7 +2073,7 @@
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
         I->clone()->insertBefore(PBI);
-      
+
     return true;
   }
   return false;
@@ -1936,7 +2088,7 @@
   BasicBlock *BB = BI->getParent();
 
   // If this block ends with a branch instruction, and if there is a
-  // predecessor that ends on a branch of the same condition, make 
+  // predecessor that ends on a branch of the same condition, make
   // this conditional branch redundant.
   if (PBI->getCondition() == BI->getCondition() &&
       PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
@@ -1945,11 +2097,11 @@
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                         CondIsTrue));
       return true;  // Nuke the branch on constant.
     }
-    
+
     // Otherwise, if there are multiple predecessors, insert a PHI that merges
     // in the constant and simplify the block result.  Subsequent passes of
     // simplifycfg will thread the block.
@@ -1969,18 +2121,18 @@
             PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                               CondIsTrue), P);
         } else {
           NewPN->addIncoming(BI->getCondition(), P);
         }
       }
-      
+
       BI->setCondition(NewPN);
       return true;
     }
   }
-  
+
   // If this is a conditional branch in an empty block, and if any
   // predecessors is a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
@@ -1991,11 +2143,11 @@
   if (&*BBI != BI)
     return false;
 
-  
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
     if (CE->canTrap())
       return false;
-  
+
   int PBIOp, BIOp;
   if (PBI->getSuccessor(0) == BI->getSuccessor(0))
     PBIOp = BIOp = 0;
@@ -2007,31 +2159,31 @@
     PBIOp = BIOp = 1;
   else
     return false;
-    
+
   // Check to make sure that the other destination of this branch
   // isn't BB itself.  If so, this is an infinite loop that will
   // keep getting unwound.
   if (PBI->getSuccessor(PBIOp) == BB)
     return false;
-    
-  // Do not perform this transformation if it would require 
+
+  // Do not perform this transformation if it would require
   // insertion of a large number of select instructions. For targets
   // without predication/cmovs, this is a big pessimization.
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
-      
+
   unsigned NumPhis = 0;
   for (BasicBlock::iterator II = CommonDest->begin();
        isa<PHINode>(II); ++II, ++NumPhis)
     if (NumPhis > 2) // Disable this xform.
       return false;
-    
+
   // Finally, if everything is ok, fold the branches to logical ops.
   BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
-  
+
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
-  
-  
+
+
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -2046,13 +2198,13 @@
                                                   "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
     OtherDest = InfLoopBlock;
-  }  
-  
+  }
+
   DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
-  
+
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
   IRBuilder<true, NoFolder> Builder(PBI);
@@ -2065,16 +2217,16 @@
 
   // Merge the conditions.
   Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
-  
+
   // Modify PBI to branch on the new condition to the new dests.
   PBI->setCondition(Cond);
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
-  
+
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
   // block that are identical to the entries for BI's block.
   AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
-  
+
   // We know that the CommonDest already had an edge from PBI to
   // it.  If it has PHIs though, the PHIs may have different
   // entries for BB and PBI's BB.  If so, insert a select to make
@@ -2092,10 +2244,10 @@
       PN->setIncomingValue(PBBIdx, NV);
     }
   }
-  
+
   DEBUG(dbgs() << "INTO: " << *PBI->getParent());
   DEBUG(dbgs() << *PBI->getParent()->getParent());
-  
+
   // This basic block is probably dead.  We know it has at least
   // one fewer predecessor.
   return true;
@@ -2214,7 +2366,7 @@
 ///   br label %end
 /// end:
 ///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
-/// 
+///
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
@@ -2228,17 +2380,17 @@
 
   Value *V = ICI->getOperand(0);
   ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
-  
+
   // The pattern we're looking for is where our only predecessor is a switch on
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
   if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false;
-  
+
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
     return false;
-  
+
   // If BB is reachable on a non-default case, then we simply know the value of
   // V in this block.  Substitute it and constant fold the icmp instruction
   // away.
@@ -2246,7 +2398,7 @@
     ConstantInt *VVal = SI->findCaseDest(BB);
     assert(VVal && "Should have a unique destination value");
     ICI->setOperand(0, VVal);
-    
+
     if (Value *V = SimplifyInstruction(ICI, TD)) {
       ICI->replaceAllUsesWith(V);
       ICI->eraseFromParent();
@@ -2254,7 +2406,7 @@
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
@@ -2264,13 +2416,13 @@
       V = ConstantInt::getFalse(BB->getContext());
     else
       V = ConstantInt::getTrue(BB->getContext());
-    
+
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
   // the block.
   BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
@@ -2297,7 +2449,7 @@
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
                                          BB->getParent(), BB);
   SI->addCase(Cst, NewBB);
-  
+
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
   Builder.SetInsertPoint(NewBB);
   Builder.SetCurrentDebugLocation(SI->getDebugLoc());
@@ -2313,8 +2465,8 @@
                                       IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
-  
-  
+
+
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
   // 'setne's and'ed together, collect them.
@@ -2323,7 +2475,7 @@
   bool TrueWhenEqual = true;
   Value *ExtraCase = 0;
   unsigned UsedICmps = 0;
-  
+
   if (Cond->getOpcode() == Instruction::Or) {
     CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true,
                                      UsedICmps);
@@ -2332,7 +2484,7 @@
                                      UsedICmps);
     TrueWhenEqual = false;
   }
-  
+
   // If we didn't have a multiply compared value, fail.
   if (CompVal == 0) return false;
 
@@ -2344,21 +2496,24 @@
   // instruction can't handle, remove them now.
   array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
   Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
-  
+
   // If Extra was used, we require at least two switch values to do the
   // transformation.  A switch with one value is just an cond branch.
   if (ExtraCase && Values.size() < 2) return false;
-  
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
   BasicBlock *EdgeBB    = BI->getSuccessor(0);
   if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
-  
+
   BasicBlock *BB = BI->getParent();
-  
+
   DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
                << " cases into SWITCH.  BB is:\n" << *BB);
-  
+
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
   // right before the condbr to handle it.
@@ -2372,13 +2527,13 @@
       Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
     else
       Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-      
+
     OldTI->eraseFromParent();
-    
+
     // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
-    
+
     DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
           << "\nEXTRABB = " << *BB);
     BB = NewBB;
@@ -2392,14 +2547,14 @@
                                      TD->getIntPtrType(CompVal->getContext()),
                                      "magicptr");
   }
-  
+
   // Create the new switch instruction now.
   SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
 
   // Add all of the 'cases' to the switch instruction.
   for (unsigned i = 0, e = Values.size(); i != e; ++i)
     New->addCase(Values[i], EdgeBB);
-  
+
   // We added edges from PI to the EdgeBB.  As such, if there were any
   // PHI nodes in EdgeBB, they need entries to be added corresponding to
   // the number of edges added.
@@ -2410,10 +2565,10 @@
     for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
       PN->addIncoming(InVal, BB);
   }
-  
+
   // Erase the old branch instruction.
   EraseTerminatorInstAndDCECond(BI);
-  
+
   DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
 }
@@ -2467,7 +2622,7 @@
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
-  
+
   // Find predecessors that end with branches.
   SmallVector<BasicBlock*, 8> UncondBranchPreds;
   SmallVector<BranchInst*, 8> CondBranchPreds;
@@ -2481,7 +2636,7 @@
         CondBranchPreds.push_back(BI);
     }
   }
-  
+
   // If we found some, do the transformation!
   if (!UncondBranchPreds.empty() && DupRet) {
     while (!UncondBranchPreds.empty()) {
@@ -2490,21 +2645,21 @@
             << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
-    
+
     // If we eliminated all predecessors of the block, delete the block now.
     if (pred_begin(BB) == pred_end(BB))
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
-    
+
     return true;
   }
-  
+
   // Check out all of the conditional branches going to this return
   // instruction.  If any of them just select between returns, change the
   // branch itself into a select/return pair.
   while (!CondBranchPreds.empty()) {
     BranchInst *BI = CondBranchPreds.pop_back_val();
-    
+
     // Check to see if the non-BB successor is also a return block.
     if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
         isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
@@ -2516,9 +2671,9 @@
 
 bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   BasicBlock *BB = UI->getParent();
-  
+
   bool Changed = false;
-  
+
   // If there are any instructions immediately before the unreachable that can
   // be removed, do so.
   while (UI != BB->begin()) {
@@ -2558,11 +2713,11 @@
     BBI->eraseFromParent();
     Changed = true;
   }
-  
+
   // If the unreachable instruction is the first in the block, take a gander
   // at all of the predecessors of this instruction, and simplify them.
   if (&BB->front() != UI) return Changed;
-  
+
   SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
@@ -2615,7 +2770,7 @@
         BasicBlock *MaxBlock = 0;
         for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
              I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
-          if (I->second.first > MaxPop || 
+          if (I->second.first > MaxPop ||
               (I->second.first == MaxPop && MaxIndex > I->second.second)) {
             MaxPop = I->second.first;
             MaxIndex = I->second.second;
@@ -2627,13 +2782,13 @@
           // edges to it.
           SI->setDefaultDest(MaxBlock);
           Changed = true;
-          
+
           // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
           // it.
           if (isa<PHINode>(MaxBlock->begin()))
             for (unsigned i = 0; i != MaxPop-1; ++i)
               MaxBlock->removePredecessor(SI->getParent());
-          
+
           for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
                i != e; ++i)
             if (i.getCaseSuccessor() == MaxBlock) {
@@ -2648,7 +2803,7 @@
         // place to note that the call does not throw though.
         BranchInst *BI = Builder.CreateBr(II->getNormalDest());
         II->removeFromParent();   // Take out of symbol table
-        
+
         // Insert the call now...
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
         Builder.SetInsertPoint(BI);
@@ -2663,7 +2818,7 @@
       }
     }
   }
-  
+
   // If this block is now dead, remove it.
   if (pred_begin(BB) == pred_end(BB) &&
       BB != &BB->getParent()->getEntryBlock()) {
@@ -2823,6 +2978,285 @@
   return Changed;
 }
 
+/// ValidLookupTableConstant - Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return CE->isGEPWithNoNotionalOverIndexing();
+
+  return isa<ConstantFP>(C) ||
+      isa<ConstantInt>(C) ||
+      isa<ConstantPointerNull>(C) ||
+      isa<GlobalValue>(C) ||
+      isa<UndefValue>(C);
+}
+
+/// GetCaseResulsts - Try to determine the resulting constant values in phi
+/// nodes at the common destination basic block for one of the case
+/// destinations of a switch instruction.
+static bool GetCaseResults(SwitchInst *SI,
+                           BasicBlock *CaseDest,
+                           BasicBlock **CommonDest,
+                           SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty, continue to its successor.
+  if (CaseDest->getFirstNonPHIOrDbg() == CaseDest->getTerminator() &&
+      !isa<PHINode>(CaseDest->begin())) {
+
+    TerminatorInst *Terminator = CaseDest->getTerminator();
+    if (Terminator->getNumSuccessors() != 1)
+      return false;
+
+    Pred = CaseDest;
+    CaseDest = Terminator->getSuccessor(0);
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  BasicBlock::iterator I = (*CommonDest)->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal = dyn_cast<Constant>(PHI->getIncomingValue(Idx));
+    if (!ConstVal)
+      return false;
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal))
+      return false;
+
+    Res.push_back(std::make_pair(PHI, ConstVal));
+  }
+
+  return true;
+}
+
+/// BuildLookupTable - Build a lookup table with the contents of Results, using
+/// DefaultResult to fill the holes in the table. If the table ends up
+/// containing the same result in each element, set *SingleResult to that value
+/// and return NULL.
+static GlobalVariable *BuildLookupTable(Module &M,
+                                        uint64_t TableSize,
+                                        ConstantInt *Offset,
+              const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Results,
+                                        Constant *DefaultResult,
+                                        Constant **SingleResult) {
+  assert(Results.size() && "Need values to build lookup table");
+  assert(TableSize >= Results.size() && "Table needs to hold all values");
+
+  // If all values in the table are equal, this is that value.
+  Constant *SameResult = Results.begin()->second;
+
+  // Build up the table contents.
+  std::vector<Constant*> TableContents(TableSize);
+  for (size_t I = 0, E = Results.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Results[I].first;
+    Constant *CaseRes = Results[I].second;
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SameResult)
+      SameResult = NULL;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Results.size() < TableSize) {
+    for (unsigned i = 0; i < TableSize; ++i) {
+      if (!TableContents[i])
+        TableContents[i] = DefaultResult;
+    }
+
+    if (DefaultResult != SameResult)
+      SameResult = NULL;
+  }
+
+  // Same result was used in the entire table; just return that.
+  if (SameResult) {
+    *SingleResult = SameResult;
+    return NULL;
+  }
+
+  ArrayType *ArrayTy = ArrayType::get(DefaultResult->getType(), TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  GlobalVariable *GV = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
+                                          GlobalVariable::PrivateLinkage,
+                                          Initializer,
+                                          "switch.table");
+  GV->setUnnamedAddr(true);
+  return GV;
+}
+
+/// SwitchToLookupTable - If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with different constant values,
+/// replace the switch with lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI,
+                                IRBuilder<> &Builder) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+  // FIXME: Handle unreachable cases.
+
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: If the results are all integers and the lookup table would fit in a
+  // target-legal register, we should store them as a bitmap and use shift/mask
+  // to look up the result.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore the switch if the number of cases are too small.
+  // This is similar to the check when building jump tables in
+  // SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Determine the best cut-off.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the the min and max case values.
+  assert(SI->case_begin() != SI->case_end());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI.getCaseValue();
+  ConstantInt *MaxCaseVal = CI.getCaseValue();
+
+  BasicBlock *CommonDest = NULL;
+  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
+  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode*, Constant*> DefaultResults;
+  SmallDenseMap<PHINode*, Type*> ResultTypes;
+  SmallVector<PHINode*, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI.getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CI.getCaseSuccessor(), &CommonDest, Results))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (ResultsTy::iterator I = Results.begin(), E = Results.end(); I!=E; ++I) {
+      if (!ResultLists.count(I->first))
+        PHIs.push_back(I->first);
+      ResultLists[I->first].push_back(std::make_pair(CaseVal, I->second));
+    }
+  }
+
+  // Get the resulting values for the default case.
+  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  if (!GetCaseResults(SI, SI->getDefaultDest(), &CommonDest, DefaultResultsList))
+    return false;
+  for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
+    PHINode *PHI = DefaultResultsList[I].first;
+    Constant *Result = DefaultResultsList[I].second;
+    DefaultResults[PHI] = Result;
+    ResultTypes[PHI] = Result->getType();
+  }
+
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  // The table density should be at lest 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  // Be careful to avoid overlow in the density computation.
+  if (RangeSpread.zextOrSelf(64).ugt(UINT64_MAX / 4 - 1))
+    return false;
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  if (SI->getNumCases() * 10 < TableSize * 4)
+    return false;
+
+  // Build the lookup tables.
+  SmallDenseMap<PHINode*, GlobalVariable*> LookupTables;
+  SmallDenseMap<PHINode*, Constant*> SingleResults;
+
+  Module &Mod = *CommonDest->getParent()->getParent();
+  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
+       I != E; ++I) {
+    PHINode *PHI = *I;
+
+    Constant *SingleResult = NULL;
+    LookupTables[PHI] = BuildLookupTable(Mod, TableSize, MinCaseVal,
+                                         ResultLists[PHI], DefaultResults[PHI],
+                                         &SingleResult);
+    SingleResults[PHI] = SingleResult;
+  }
+
+  // Create the BB that does the lookups.
+  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
+                                            "switch.lookup",
+                                            CommonDest->getParent(),
+                                            CommonDest);
+
+  // Check whether the condition value is within the case range, and branch to
+  // the new BB.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+                                        "switch.tableidx");
+  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+      MinCaseVal->getType(), TableSize));
+  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+  bool ReturnedEarly = false;
+  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
+       I != E; ++I) {
+    PHINode *PHI = *I;
+    // There was a single result for this phi; just use that.
+    if (Constant *SingleResult = SingleResults[PHI]) {
+      PHI->addIncoming(SingleResult, LookupBB);
+      continue;
+    }
+
+    Value *GEPIndices[] = { Builder.getInt32(0), TableIndex };
+    Value *GEP = Builder.CreateInBoundsGEP(LookupTables[PHI], GEPIndices,
+                                           "switch.gep");
+    Value *Result = Builder.CreateLoad(GEP, "switch.load");
+
+    // If the result is only going to be used to return from the function,
+    // we want to do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin())) {
+      if (CommonDest->getFirstNonPHIOrDbg() == CommonDest->getTerminator()) {
+        Builder.CreateRet(Result);
+        ReturnedEarly = true;
+      }
+    }
+
+    if (!ReturnedEarly)
+      PHI->addIncoming(Result, LookupBB);
+  }
+
+  if (!ReturnedEarly)
+    Builder.CreateBr(CommonDest);
+
+  // Remove the switch.
+  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+    if (Succ == SI->getDefaultDest()) continue;
+    Succ->removePredecessor(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  ++NumLookupTables;
+  return true;
+}
+
 bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   // If this switch is too complex to want to look at, ignore it.
   if (!isValueEqualityComparison(SI))
@@ -2862,13 +3296,16 @@
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB) | true;
 
+  if (SwitchToLookupTable(SI, Builder))
+    return SimplifyCFG(BB) | true;
+
   return false;
 }
 
 bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   BasicBlock *BB = IBI->getParent();
   bool Changed = false;
-  
+
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
@@ -2879,7 +3316,7 @@
       --i; --e;
       Changed = true;
     }
-  } 
+  }
 
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
@@ -2887,14 +3324,14 @@
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
     BranchInst::Create(IBI->getDestination(0), IBI);
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
       return SimplifyCFG(BB) | true;
@@ -2904,13 +3341,13 @@
 
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
-  
+
   // If the Terminator is the only non-phi instruction, simplify the block.
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
-  
+
   // If the only instruction in the block is a seteq/setne comparison
   // against a constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
@@ -2921,7 +3358,7 @@
           TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
@@ -2934,7 +3371,7 @@
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
-  
+
   // Conditional branch
   if (isValueEqualityComparison(BI)) {
     // If we only have one predecessor, and if it is a branch on this value,
@@ -2943,7 +3380,7 @@
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return SimplifyCFG(BB) | true;
-    
+
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
     BasicBlock::iterator I = BB->begin();
@@ -2962,17 +3399,17 @@
         return SimplifyCFG(BB) | true;
     }
   }
-  
+
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
   if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI))
     return SimplifyCFG(BB) | true;
-  
+
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
   // there is any identical code in the "then" and "else" blocks.  If so, we
@@ -2999,14 +3436,14 @@
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return SimplifyCFG(BB) | true;
   }
-  
+
   // If this is a branch on a phi node in the current block, thread control
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, TD))
         return SimplifyCFG(BB) | true;
-  
+
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
@@ -3114,7 +3551,7 @@
   //
   if (MergeBlockIntoPredecessor(BB))
     return true;
-  
+
   IRBuilder<> Builder(BB);
 
   // If there is a trivial two-entry PHI node in this basic block, and we can

diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 81eb9e0..528e6a1 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp

@@ -72,7 +72,7 @@
                 ++NumSimplified;
                 Changed = true;
               }
-            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I);
+            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
           }
 
         // Place the list of instructions to simplify on the next loop iteration

diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 62d23cb..c09dcd2 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp

@@ -601,7 +601,7 @@
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
-    (void) SimplifyInstructionsInBlock(&BB, TD);
+    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
     return true;
   }
 

diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index c09c69b..f3f24ae 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp

@@ -1029,6 +1029,9 @@
       Out << "sideeffect ";
     if (IA->isAlignStack())
       Out << "alignstack ";
+    // We don't emit the AD_ATT dialect as it's the assumed default.
+    if (IA->getDialect() == InlineAsm::AD_Intel)
+      Out << "inteldialect ";
     Out << '"';
     PrintEscapedString(IA->getAsmString(), Out);
     Out << "\", \"";

diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index c8219eb..d466ac6 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp

@@ -88,9 +88,6 @@
     Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
     Result += " ";
   }
-  if (Attrs & Attribute::IANSDialect)
-    Result += "ia_nsdialect ";
-
   // Trim the trailing space.
   assert(!Result.empty() && "Unknown attribute!");
   Result.erase(Result.end()-1);

diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index 6a20be6..c17e794 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt

@@ -42,7 +42,7 @@
 
 # Workaround: It takes over 20 minutes to compile with msvc10.
 # FIXME: Suppressing optimizations to core libraries would not be good thing.
-if( MSVC_VERSION EQUAL 1600 )
+if( MSVC_VERSION LESS 1700 )
 set_property(
   SOURCE Function.cpp
   PROPERTY COMPILE_FLAGS "/Og-"

diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index 8903a8f..0f81b3e 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h

@@ -352,18 +352,21 @@
 struct InlineAsmKeyType {
   InlineAsmKeyType(StringRef AsmString,
                    StringRef Constraints, bool hasSideEffects,
-                   bool isAlignStack)
+                   bool isAlignStack, InlineAsm::AsmDialect asmDialect)
     : asm_string(AsmString), constraints(Constraints),
-      has_side_effects(hasSideEffects), is_align_stack(isAlignStack) {}
+      has_side_effects(hasSideEffects), is_align_stack(isAlignStack),
+      asm_dialect(asmDialect) {}
   std::string asm_string;
   std::string constraints;
   bool has_side_effects;
   bool is_align_stack;
+  InlineAsm::AsmDialect asm_dialect;
   bool operator==(const InlineAsmKeyType& that) const {
     return this->asm_string == that.asm_string &&
            this->constraints == that.constraints &&
            this->has_side_effects == that.has_side_effects &&
-           this->is_align_stack == that.is_align_stack;
+           this->is_align_stack == that.is_align_stack &&
+           this->asm_dialect == that.asm_dialect;
   }
   bool operator<(const InlineAsmKeyType& that) const {
     if (this->asm_string != that.asm_string)
@@ -374,6 +377,8 @@
       return this->has_side_effects < that.has_side_effects;
     if (this->is_align_stack != that.is_align_stack)
       return this->is_align_stack < that.is_align_stack;
+    if (this->asm_dialect != that.asm_dialect)
+      return this->asm_dialect < that.asm_dialect;
     return false;
   }
 
@@ -490,7 +495,8 @@
 struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
   static InlineAsm *create(PointerType *Ty, const InlineAsmKeyType &Key) {
     return new InlineAsm(Ty, Key.asm_string, Key.constraints,
-                         Key.has_side_effects, Key.is_align_stack);
+                         Key.has_side_effects, Key.is_align_stack,
+                         Key.asm_dialect);
   }
 };
 
@@ -499,7 +505,8 @@
   typedef InlineAsmKeyType ValType;
   static ValType getValType(InlineAsm *Asm) {
     return InlineAsmKeyType(Asm->getAsmString(), Asm->getConstraintString(),
-                            Asm->hasSideEffects(), Asm->isAlignStack());
+                            Asm->hasSideEffects(), Asm->isAlignStack(),
+                            Asm->getDialect());
   }
 };
 

diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
index 003a5d4..5bc1ac9 100644
--- a/lib/VMCore/GCOV.cpp
+++ b/lib/VMCore/GCOV.cpp

@@ -28,19 +28,19 @@
 }
 
 /// isGCDAFile - Return true if Format identifies a .gcda file.
-static bool isGCDAFile(GCOVFormat Format) {
-  return Format == GCDA_402 || Format == GCDA_404;
+static bool isGCDAFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404;
 }
 
 /// isGCNOFile - Return true if Format identifies a .gcno file.
-static bool isGCNOFile(GCOVFormat Format) {
-  return Format == GCNO_402 || Format == GCNO_404;
+static bool isGCNOFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404;
 }
 
 /// read - Read GCOV buffer.
 bool GCOVFile::read(GCOVBuffer &Buffer) {
-  GCOVFormat Format = Buffer.readGCOVFormat();
-  if (Format == InvalidGCOV)
+  GCOV::GCOVFormat Format = Buffer.readGCOVFormat();
+  if (Format == GCOV::InvalidGCOV)
     return false;
 
   unsigned i = 0;
@@ -87,21 +87,21 @@
 
 /// read - Read a aunction from the buffer. Return false if buffer cursor
 /// does not point to a function tag.
-bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
+bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
   if (!Buff.readFunctionTag())
     return false;
 
   Buff.readInt(); // Function header length
   Ident = Buff.readInt(); 
   Buff.readInt(); // Checksum #1
-  if (Format != GCNO_402)
+  if (Format != GCOV::GCNO_402)
     Buff.readInt(); // Checksum #2
 
   Name = Buff.readString();
-  if (Format == GCNO_402 || Format == GCNO_404)
+  if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
     Filename = Buff.readString();
 
-  if (Format == GCDA_402 || Format == GCDA_404) {
+  if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
     Buff.readArcTag();
     uint32_t Count = Buff.readInt() / 2;
     for (unsigned i = 0, e = Count; i != e; ++i) {

diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 736e370..2e636aa 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp

@@ -27,19 +27,20 @@
 
 InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
-                          bool isAlignStack) {
-  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack);
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack,
+                       asmDialect);
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
   return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
 }
 
 InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
-                     bool isAlignStack)
+                     bool isAlignStack, AsmDialect asmDialect)
   : Value(Ty, Value::InlineAsmVal),
-    AsmString(asmString), 
-    Constraints(constraints), HasSideEffects(hasSideEffects), 
-    IsAlignStack(isAlignStack) {
+    AsmString(asmString), Constraints(constraints),
+    HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
+    Dialect(asmDialect) {
 
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&

diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 4530c04..53f1149 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp

@@ -1189,7 +1189,7 @@
   assert(PassDebugging >= Details);
   if (Set.empty())
     return;
-  dbgs() << (void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
     const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);

diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index dac6373..c4567a3 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt

@@ -10,3 +10,11 @@
     endif()
   endif()
 endforeach(entry)
+
+# Also add in the compiler-rt tree if present and we have a sufficiently
+# recent version of CMake.
+if(${CMAKE_VERSION} VERSION_GREATER 2.8.7 AND
+   IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt AND
+   EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt/CMakeLists.txt)
+  add_subdirectory(compiler-rt)
+endif()

diff --git a/runtime/libprofile/CMakeLists.txt b/runtime/libprofile/CMakeLists.txt
index 414ad00..8609715 100644
--- a/runtime/libprofile/CMakeLists.txt
+++ b/runtime/libprofile/CMakeLists.txt

@@ -13,7 +13,8 @@
   PROPERTIES
   OUTPUT_NAME "profile_rt" )
 
-add_llvm_loadable_module( profile_rt-shared ${SOURCES} )
+set(BUILD_SHARED_LIBS ON)
+add_llvm_library( profile_rt-shared ${SOURCES} )
 set_target_properties( profile_rt-shared
   PROPERTIES
   OUTPUT_NAME "profile_rt" )

diff --git a/runtime/libprofile/Profiling.h b/runtime/libprofile/Profiling.h
index c6b9a4d..acc6399 100644
--- a/runtime/libprofile/Profiling.h
+++ b/runtime/libprofile/Profiling.h

@@ -15,7 +15,7 @@
 #ifndef PROFILING_H
 #define PROFILING_H
 
-#include "llvm/Analysis/ProfileInfoTypes.h" /* for enum ProfilingType */
+#include "llvm/Analysis/ProfileDataTypes.h" /* for enum ProfilingType */
 
 /* save_arguments - Save argc and argv as passed into the program for the file
  * we output.

diff --git a/test/Analysis/BasicAA/noalias-geps.ll b/test/Analysis/BasicAA/noalias-geps.ll
new file mode 100644
index 0000000..a93d778
--- /dev/null
+++ b/test/Analysis/BasicAA/noalias-geps.ll

@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+; Check that geps with equal base offsets of noalias base pointers stay noalias.
+define i32 @test(i32* %p, i16 %i) {
+  %pi = getelementptr i32* %p, i32 0
+  %pi.next = getelementptr i32* %p, i32 1
+  %b = icmp eq i16 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr i32* %pi, i32 1
+  %g = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr i32* %pi, i32 1
+  %g2 = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+
+bb3:
+  %ptr_phi = phi i32* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi i32* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr i32* %ptr_phi , i32 1
+  %g1 = getelementptr i32* %ptr_phi2 , i32 1
+
+ret i32 0
+}
+
+; Check that geps with equal indices of noalias base pointers stay noalias.
+define i32 @test2([2 x i32]* %p, i32 %i) {
+  %pi = getelementptr [2 x i32]* %p, i32 0
+  %pi.next = getelementptr [2 x i32]* %p, i32 1
+  %b = icmp eq i32 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr [2 x i32]* %pi, i32 1
+  %g = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr [2 x i32]* %pi, i32 1
+  %g2 = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb3:
+  %ptr_phi = phi [2 x i32]* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi [2 x i32]* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr [2 x i32]* %ptr_phi , i32 1, i32 %i
+  %g1 = getelementptr [2 x i32]* %ptr_phi2 , i32 1, i32 %i
+
+ret i32 0
+}

diff --git a/test/Analysis/BasicAA/phi-speculation.ll b/test/Analysis/BasicAA/phi-speculation.ll
new file mode 100644
index 0000000..21c6592
--- /dev/null
+++ b/test/Analysis/BasicAA/phi-speculation.ll

@@ -0,0 +1,33 @@
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; ptr_phi and ptr2_phi do not alias.
+; CHECK: NoAlias: i32* %ptr2_phi, i32* %ptr_phi
+
+define i32 @test_noalias(i32* %ptr2, i32 %count, i32* %coeff) {
+entry:
+  %ptr = getelementptr inbounds i32* %ptr2, i64 1
+  br label %while.body
+
+while.body:
+  %num = phi i32 [ %count, %entry ], [ %dec, %while.body ]
+  %ptr_phi = phi i32* [ %ptr, %entry ], [ %ptr_inc, %while.body ]
+  %ptr2_phi = phi i32* [ %ptr2, %entry ], [ %ptr2_inc, %while.body ]
+  %result.09 = phi i32 [ 0 , %entry ], [ %add, %while.body ]
+  %dec = add nsw i32 %num, -1
+  %0 = load i32* %ptr_phi, align 4
+  store i32 %0, i32* %ptr2_phi, align 4
+  %1 = load i32* %coeff, align 4
+  %2 = load i32* %ptr_phi, align 4
+  %mul = mul nsw i32 %1, %2
+  %add = add nsw i32 %mul, %result.09
+  %tobool = icmp eq i32 %dec, 0
+  %ptr_inc = getelementptr inbounds i32* %ptr_phi, i64 1
+  %ptr2_inc = getelementptr inbounds i32* %ptr2_phi, i64 1
+  br i1 %tobool, label %the_exit, label %while.body
+
+the_exit:
+  ret i32 %add
+}

diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 74d06a1..08adfa8 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll

@@ -88,3 +88,30 @@
 }
 
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+
+define i32 @test4(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Printing analysis {{.*}} for function 'test4'
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 2, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !2
+; CHECK: edge entry -> return probability is 7 / 85
+; CHECK: edge entry -> sw.bb probability is 14 / 85
+; CHECK: edge entry -> sw.bb1 probability is 64 / 85
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}

diff --git a/test/Analysis/Profiling/load-branch-weights-ifs.ll b/test/Analysis/Profiling/load-branch-weights-ifs.ll
new file mode 100644
index 0000000..ddbaf96
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-ifs.ll

@@ -0,0 +1,122 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_mod - Branch taken 6 times in 7.
+define i32 @func_mod(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 7
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.else
+; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof !0
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.else:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% taken probability.
+define i32 @func_const_true(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% not-taken probability.
+define i32 @func_const_false(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !2
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 7000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !3
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_mod(i32 %1)
+  br label %for.inc
+
+for.inc:
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %call1 = call i32 @func_const_true(i32 1)
+  %call2 = call i32 @func_const_false(i32 0)
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 6000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 0}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7000, i32 1}
+; CHECK-NOT: !4

diff --git a/test/Analysis/Profiling/load-branch-weights-loops.ll b/test/Analysis/Profiling/load-branch-weights-loops.ll
new file mode 100644
index 0000000..476f377b
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-loops.ll

@@ -0,0 +1,188 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_for - Test branch probabilities for a vanilla for loop.
+define i32 @func_for(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !0
+
+for.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_for_odd - Test branch probabilities for a for loop with a continue and
+;; a break.
+define i32 @func_for_odd(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+  %2 = load i32* %loop, align 4
+  %rem = srem i32 %2, 10
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; CHECK: br i1 %tobool, label %if.then, label %if.end, !prof !2
+
+if.then:
+  br label %for.inc
+
+if.end:
+  %3 = load i32* %loop, align 4
+  %cmp1 = icmp eq i32 %3, 500
+  br i1 %cmp1, label %if.then2, label %if.end3
+; CHECK: br i1 %cmp1, label %if.then2, label %if.end3, !prof !3
+
+if.then2:
+  br label %for.end
+
+if.end3:
+  %4 = load i32* %N.addr, align 4
+  %5 = load i32* %ret, align 4
+  %add = add nsw i32 %5, %4
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %6 = load i32* %loop, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %7 = load i32* %ret, align 4
+  ret i32 %7
+}
+
+;; func_while - Test branch probability in a vanilla while loop.
+define i32 @func_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %while.cond
+
+while.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %while.body, label %while.end
+; CHECK: br i1 %cmp, label %while.body, label %while.end, !prof !0
+
+while.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %while.cond
+
+while.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_while - Test branch probability in a vanilla do-while loop.
+define i32 @func_do_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %do.body
+
+do.body:
+  %0 = load i32* %N.addr, align 4
+  %1 = load i32* %ret, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %ret, align 4
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %do.cond
+
+do.cond:
+  %3 = load i32* %loop, align 4
+  %4 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %3, %4
+  br i1 %cmp, label %do.body, label %do.end
+; CHECK: br i1 %cmp, label %do.body, label %do.end, !prof !4
+
+do.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  %call = call i32 @func_for(i32 1000)
+  %call1 = call i32 @func_for_odd(i32 1000)
+  %call2 = call i32 @func_while(i32 1000)
+  %call3 = call i32 @func_do_while(i32 1000)
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 501, i32 0}
+!2 = metadata !{metadata !"branch_weights", i32 450, i32 51}
+!3 = metadata !{metadata !"branch_weights", i32 1, i32 50}
+!4 = metadata !{metadata !"branch_weights", i32 999, i32 1}
+; CHECK-NOT: !5

diff --git a/test/Analysis/Profiling/load-branch-weights-switches.ll b/test/Analysis/Profiling/load-branch-weights-switches.ll
new file mode 100644
index 0000000..be11f04
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-switches.ll

@@ -0,0 +1,165 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_switch - Test branch probabilities for a switch instruction with an
+;; even chance of taking each case (or no case).
+define i32 @func_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 4
+  switch i32 %rem, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+; CHECK: ], !prof !0
+
+sw.bb:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb1:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb2:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.epilog:
+  store i32 8, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_switch_switch - Test branch probabilities in a switch-instruction that
+;; leads to further switch instructions.  The first-tier switch occludes some
+;; possibilities in the second-tier switches, leading to some branches having a
+;; 0 probability.
+define i32 @func_switch_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 2
+  switch i32 %rem, label %sw.default11 [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb5
+  ]
+; CHECK: ], !prof !1
+
+sw.bb:
+  %1 = load i32* %N.addr, align 4
+  %rem1 = srem i32 %1, 4
+  switch i32 %rem1, label %sw.default [
+    i32 0, label %sw.bb2
+    i32 1, label %sw.bb3
+    i32 2, label %sw.bb4
+  ]
+; CHECK: ], !prof !2
+
+sw.bb2:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb3:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb4:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.default:
+  store i32 8, i32* %retval
+  br label %return
+
+sw.bb5:
+  %2 = load i32* %N.addr, align 4
+  %rem6 = srem i32 %2, 4
+  switch i32 %rem6, label %sw.default10 [
+    i32 0, label %sw.bb7
+    i32 1, label %sw.bb8
+    i32 2, label %sw.bb9
+  ]
+; CHECK: ], !prof !3
+
+sw.bb7:
+  store i32 9, i32* %retval
+  br label %return
+
+sw.bb8:
+  store i32 10, i32* %retval
+  br label %return
+
+sw.bb9:
+  store i32 11, i32* %retval
+  br label %return
+
+sw.default10:
+  store i32 12, i32* %retval
+  br label %return
+
+sw.default11:
+  store i32 13, i32* %retval
+  br label %return
+
+return:
+  %3 = load i32* %retval
+  ret i32 %3
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 4000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !4
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_switch(i32 %1)
+  %2 = load i32* %loop, align 4
+  %call1 = call i32 @func_switch_switch(i32 %2)
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %loop, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 1000, i32 1000, i32 1000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 0, i32 2000, i32 2000}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1000, i32 0, i32 1000}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 1000, i32 0, i32 1000, i32 0}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 4000, i32 1}
+; CHECK-NOT: !5

diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index b8eb6d3..61be4b7 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml

@@ -113,14 +113,14 @@
   ignore (define_global "const_int_string" c m);
   insist (i32_type = type_of c);
 
-  (* RUN: grep 'const_string.*"cruel\00world"' < %t.ll
+  (* RUN: grep 'const_string.*"cruel\\00world"' < %t.ll
    *)
   group "string";
   let c = const_string context "cruel\000world" in
   ignore (define_global "const_string" c m);
   insist ((array_type i8_type 11) = type_of c);
 
-  (* RUN: grep 'const_stringz.*"hi\00again\00"' < %t.ll
+  (* RUN: grep 'const_stringz.*"hi\\00again\\00"' < %t.ll
    *)
   group "stringz";
   let c = const_stringz context "hi\000again" in
@@ -187,7 +187,7 @@
   ignore (define_global "const_all_ones" c m);
 
   group "pointer null"; begin
-    (* RUN: grep "const_pointer_null = global i64* null" < %t.ll
+    (* RUN: grep "const_pointer_null = global i64\* null" < %t.ll
      *)
     let c = const_pointer_null (pointer_type i64_type) in
     ignore (define_global "const_pointer_null" c m);
@@ -542,7 +542,7 @@
 (*===-- Aliases -----------------------------------------------------------===*)
 
 let test_aliases () =
-  (* RUN: grep "@alias = alias i32* @aliasee" < %t.ll
+  (* RUN: grep "@alias = alias i32\* @aliasee" < %t.ll
    *)
   let v = declare_global i32_type "aliasee" m in
   ignore (add_alias m (pointer_type i32_type) v "alias")
@@ -554,7 +554,7 @@
   let ty = function_type i32_type [| i32_type; i64_type |] in
   let ty2 = function_type i8_type [| i8_type; i64_type |] in
   
-  (* RUN: grep "declare i32 @Fn1\(i32, i64\)" < %t.ll
+  (* RUN: grep 'declare i32 @Fn1(i32, i64)' < %t.ll
    *)
   begin group "declare";
     insist (None = lookup_function "Fn1" m);
@@ -935,7 +935,7 @@
 
   group "malloc/free"; begin
       (* RUN: grep "call.*@malloc(i32 ptrtoint" < %t.ll
-       * RUN: grep "call.*@free(i8*" < %t.ll
+       * RUN: grep "call.*@free(i8\*" < %t.ll
        * RUN: grep "call.*@malloc(i32 %" < %t.ll
        *)
       let bb1 = append_block context "MallocBlock1" fn in
@@ -947,7 +947,7 @@
   end;
 
   group "indirectbr"; begin
-    (* RUN: grep "indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]" < %t.ll
+    (* RUN: grep "indirectbr i8\* blockaddress(@X7, %IBRBlock2), \[label %IBRBlock2, label %IBRBlock3\]" < %t.ll
      *)
     let bb1 = append_block context "IBRBlock1" fn in
 
@@ -1054,10 +1054,10 @@
 
     (* RUN: grep "%build_alloca = alloca i32" < %t.ll
      * RUN: grep "%build_array_alloca = alloca i32, i32 %P2" < %t.ll
-     * RUN: grep "%build_load = load i32* %build_array_alloca" < %t.ll
-     * RUN: grep "store i32 %P2, i32* %build_alloca" < %t.ll
-     * RUN: grep "%build_gep = getelementptr i32* %build_array_alloca, i32 %P2" < %t.ll
-     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_load = load i32\* %build_array_alloca" < %t.ll
+     * RUN: grep "store i32 %P2, i32\* %build_alloca" < %t.ll
+     * RUN: grep "%build_gep = getelementptr i32\* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32\* %build_array_alloca, i32 %P2" < %t.ll
      * RUN: grep "%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1" < %t.ll
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
@@ -1106,14 +1106,14 @@
      * RUN: grep "%build_fptrunc2 = fptrunc double %build_sitofp to float" < %t.ll
      * RUN: grep "%build_fpext = fpext float %build_fptrunc to double" < %t.ll
      * RUN: grep "%build_fpext2 = fpext float %build_fptrunc to double" < %t.ll
-     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8*" < %t.ll
-     * RUN: grep "%build_ptrtoint = ptrtoint i8* %build_inttoptr to i64" < %t.ll
-     * RUN: grep "%build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8\*" < %t.ll
+     * RUN: grep "%build_ptrtoint = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_ptrtoint2 = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
      * RUN: grep "%build_bitcast = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast2 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast3 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast4 = bitcast i64 %build_ptrtoint to double" < %t.ll
-     * RUN: grep "%build_pointercast = bitcast i8* %build_inttoptr to i16*" < %t.ll
+     * RUN: grep "%build_pointercast = bitcast i8\* %build_inttoptr to i16*" < %t.ll
      *)
     let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
     let inst29 = build_zext inst28 i32_type "build_zext" atentry in
@@ -1148,7 +1148,7 @@
      * RUN: grep "%build_fcmp_false = fcmp false float %F1, %F2" < %t.ll
      * RUN: grep "%build_fcmp_true = fcmp true float %F2, %F1" < %t.ll
      * RUN: grep "%build_is_null.*= icmp eq.*%X0,.*null" < %t.ll
-     * RUN: grep "%build_is_not_null = icmp ne i8* %X1, null" < %t.ll
+     * RUN: grep "%build_is_not_null = icmp ne i8\* %X1, null" < %t.ll
      * RUN: grep "%build_ptrdiff" < %t.ll
      *)
     ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
@@ -1167,7 +1167,7 @@
   group "miscellaneous"; begin
     (* RUN: grep "%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)" < %t.ll
      * RUN: grep "%build_select = select i1 %build_icmp, i32 %P1, i32 %P2" < %t.ll
-     * RUN: grep "%build_va_arg = va_arg i8** null, i32" < %t.ll
+     * RUN: grep "%build_va_arg = va_arg i8\*\* null, i32" < %t.ll
      * RUN: grep "%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2" < %t.ll
      * RUN: grep "%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2" < %t.ll
      * RUN: grep "%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>" < %t.ll
@@ -1240,8 +1240,8 @@
   end;
 
   group "dbg"; begin
-    (* RUN: grep "%dbg = add i32 %P1, %P2, !dbg !1" < %t.ll
-     * RUN: grep "!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}" < %t.ll
+    (* RUN: grep '%dbg = add i32 %P1, %P2, !dbg !1' < %t.ll
+     * RUN: grep '!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}' < %t.ll
      *)
     insist ((current_debug_location atentry) = None);
 

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 991cc9d..b9b2237 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt

@@ -18,6 +18,7 @@
           llvm-dis llvm-extract llvm-dwarfdump
           llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
           macho-dump opt
+          profile_rt-shared
           FileCheck count not
           yaml2obj
   )

diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index f80b44f..1769ee5 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll

@@ -300,3 +300,34 @@
 
 declare <4 x float> @llvm.sin.v4f32(<4 x float>) nounwind readonly
 
+define void @test_floor(<4 x float>* %X) nounwind {
+
+; CHECK: test_floor:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      vstmia  {{.*}}
+
+L.entry:
+  %0 = load <4 x float>* @A, align 16
+  %1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readonly
+

diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
new file mode 100644
index 0000000..ec7f72d
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll

@@ -0,0 +1,129 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=thumb
+; Test that this doesn't crash.
+; <rdar://problem/12183003>
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.1.0"
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+
+define void @findEdges(i8*) nounwind ssp {
+  %2 = icmp sgt i32 undef, 0
+  br i1 %2, label %5, label %3
+
+; <label>:3                                       ; preds = %5, %1
+  %4 = phi i8* [ %0, %1 ], [ %19, %5 ]
+  ret void
+
+; <label>:5                                       ; preds = %5, %1
+  %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
+  %9 = getelementptr inbounds i8* null, i32 3
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
+  %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
+  %15 = getelementptr inbounds i8* %6, i32 3
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
+  %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
+  %19 = getelementptr inbounds i8* %6, i32 48
+  %20 = bitcast <16 x i8> %13 to <2 x i64>
+  %21 = bitcast <16 x i8> %8 to <2 x i64>
+  %22 = bitcast <16 x i8> %14 to <2 x i64>
+  %23 = shufflevector <2 x i64> %22, <2 x i64> undef, <1 x i32> zeroinitializer
+  %24 = bitcast <1 x i64> %23 to <8 x i8>
+  %25 = zext <8 x i8> %24 to <8 x i16>
+  %26 = sub <8 x i16> zeroinitializer, %25
+  %27 = bitcast <16 x i8> %17 to <2 x i64>
+  %28 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %26) nounwind
+  %29 = mul <8 x i16> %28, %28
+  %30 = add <8 x i16> zeroinitializer, %29
+  %31 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> undef, <8 x i16> %30) nounwind
+  %32 = bitcast <16 x i8> %11 to <2 x i64>
+  %33 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> zeroinitializer
+  %34 = bitcast <1 x i64> %33 to <8 x i8>
+  %35 = zext <8 x i8> %34 to <8 x i16>
+  %36 = sub <8 x i16> %35, zeroinitializer
+  %37 = bitcast <16 x i8> %18 to <2 x i64>
+  %38 = shufflevector <2 x i64> %37, <2 x i64> undef, <1 x i32> zeroinitializer
+  %39 = bitcast <1 x i64> %38 to <8 x i8>
+  %40 = zext <8 x i8> %39 to <8 x i16>
+  %41 = sub <8 x i16> zeroinitializer, %40
+  %42 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %36) nounwind
+  %43 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %41) nounwind
+  %44 = mul <8 x i16> %42, %42
+  %45 = mul <8 x i16> %43, %43
+  %46 = add <8 x i16> %45, %44
+  %47 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %31, <8 x i16> %46) nounwind
+  %48 = bitcast <8 x i16> %47 to <2 x i64>
+  %49 = shufflevector <2 x i64> %48, <2 x i64> undef, <1 x i32> zeroinitializer
+  %50 = bitcast <1 x i64> %49 to <4 x i16>
+  %51 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %50, <4 x i16> undef) nounwind
+  %52 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %51, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %53 = bitcast <4 x i16> %52 to <1 x i64>
+  %54 = shufflevector <1 x i64> %53, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %55 = bitcast <2 x i64> %54 to <8 x i16>
+  %56 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %55, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %57 = shufflevector <2 x i64> %20, <2 x i64> undef, <1 x i32> <i32 1>
+  %58 = bitcast <1 x i64> %57 to <8 x i8>
+  %59 = zext <8 x i8> %58 to <8 x i16>
+  %60 = sub <8 x i16> zeroinitializer, %59
+  %61 = shufflevector <2 x i64> %21, <2 x i64> undef, <1 x i32> <i32 1>
+  %62 = bitcast <1 x i64> %61 to <8 x i8>
+  %63 = zext <8 x i8> %62 to <8 x i16>
+  %64 = sub <8 x i16> %63, zeroinitializer
+  %65 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %60) nounwind
+  %66 = mul <8 x i16> %65, %65
+  %67 = add <8 x i16> zeroinitializer, %66
+  %68 = shufflevector <2 x i64> %27, <2 x i64> undef, <1 x i32> <i32 1>
+  %69 = bitcast <1 x i64> %68 to <8 x i8>
+  %70 = zext <8 x i8> %69 to <8 x i16>
+  %71 = sub <8 x i16> zeroinitializer, %70
+  %72 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> undef) nounwind
+  %73 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %71) nounwind
+  %74 = mul <8 x i16> %72, %72
+  %75 = mul <8 x i16> %73, %73
+  %76 = add <8 x i16> %75, %74
+  %77 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> <i32 1>
+  %78 = bitcast <1 x i64> %77 to <8 x i8>
+  %79 = zext <8 x i8> %78 to <8 x i16>
+  %80 = sub <8 x i16> %79, zeroinitializer
+  %81 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %80) nounwind
+  %82 = mul <8 x i16> %81, %81
+  %83 = add <8 x i16> zeroinitializer, %82
+  %84 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %76, <8 x i16> %83) nounwind
+  %85 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %67, <8 x i16> %84) nounwind
+  %86 = bitcast <8 x i16> %85 to <2 x i64>
+  %87 = shufflevector <2 x i64> %86, <2 x i64> undef, <1 x i32> <i32 1>
+  %88 = bitcast <1 x i64> %87 to <4 x i16>
+  %89 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %88, <4 x i16> undef) nounwind
+  %90 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %89, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %91 = bitcast <4 x i16> %90 to <1 x i64>
+  %92 = shufflevector <1 x i64> undef, <1 x i64> %91, <2 x i32> <i32 0, i32 1>
+  %93 = bitcast <2 x i64> %92 to <8 x i16>
+  %94 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %93, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %95 = bitcast <8 x i8> %56 to <1 x i64>
+  %96 = bitcast <8 x i8> %94 to <1 x i64>
+  %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
+  %98 = bitcast <2 x i64> %97 to <16 x i8>
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  %99 = icmp slt i32 undef, undef
+  br i1 %99, label %5, label %3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone

diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
new file mode 100644
index 0000000..8471be5
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-30-select.ll

@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; rdar://12201387
+
+;CHECK: select_s_v_v
+;CHECK: it  ne
+;CHECK-NEXT: vmovne.i32
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+

diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 8967730..6e6b363 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll

@@ -159,3 +159,13 @@
   store i8 %3, i8* %old
   ret void
 }
+
+; CHECK: func4
+; This function should not need to use callee-saved registers.
+; rdar://problem/12203728
+; CHECK-NOT: r4
+define i32 @func4(i32* %p) nounwind optsize ssp {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 monotonic
+  ret i32 %0
+}

diff --git a/test/CodeGen/ARM/crash-shufflevector.ll b/test/CodeGen/ARM/crash-shufflevector.ll
index ece42346..bdc0e0e 100644
--- a/test/CodeGen/ARM/crash-shufflevector.ll
+++ b/test/CodeGen/ARM/crash-shufflevector.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7--
+; RUN: llc < %s -mtriple=armv7
 
 declare void @g(<16 x i8>)
 define void @f(<4 x i8> %param1, <4 x i8> %param2) {
@@ -7,4 +7,4 @@
    %z = shufflevector <16 x i8> %y1, <16 x i8> %y2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
    call void @g(<16 x i8> %z)
    ret void
-}
+}
\ No newline at end of file

diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 0000000..e19185b
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll

@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+declare void @bar()
+
+; This is a comp
+define float @test_ineligible(float, float %in) {
+; CHECK: test_ineligible:
+
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %val = fadd float %sqrt, %sqrt
+
+  ; This call forces a move from a callee-saved register to the return-reg. That
+  ; move is not eligible for conversion to a d-register instructions because the
+  ; use-def chains would be messed up. Primarily a compile-test (we used to
+  ; internal fault).
+  call void @bar()
+; CHECL: bl bar
+; CHECK: vmov.f32 {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
\ No newline at end of file

diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
new file mode 100644
index 0000000..392a845
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pic.ll

@@ -0,0 +1,43 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
+
+@g = global i32 0, align 4
+
+define i32 @LoadGV() {
+entry:
+; THUMB: LoadGV
+; THUMB: movw [[reg0:r[0-9]+]],
+; THUMB: movt [[reg0]],
+; THUMB: add  [[reg0]], pc
+; ARM: LoadGV
+; ARM: ldr [[reg1:r[0-9]+]],
+; ARM: add [[reg1]], pc, [[reg1]]
+; ARMv7: LoadGV
+; ARMv7: movw [[reg2:r[0-9]+]],
+; ARMv7: movt [[reg2]],
+; ARMv7: add  [[reg2]], pc, [[reg2]]
+  %tmp = load i32* @g
+  ret i32 %tmp
+}
+
+@i = external global i32
+
+define i32 @LoadIndirectSymbol() {
+entry:
+; THUMB: LoadIndirectSymbol
+; THUMB: movw r[[reg3:[0-9]+]],
+; THUMB: movt r[[reg3]],
+; THUMB: add  r[[reg3]], pc
+; THUMB: ldr  r[[reg3]], [r[[reg3]]]
+; ARM: LoadIndirectSymbol
+; ARM: ldr [[reg4:r[0-9]+]],
+; ARM: ldr [[reg4]], [pc, [[reg4]]]
+; ARMv7: LoadIndirectSymbol
+; ARMv7: movw r[[reg5:[0-9]+]],
+; ARMv7: movt r[[reg5]],
+; ARMv7: add  r[[reg5]], pc, r[[reg5]]
+; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
+  %tmp = load i32* @i
+  ret i32 %tmp
+}

diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
new file mode 100644
index 0000000..ec57187
--- /dev/null
+++ b/test/CodeGen/ARM/fp-fast.ll

@@ -0,0 +1,60 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %t1)
+  ret float %t2
+}
+
+; CHECK: test2
+define float @test2(float %x, float %y) {
+; CHECK-NOT: vmul
+; CHECK: vfma.f32
+; CHECK-NOT: vmul
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %t1, float 2.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test3
+define float @test3(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vadd.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test4
+define float @test4(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vsub.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float -1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %x)
+  ret float %t2
+}
+
+; CHECK: test6
+define float @test6(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fsub float -0.0, %x
+  %t2 = call float @llvm.fma.f32(float %x, float 5.0, float %t1)
+  ret float %t2
+}
+
+declare float @llvm.fma.f32(float, float, float)

diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 0000000..4f2d7e3
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll

@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov s
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+  %1 = insertelement <4 x i32> %in, i32 255, i32 3
+  ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+  %1 = insertelement <8 x i16> %in, i16 255, i32 7
+  ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+  %1 = insertelement <16 x i8> %in, i8 255, i32 15
+  ret <16 x i8> %1
+}

diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
new file mode 100644
index 0000000..e4a00e9
--- /dev/null
+++ b/test/CodeGen/ARM/longMAC.ll

@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}

diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 5575566..62708ed5 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll

@@ -80,7 +80,7 @@
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq.w {{r.*}}, [[R2]]
+; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 

diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index cfc0e70..7507808 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll

@@ -9,7 +9,7 @@
 
 ; T2: t1:
 ; T2: mvn r0, #-2147483648
-; T2: addle.w r1, r1
+; T2: addle r1, r0
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -23,7 +23,7 @@
 ; ARM: mov r0, r1
 
 ; T2: t2:
-; T2: suble.w r1, r1, #10
+; T2: suble r1, #10
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 10
@@ -37,7 +37,7 @@
 ; ARM: mov r0, r3
 
 ; T2: t3:
-; T2: andge.w r3, r3, r2
+; T2: andge r3, r2
 ; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 -1, i32 %x
@@ -51,7 +51,7 @@
 ; ARM: mov r0, r3
 
 ; T2: t4:
-; T2: orrge.w r3, r3, r2
+; T2: orrge r3, r2
 ; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 0, i32 %x
@@ -81,7 +81,7 @@
 
 ; T2: t6:
 ; T2-NOT: movge
-; T2: eorlt.w r3, r3, r2
+; T2: eorlt r3, r2
   %cond = icmp slt i32 %a, %b
   %tmp1 = select i1 %cond, i32 %c, i32 0
   %tmp2 = xor i32 %tmp1, %d
@@ -200,7 +200,7 @@
 
 ; T2: t13
 ; T2: cmp r1, #10
-; T2: addgt.w r0, r0, #1
+; T2: addgt r0, #1
   %cmp = icmp sgt i32 %a, 10
   %conv = zext i1 %cmp to i32
   %add = add i32 %conv, %c
@@ -216,7 +216,7 @@
 
 ; T2: t14
 ; T2: cmp r1, #10
-; T2: subgt.w r0, r0, #1
+; T2: subgt r0, #1
   %cmp = icmp sgt i32 %a, 10
   %conv = sext i1 %cmp to i32
   %add = add i32 %conv, %c

diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 6fcbdee..2961b94 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll

@@ -63,3 +63,24 @@
 if.else:
   ret i32 %sub
 }
+
+; If the sub/rsb instruction is predicated, we can't use the flags.
+; <rdar://problem/12263428>
+; Test case from MultiSource/Benchmarks/Ptrdist/bc/number.s
+; CHECK: bc_raise
+; CHECK: rsbeq
+; CHECK: cmp
+define i32 @bc_raise() nounwind ssp {
+entry:
+  %val.2.i = select i1 undef, i32 0, i32 undef
+  %sub.i = sub nsw i32 0, %val.2.i
+  %retval.0.i = select i1 undef, i32 %val.2.i, i32 %sub.i
+  %cmp1 = icmp eq i32 %retval.0.i, 0
+  br i1 %cmp1, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %num2long.exit
+  ret i32 17
+
+if.end11:                                         ; preds = %num2long.exit
+  ret i32 23
+}

diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 05332e4..a8c224b 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll

@@ -261,3 +261,37 @@
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
   ret void
 }
+
+define <4 x i32> @tdupi(i32 %x, i32 %y) {
+;CHECK: tdupi
+;CHECK: vdup.32
+  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
+  ret <4 x i32> %4
+}
+
+define <4 x float> @tdupf(float %x, float %y) {
+;CHECK: tdupf
+;CHECK: vdup.32
+  %1 = insertelement <4 x float> undef, float %x, i32 0
+  %2 = insertelement <4 x float> %1, float %x, i32 1
+  %3 = insertelement <4 x float> %2, float %x, i32 2
+  %4 = insertelement <4 x float> %3, float %y, i32 3
+  ret <4 x float> %4
+}
+
+; This test checks that when splatting an element from a vector into another,
+; the value isn't moved out to GPRs first.
+define <4 x i32> @tduplane(<4 x i32> %invec) {
+;CHECK: tduplane
+;CHECK-NOT: vmov {{.*}}, d16[1]
+;CHECK: vdup.32 {{.*}}, d16[1]
+  %in = extractelement <4 x i32> %invec, i32 1
+  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
+  %4 = insertelement <4 x i32> %3, i32 255, i32 3
+  ret <4 x i32> %4
+}

diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 8fd3db2..22af797 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll

@@ -62,3 +62,14 @@
   %2 = sdiv <4 x i8> zeroinitializer, %1
   ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+

diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d..2ed65c9 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll

@@ -200,7 +200,7 @@
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2

diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
new file mode 100644
index 0000000..802ee2c
--- /dev/null
+++ b/test/CodeGen/Generic/MachineBranchProb.ll

@@ -0,0 +1,32 @@
+; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure we have the correct weight attached to each successor.
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Machine code for function test2:
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !0
+; CHECK: BB#0: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: BB#4: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: BB#5: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}

diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index e9ac8b6..8a6efb6 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll

@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
 ; CHECK: r2 = #3
 ; CHECK: r3 = #4
 ; CHECK: r4 = #5
-; CHECK: r5 = #6
 
 
 define void @foo() nounwind {

diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index ab69b22..186e393 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate new value store packet in V4
 
 @i = global i32 0, align 4

diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
new file mode 100644
index 0000000..79b5f4a
--- /dev/null
+++ b/test/CodeGen/Hexagon/remove_lsr.ll

@@ -0,0 +1,80 @@
+; Test fix for PR-13709.
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+
+; Convert the sequence
+; r17:16 = lsr(r11:10, #32)
+; .. = r16
+; into
+; r17:16 = lsr(r11:10, #32)
+; .. = r11
+; This makes the lsr instruction dead and it gets removed subsequently
+; by a dead code removal pass.
+
+%union.vect64 = type { i64 }
+%union.vect32 = type { i32 }
+
+define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
+ %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
+ i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
+ nounwind {
+entry:
+  %scevgep = getelementptr %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
+  %scevgep28 = getelementptr %union.vect32* %s_odd, i32 1
+  %scevgep32 = getelementptr %union.vect32* %s_even, i32 1
+  %scevgep36 = getelementptr i8* %scr_s_odd_code_ptr, i32 1
+  %scevgep39 = getelementptr i8* %scr_s_even_code_ptr, i32 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
+  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
+  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
+  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
+  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
+  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
+  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
+  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
+  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
+  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
+  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
+  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
+  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
+  %conv3 = sext i8 %predicate.022 to i32
+  %1 = trunc i64 %val.021 to i32
+  %2 = trunc i64 %0 to i32
+  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
+  store i32 %3, i32* %lsr.iv3335, align 4, !tbaa !0
+  %conv8 = sext i8 %predicate_1.023 to i32
+  %4 = lshr i64 %val.021, 32
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %0, 32
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
+  store i32 %8, i32* %lsr.iv2931, align 4, !tbaa !0
+  %srcval = load i64* %lsr.iv27, align 8
+  %9 = load i8* %lsr.iv40, align 1, !tbaa !1
+  %10 = load i8* %lsr.iv37, align 1, !tbaa !1
+  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
+  %exitcond = icmp eq i8 %lftr.wideiv, 32
+  %scevgep26 = getelementptr %union.vect64* %lsr.iv, i32 1
+  %scevgep30 = getelementptr %union.vect32* %lsr.iv29, i32 1
+  %scevgep34 = getelementptr %union.vect32* %lsr.iv33, i32 1
+  %scevgep38 = getelementptr i8* %lsr.iv37, i32 1
+  %scevgep41 = getelementptr i8* %lsr.iv40, i32 1
+  %lsr.iv.next = add i32 %lsr.iv42, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}

diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 2e4ab63..683a4c2 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32

diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index b266ce6..1b2fbc8 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll

@@ -1,12 +1,18 @@
-; RUN: llc -march=mipsel  -enable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
 
 define void @foo1() nounwind {
 entry:
-; CHECK:      jalr 
-; CHECK-NOT:  nop 
-; CHECK:      jr 
-; CHECK-NOT:  nop
-; CHECK:      .end
+; Default:     jalr 
+; Default-NOT: nop 
+; Default:     jr 
+; Default-NOT: nop
+; Default:     .end
+; None: jalr 
+; None: nop 
+; None: jr 
+; None: nop
+; None: .end
 
   tail call void @foo2(i32 3) nounwind
   ret void

diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
new file mode 100644
index 0000000..f96ce26
--- /dev/null
+++ b/test/CodeGen/Mips/init-array.ll

@@ -0,0 +1,14 @@
+; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+
+target triple = "mipsel-unknown-linux"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @test }]
+; CHECK: .section
+; CHECK: .init_array
+; CHECK-NOT: .ctors
+; CHECK: .4byte test
+
+define internal void @test() section ".text.startup" {
+entry:
+  ret void
+}

diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 0227b88..873b9f14 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll

@@ -6,9 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
+; O32: lui $at, 0
+; O32: addiu $at, $at, {{[0-9]+}} 
+; N64: lui $at, 0
+; N64: daddiu $at, $at, 0
+; N64: dsll $at, $at, 16
+; N64: daddiu $at, $at, 0
 ; N64: bal
-; N64: highest
-; N64: higher
+; N64: dsll $at, $at, 16
+; N64: daddiu $at, $at, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then

diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
new file mode 100644
index 0000000..03503fb
--- /dev/null
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll

@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: | FileCheck %s
+
+@i = internal unnamed_addr global i32 0, align 4
+
+define i32 @geti() nounwind readonly {
+entry:
+; CHECK: lw ${{[0-9]+}}, %gp_rel(i)($gp)
+  %0 = load i32* @i, align 4
+  ret i32 %0
+}
+

diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index d681091..ce98cc8 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll

@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
 @bar = hidden alias i32* @foo

diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a7ddb96..72d30dc 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll

@@ -1,8 +1,10 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=PIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:                             | FileCheck %s -check-prefix=STATIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:   -mips-fix-global-base-reg=false | FileCheck %s -check-prefix=STATICGP
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:     FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
+; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
+; RUN:     -mips-fix-global-base-reg=false < %s  | \
+; RUN:     FileCheck %s -check-prefix=STATICGP
 
 @t1 = thread_local global i32 0, align 4
 

diff --git a/test/CodeGen/Mips/uitofp.ll b/test/CodeGen/Mips/uitofp.ll
new file mode 100644
index 0000000..aff70c2
--- /dev/null
+++ b/test/CodeGen/Mips/uitofp.ll

@@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mattr=+single-float < %s
+
+define void @f0() nounwind {
+entry:
+  %b = alloca i32, align 4
+  %a = alloca float, align 4
+  store volatile i32 1, i32* %b, align 4
+  %0 = load volatile i32* %b, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* %a, align 4
+  ret void
+}

diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 0003a17..b95ac68 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll

@@ -9,9 +9,8 @@
 
 define void @foo() nounwind ssp {
 entry:
-; Better: mtctr r12
-; CHECK: mr r12, [[REG:r[0-9]+]]
-; CHECK: mtctr [[REG]]
+; CHECK: mtctr r12
+; CHECK: bctrl
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return

diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index 9a456b6..638059a 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll

@@ -2,10 +2,10 @@
 
 declare void @bar(i64 %x, i64 %y)
 
-; CHECK: li {{[53]}}, 0
+; CHECK: li 3, 0
 ; CHECK: li 4, 2
+; CHECK: li 5, 0
 ; CHECK: li 6, 3
-; CHECK: mr {{[53]}}, {{[53]}}
 
 define void @foo() {
   call void @bar(i64 2, i64 3)

diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
new file mode 100644
index 0000000..afa1ea8
--- /dev/null
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll

@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1
+
+define void @test(i32 %count) nounwind {
+entry:
+; CHECK: crxor 6, 6, 6
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %cmp2 = icmp sgt i32 %count, 0
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+; CHECK: crxor 6, 6, 6
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %inc = add nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind

diff --git a/test/CodeGen/PowerPC/fsl-e500mc.ll b/test/CodeGen/PowerPC/fsl-e500mc.ll
new file mode 100644
index 0000000..09b7e41
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e500mc.ll

@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e500mc and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e500mc < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-fsl-linux"
+
+%struct.teststruct = type { [12 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 52, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind

diff --git a/test/CodeGen/PowerPC/fsl-e5500.ll b/test/CodeGen/PowerPC/fsl-e5500.ll
new file mode 100644
index 0000000..d47d8c8
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e5500.ll

@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e5500 and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e5500 < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-fsl-linux"
+
+%struct.teststruct = type { [24 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 100, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind

diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index e1ff82d..59c3388 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll

@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=ppc32 | not grep mr
+; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
 
+; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
 entry:
         %tmp = tail call i32 asm "foo $0", "=r"( )              ; <i32> [#uses=1]
@@ -12,3 +13,9 @@
         ret i32 %tmp1
 }
 
+; CHECK: test3
+define i32 @test3(i32 %Y, i32 %X) {
+entry:
+        %tmp1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "foo $0, $1", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"( i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y )                ; <i32> [#uses=1]
+       ret i32 1
+}

diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
new file mode 100644
index 0000000..f1326ba
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll

@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s

+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"

+target triple = "powerpc64-unknown-linux-gnu"

+

+@double_array = global [32 x double] zeroinitializer, align 8

+@number64 = global i64 10, align 8

+@internal_static_var.x = internal unnamed_addr global i64 0, align 8

+

+define i64 @access_int64(i64 %a) nounwind readonly {

+entry:

+; CHECK: access_int64:

+; CHECK-NEXT: .align  3

+; CHECK-NEXT: .quad   .L.access_int64

+; CHECK-NEXT: .quad   .TOC.@tocbase

+; CHECK-NEXT: .text

+  %0 = load i64* @number64, align 8

+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)

+  %cmp = icmp eq i64 %0, %a

+  %conv1 = zext i1 %cmp to i64 

+  ret i64 %conv1

+}

+

+define i64 @internal_static_var(i64 %a) nounwind {

+entry:

+; CHECK: internal_static_var:

+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)

+  %0 = load i64* @internal_static_var.x, align 8

+  %cmp = icmp eq i64 %0, %a

+  %conv1 = zext i1 %cmp to i64 

+  ret i64 %conv1 

+}

+

+define i32 @access_double(double %a) nounwind readnone {

+entry:

+; CHECK: access_double:

+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)

+  %cmp = fcmp oeq double %a, 2.000000e+00

+  %conv = zext i1 %cmp to i32 

+  ret i32 %conv

+}

+

+

+define i32 @access_double_array(double %a, i32 %i) nounwind readonly {

+entry:

+; CHECK: access_double_array:

+  %idxprom = sext i32 %i to i64

+  %arrayidx = getelementptr inbounds [32 x double]* @double_array, i64 0, i64 %idxprom

+  %0 = load double* %arrayidx, align 8

+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)

+  %cmp = fcmp oeq double %0, %a

+  %conv = zext i1 %cmp to i32

+  ret i32 %conv

+}

+

+; Check the creation of 4 .tc entries:

+; * int64_t global 'number64'

+; * double constant 2.0

+; * double array 'double_array'

+; * static int64_t 'x' accessed within '@internal_static_var'

+; CHECK: .LC{{[0-9]+}}:

+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}

+; CHECK-NEXT: .LC{{[0-9]+}}:

+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}

+; CHECK-NEXT: .LC{{[0-9]+}}:

+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}

+; CHECK-NEXT: .LC{{[0-9]+}}:

+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}


diff --git a/test/CodeGen/PowerPC/ppc64-zext.ll b/test/CodeGen/PowerPC/ppc64-zext.ll
new file mode 100644
index 0000000..eb55445
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-zext.ll

@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+
+define i64 @fun(i32 %arg32) nounwind {
+entry:
+; CHECK: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  %o = zext i32 %arg32 to i64
+  ret i64 %o
+}
+

diff --git a/test/CodeGen/PowerPC/pr13641.ll b/test/CodeGen/PowerPC/pr13641.ll
new file mode 100644
index 0000000..c4d3f3a
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13641.ll

@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind {
+  ret void
+}
+
+; CHECK: blr
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .quad 0

diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
new file mode 100644
index 0000000..520921f
--- /dev/null
+++ b/test/CodeGen/PowerPC/remat-imm.ll

@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [6 x i8] c"%d,%d\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; CHECK: li 4, 128
+; CHECK-NOT: mr 4, {{.*}}
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind

diff --git a/test/CodeGen/Thumb/thumb_jump24_fixup.ll b/test/CodeGen/Thumb/thumb_jump24_fixup.ll
new file mode 100644
index 0000000..e6a6b25
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb_jump24_fixup.ll

@@ -0,0 +1,23 @@
+; RUN: llc -mtriple thumbv7-none-linux-gnueabi -mcpu=cortex-a8 -march=thumb -mattr=thumb2 -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+define i32 @test_fixup_t2_uncondbranch() {
+b0:
+  invoke void @__cxa_throw(i8* null, i8* null, i8* null) noreturn
+    to label %unreachable unwind label %lpad
+
+; CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 __cxa_throw
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) cleanup
+  ret i32 0
+
+unreachable:
+  unreachable
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @__cxa_throw(i8*, i8*, i8*)

diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
new file mode 100644
index 0000000..beefd60
--- /dev/null
+++ b/test/CodeGen/Thumb2/longMACt.ll

@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}

diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ead198f..ed4d26d 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll

@@ -5,7 +5,7 @@
 ; CHECK: mvn r0, #-2147483648
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: addle.w r1, r1, r0
+; CHECK: addle r1, r0
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -30,7 +30,7 @@
 ; CHECK: t3
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: suble.w r1, r1, #10
+; CHECK: suble r1, #10
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 10

diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b67..0465952 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll

@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd

diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000..6ebbb2e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll

@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}

diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000..0f36ce2
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll

@@ -0,0 +1,362 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+ declare i32 @foo(i32, i8*)
+

diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000..e7c9605
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll

@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}

diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa8..95854c7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll

@@ -109,8 +109,8 @@
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 

diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709..ec11654 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll

@@ -229,9 +229,8 @@
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>

diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6ca..ff56a45 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll

@@ -19,12 +19,12 @@
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
+  ret void
+}
+
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}

diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa..267a806 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll

@@ -26,3 +26,37 @@
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}

diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9..09eb5d1 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll

@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind

diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663..85a70aa 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll

@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }

diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a..bd3514c 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll

@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:

diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e0..e3910a6 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll

@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm

diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b3..2fe1ecd 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll

@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {

diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87..6d98d59 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll

@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+

diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000..091f0de
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll

@@ -0,0 +1,37 @@
+; RUN: llc -march=x86-64 -mattr=-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+

diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb..597236e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll

@@ -19,3 +19,12 @@
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}

diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000..014132b
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll

@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0Amov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}

diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e5..51320dd 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll

@@ -1,14 +1,10 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:

diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000..84102f1
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll

@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vectest(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel
+; AVX:   vptest
+; AVX:   ret
+}

diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000..024b163
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll

@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}

diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d23..b823f0a 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll

@@ -76,12 +76,12 @@
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r

diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000..82517cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll

@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)

diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000..5e0160b
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll

@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)

diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e..dc0464f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll

@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}

diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6b..8dfc2ea 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll

@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom -mattr=+sse41 | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}

diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d14..dfaa3d6 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll

@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
 

diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/2010-04-13-PubType.ll
index db7bb0a..559f032 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/2010-04-13-PubType.ll

@@ -1,6 +1,6 @@
-; RUN: llc -O0 -asm-verbose < %s > %t
-; RUN: grep "External Name" %t | grep -v X
-; RUN: grep "External Name" %t | grep Y | count 1
+; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx < %s | FileCheck %s
+; CHECK-NOT: .asciz "X" ## External Name
+; CHECK: .asciz "Y" ## External Name
 ; Test to check type with no definition is listed in pubtypes section.
 %struct.X = type opaque
 %struct.Y = type { i32 }

diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
new file mode 100755
index 0000000..9a1d538
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
Binary files differ

diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index a227071..58fb055 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll

@@ -7,16 +7,15 @@
 ; first check that we have a TAG_subprogram at a given offset and it has
 ; AT_inline.
 
-; CHECK: 0x00000134:   DW_TAG_subprogram [18]
-; CHECK-NEXT:     DW_AT_MIPS_linkage_name
+; CHECK: 0x0000011e:   DW_TAG_subprogram [18]
 ; CHECK-NEXT:     DW_AT_specification
 ; CHECK-NEXT:     DW_AT_inline
 
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
-; CHECK: 0x00000184:   DW_TAG_subprogram [20]
-; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x0134 => {0x00000134})
+; CHECK: 0x0000015f:   DW_TAG_subprogram [20]
+; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011e => {0x0000011e})
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:

diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 2cd1001..caf12c2 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll

@@ -16,8 +16,8 @@
 
 ; Verify that we refer to 'yyyy' with a relocation.
 ; LINUX:      .long   .Lstring3               # DW_AT_name
-; LINUX-NEXT: .long   39                      # DW_AT_type
-; LINUX-NEXT: .byte   1                       # DW_AT_external
+; LINUX-NEXT: .long   38                      # DW_AT_type
+; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_line
 ; LINUX-NEXT: .byte   9                       # DW_AT_location

diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
new file mode 100644
index 0000000..d3a7e12
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-inlining.test

@@ -0,0 +1,28 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x613 \
+RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x6de \
+RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x685 \
+RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x640 \
+RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
+
+DEEP_STACK:      inlined_h
+DEEP_STACK-NEXT: header.h:2:21
+DEEP_STACK-NEXT: inlined_g
+DEEP_STACK-NEXT: header.h:7
+DEEP_STACK-NEXT: inlined_f
+DEEP_STACK-NEXT: main.cc:3
+DEEP_STACK-NEXT: main
+DEEP_STACK-NEXT: main.cc:8
+
+SHORTER_STACK:      header.h:7:20
+SHORTER_STACK-NEXT: main.cc:3
+SHORTER_STACK-NEXT: main.cc:8
+
+SHORT_STACK:      main.cc:3:20
+SHORT_STACK-NEXT: main.cc:8
+
+INL_FUNC_NAME:      inlined_g
+INL_FUNC_NAME-NEXT: header.h:7:20
+

diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
index de23dcd..973c344 100644
--- a/test/DebugInfo/dwarfdump-test.test
+++ b/test/DebugInfo/dwarfdump-test.test

@@ -17,6 +17,8 @@
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
 RUN:   --address=0x55c --functions \
 RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
+RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
 
 MAIN: main
 MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
@@ -44,3 +46,11 @@
 
 MANY_SEQ_IN_LINE_TABLE: _Z1cv
 MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
+
+DEBUG_RANGES:      .debug_ranges contents:
+DEBUG_RANGES-NEXT: 00000000 000000000000055c 0000000000000567
+DEBUG_RANGES-NEXT: 00000000 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000000 <End of list>
+DEBUG_RANGES-NEXT: 00000030 0000000000000570 000000000000057b
+DEBUG_RANGES-NEXT: 00000030 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000030 <End of list>

diff --git a/test/DebugInfo/linkage-name.ll b/test/DebugInfo/linkage-name.ll
new file mode 100644
index 0000000..b984923
--- /dev/null
+++ b/test/DebugInfo/linkage-name.ll

@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_TAG_subprogram [9] *
+; CHECK-NOT: DW_AT_MIPS_linkage_name
+; CHECK: DW_AT_specification
+
+%class.A = type { i8 }
+
+@a = global %class.A zeroinitializer, align 1
+
+define i32 @_ZN1A1aEi(%class.A* %this, i32 %b) nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  %b.addr = alloca i32, align 4
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21), !dbg !23
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24), !dbg !25
+  %this1 = load %class.A** %this.addr
+  %0 = load i32* %b.addr, align 4, !dbg !26
+  ret i32 %0, !dbg !26
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, i32 0, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
+!21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 5, i32 8, metadata !5, null}
+!24 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554437, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{i32 5, i32 14, metadata !5, null}
+!26 = metadata !{i32 6, i32 4, metadata !27, null}
+!27 = metadata !{i32 786443, metadata !5, i32 5, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]

diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
new file mode 100644
index 0000000..5fa68f9
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll

@@ -0,0 +1,88 @@
+; RUN: %lli -use-mcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.1)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.100000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}

diff --git a/test/MC/AsmParser/directive_lcomm.s b/test/MC/AsmParser/directive_lcomm.s
index 0a0add5..37a350c 100644
--- a/test/MC/AsmParser/directive_lcomm.s
+++ b/test/MC/AsmParser/directive_lcomm.s

@@ -1,9 +1,14 @@
 # RUN: llvm-mc -triple i386-apple-darwin10 %s | FileCheck %s
+# RUN: llvm-mc -triple i386-pc-mingw32 %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-linux-gnu %s 2>&1 | FileCheck %s -check-prefix=ERROR
 
 # CHECK: TEST0:
-# CHECK: .zerofill __DATA,__bss,a,7,4
-# CHECK: .zerofill __DATA,__bss,b,8
-# CHECK: .zerofill __DATA,__bss,c,0
+# CHECK: .lcomm a,7,4
+# CHECK: .lcomm b,8
+# CHECK: .lcomm c,0
+
+# ELF doesn't like alignment on .lcomm.
+# ERROR: alignment not supported on this target
 TEST0:  
         .lcomm a, 8-1, 4
         .lcomm b,8

diff --git a/test/MC/AsmParser/labels.s b/test/MC/AsmParser/labels.s
index 5609175..6a9870b 100644
--- a/test/MC/AsmParser/labels.s
+++ b/test/MC/AsmParser/labels.s

@@ -41,7 +41,7 @@
 // CHECK: .comm "a 6",1
         .comm "a 6", 1
 
-// CHECK: .zerofill __DATA,__bss,"a 7",1,0
+// CHECK: .lcomm "a 7",1
         .lcomm "a 7", 1
 
 // FIXME: We don't bother to support .lsym.

diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
new file mode 100644
index 0000000..74da557
--- /dev/null
+++ b/test/MC/COFF/comm.ll

@@ -0,0 +1,13 @@
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+
+@a = internal global i8 0, align 1
+@b = internal global double 0.000000e+00, align 8
+@c = common global i8 0, align 1
+@d = common global double 0.000000e+00, align 8
+
+; .lcomm uses byte alignment
+; CHECK: .lcomm	_a,1
+; CHECK: .lcomm	_b,8,8
+; .comm uses log2 alignment
+; CHECK: .comm	_c,1,0
+; CHECK: .comm	_d,8,3

diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
index 5ba7d61..00b8526 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt

@@ -1,5 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
-# XFAIL: *
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | FileCheck %s
 
 # Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
@@ -9,3 +8,4 @@
 # 
 # 'a' == 1 and data_size == 8 is invalid
 0x3d 0x3c 0xa0 0xf4
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9bb0995
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
new file mode 100644
index 0000000..84c98bf
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0xc0 0x0f
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9024b09
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..9462812
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
new file mode 100644
index 0000000..f6e71bc
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt

@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding

diff --git a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
new file mode 100644
index 0000000..e53739e
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt

@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0xa0 0xf9 0x00 0x00
+0xa0 0xf9 0x20 0x00
+0xa0 0xf9 0x40 0x00
+0xa0 0xf9 0x60 0x00
+0xa0 0xf9 0x80 0x00
+0xa0 0xf9 0xa0 0x00
+0xa0 0xf9 0xc0 0x00
+0xa0 0xf9 0xe0 0x00
+
+# CHECK: vld1.8  {d0[0]}, [r0], r0 @ encoding: [0xa0,0xf9,0x00,0x00]
+# CHECK: vld1.8  {d0[1]}, [r0], r0 @ encoding: [0xa0,0xf9,0x20,0x00]
+# CHECK: vld1.8  {d0[2]}, [r0], r0 @ encoding: [0xa0,0xf9,0x40,0x00]
+# CHECK: vld1.8  {d0[3]}, [r0], r0 @ encoding: [0xa0,0xf9,0x60,0x00]
+# CHECK: vld1.8  {d0[4]}, [r0], r0 @ encoding: [0xa0,0xf9,0x80,0x00]
+# CHECK: vld1.8  {d0[5]}, [r0], r0 @ encoding: [0xa0,0xf9,0xa0,0x00]
+# CHECK: vld1.8  {d0[6]}, [r0], r0 @ encoding: [0xa0,0xf9,0xc0,0x00]
+# CHECK: vld1.8  {d0[7]}, [r0], r0 @ encoding: [0xa0,0xf9,0xe0,0x00]
+
+0xa0 0xf9 0x00 0x04
+0xa0 0xf9 0x10 0x04
+0xa0 0xf9 0x40 0x04
+0xa0 0xf9 0x50 0x04
+0xa0 0xf9 0x80 0x04
+0xa0 0xf9 0x90 0x04
+0xa0 0xf9 0xc0 0x04
+0xa0 0xf9 0xd0 0x04
+
+# CHECK: vld1.16 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x40,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0], r0      @ encoding: [0xa0,0xf9,0xc0,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
+
+0xa0 0xf9 0x00 0x08
+0xa0 0xf9 0x30 0x08
+0xa0 0xf9 0x80 0x08
+0xa0 0xf9 0xb0 0x08
+
+# CHECK: vld1.32 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x08]
+# CHECK: vld1.32 {d0[0]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
+
+0xa0 0xf9 0x1f 0x04
+0xa0 0xf9 0x8f 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16] @ encoding: [0xa0,0xf9,0x1f,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]      @ encoding: [0xa0,0xf9,0x8f,0x00]
+
+0xa0 0xf9 0x1d 0x04
+0xa0 0xf9 0x8d 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]!      @ encoding: [0xa0,0xf9,0x8d,0x00]
+
+0xa5 0xf9 0x10 0x04
+0xa5 0xf9 0x1a 0x04
+0xae 0xf9 0x1a 0x04
+0xa5 0xf9 0x1a 0x94
+
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d0[0]}, [lr, :16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
+
+0xa0 0xf9 0x20 0x0b
+0xa0 0xf9 0x20 0x07
+0xa0 0xf9 0x20 0x03
+
+# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0, :128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
+# CHECK: vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x07]
+# CHECK: vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x03]

diff --git a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
new file mode 100644
index 0000000..eb3722c
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt

@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0x80 0xf9 0x00 0x00
+0x81 0xf9 0x21 0x10
+0x81 0xf9 0x42 0x10
+0x81 0xf9 0x61 0x20
+0x82 0xf9 0x82 0x20
+0x82 0xf9 0xa1 0x10
+0x82 0xf9 0xc2 0x20
+0x83 0xf9 0xe3 0x30
+
+# CHECK: vst1.8  {d0[0]}, [r0], r0 @ encoding: [0x80,0xf9,0x00,0x00]
+# CHECK: vst1.8  {d1[1]}, [r1], r1 @ encoding: [0x81,0xf9,0x21,0x10]
+# CHECK: vst1.8  {d1[2]}, [r1], r2 @ encoding: [0x81,0xf9,0x42,0x10]
+# CHECK: vst1.8  {d2[3]}, [r1], r1 @ encoding: [0x81,0xf9,0x61,0x20]
+# CHECK: vst1.8  {d2[4]}, [r2], r2 @ encoding: [0x82,0xf9,0x82,0x20]
+# CHECK: vst1.8  {d1[5]}, [r2], r1 @ encoding: [0x82,0xf9,0xa1,0x10]
+# CHECK: vst1.8  {d2[6]}, [r2], r2 @ encoding: [0x82,0xf9,0xc2,0x20]
+# CHECK: vst1.8  {d3[7]}, [r3], r3 @ encoding: [0x83,0xf9,0xe3,0x30]
+
+0x80 0xf9 0x00 0x04
+0xc3 0xf9 0x13 0x04
+0xc4 0xf9 0x43 0x04
+0xc5 0xf9 0x55 0x04
+0xc6 0xf9 0x85 0x04
+0xc7 0xf9 0x95 0x74
+0xc8 0xf9 0xc7 0x84
+0xc9 0xf9 0xd9 0x94
+
+# CHECK: vst1.16 {d0[0]},  [r0], r0      @ encoding: [0x80,0xf9,0x00,0x04]
+# CHECK: vst1.16 {d16[0]}, [r3, :16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
+# CHECK: vst1.16 {d16[1]}, [r4], r3      @ encoding: [0xc4,0xf9,0x43,0x04]
+# CHECK: vst1.16 {d16[1]}, [r5, :16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
+# CHECK: vst1.16 {d16[2]}, [r6], r5      @ encoding: [0xc6,0xf9,0x85,0x04]
+# CHECK: vst1.16 {d23[2]}, [r7, :16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
+# CHECK: vst1.16 {d24[3]}, [r8], r7      @ encoding: [0xc8,0xf9,0xc7,0x84]
+# CHECK: vst1.16 {d25[3]}, [r9, :16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
+
+0x8a 0xf9 0x01 0xa8
+0xcb 0xf9 0x32 0x18
+0x8c 0xf9 0x83 0xb8
+0xcd 0xf9 0xb4 0x28
+
+# CHECK: vst1.32 {d10[0]}, [r10], r1      @ encoding: [0x8a,0xf9,0x01,0xa8]
+# CHECK: vst1.32 {d17[0]}, [r11, :32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
+# CHECK: vst1.32 {d11[1]}, [r12], r3      @ encoding: [0x8c,0xf9,0x83,0xb8]
+# CHECK: vst1.32 {d18[1]}, [sp, :32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
+
+0x81 0xf9 0x1f 0x44
+0x82 0xf9 0x8f 0x30
+
+# CHECK: vst1.16 {d4[0]}, [r1, :16] @ encoding: [0x81,0xf9,0x1f,0x44]
+# CHECK: vst1.8  {d3[4]}, [r2]      @ encoding: [0x82,0xf9,0x8f,0x30]
+
+0x83 0xf9 0x1d 0x24
+0x84 0xf9 0x8d 0x10
+
+# CHECK: vst1.16 {d2[0]}, [r3, :16]! @ encoding: [0x83,0xf9,0x1d,0x24]
+# CHECK: vst1.8  {d1[4]}, [r4]!      @ encoding: [0x84,0xf9,0x8d,0x10]
+
+0x85 0xf9 0x10 0x04
+0x85 0xf9 0x1a 0x74
+0x8e 0xf9 0x1a 0x84
+0x85 0xf9 0x1a 0x94
+
+# CHECK: vst1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
+# CHECK: vst1.16 {d7[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
+# CHECK: vst1.16 {d8[0]}, [lr, :16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
+# CHECK: vst1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
+
+0x81 0xf9 0x24 0x0b
+0x82 0xf9 0x25 0x07
+0x83 0xf9 0x26 0x03
+
+# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1, :128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
+# CHECK: vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r2], r5       @ encoding: [0x82,0xf9,0x25,0x07]
+# CHECK: vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r3], r6       @ encoding: [0x83,0xf9,0x26,0x03]

diff --git a/test/MC/ELF/cfi-reg.s b/test/MC/ELF/cfi-reg.s
new file mode 100644
index 0000000..fd68d6d
--- /dev/null
+++ b/test/MC/ELF/cfi-reg.s

@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -o - | FileCheck %s
+// PR13754
+
+f:
+	.cfi_startproc
+        nop
+	.cfi_offset 6, -16
+        nop
+	.cfi_offset %rsi, -16
+        nop
+	.cfi_offset rbx, -16
+        nop
+	.cfi_endproc
+
+// CHECK: f:
+// CHECK: .cfi_offset %rbp, -16
+// CHECK: .cfi_offset %rsi, -16
+// CHECK: .cfi_offset %rbx, -16

diff --git a/test/MC/ELF/lcomm.s b/test/MC/ELF/lcomm.s
new file mode 100644
index 0000000..ae8d0ba
--- /dev/null
+++ b/test/MC/ELF/lcomm.s

@@ -0,0 +1,21 @@
+// RUN: llvm-mc -triple i386-pc-linux-gnu %s -filetype=obj -o - | elf-dump | FileCheck %s
+
+.lcomm A, 5
+.lcomm B, 32 << 20
+
+// CHECK: (('st_name', 0x00000001) # 'A'
+// CHECK:  ('st_value', 0x00000000)
+// CHECK:  ('st_size', 0x00000005)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
+// CHECK: (('st_name', 0x00000003) # 'B'
+// CHECK:  ('st_value', 0x00000005)
+// CHECK:  ('st_size', 0x02000000)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),

diff --git a/test/MC/Mips/do_switch.ll b/test/MC/Mips/do_switch.ll
new file mode 100644
index 0000000..7eda1b4
--- /dev/null
+++ b/test/MC/Mips/do_switch.ll

@@ -0,0 +1,39 @@
+; This test case will cause an internal EK_GPRel64BlockAddress to be 
+; produced. This was not handled for direct object and an assertion
+; to occur. This is a variation on test case test/CodeGen/Mips/do_switch.ll
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=static
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=pic
+
+; RUN: llc < %s -filetype=obj -march=mips64 -relocation-model=pic -mcpu=mips64 -mattr=n64 
+
+define i32 @main() nounwind readnone {
+entry:
+  %x = alloca i32, align 4                        ; <i32*> [#uses=2]
+  store volatile i32 2, i32* %x, align 4
+  %0 = load volatile i32* %x, align 4             ; <i32> [#uses=1]
+
+  switch i32 %0, label %bb4 [
+    i32 0, label %bb5
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb1:                                              ; preds = %entry
+  ret i32 2
+
+bb2:                                              ; preds = %entry
+  ret i32 0
+
+bb3:                                              ; preds = %entry
+  ret i32 3
+
+bb4:                                              ; preds = %entry
+  ret i32 4
+
+bb5:                                              ; preds = %entry
+  ret i32 1
+}
+

diff --git a/test/MC/Mips/elf-N64.ll b/test/MC/Mips/elf-N64.ll
index 23ec53a..ae6de78 100644
--- a/test/MC/Mips/elf-N64.ll
+++ b/test/MC/Mips/elf-N64.ll

@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 -disable-mips-delay-filler %s -o - | elf-dump --dump-section-data  | FileCheck %s
 
 ; Check for N64 relocation production.
 ;

diff --git a/test/MC/Mips/higher_highest.ll b/test/MC/Mips/higher_highest.ll
index 81a89e3..0c66522 100644
--- a/test/MC/Mips/higher_highest.ll
+++ b/test/MC/Mips/higher_highest.ll

@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
-
+; DISABLE: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
+; RUN: false
+; XFAIL: *
+; Disabled because currently we don't have a way to generate these relocations.
+;
 ; Check that the R_MIPS_HIGHER and R_MIPS_HIGHEST relocations were created.
 
 ; CHECK:     ('r_type', 0x1d)

diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
new file mode 100644
index 0000000..2997782
--- /dev/null
+++ b/test/MC/Mips/mips-alu-instructions.s

@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  add    $9, $6, $7      # encoding: [0x20,0x48,0xc7,0x00]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addu   $9, $6, $7      # encoding: [0x21,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
+# CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
+# CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
+# CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
+# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+    add    $9,$6,$7
+    add    $9,$6,17767
+    addu   $9,$6,-15001
+    addi   $9,$6,17767
+    addiu  $9,$6,-15001
+    addu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    sub    $9,$6,$7
+    subu   $4,$3,$5
+    neg    $6,$7
+    negu   $6,$7
+    move   $7,$8

diff --git a/test/MC/Mips/mips-fpu-instructions.s b/test/MC/Mips/mips-fpu-instructions.s
new file mode 100644
index 0000000..ce8024d
--- /dev/null
+++ b/test/MC/Mips/mips-fpu-instructions.s

@@ -0,0 +1,162 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for FPU instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# FP aritmetic  instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  abs.d      $f12, $f14         # encoding: [0x05,0x73,0x20,0x46]
+# CHECK:  abs.s      $f6, $f7           # encoding: [0x85,0x39,0x00,0x46]
+# CHECK:  add.d      $f8, $f12, $f14    # encoding: [0x00,0x62,0x2e,0x46]
+# CHECK:  add.s      $f9, $f6, $f7      # encoding: [0x40,0x32,0x07,0x46]
+# CHECK:  floor.w.d  $f12, $f14         # encoding: [0x0f,0x73,0x20,0x46]
+# CHECK:  floor.w.s  $f6, $f7           # encoding: [0x8f,0x39,0x00,0x46]
+# CHECK:  ceil.w.d   $f12, $f14         # encoding: [0x0e,0x73,0x20,0x46]
+# CHECK:  ceil.w.s   $f6, $f7           # encoding: [0x8e,0x39,0x00,0x46]
+# CHECK:  mul.d      $f8, $f12, $f14    # encoding: [0x02,0x62,0x2e,0x46]
+# CHECK:  mul.s      $f9, $f6, $f7      # encoding: [0x42,0x32,0x07,0x46]
+# CHECK:  neg.d      $f12, $f14         # encoding: [0x07,0x73,0x20,0x46]
+# CHECK:  neg.s      $f6, $f7           # encoding: [0x87,0x39,0x00,0x46]
+# CHECK:  round.w.d  $f12, $f14         # encoding: [0x0c,0x73,0x20,0x46]
+# CHECK:  round.w.s  $f6, $f7           # encoding: [0x8c,0x39,0x00,0x46]
+# CHECK:  sqrt.d     $f12, $f14         # encoding: [0x04,0x73,0x20,0x46]
+# CHECK:  sqrt.s     $f6, $f7           # encoding: [0x84,0x39,0x00,0x46]
+# CHECK:  sub.d      $f8, $f12, $f14    # encoding: [0x01,0x62,0x2e,0x46]
+# CHECK:  sub.s      $f9, $f6, $f7      # encoding: [0x41,0x32,0x07,0x46]
+# CHECK:  trunc.w.d  $f12, $f14         # encoding: [0x0d,0x73,0x20,0x46]
+# CHECK:  trunc.w.s  $f6, $f7           # encoding: [0x8d,0x39,0x00,0x46]
+
+    abs.d      $f12,$f14
+    abs.s      $f6,$f7
+    add.d      $f8,$f12,$f14
+    add.s      $f9,$f6,$f7
+    floor.w.d  $f12,$f14
+    floor.w.s  $f6,$f7
+    ceil.w.d   $f12,$f14
+    ceil.w.s   $f6,$f7
+    mul.d      $f8,$f12,$f14
+    mul.s      $f9,$f6, $f7
+    neg.d      $f12,$f14
+    neg.s      $f6,$f7
+    round.w.d  $f12,$f14
+    round.w.s  $f6,$f7
+    sqrt.d     $f12,$f14
+    sqrt.s     $f6,$f7
+    sub.d      $f8,$f12,$f14
+    sub.s      $f9,$f6,$f7
+    trunc.w.d  $f12,$f14
+    trunc.w.s  $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP compare instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  c.eq.d    $f12, $f14        # encoding: [0x32,0x60,0x2e,0x46]
+# CHECK:  c.eq.s    $f6, $f7          # encoding: [0x32,0x30,0x07,0x46]
+# CHECK:  c.f.d     $f12, $f14        # encoding: [0x30,0x60,0x2e,0x46]
+# CHECK:  c.f.s     $f6, $f7          # encoding: [0x30,0x30,0x07,0x46]
+# CHECK:  c.le.d    $f12, $f14        # encoding: [0x3e,0x60,0x2e,0x46]
+# CHECK:  c.le.s    $f6, $f7          # encoding: [0x3e,0x30,0x07,0x46]
+# CHECK:  c.lt.d    $f12, $f14        # encoding: [0x3c,0x60,0x2e,0x46]
+# CHECK:  c.lt.s    $f6, $f7          # encoding: [0x3c,0x30,0x07,0x46]
+# CHECK:  c.nge.d   $f12, $f14        # encoding: [0x3d,0x60,0x2e,0x46]
+# CHECK:  c.nge.s   $f6, $f7          # encoding: [0x3d,0x30,0x07,0x46]
+# CHECK:  c.ngl.d   $f12, $f14        # encoding: [0x3b,0x60,0x2e,0x46]
+# CHECK:  c.ngl.s   $f6, $f7          # encoding: [0x3b,0x30,0x07,0x46]
+# CHECK:  c.ngle.d  $f12, $f14        # encoding: [0x39,0x60,0x2e,0x46]
+# CHECK:  c.ngle.s  $f6, $f7          # encoding: [0x39,0x30,0x07,0x46]
+# CHECK:  c.ngt.d   $f12, $f14        # encoding: [0x3f,0x60,0x2e,0x46]
+# CHECK:  c.ngt.s   $f6, $f7          # encoding: [0x3f,0x30,0x07,0x46]
+# CHECK:  c.ole.d   $f12, $f14        # encoding: [0x36,0x60,0x2e,0x46]
+# CHECK:  c.ole.s   $f6, $f7          # encoding: [0x36,0x30,0x07,0x46]
+# CHECK:  c.olt.d   $f12, $f14        # encoding: [0x34,0x60,0x2e,0x46]
+# CHECK:  c.olt.s   $f6, $f7          # encoding: [0x34,0x30,0x07,0x46]
+# CHECK:  c.seq.d   $f12, $f14        # encoding: [0x3a,0x60,0x2e,0x46]
+# CHECK:  c.seq.s   $f6, $f7          # encoding: [0x3a,0x30,0x07,0x46]
+# CHECK:  c.sf.d    $f12, $f14        # encoding: [0x38,0x60,0x2e,0x46]
+# CHECK:  c.sf.s    $f6, $f7          # encoding: [0x38,0x30,0x07,0x46]
+# CHECK:  c.ueq.d   $f12, $f14        # encoding: [0x33,0x60,0x2e,0x46]
+# CHECK:  c.ueq.s   $f28, $f18        # encoding: [0x33,0xe0,0x12,0x46]
+# CHECK:  c.ule.d   $f12, $f14        # encoding: [0x37,0x60,0x2e,0x46]
+# CHECK:  c.ule.s   $f6, $f7          # encoding: [0x37,0x30,0x07,0x46]
+# CHECK:  c.ult.d   $f12, $f14        # encoding: [0x35,0x60,0x2e,0x46]
+# CHECK:  c.ult.s   $f6, $f7          # encoding: [0x35,0x30,0x07,0x46]
+# CHECK:  c.un.d    $f12, $f14        # encoding: [0x31,0x60,0x2e,0x46]
+# CHECK:  c.un.s    $f6, $f7          # encoding: [0x31,0x30,0x07,0x46]
+
+     c.eq.d    $f12,$f14
+     c.eq.s    $f6,$f7
+     c.f.d     $f12,$f14
+     c.f.s     $f6,$f7
+     c.le.d    $f12,$f14
+     c.le.s    $f6,$f7
+     c.lt.d    $f12,$f14
+     c.lt.s    $f6,$f7
+     c.nge.d   $f12,$f14
+     c.nge.s   $f6,$f7
+     c.ngl.d   $f12,$f14
+     c.ngl.s   $f6,$f7
+     c.ngle.d  $f12,$f14
+     c.ngle.s  $f6,$f7
+     c.ngt.d   $f12,$f14
+     c.ngt.s   $f6,$f7
+     c.ole.d   $f12,$f14
+     c.ole.s   $f6,$f7
+     c.olt.d   $f12,$f14
+     c.olt.s   $f6,$f7
+     c.seq.d   $f12,$f14
+     c.seq.s   $f6,$f7
+     c.sf.d    $f12,$f14
+     c.sf.s    $f6,$f7
+     c.ueq.d   $f12,$f14
+     c.ueq.s   $f28,$f18
+     c.ule.d   $f12,$f14
+     c.ule.s   $f6,$f7
+     c.ult.d   $f12,$f14
+     c.ult.s   $f6,$f7
+     c.un.d    $f12,$f14
+     c.un.s    $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP convert instructions
+#------------------------------------------------------------------------------
+# CHECK:  cvt.d.s   $f6, $f7          # encoding: [0xa1,0x39,0x00,0x46]
+# CHECK:  cvt.d.w   $f12, $f14        # encoding: [0x21,0x73,0x80,0x46]
+# CHECK:  cvt.s.d   $f12, $f14        # encoding: [0x20,0x73,0x20,0x46]
+# CHECK:  cvt.s.w   $f6, $f7          # encoding: [0xa0,0x39,0x80,0x46]
+# CHECK:  cvt.w.d   $f12, $f14        # encoding: [0x24,0x73,0x20,0x46]
+# CHECK:  cvt.w.s   $f6, $f7          # encoding: [0xa4,0x39,0x00,0x46]
+
+  cvt.d.s   $f6,$f7
+  cvt.d.w   $f12,$f14
+  cvt.s.d   $f12,$f14
+  cvt.s.w   $f6,$f7
+  cvt.w.d   $f12,$f14
+  cvt.w.s   $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP move instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  cfc1    $6, $fcc0            # encoding: [0x00,0x00,0x46,0x44]
+# CHECK:  mfc1    $6, $f7              # encoding: [0x00,0x38,0x06,0x44]
+# CHECK:  mfhi    $5                   # encoding: [0x10,0x28,0x00,0x00]
+# CHECK:  mflo    $5                   # encoding: [0x12,0x28,0x00,0x00]
+# CHECK:  mov.d   $f6, $f8             # encoding: [0x86,0x41,0x20,0x46]
+# CHECK:  mov.s   $f6, $f7             # encoding: [0x86,0x39,0x00,0x46]
+# CHECK:  mtc1    $6, $f7              # encoding: [0x00,0x38,0x86,0x44]
+# CHECK:  mthi    $7                   # encoding: [0x11,0x00,0xe0,0x00]
+# CHECK:  mtlo    $7                   # encoding: [0x13,0x00,0xe0,0x00]
+# CHECK:  swc1    $f9, 9158($7)        # encoding: [0xc6,0x23,0xe9,0xe4]
+
+   cfc1    $a2,$0
+   mfc1    $a2,$f7
+   mfhi    $a1
+   mflo    $a1
+   mov.d   $f6,$f8
+   mov.s   $f6,$f7
+   mtc1    $a2,$f7
+   mthi    $a3
+   mtlo    $a3
+   swc1    $f9,9158($a3)

diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
new file mode 100644
index 0000000..998be41
--- /dev/null
+++ b/test/MC/Mips/mips-jump-instructions.s

@@ -0,0 +1,72 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for jumps and branches.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Branch instructions
+#------------------------------------------------------------------------------
+# CHECK:   b 1332                 # encoding: [0x34,0x05,0x00,0x10]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1f 1332              # encoding: [0x34,0x05,0x00,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1t 1332              # encoding: [0x34,0x05,0x01,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   beq $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x11]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgez $6, 1332          # encoding: [0x34,0x05,0xc1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgezal $6, 1332        # encoding: [0x34,0x05,0xd1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgtz $6, 1332          # encoding: [0x34,0x05,0xc0,0x1c]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   blez $6, 1332          # encoding: [0x34,0x05,0xc0,0x18]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bne $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x15]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bal     1332           # encoding: [0x34,0x05,0x00,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+         b 1332
+         nop
+         bc1f 1332
+         nop
+         bc1t 1332
+         nop
+         beq $9,$6,1332
+         nop
+         bgez $6,1332
+         nop
+         bgezal $6,1332
+         nop
+         bgtz $6,1332
+         nop
+         blez $6,1332
+         nop
+         bne $9,$6,1332
+         nop
+         bal 1332
+         nop
+
+end_of_code:
+#------------------------------------------------------------------------------
+# Jump instructions
+#------------------------------------------------------------------------------
+# CHECK:   j 1328               # encoding: [0x30,0x05,0x00,0x08]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jal 1328             # encoding: [0x30,0x05,0x00,0x0c]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $6              # encoding: [0x09,0xf8,0xc0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+
+
+   j 1328
+   nop
+   jal 1328
+   nop
+   jalr $6
+   nop
+   jr $7
+   nop
+   j $7

diff --git a/test/MC/Mips/mips-memory-instructions.s b/test/MC/Mips/mips-memory-instructions.s
new file mode 100644
index 0000000..b5f1267
--- /dev/null
+++ b/test/MC/Mips/mips-memory-instructions.s

@@ -0,0 +1,45 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for loads and stores.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Memory store instructions
+#------------------------------------------------------------------------------
+# CHECK:  sb      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa0]
+# CHECK:  sc      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xe0]
+# CHECK:  sh      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa4]
+# CHECK:  sw      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xac]
+# CHECK:  sw      $7,  0($5)      # encoding: [0x00,0x00,0xa7,0xac]
+# CHECK:  swc1    $f2, 16($5)     # encoding: [0x10,0x00,0xa2,0xe4]
+# CHECK:  swl     $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa8]
+     sb   $4, 16($5)
+     sc   $4, 16($5)
+     sh   $4, 16($5)
+     sw   $4, 16($5)
+     sw   $7,   ($5)
+     swc1 $f2, 16($5)
+     swl  $4, 16($5)
+
+#------------------------------------------------------------------------------
+# Memory load instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  lb  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x80]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lbu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x90]
+# CHECK:  lh  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x84]
+# CHECK:  lhu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x94]
+# CHECK:  ll  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0xc0]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lw  $7, 0($7)       # encoding: [0x00,0x00,0xe7,0x8c]
+# CHECK:  lw  $2, 16($sp)     # encoding: [0x10,0x00,0xa2,0x8f]
+
+      lb      $4, 4($5)
+      lw      $4, 4($5)
+      lbu     $4, 4($5)
+      lh      $4, 4($5)
+      lhu     $4, 4($5)
+      ll      $4, 4($5)
+      lw      $4, 4($5)
+      lw      $7,    ($7)
+      lw      $2, 16($sp)

diff --git a/test/MC/Mips/mips-relocations.s b/test/MC/Mips/mips-relocations.s
new file mode 100644
index 0000000..ff71c75
--- /dev/null
+++ b/test/MC/Mips/mips-relocations.s

@@ -0,0 +1,41 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for relocations.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+# CHECK:  lui   $2, %hi(_gp_disp)     # encoding: [A,A,0x02,0x3c]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_HI, kind: fixup_Mips_HI16
+# CHECK:  addiu $2, $2, %lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_LO, kind: fixup_Mips_LO16
+# CHECK:  lw    $25, %call16(strchr)($gp)   # encoding: [A,A,0x99,0x8f]
+# CHECK:                                    #   fixup A - offset: 0, value: strchr@GOT_CALL, kind: fixup_Mips_CALL16
+# CHECK:  lw      $3, %got(loop_1)($2)    # encoding: [A,A,0x43,0x8c]
+# CHECK:                                  #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lui     $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x02,0x3c]
+# CHECK:                                        #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  addiu   $2, $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                  #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  lw      $3, %got(loop_1)($2)      # encoding: [A,A,0x43,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lw      $4, %got_disp(loop_2)($3) # encoding: [A,A,0x64,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_2@GOT_DISP, kind: fixup_Mips_GOT_DISP
+# CHECK:  lw      $5, %got_page(loop_3)($4) # encoding: [A,A,0x85,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_3@GOT_PAGE, kind: fixup_Mips_GOT_PAGE
+# CHECK:  lw      $6, %got_ofst(loop_4)($5) # encoding: [A,A,0xa6,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_4@GOT_OFST, kind: fixup_Mips_GOT_OFST
+# CHECK:  lui     $2, %tprel_hi(_gp_disp)   # encoding: [A,A,0x02,0x3c]
+# CHECK:                                    #   fixup A - offset: 0, value: _gp_disp@TPREL_HI, kind: fixup_Mips_TPREL_HI
+# CHECK:  addiu   $2, $2, %tprel_lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                      #   fixup A - offset: 0, value: _gp_disp@TPREL_LO, kind: fixup_Mips_TPREL_LO
+
+    lui	$2, %hi(_gp_disp)
+	  addiu	$2, $2, %lo(_gp_disp)
+    lw	$25, %call16(strchr)($gp)
+    lw      $3, %got(loop_1)($2)
+    lui	$2, %dtprel_hi(_gp_disp)
+	  addiu	$2, $2, %dtprel_hi(_gp_disp)
+    lw	$3, %got(loop_1)($2)
+    lw	$4, %got_disp(loop_2)($3)
+    lw	$5, %got_page(loop_3)($4)
+    lw	$6, %got_ofst(loop_4)($5)
+    lui	$2, %tprel_hi(_gp_disp)
+	  addiu	$2, $2, %tprel_lo(_gp_disp)

diff --git a/test/MC/Mips/mips64extins.ll b/test/MC/Mips/mips64extins.ll
new file mode 100644
index 0000000..ebe8f86
--- /dev/null
+++ b/test/MC/Mips/mips64extins.ll

@@ -0,0 +1,57 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -mattr=n64 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 - \
+; RUN: | FileCheck %s
+
+define i64 @dext(i64 %i) nounwind readnone {
+entry:
+; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 10
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 1023
+  ret i64 %and
+}
+
+define i64 @dextu(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 2, 6
+  %shr = lshr i64 %i, 34
+  %and = and i64 %shr, 63
+  ret i64 %and
+}
+
+define i64 @dextm(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 2
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 17179869183
+  ret i64 %and
+}
+
+define i64 @dins(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
+  %shl2 = shl i64 %j, 8
+  %and = and i64 %shl2, 261888
+  %and3 = and i64 %i, -261889
+  %or = or i64 %and3, %and
+  ret i64 %or
+}
+
+define i64 @dinsm(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 10, 1
+  %shl4 = shl i64 %j, 10
+  %and = and i64 %shl4, 8796093021184
+  %and5 = and i64 %i, -8796093021185
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
+
+define i64 @dinsu(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsu ${{[0-9]+}}, ${{[0-9]+}}, 8, 13
+  %shl4 = shl i64 %j, 40
+  %and = and i64 %shl4, 9006099743113216
+  %and5 = and i64 %i, -9006099743113217
+  %or = or i64 %and5, %and
+  ret i64 %or
+}

diff --git a/test/MC/Mips/mips64shift.ll b/test/MC/Mips/mips64shift.ll
index 7817b96..99cac7b 100644
--- a/test/MC/Mips/mips64shift.ll
+++ b/test/MC/Mips/mips64shift.ll

@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
 define i64 @f3(i64 %a0) nounwind readnone {
 entry:

diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
new file mode 100644
index 0000000..f9d8460
--- /dev/null
+++ b/test/MC/Mips/mips_directives.s

@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple mips-unknown-unknown %s
+
+$BB0_2:
+	.frame	$sp,0,$ra
+	.mask 	0x00000000,0
+	.fmask	0x00000000,0
+	.set	noreorder
+	.set	nomacro
+$JTI0_0:
+	.gpword	($BB0_2)

diff --git a/test/MC/Mips/multi-64bit-func.ll b/test/MC/Mips/multi-64bit-func.ll
index 6e0d784..83577aa 100644
--- a/test/MC/Mips/multi-64bit-func.ll
+++ b/test/MC/Mips/multi-64bit-func.ll

@@ -1,8 +1,8 @@
 ; There is no real check here. If the test doesn't 
 ; assert it passes.
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler < %s 
 ; Run it again without extra nop in delay slot
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -enable-mips-delay-filler < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
 
 define i32 @bosco1(i32 %x) nounwind readnone {
 entry:

diff --git a/test/MC/X86/intel-syntax-2.s b/test/MC/X86/intel-syntax-2.s
index ca4afc3..c377e1a 100644
--- a/test/MC/X86/intel-syntax-2.s
+++ b/test/MC/X86/intel-syntax-2.s

@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown  %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=att %s | FileCheck %s
 
 	.intel_syntax
 _test:
 // CHECK:	movl	$257, -4(%rsp)
 	mov	DWORD PTR [RSP - 4], 257
-
+    .att_syntax
+// CHECK:	movl	$257, -4(%rsp)
+    movl $257, -4(%rsp)
\ No newline at end of file

diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 6a2d5bb..03cb62e 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s

@@ -1164,6 +1164,10 @@
 // CHECK: encoding: [0x66,0x48,0x0f,0x6e,0xc7]
 	movd %rdi,%xmm0
 
+// CHECK: movd  %xmm0, %rax
+// CHECK: encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+        movd  %xmm0, %rax
+
 // CHECK: movntil %eax, (%rdi)
 // CHECK: encoding: [0x0f,0xc3,0x07]
 // CHECK: movntil

diff --git a/test/Makefile b/test/Makefile
index 9ddfabf..ae7a674 100644
--- a/test/Makefile
+++ b/test/Makefile

@@ -131,7 +131,7 @@
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
-	@$(ECHOPATH) s,@OCAMLOPT@,$(OCAMLOPT) -cc \\\\\"$(CXX_FOR_OCAMLOPT)\\\\\" -I $(LibDir)/ocaml,g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp

diff --git a/test/Object/Inputs/dext-test.elf-mips64r2 b/test/Object/Inputs/dext-test.elf-mips64r2
new file mode 100644
index 0000000..59dbaef
--- /dev/null
+++ b/test/Object/Inputs/dext-test.elf-mips64r2
Binary files differ

diff --git a/test/Object/Inputs/relocations.elf-x86-64 b/test/Object/Inputs/relocations.elf-x86-64
new file mode 100644
index 0000000..6e340c7
--- /dev/null
+++ b/test/Object/Inputs/relocations.elf-x86-64
Binary files differ

diff --git a/test/Object/Mips/feature.test b/test/Object/Mips/feature.test
new file mode 100644
index 0000000..e8da609
--- /dev/null
+++ b/test/Object/Mips/feature.test

@@ -0,0 +1,11 @@
+RUN: llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 %p/../Inputs/dext-test.elf-mips64r2 \
+RUN: | FileCheck %s
+
+CHECK: Disassembly of section .text:
+CHECK: .text:
+CHECK:        0:	08 00 e0 03                                  	jr	$ra
+CHECK:        4:	43 49 82 7c                                  	dext $2, $4, 5, 10
+CHECK:        8:	08 00 e0 03                                  	jr	$ra
+CHECK:        c:	83 28 82 7c                                  	dext $2, $4, 2, 6
+CHECK:       10:	08 00 e0 03                                  	jr	$ra
+CHECK:       14:	43 09 82 7c                                  	dext $2, $4, 5, 2

diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
new file mode 100644
index 0000000..1499317
--- /dev/null
+++ b/test/Object/Mips/lit.local.cfg

@@ -0,0 +1,5 @@
+config.suffixes = ['.test']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True

diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index a394a23..6d35a26 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test

@@ -9,6 +9,9 @@
 RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-hexagon \
 RUN:              | FileCheck %s -check-prefix ELF-hexagon
 
+RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-complex-x86-64
+
 COFF-i386: .text
 COFF-i386: IMAGE_REL_I386_DIR32 L_.str
 COFF-i386: IMAGE_REL_I386_REL32 _puts
@@ -36,3 +39,13 @@
 ELF-hexagon: R_HEX_LO16 puts
 ELF-hexagon: R_HEX_B15_PCREL testf
 ELF-hexagon: R_HEX_B22_PCREL puts
+
+ELF-complex-x86-64: .text
+ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32S .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_64 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_PC32 .data-4-P
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+0
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+4

diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 18de368..1d8d623 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td

@@ -3,15 +3,59 @@
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
-// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, !if({ C:x{2} }, 0, 1), !if({ C:x{2} }, 1, 1), !if({ C:x{2} }, 0, 0), !if({ C:x{1} }, C:y{3}, 0), !if({ C:x{1} }, C:y{2}, 1), !if({ C:x{0} }, C:y{3}, C:z), !if({ C:x{0} }, C:y{2}, C:y{2}), !if({ C:x{0} }, C:y{1}, C:y{1}), !if({ C:x{0} }, C:y{0}, C:y{0}) };
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, 2, 6){2}, !if({ C:x{2} }, 2, 6){1}, !if({ C:x{2} }, 2, 6){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
 class C<bits<3> x, bits<4> y, bit z> {
   bits<16> n;
 
+  let n{11}  = !if(y{3}, 1,
+               !if(y{2}, x{0},
+               !if(y{1}, x{1},
+               !if(y{0}, x{2}, ?))));
+  let n{10-9}= !if(x{2}, y{3-2},
+               !if(x{1}, y{2-1},
+               !if(x{0}, y{1-0}, ?)));
   let n{8-6} = !if(x{2}, 0b010, 0b110);
   let n{5-4} = !if(x{1}, y{3-2}, {0, 1});
   let n{3-0} = !if(x{0}, y{3-0}, {z, y{2}, y{1}, y{0}});
 }
 
+def C1 : C<{1, 0, 1}, {0, 1, 0, 1}, 0>;
+def C2 : C<{0, 1, 0}, {1, 0, 1, 0}, 1>;
+def C3 : C<{0, 0, 0}, {1, 0, 1, 0}, 0>;
+def C4 : C<{0, 0, 0}, {0, 0, 0, 0}, 0>;
+
+// CHECK: def C1
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 };
+// CHECK: def C2
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0 };
+// CHECK: def C3
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, ?, ?, 1, 1, 0, 0, 1, 0, 0, 1, 0 };
+// CHECK: def C4
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, 1, 1, 0, 0, 1, 0, 0, 0, 0 };
+
+class S<int s> {
+  bits<2> val = !if(!eq(s, 8),  {0, 0},
+                !if(!eq(s, 16), 0b01,
+                !if(!eq(s, 32), 2,
+                !if(!eq(s, 64), {1, 1}, ?))));
+}
+
+def D8  : S<8>;
+def D16 : S<16>;
+def D32 : S<32>;
+def D64 : S<64>;
+def D128: S<128>;
+// CHECK: def D128
+// CHECK-NEXT: bits<2> val = { ?, ? };
+// CHECK: def D16
+// CHECK-NEXT: bits<2> val = { 0, 1 };
+// CHECK: def D32
+// CHECK-NEXT: bits<2> val = { 1, 0 };
+// CHECK: def D64
+// CHECK-NEXT: bits<2> val = { 1, 1 };
+// CHECK: def D8
+// CHECK-NEXT: bits<2> val = { 0, 0 };
+
 // CHECK:      def One
 // CHECK-NEXT: list<int> first = [1, 2, 3];
 // CHECK-NEXT: list<int> rest = [1, 2, 3];

diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
new file mode 100644
index 0000000..5f3e3da
--- /dev/null
+++ b/test/TableGen/list-element-bitref.td

@@ -0,0 +1,15 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class C<list<bits<8>> L> {
+  bits<2> V0 = L[0]{1-0};
+  bits<2> V1 = L[1]{3-2};
+  string V2 = !if(L[0]{0}, "Odd", "Even");
+}
+
+def c0 : C<[0b0101, 0b1010]>;
+
+// CHECK: def c0
+// CHECk-NEXT: bits<2> V0 = { 0, 1 };
+// CHECk-NEXT: bits<2> V1 = { 1, 0 };
+// CHECk-NEXT: string V2 = "Odd";

diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
new file mode 100644
index 0000000..7779b63
--- /dev/null
+++ b/test/TableGen/pr8330.td

@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class Or4<bits<8> Val> {
+  bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
+}
+
+class Whatev<bits<8> x>;
+
+class Whatever<bits<8> x> {
+  bits<8> W = {x{0}, x{1}, x{2}, x{3}, x{4}, x{5}, x{6}, x{7} };
+}
+
+multiclass X<bits<8> BaseOpc> {
+ def bar : Whatev<Or4<BaseOpc>.V >;
+}
+
+multiclass Y<bits<8> BaseOpc> {
+ def foo : Whatever<Or4<BaseOpc>.V >;
+}
+
+defm a : X<4>;
+
+// CHECK: def abar
+
+defm b : Y<8>;
+
+// CHECK: def bfoo
+// CHECK-NEXT: bits<8> W = { 0, 0, 1, 1, 0, 0, 0, 0 };

diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index 7a8cdd5..e0eb90a 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll

@@ -310,3 +310,17 @@
   store i32 %c, i32* %4, align 4
   ret void
 }
+
+; Check another case like PR13547 where strdup is not like malloc.
+; CHECK: @test25
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: store i8 %tmp
+define i8* @test25(i8* %p) nounwind {
+  %p.4 = getelementptr i8* %p, i64 4
+  %tmp = load i8* %p.4, align 1
+  store i8 0, i8* %p.4, align 1
+  %q = call i8* @strdup(i8* %p) nounwind optsize
+  store i8 %tmp, i8* %p.4, align 1
+  ret i8* %q
+}

diff --git a/test/Transforms/GVN/malloc-load-removal.ll b/test/Transforms/GVN/malloc-load-removal.ll
new file mode 100644
index 0000000..66b6929
--- /dev/null
+++ b/test/Transforms/GVN/malloc-load-removal.ll

@@ -0,0 +1,31 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; PR13694
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare i8* @malloc(i64) nounwind
+
+define noalias i8* @test() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @malloc(i64 100) nounwind
+  %0 = load i8* %call, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK: @test
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS: @test
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}

diff --git a/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
new file mode 100644
index 0000000..4efaf8c
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll

@@ -0,0 +1,57 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; rdar://12182093
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @udiv400
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @udiv400(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv400_no
+; CHECK: ashr
+; CHECK: div
+; CHECK: ret
+define i32 @udiv400_no(i32 %x) {
+entry:
+  %div = ashr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+; CHECK: @sdiv400_yes
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @sdiv400_yes(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  ; The sign bits of both operands are zero (i.e. we can prove they are
+  ; unsigned inputs), turn this into a udiv.
+  ; Next, optimize this just like sdiv.
+  %div1 = sdiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv_i80
+; CHECK: udiv i80 %x, 400
+; CHECK: ret
+define i80 @udiv_i80(i80 %x) {
+  %div = lshr i80 %x, 2
+  %div1 = udiv i80 %div, 100
+  ret i80 %div1
+}
+
+define i32 @no_crash_notconst_udiv(i32 %x, i32 %notconst) {
+  %div = lshr i32 %x, %notconst
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}

diff --git a/test/Transforms/InstCombine/fold-vector-select.ll b/test/Transforms/InstCombine/fold-vector-select.ll
index 3f22522..2cb970b 100644
--- a/test/Transforms/InstCombine/fold-vector-select.ll
+++ b/test/Transforms/InstCombine/fold-vector-select.ll

@@ -1,13 +1,148 @@
 ; RUN: opt < %s -instcombine -S | not grep select
 
-define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
- %r = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> zeroinitializer
- %g = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 6, i32 9, i32 1>
- %b = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>,  <4 x i32> zeroinitializer, <4 x i32> <i32 7, i32 1, i32 4, i32 9>
- %a = select <4 x i1> zeroinitializer,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 8, i32 5>
- store <4 x i32> %r, <4 x i32>* %A
- store <4 x i32> %g, <4 x i32>* %B
- store <4 x i32> %b, <4 x i32>* %C
- store <4 x i32> %a, <4 x i32>* %D
+define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D,
+                 <4 x i32> *%E, <4 x i32> *%F, <4 x i32> *%G, <4 x i32> *%H,
+                 <4 x i32> *%I, <4 x i32> *%J, <4 x i32> *%K, <4 x i32> *%L,
+                 <4 x i32> *%M, <4 x i32> *%N, <4 x i32> *%O, <4 x i32> *%P,
+                 <4 x i32> *%Q, <4 x i32> *%R, <4 x i32> *%S, <4 x i32> *%T,
+                 <4 x i32> *%U, <4 x i32> *%V, <4 x i32> *%W, <4 x i32> *%X,
+                 <4 x i32> *%Y, <4 x i32> *%Z, <4 x i32> *%BA, <4 x i32> *%BB,
+                 <4 x i32> *%BC, <4 x i32> *%BD, <4 x i32> *%BE, <4 x i32> *%BF,
+                 <4 x i32> *%BG, <4 x i32> *%BH, <4 x i32> *%BI, <4 x i32> *%BJ,
+                 <4 x i32> *%BK, <4 x i32> *%BL, <4 x i32> *%BM, <4 x i32> *%BN,
+                 <4 x i32> *%BO, <4 x i32> *%BP, <4 x i32> *%BQ, <4 x i32> *%BR,
+                 <4 x i32> *%BS, <4 x i32> *%BT, <4 x i32> *%BU, <4 x i32> *%BV,
+                 <4 x i32> *%BW, <4 x i32> *%BX, <4 x i32> *%BY, <4 x i32> *%BZ,
+                 <4 x i32> *%CA, <4 x i32> *%CB, <4 x i32> *%CC, <4 x i32> *%CD,
+                 <4 x i32> *%CE, <4 x i32> *%CF, <4 x i32> *%CG, <4 x i32> *%CH,
+                 <4 x i32> *%CI, <4 x i32> *%CJ, <4 x i32> *%CK, <4 x i32> *%CL) {
+ %a = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 9, i32 87, i32 57, i32 8>
+ %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 99, i32 49, i32 29>
+ %c = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 18, i32 53, i32 84>
+ %d = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 29, i32 82, i32 45, i32 16>
+ %e = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 11, i32 15, i32 32, i32 99>
+ %f = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 19, i32 86, i32 29, i32 33>
+ %g = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 10, i32 26, i32 45>
+ %h = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 88, i32 70, i32 90, i32 48>
+ %i = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 30, i32 53, i32 42, i32 12>
+ %j = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 46, i32 24, i32 93, i32 26>
+ %k = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 33, i32 99, i32 15, i32 57>
+ %l = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 51, i32 60, i32 60, i32 50>
+ %m = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 50, i32 12, i32 7, i32 45>
+ %n = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 65, i32 36, i32 36>
+ %o = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 54, i32 0, i32 17, i32 78>
+ %p = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 56, i32 13, i32 64, i32 48>
+ %q = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 52, i32 69, i32 88, i32 11>, <4 x i32> zeroinitializer
+ %r = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 5, i32 87, i32 68, i32 14>, <4 x i32> zeroinitializer
+ %s = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 47, i32 17, i32 66, i32 63>, <4 x i32> zeroinitializer
+ %t = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 64, i32 25, i32 73, i32 81>, <4 x i32> zeroinitializer
+ %u = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 51, i32 41, i32 61, i32 63>, <4 x i32> zeroinitializer
+ %v = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 39, i32 59, i32 17, i32 0>, <4 x i32> zeroinitializer
+ %w = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 91, i32 99, i32 97, i32 29>, <4 x i32> zeroinitializer
+ %x = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 89, i32 45, i32 89, i32 10>, <4 x i32> zeroinitializer
+ %y = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 25, i32 70, i32 21, i32 27>, <4 x i32> zeroinitializer
+ %z = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 40, i32 12, i32 27, i32 88>, <4 x i32> zeroinitializer
+ %ba = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 36, i32 35, i32 90, i32 23>, <4 x i32> zeroinitializer
+ %bb = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 83, i32 3, i32 64, i32 82>, <4 x i32> zeroinitializer
+ %bc = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 15, i32 72, i32 2, i32 54>, <4 x i32> zeroinitializer
+ %bd = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 32, i32 47, i32 100, i32 84>, <4 x i32> zeroinitializer
+ %be = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 92, i32 57, i32 82, i32 1>, <4 x i32> zeroinitializer
+ %bf = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 42, i32 14, i32 22, i32 89>, <4 x i32> zeroinitializer
+ %bg = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 33, i32 10, i32 67, i32 66>, <4 x i32> <i32 42, i32 91, i32 47, i32 40>
+ %bh = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 8, i32 13, i32 48, i32 0>, <4 x i32> <i32 84, i32 66, i32 87, i32 84>
+ %bi = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 85, i32 96, i32 1, i32 94>, <4 x i32> <i32 54, i32 57, i32 7, i32 92>
+ %bj = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 55, i32 21, i32 92, i32 68>, <4 x i32> <i32 51, i32 61, i32 62, i32 39>
+ %bk = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 42, i32 18, i32 77, i32 74>, <4 x i32> <i32 82, i32 33, i32 30, i32 7>
+ %bl = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 80, i32 92, i32 61, i32 84>, <4 x i32> <i32 43, i32 89, i32 92, i32 6>
+ %bm = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 49, i32 14, i32 62, i32 62>, <4 x i32> <i32 35, i32 33, i32 92, i32 59>
+ %bn = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 3, i32 97, i32 49, i32 18>, <4 x i32> <i32 56, i32 64, i32 19, i32 75>
+ %bo = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 91, i32 57, i32 0, i32 1>, <4 x i32> <i32 43, i32 63, i32 64, i32 11>
+ %bp = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 41, i32 65, i32 18, i32 11>, <4 x i32> <i32 86, i32 26, i32 31, i32 3>
+ %bq = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 31, i32 46, i32 32, i32 68>, <4 x i32> <i32 100, i32 59, i32 62, i32 6>
+ %br = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 76, i32 67, i32 87, i32 7>, <4 x i32> <i32 63, i32 48, i32 97, i32 24>
+ %bs = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 83, i32 89, i32 19, i32 4>, <4 x i32> <i32 21, i32 2, i32 40, i32 21>
+ %bt = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 45, i32 76, i32 81, i32 100>, <4 x i32> <i32 65, i32 26, i32 100, i32 46>
+ %bu = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 16, i32 75, i32 31, i32 17>, <4 x i32> <i32 37, i32 66, i32 86, i32 65>
+ %bv = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 13, i32 25, i32 43, i32 59>, <4 x i32> <i32 82, i32 78, i32 60, i32 52>
+ %bw = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bx = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %by = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bz = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ca = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cb = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cc = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cd = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ce = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cf = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cg = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ch = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ci = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cj = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ck = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cl = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ store <4 x i32> %a, <4 x i32>* %A
+ store <4 x i32> %b, <4 x i32>* %B
+ store <4 x i32> %c, <4 x i32>* %C
+ store <4 x i32> %d, <4 x i32>* %D
+ store <4 x i32> %e, <4 x i32>* %E
+ store <4 x i32> %f, <4 x i32>* %F
+ store <4 x i32> %g, <4 x i32>* %G
+ store <4 x i32> %h, <4 x i32>* %H
+ store <4 x i32> %i, <4 x i32>* %I
+ store <4 x i32> %j, <4 x i32>* %J
+ store <4 x i32> %k, <4 x i32>* %K
+ store <4 x i32> %l, <4 x i32>* %L
+ store <4 x i32> %m, <4 x i32>* %M
+ store <4 x i32> %n, <4 x i32>* %N
+ store <4 x i32> %o, <4 x i32>* %O
+ store <4 x i32> %p, <4 x i32>* %P
+ store <4 x i32> %q, <4 x i32>* %Q
+ store <4 x i32> %r, <4 x i32>* %R
+ store <4 x i32> %s, <4 x i32>* %S
+ store <4 x i32> %t, <4 x i32>* %T
+ store <4 x i32> %u, <4 x i32>* %U
+ store <4 x i32> %v, <4 x i32>* %V
+ store <4 x i32> %w, <4 x i32>* %W
+ store <4 x i32> %x, <4 x i32>* %X
+ store <4 x i32> %y, <4 x i32>* %Y
+ store <4 x i32> %z, <4 x i32>* %Z
+ store <4 x i32> %ba, <4 x i32>* %BA
+ store <4 x i32> %bb, <4 x i32>* %BB
+ store <4 x i32> %bc, <4 x i32>* %BC
+ store <4 x i32> %bd, <4 x i32>* %BD
+ store <4 x i32> %be, <4 x i32>* %BE
+ store <4 x i32> %bf, <4 x i32>* %BF
+ store <4 x i32> %bg, <4 x i32>* %BG
+ store <4 x i32> %bh, <4 x i32>* %BH
+ store <4 x i32> %bi, <4 x i32>* %BI
+ store <4 x i32> %bj, <4 x i32>* %BJ
+ store <4 x i32> %bk, <4 x i32>* %BK
+ store <4 x i32> %bl, <4 x i32>* %BL
+ store <4 x i32> %bm, <4 x i32>* %BM
+ store <4 x i32> %bn, <4 x i32>* %BN
+ store <4 x i32> %bo, <4 x i32>* %BO
+ store <4 x i32> %bp, <4 x i32>* %BP
+ store <4 x i32> %bq, <4 x i32>* %BQ
+ store <4 x i32> %br, <4 x i32>* %BR
+ store <4 x i32> %bs, <4 x i32>* %BS
+ store <4 x i32> %bt, <4 x i32>* %BT
+ store <4 x i32> %bu, <4 x i32>* %BU
+ store <4 x i32> %bv, <4 x i32>* %BV
+ store <4 x i32> %bw, <4 x i32>* %BW
+ store <4 x i32> %bx, <4 x i32>* %BX
+ store <4 x i32> %by, <4 x i32>* %BY
+ store <4 x i32> %bz, <4 x i32>* %BZ
+ store <4 x i32> %ca, <4 x i32>* %CA
+ store <4 x i32> %cb, <4 x i32>* %CB
+ store <4 x i32> %cc, <4 x i32>* %CC
+ store <4 x i32> %cd, <4 x i32>* %CD
+ store <4 x i32> %ce, <4 x i32>* %CE
+ store <4 x i32> %cf, <4 x i32>* %CF
+ store <4 x i32> %cg, <4 x i32>* %CG
+ store <4 x i32> %ch, <4 x i32>* %CH
+ store <4 x i32> %ci, <4 x i32>* %CI
+ store <4 x i32> %cj, <4 x i32>* %CJ
+ store <4 x i32> %ck, <4 x i32>* %CK
+ store <4 x i32> %cl, <4 x i32>* %CL
  ret void
 }

diff --git a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
index d95e8f8..74f2fdd 100644
--- a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
+++ b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll

@@ -6,9 +6,9 @@
 ; The udiv instructions shouldn't be optimized away, and the
 ; sext instructions should be optimized to zext.
 
-define i64 @bar(i32 %x) nounwind {
+define i64 @bar(i32 %x, i32 %g) nounwind {
   %y = lshr i32 %x, 30
-  %r = udiv i32 %y, 3
+  %r = udiv i32 %y, %g
   %z = sext i32 %r to i64
   ret i64 %z
 }

diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 8a81857..9676efe 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll

@@ -121,3 +121,39 @@
   call void @quux()
   br label %L0
 }
+
+; Make sure the edge value of %0 from entry to L2 includes 0 and L3 is
+; reachable.
+; CHECK: test_switch_default
+; CHECK: entry:
+; CHECK: load
+; CHECK: switch
+; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
+; CHECK: store
+; CHECK: br
+; CHECK: L2:
+; CHECK: icmp
+define void @test_switch_default(i32* nocapture %status) nounwind {
+entry:
+  %0 = load i32* %status, align 4
+  switch i32 %0, label %L2 [
+    i32 5061, label %L1
+    i32 0, label %L2
+  ]
+
+L1:
+  store i32 10025, i32* %status, align 4
+  br label %L2
+
+L2:
+  %1 = load i32* %status, align 4
+  %cmp57.i = icmp eq i32 %1, 0
+  br i1 %cmp57.i, label %L3, label %L4
+
+L3:
+  store i32 10000, i32* %status, align 4
+  br label %L4
+
+L4:
+  ret void
+}

diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 6f28d53..98f9334 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll

@@ -29,7 +29,7 @@
 }
 
 
-declare void @foo2(i32)
+declare void @foo2(i32) nounwind
 
 
 ;; It is ok and desirable to hoist this potentially trapping instruction.
@@ -64,3 +64,29 @@
 	%C = sub i32 %A, %B		; <i32> [#uses=1]
 	ret i32 %C
 }
+
+; CHECK: @test4
+; CHECK: call
+; CHECK: sdiv
+; CHECK: ret
+define i32 @test4(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %n.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  call void @foo_may_call_exit(i32 0)
+  %div = sdiv i32 %x, %y
+  %add = add nsw i32 %n.01, %div
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %n.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %n.0.lcssa
+}
+
+declare void @foo_may_call_exit(i32)
+

diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
new file mode 100644
index 0000000..675d71f
--- /dev/null
+++ b/test/Transforms/LoopRotate/multiple-exits.ll

@@ -0,0 +1,236 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; PR7447
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+
+; CHECK: @test1
+; CHECK: for.cond1.preheader:
+; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
+; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
+; CHECK: %cmp2 = icmp ult i32 %i.1, 100
+; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
+}
+
+define void @test2(i32 %x) nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp eq i32 %i.0, %x
+  br i1 %cmp, label %return.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i32 @foo(i32 %i.0) nounwind
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %a
+
+if.end:                                           ; preds = %for.body
+  %call1 = tail call i32 @foo(i32 42) nounwind
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+a:                                                ; preds = %for.body
+  %call2 = tail call i32 @bar(i32 1) nounwind
+  br label %return
+
+return.loopexit:                                  ; preds = %for.cond
+  br label %return
+
+return:                                           ; preds = %return.loopexit, %a
+  ret void
+
+; CHECK: @test2
+; CHECK: if.end:
+; CHECK: %inc = add i32 %i.02, 1
+; CHECK: %cmp = icmp eq i32 %inc, %x
+; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+}
+
+declare i32 @foo(i32)
+
+declare i32 @bar(i32)
+
+@_ZTIi = external constant i8*
+
+; Verify dominators.
+define void @test3(i32 %x) {
+entry:
+  %cmp2 = icmp eq i32 0, %x
+  br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  invoke void @_Z3fooi(i32 %i.03)
+          to label %for.inc unwind label %lpad
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.03, 1
+  %cmp = icmp eq i32 %inc, %x
+  br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body
+
+lpad:                                             ; preds = %for.body
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %4 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph
+
+for.body.i.lr.ph:                                 ; preds = %catch
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i.lr.ph, %for.inc.i
+  %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ]
+  invoke void @_Z3fooi(i32 %i.0.i1)
+          to label %for.inc.i unwind label %lpad.i
+
+for.inc.i:                                        ; preds = %for.body.i
+  %inc.i = add i32 %i.0.i1, 1
+  %cmp.i = icmp eq i32 %inc.i, 0
+  br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i
+
+lpad.i:                                           ; preds = %for.body.i
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  %7 = extractvalue { i8*, i32 } %5, 1
+  %matches.i = icmp eq i32 %7, %3
+  br i1 %matches.i, label %catch.i, label %lpad1.body
+
+catch.i:                                          ; preds = %lpad.i
+  %8 = tail call i8* @__cxa_begin_catch(i8* %6) nounwind
+  invoke void @test3(i32 0)
+          to label %invoke.cont2.i unwind label %lpad1.i
+
+invoke.cont2.i:                                   ; preds = %catch.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %invoke.cont2
+
+lpad1.i:                                          ; preds = %catch.i
+  %9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %10 = extractvalue { i8*, i32 } %9, 0
+  %11 = extractvalue { i8*, i32 } %9, 1
+  tail call void @__cxa_end_catch() nounwind
+  br label %lpad1.body
+
+for.cond.i.invoke.cont2.loopexit_crit_edge:       ; preds = %for.inc.i
+  br label %invoke.cont2.loopexit
+
+invoke.cont2.loopexit:                            ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %invoke.cont2.loopexit, %invoke.cont2.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+for.cond.try.cont.loopexit_crit_edge:             ; preds = %for.inc
+  br label %try.cont.loopexit
+
+try.cont.loopexit:                                ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry
+  br label %try.cont
+
+try.cont:                                         ; preds = %try.cont.loopexit, %invoke.cont2
+  ret void
+
+lpad1.body:                                       ; preds = %lpad1.i, %lpad.i
+  %exn.slot.0.i = phi i8* [ %10, %lpad1.i ], [ %6, %lpad.i ]
+  %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ]
+  tail call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1.body, %lpad
+  %exn.slot.0 = phi i8* [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ]
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+declare void @_Z3fooi(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test4() nounwind uwtable {
+entry:
+  br label %"7"
+
+"3":                                              ; preds = %"7"
+  br i1 undef, label %"31", label %"4"
+
+"4":                                              ; preds = %"3"
+  %. = select i1 undef, float 0x3F50624DE0000000, float undef
+  %0 = add i32 %1, 1
+  br label %"7"
+
+"7":                                              ; preds = %"4", %entry
+  %1 = phi i32 [ %0, %"4" ], [ 0, %entry ]
+  %2 = icmp slt i32 %1, 100
+  br i1 %2, label %"3", label %"8"
+
+"8":                                              ; preds = %"7"
+  br i1 undef, label %"9", label %"31"
+
+"9":                                              ; preds = %"8"
+  br label %"33"
+
+"27":                                             ; preds = %"31"
+  unreachable
+
+"31":                                             ; preds = %"8", %"3"
+  br i1 undef, label %"27", label %"32"
+
+"32":                                             ; preds = %"31"
+  br label %"33"
+
+"33":                                             ; preds = %"32", %"9"
+  ret void
+}

diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 0a7ba5d..7b64b1b 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll

@@ -1,4 +1,4 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
+; RUN: opt -basicaa -objc-arc -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
 
@@ -1498,7 +1498,7 @@
 }
 
 ; Do delete retain+release with intervening stores of the
-; address value;
+; address value.
 
 ; CHECK: define void @test50(
 ; CHECK-NOT: @objc_

diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index a618a21..32be03e 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll

@@ -16,6 +16,10 @@
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 declare void @use(i8*)
 declare void @objc_release(i8*)
+declare i8* @def()
+declare void @__crasher_block_invoke(i8* nocapture)
+declare i8* @objc_retainBlock(i8*)
+declare void @__crasher_block_invoke1(i8* nocapture)
 
 !0 = metadata !{}
 
@@ -279,11 +283,13 @@
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test6(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test6() nounwind {
 entry:
@@ -345,11 +351,13 @@
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test7(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test7() nounwind {
 entry:
@@ -553,12 +561,12 @@
   ret void
 }
 
-; Like test9, but without a split backedge. This we can optimize.
+; Like test9, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test9b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test9b() nounwind {
 entry:
@@ -687,12 +695,12 @@
   ret void
 }
 
-; Like test10, but without a split backedge. This we can optimize.
+; Like test10, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test10b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test10b() nounwind {
 entry:
@@ -751,3 +759,64 @@
   call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; Pointers to strong pointers can obscure provenance relationships. Be conservative
+; in the face of escaping pointers. rdar://12150909.
+
+%struct.__block_d = type { i64, i64 }
+
+@_NSConcreteStackBlock = external global i8*
+@__block_d_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+@__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+
+; CHECK: define void @test11(
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+; CHECK: }
+define void @test11() {
+entry:
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %block9 = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %call = call i8* @def(), !clang.arc.no_objc_arc_exceptions !0
+  %foo = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 5
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke to i8*), i8** %block.invoke, align 8
+  %block.d = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp to %struct.__block_d*), %struct.__block_d** %block.d, align 8
+  %foo2 = tail call i8* @objc_retain(i8* %call) nounwind
+  store i8* %foo2, i8** %foo, align 8
+  %foo4 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block to i8*
+  %foo5 = call i8* @objc_retainBlock(i8* %foo4) nounwind
+  call void @use(i8* %foo5), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo5) nounwind
+  %strongdestroy = load i8** %foo, align 8
+  call void @objc_release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
+  %foo10 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 5
+  %block.isa11 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa11, align 8
+  %block.flags12 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags12, align 8
+  %block.reserved13 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 2
+  store i32 0, i32* %block.reserved13, align 4
+  %block.invoke14 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke1 to i8*), i8** %block.invoke14, align 8
+  %block.d15 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp5 to %struct.__block_d*), %struct.__block_d** %block.d15, align 8
+  %foo18 = call i8* @objc_retain(i8* %call) nounwind
+  store i8* %call, i8** %foo10, align 8
+  %foo20 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9 to i8*
+  %foo21 = call i8* @objc_retainBlock(i8* %foo20) nounwind
+  call void @use(i8* %foo21), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo21) nounwind
+  %strongdestroy25 = load i8** %foo10, align 8
+  call void @objc_release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
+  call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  ret void
+}

diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
new file mode 100644
index 0000000..53d5448
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll

@@ -0,0 +1,37 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+; This test case was written to trigger an incorrect assert statement in
+; -simplifycfg.  Thus we don't actually want to check the output, just that
+; -simplifycfg ran successfully.  Thus we only check that the function still
+; exists, and that it still calls foo().
+;
+; NOTE: There are some obviously dead blocks and missing branch weight
+;       metadata.  Both of these features were key to triggering the assert.
+;       Additionally, the not-taken weight of the branch with a weight had to
+;       be 0 to trigger the assert.
+
+declare void @foo() nounwind uwtable
+
+define void @func(i32 %A) nounwind uwtable {
+; CHECK: define void @func
+entry:
+  %cmp11 = icmp eq i32 %A, 1
+  br i1 %cmp11, label %if.then, label %if.else, !prof !0
+
+if.then:
+  call void @foo()
+; CHECK: call void @foo()
+  br label %if.else
+
+if.else:
+  %cmp17 = icmp eq i32 %A, 2
+  br i1 %cmp17, label %if.then2, label %if.end
+
+if.then2:
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 0}

diff --git a/test/Transforms/SimplifyCFG/switch_create.ll b/test/Transforms/SimplifyCFG/switch_create.ll
index 546cc75..b28e4a4 100644
--- a/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/test/Transforms/SimplifyCFG/switch_create.ll

@@ -141,8 +141,9 @@
         ret i1 %UnifiedRetVal
         
 ; CHECK: @test6
-; CHECK: %tmp.2.i.off = add i32 %tmp.2.i, -14
-; CHECK: %switch = icmp ult i32 %tmp.2.i.off, 6
+; CHECK: %switch.tableidx = sub i32 %tmp.2.i, 14
+; CHECK: %0 = icmp ult i32 %switch.tableidx, 6
+; CHECK: select i1 %0, i1 true, i1 false
 }
 
 define void @test7(i8 zeroext %c, i32 %x) nounwind ssp noredzone {

diff --git a/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll
new file mode 100644
index 0000000..414da93
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll

@@ -0,0 +1,140 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The table for @f
+; CHECK: @switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
+
+; The int table for @h
+; CHECK: @switch.table1 = private unnamed_addr constant [4 x i8] c"*\09X\05"
+
+; The float table for @h
+; CHECK: @switch.table2 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
+
+; The table for @foostring
+; CHECK: @switch.table3 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
+
+; A simple int-to-int selection switch.
+; It is dense enough to be replaced by table lookup.
+; The result is directly by a ret from an otherwise empty bb,
+; so we return early, directly from the lookup bb.
+
+define i32 @f(i32 %c) nounwind uwtable readnone {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 42
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 7
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: return:
+; CHECK-NEXT: ret i32 15
+}
+
+; A switch used to initialize two variables, an i8 and a float.
+
+declare void @dummy(i8 signext, float)
+define void @h(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i8 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi float [ 0x4023FAE140000000, %sw.default ], [ 0x4001AE1480000000, %sw.bb3 ], [ 0x4012449BA0000000, %sw.bb2 ], [ 0x3FF3BE76C0000000, %sw.bb1 ], [ 0x40091EB860000000, %entry ]
+  call void @dummy(i8 signext %a.0, float %b.0)
+  ret void
+
+; CHECK: @h
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %sw.epilog
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i8]* @switch.table1, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i8* %switch.gep
+; CHECK-NEXT: %switch.gep1 = getelementptr inbounds [4 x float]* @switch.table2, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load2 = load float* %switch.gep1
+; CHECK-NEXT: br label %sw.epilog
+; CHECK: sw.epilog:
+; CHECK-NEXT: %a.0 = phi i8 [ %switch.load, %switch.lookup ], [ 7, %entry ]
+; CHECK-NEXT: %b.0 = phi float [ %switch.load2, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
+; CHECK-NEXT: call void @dummy(i8 signext %a.0, float %b.0)
+; CHECK-NEXT: ret void
+}
+
+
+; Switch used to return a string.
+
+@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
+@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
+@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
+
+define i8* @foostring(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+
+return:
+  %retval.0 = phi i8* [ getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0), %sw.default ],
+                      [ getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0), %sw.bb3 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), %sw.bb2 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), %sw.bb1 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), %entry ]
+  ret i8* %retval.0
+
+; CHECK: @foostring
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table3, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i8** %switch.gep
+; CHECK-NEXT: ret i8* %switch.load
+}

diff --git a/test/Transforms/SimplifyLibCalls/double-float-shrink.ll b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
new file mode 100644
index 0000000..b4ab8b4
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll

@@ -0,0 +1,333 @@
+; RUN: opt  < %s -simplify-libcalls -enable-double-float-shrink -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @acos_test(float %f) nounwind readnone {
+; CHECK: acos_test
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acosf(float %f)
+}
+
+define double @acos_test2(float %f) nounwind readnone {
+; CHECK: acos_test2
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    ret double %call
+; CHECK: call double @acos(double %conv)
+}
+
+define float @acosh_test(float %f) nounwind readnone {
+; CHECK: acosh_test
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acoshf(float %f)
+}
+
+define double @acosh_test2(float %f) nounwind readnone {
+; CHECK: acosh_test2
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    ret double %call
+; CHECK: call double @acosh(double %conv)
+}
+
+define float @asin_test(float %f) nounwind readnone {
+; CHECK: asin_test
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinf(float %f)
+}
+
+define double @asin_test2(float %f) nounwind readnone {
+; CHECK: asin_test2
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    ret double %call
+; CHECK: call double @asin(double %conv)
+}
+
+define float @asinh_test(float %f) nounwind readnone {
+; CHECK: asinh_test
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinhf(float %f)
+}
+
+define double @asinh_test2(float %f) nounwind readnone {
+; CHECK: asinh_test2
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    ret double %call
+; CHECK: call double @asinh(double %conv)
+}
+
+define float @atan_test(float %f) nounwind readnone {
+; CHECK: atan_test
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanf(float %f)
+}
+
+define double @atan_test2(float %f) nounwind readnone {
+; CHECK: atan_test2
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    ret double %call
+; CHECK: call double @atan(double %conv)
+}
+define float @atanh_test(float %f) nounwind readnone {
+; CHECK: atanh_test
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanhf(float %f)
+}
+
+define double @atanh_test2(float %f) nounwind readnone {
+; CHECK: atanh_test2
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    ret double %call
+; CHECK: call double @atanh(double %conv)
+}
+define float @cbrt_test(float %f) nounwind readnone {
+; CHECK: cbrt_test
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @cbrtf(float %f)
+}
+
+define double @cbrt_test2(float %f) nounwind readnone {
+; CHECK: cbrt_test2
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    ret double %call
+; CHECK: call double @cbrt(double %conv)
+}
+define float @exp_test(float %f) nounwind readnone {
+; CHECK: exp_test
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expf(float %f)
+}
+
+define double @exp_test2(float %f) nounwind readnone {
+; CHECK: exp_test2
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    ret double %call
+; CHECK: call double @exp(double %conv)
+}
+define float @expm1_test(float %f) nounwind readnone {
+; CHECK: expm1_test
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expm1f(float %f)
+}
+
+define double @expm1_test2(float %f) nounwind readnone {
+; CHECK: expm1_test2
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    ret double %call
+; CHECK: call double @expm1(double %conv)
+}
+define float @exp10_test(float %f) nounwind readnone {
+; CHECK: exp10_test
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @exp10f(float %f)
+}
+
+define double @exp10_test2(float %f) nounwind readnone {
+; CHECK: exp10_test2
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    ret double %call
+; CHECK: call double @exp10(double %conv)
+}
+define float @log_test(float %f) nounwind readnone {
+; CHECK: log_test
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logf(float %f)
+}
+
+define double @log_test2(float %f) nounwind readnone {
+; CHECK: log_test2
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    ret double %call
+; CHECK: call double @log(double %conv)
+}
+define float @log10_test(float %f) nounwind readnone {
+; CHECK: log10_test
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log10f(float %f)
+}
+
+define double @log10_test2(float %f) nounwind readnone {
+; CHECK: log10_test2
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    ret double %call
+; CHECK: call double @log10(double %conv)
+}
+define float @log1p_test(float %f) nounwind readnone {
+; CHECK: log1p_test
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log1pf(float %f)
+}
+
+define double @log1p_test2(float %f) nounwind readnone {
+; CHECK: log1p_test2
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    ret double %call
+; CHECK: call double @log1p(double %conv)
+}
+define float @log2_test(float %f) nounwind readnone {
+; CHECK: log2_test
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log2f(float %f)
+}
+
+define double @log2_test2(float %f) nounwind readnone {
+; CHECK: log2_test2
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    ret double %call
+; CHECK: call double @log2(double %conv)
+}
+define float @logb_test(float %f) nounwind readnone {
+; CHECK: logb_test
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logbf(float %f)
+}
+
+define double @logb_test2(float %f) nounwind readnone {
+; CHECK: logb_test2
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    ret double %call
+; CHECK: call double @logb(double %conv)
+}
+define float @sin_test(float %f) nounwind readnone {
+; CHECK: sin_test
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sinf(float %f)
+}
+
+define double @sin_test2(float %f) nounwind readnone {
+; CHECK: sin_test2
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    ret double %call
+; CHECK: call double @sin(double %conv)
+}
+define float @sqrt_test(float %f) nounwind readnone {
+; CHECK: sqrt_test
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sqrtf(float %f)
+}
+
+define double @sqrt_test2(float %f) nounwind readnone {
+; CHECK: sqrt_test2
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    ret double %call
+; CHECK: call double @sqrt(double %conv)
+}
+define float @tan_test(float %f) nounwind readnone {
+; CHECK: tan_test
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanf(float %f)
+}
+
+define double @tan_test2(float %f) nounwind readnone {
+; CHECK: tan_test2
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    ret double %call
+; CHECK: call double @tan(double %conv)
+}
+define float @tanh_test(float %f) nounwind readnone {
+; CHECK: tanh_test
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanhf(float %f)
+}
+
+define double @tanh_test2(float %f) nounwind readnone {
+; CHECK: tanh_test2
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    ret double %call
+; CHECK: call double @tanh(double %conv)
+}
+
+declare double @tanh(double) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @sqrt(double) nounwind readnone
+declare double @sin(double) nounwind readnone
+declare double @log2(double) nounwind readnone
+declare double @log1p(double) nounwind readnone
+declare double @log10(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @logb(double) nounwind readnone
+declare double @exp10(double) nounwind readnone
+declare double @expm1(double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @cbrt(double) nounwind readnone
+declare double @atanh(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @acos(double) nounwind readnone
+declare double @acosh(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @asinh(double) nounwind readnone

diff --git a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
new file mode 100644
index 0000000..aecb887
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll

@@ -0,0 +1,179 @@
+; RUN: opt -S -simplify-libcalls -instcombine %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @test1(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @ceil(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test1
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test2(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @fabs(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test2
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test3(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @floor(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test3
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test4(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @nearbyint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test4
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test5(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @rint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test5
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test6(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @round(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test6
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test7(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @trunc(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test7
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+define i32 @test8(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @ceil(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test8
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test9(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fabs(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test9
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test10(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @floor(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test10
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test11(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @nearbyint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test11
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test12(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @rint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test12
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test13(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @round(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test13
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test14(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @trunc(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test14
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+declare double @fabs(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare double @rint(double) nounwind readnone
+declare double @round(double) nounwind readnone
+declare double @trunc(double) nounwind readnone

diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 02f66d7..496e31c 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile

@@ -24,6 +24,8 @@
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
 CXXFLAGS += -I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
-CXXFLAGS += -L$(SharedLibDir)/$(SharedPrefix) -lLTO
+LDFLAGS += -L$(SharedLibDir)/$(SharedPrefix)
 
 include $(LEVEL)/Makefile.common
+
+LIBS += -lLTO

diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index a5d2e61..68cb921 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt

@@ -19,4 +19,6 @@
 
 add_llvm_tool(lli
   lli.cpp
+  RecordingMemoryManager.cpp
+  RemoteTarget.cpp
   )

diff --git a/tools/lli/RecordingMemoryManager.cpp b/tools/lli/RecordingMemoryManager.cpp
new file mode 100644
index 0000000..9e1cff5
--- /dev/null
+++ b/tools/lli/RecordingMemoryManager.cpp

@@ -0,0 +1,87 @@
+//===- RecordingMemoryManager.cpp - Recording memory manager --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RecordingMemoryManager.h"
+using namespace llvm;
+
+uint8_t *RecordingMemoryManager::
+allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here.
+  void *Addr = malloc(Size);
+  assert(Addr && "malloc() failure!");
+  sys::MemoryBlock Block(Addr, Size);
+  AllocatedCodeMem.push_back(Allocation(Block, Alignment));
+  return (uint8_t*)Addr;
+}
+
+uint8_t *RecordingMemoryManager::
+allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here.
+  void *Addr = malloc(Size);
+  assert(Addr && "malloc() failure!");
+  sys::MemoryBlock Block(Addr, Size);
+  AllocatedDataMem.push_back(Allocation(Block, Alignment));
+  return (uint8_t*)Addr;
+}
+void RecordingMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
+uint8_t *RecordingMemoryManager::getGOTBase() const {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
+                                              unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                                             uint8_t *FunctionEnd) {
+  llvm_unreachable("Unexpected!");
+}
+uint8_t *RecordingMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::deallocateFunctionBody(void *Body) {
+  llvm_unreachable("Unexpected!");
+}
+uint8_t* RecordingMemoryManager::startExceptionTable(const Function* F, uintptr_t &ActualSize) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::endExceptionTable(const Function *F, uint8_t *TableStart,
+                                               uint8_t *TableEnd, uint8_t* FrameRegister) {
+  llvm_unreachable("Unexpected!");
+}
+void RecordingMemoryManager::deallocateExceptionTable(void *ET) {
+  llvm_unreachable("Unexpected!");
+}
+void *RecordingMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                        bool AbortOnFailure) {
+  return NULL;
+}

diff --git a/tools/lli/RecordingMemoryManager.h b/tools/lli/RecordingMemoryManager.h
new file mode 100644
index 0000000..1590235
--- /dev/null
+++ b/tools/lli/RecordingMemoryManager.h

@@ -0,0 +1,78 @@
+//===- RecordingMemoryManager.h - LLI MCJIT recording memory manager ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RECORDINGMEMORYMANAGER_H
+#define RECORDINGMEMORYMANAGER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include <utility>
+
+namespace llvm {
+
+class RecordingMemoryManager : public JITMemoryManager {
+public:
+  typedef std::pair<sys::MemoryBlock, unsigned> Allocation;
+
+private:
+  SmallVector<Allocation, 16> AllocatedDataMem;
+  SmallVector<Allocation, 16> AllocatedCodeMem;
+
+public:
+  RecordingMemoryManager() {}
+  virtual ~RecordingMemoryManager() {}
+
+  typedef SmallVectorImpl<Allocation>::const_iterator const_data_iterator;
+  typedef SmallVectorImpl<Allocation>::const_iterator const_code_iterator;
+
+  const_data_iterator data_begin() const { return AllocatedDataMem.begin(); }
+  const_data_iterator   data_end() const { return AllocatedDataMem.end(); }
+  const_code_iterator code_begin() const { return AllocatedCodeMem.begin(); }
+  const_code_iterator   code_end() const { return AllocatedCodeMem.end(); }
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true);
+  // The following obsolete JITMemoryManager calls are stubbed out for
+  // this model.
+  void setMemoryWritable();
+  void setMemoryExecutable();
+  void setPoisonMemory(bool poison);
+  void AllocateGOT();
+  uint8_t *getGOTBase() const;
+  uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize);
+  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                        unsigned Alignment);
+  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                       uint8_t *FunctionEnd);
+  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment);
+  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment);
+  void deallocateFunctionBody(void *Body);
+  uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize);
+  void endExceptionTable(const Function *F, uint8_t *TableStart,
+                         uint8_t *TableEnd, uint8_t* FrameRegister);
+  void deallocateExceptionTable(void *ET);
+
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/tools/lli/RemoteTarget.cpp b/tools/lli/RemoteTarget.cpp
new file mode 100644
index 0000000..918f157
--- /dev/null
+++ b/tools/lli/RemoteTarget.cpp

@@ -0,0 +1,61 @@
+//===- RemoteTarget.cpp - LLVM Remote process JIT execution --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the RemoteTarget class which executes JITed code in a
+// separate address range from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RemoteTarget.h"
+#include <llvm/ADT/StringRef.h>
+#include <llvm/Support/DataTypes.h>
+#include <llvm/Support/Memory.h>
+#include <stdlib.h>
+#include <string>
+using namespace llvm;
+
+bool RemoteTarget::allocateSpace(size_t Size, unsigned Alignment,
+                                 uint64_t &Address) {
+  sys::MemoryBlock *Prev = Allocations.size() ? &Allocations.back() : NULL;
+  sys::MemoryBlock Mem = sys::Memory::AllocateRWX(Size, Prev, &ErrorMsg);
+  if (Mem.base() == NULL)
+    return true;
+  if ((uintptr_t)Mem.base() % Alignment) {
+    ErrorMsg = "unable to allocate sufficiently aligned memory";
+    return true;
+  }
+  Address = reinterpret_cast<uint64_t>(Mem.base());
+  return false;
+}
+
+bool RemoteTarget::loadData(uint64_t Address, const void *Data, size_t Size) {
+  memcpy ((void*)Address, Data, Size);
+  sys::MemoryBlock Mem((void*)Address, Size);
+  sys::Memory::setExecutable(Mem, &ErrorMsg);
+  return false;
+}
+
+bool RemoteTarget::loadCode(uint64_t Address, const void *Data, size_t Size) {
+  memcpy ((void*)Address, Data, Size);
+  return false;
+}
+
+bool RemoteTarget::executeCode(uint64_t Address, int &RetVal) {
+  int (*fn)(void) = (int(*)(void))Address;
+  RetVal = fn();
+  return false;
+}
+
+void RemoteTarget::create() {
+}
+
+void RemoteTarget::stop() {
+  for (unsigned i = 0, e = Allocations.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(Allocations[i]);
+}

diff --git a/tools/lli/RemoteTarget.h b/tools/lli/RemoteTarget.h
new file mode 100644
index 0000000..c584526
--- /dev/null
+++ b/tools/lli/RemoteTarget.h

@@ -0,0 +1,101 @@
+//===- RemoteTarget.h - LLVM Remote process JIT execution ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the RemoteTarget class which executes JITed code in a
+// separate address range from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef REMOTEPROCESS_H
+#define REMOTEPROCESS_H
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/SmallVector.h>
+#include <llvm/Support/DataTypes.h>
+#include <llvm/Support/Memory.h>
+#include <stdlib.h>
+#include <string>
+
+namespace llvm {
+
+class RemoteTarget {
+  std::string ErrorMsg;
+  bool IsRunning;
+
+  SmallVector<sys::MemoryBlock, 16> Allocations;
+
+public:
+  StringRef getErrorMsg() const { return ErrorMsg; }
+
+  /// Allocate space in the remote target address space.
+  ///
+  /// @param      Size      Amount of space, in bytes, to allocate.
+  /// @param      Alignment Required minimum alignment for allocated space.
+  /// @param[out] Address   Remote address of the allocated memory.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool allocateSpace(size_t Size, unsigned Alignment, uint64_t &Address);
+
+  /// Load data into the target address space.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool loadData(uint64_t Address, const void *Data, size_t Size);
+
+  /// Load code into the target address space and prepare it for execution.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool loadCode(uint64_t Address, const void *Data, size_t Size);
+
+  /// Execute code in the target process. The called function is required
+  /// to be of signature int "(*)(void)".
+  ///
+  /// @param      Address   Address of the loaded function in the target
+  ///                       process.
+  /// @param[out] RetVal    The integer return value of the called function.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool executeCode(uint64_t Address, int &RetVal);
+
+  /// Minimum alignment for memory permissions. Used to seperate code and
+  /// data regions to make sure data doesn't get marked as code or vice
+  /// versa.
+  ///
+  /// @returns Page alignment return value. Default of 4k.
+  unsigned getPageAlignment() { return 4096; }
+
+  /// Start the remote process.
+  void create();
+
+  /// Terminate the remote process.
+  void stop();
+
+  RemoteTarget() : ErrorMsg(""), IsRunning(false) {}
+  ~RemoteTarget() { if (IsRunning) stop(); }
+
+private:
+  // Main processing function for the remote target process. Command messages
+  // are received on file descriptor CmdFD and responses come back on OutFD.
+  static void doRemoteTargeting(int CmdFD, int OutFD);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index b6c9299..4004b6c 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp

@@ -13,6 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "lli"
+#include "RecordingMemoryManager.h"
+#include "RemoteTarget.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
@@ -32,9 +35,11 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Memory.h"
 #include <cerrno>
@@ -73,6 +78,13 @@
     "use-mcjit", cl::desc("Enable use of the MC-based JIT (if available)"),
     cl::init(false));
 
+  // The MCJIT supports building for a target address space separate from
+  // the JIT compilation process. Use a forked process and a copying
+  // memory manager with IPC to execute using this functionality.
+  cl::opt<bool> RemoteMCJIT("remote-mcjit",
+    cl::desc("Execute MCJIT'ed code in a separate process."),
+    cl::init(false));
+
   // Determine optimization level.
   cl::opt<char>
   OptLevel("O",
@@ -372,6 +384,79 @@
     free(AllocatedDataMem[i].base());
 }
 
+
+void layoutRemoteTargetMemory(RemoteTarget *T, RecordingMemoryManager *JMM) {
+  // Lay out our sections in order, with all the code sections first, then
+  // all the data sections.
+  uint64_t CurOffset = 0;
+  unsigned MaxAlign = T->getPageAlignment();
+  SmallVector<std::pair<const void*, uint64_t>, 16> Offsets;
+  SmallVector<unsigned, 16> Sizes;
+  for (RecordingMemoryManager::const_code_iterator I = JMM->code_begin(),
+                                                   E = JMM->code_end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "code region: size " << I->first.size()
+                 << ", alignment " << I->second << "\n");
+    // Align the current offset up to whatever is needed for the next
+    // section.
+    unsigned Align = I->second;
+    CurOffset = (CurOffset + Align - 1) / Align * Align;
+    // Save off the address of the new section and allocate its space.
+    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
+    Sizes.push_back(I->first.size());
+    CurOffset += I->first.size();
+  }
+  // Adjust to keep code and data aligned on seperate pages.
+  CurOffset = (CurOffset + MaxAlign - 1) / MaxAlign * MaxAlign;
+  unsigned FirstDataIndex = Offsets.size();
+  for (RecordingMemoryManager::const_data_iterator I = JMM->data_begin(),
+                                                   E = JMM->data_end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "data region: size " << I->first.size()
+                 << ", alignment " << I->second << "\n");
+    // Align the current offset up to whatever is needed for the next
+    // section.
+    unsigned Align = I->second;
+    CurOffset = (CurOffset + Align - 1) / Align * Align;
+    // Save off the address of the new section and allocate its space.
+    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
+    Sizes.push_back(I->first.size());
+    CurOffset += I->first.size();
+  }
+
+  // Allocate space in the remote target.
+  uint64_t RemoteAddr;
+  if (T->allocateSpace(CurOffset, MaxAlign, RemoteAddr))
+    report_fatal_error(T->getErrorMsg());
+  // Map the section addresses so relocations will get updated in the local
+  // copies of the sections.
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    uint64_t Addr = RemoteAddr + Offsets[i].second;
+    EE->mapSectionAddress(const_cast<void*>(Offsets[i].first), Addr);
+
+    DEBUG(dbgs() << "  Mapping local: " << Offsets[i].first
+                 << " to remote: " << format("%#018x", Addr) << "\n");
+
+  }
+  // Now load it all to the target.
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    uint64_t Addr = RemoteAddr + Offsets[i].second;
+
+    if (i < FirstDataIndex) {
+      T->loadCode(Addr, Offsets[i].first, Sizes[i]);
+
+      DEBUG(dbgs() << "  loading code: " << Offsets[i].first
+            << " to remote: " << format("%#018x", Addr) << "\n");
+    } else {
+      T->loadData(Addr, Offsets[i].first, Sizes[i]);
+
+      DEBUG(dbgs() << "  loading data: " << Offsets[i].first
+            << " to remote: " << format("%#018x", Addr) << "\n");
+    }
+
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // main Driver function
 //
@@ -428,12 +513,19 @@
     Mod->setTargetTriple(Triple::normalize(TargetTriple));
 
   // Enable MCJIT if desired.
-  LLIMCJITMemoryManager *JMM = 0;
+  JITMemoryManager *JMM = 0;
   if (UseMCJIT && !ForceInterpreter) {
     builder.setUseMCJIT(true);
-    JMM = new LLIMCJITMemoryManager();
+    if (RemoteMCJIT)
+      JMM = new RecordingMemoryManager();
+    else
+      JMM = new LLIMCJITMemoryManager();
     builder.setJITMemoryManager(JMM);
   } else {
+    if (RemoteMCJIT) {
+      errs() << "error: Remote process execution requires -use-mcjit\n";
+      exit(1);
+    }
     builder.setJITMemoryManager(ForceInterpreter ? 0 :
                                 JITMemoryManager::CreateDefaultMemManager());
   }
@@ -451,11 +543,14 @@
   }
   builder.setOptLevel(OLvl);
 
-  TargetOptions Options;
-  Options.JITExceptionHandling = EnableJITExceptionHandling;
-  Options.JITEmitDebugInfo = EmitJitDebugInfo;
-  Options.JITEmitDebugInfoToDisk = EmitJitDebugInfoToDisk;
-  builder.setTargetOptions(Options);
+  // Remote target execution doesn't handle EH or debug registration.
+  if (!RemoteMCJIT) {
+    TargetOptions Options;
+    Options.JITExceptionHandling = EnableJITExceptionHandling;
+    Options.JITEmitDebugInfo = EmitJitDebugInfo;
+    Options.JITEmitDebugInfoToDisk = EmitJitDebugInfoToDisk;
+    builder.setTargetOptions(Options);
+  }
 
   EE = builder.create();
   if (!EE) {
@@ -466,10 +561,6 @@
     exit(1);
   }
 
-  // Clear instruction cache before code will be executed.
-  if (JMM)
-    JMM->invalidateInstructionCache();
-
   // The following functions have no effect if their respective profiling
   // support wasn't enabled in the build configuration.
   EE->RegisterJITEventListener(
@@ -477,6 +568,10 @@
   EE->RegisterJITEventListener(
                 JITEventListener::createIntelJITEventListener());
 
+  if (!NoLazyCompilation && RemoteMCJIT) {
+    errs() << "warning: remote mcjit does not support lazy compilation\n";
+    NoLazyCompilation = true;
+  }
   EE->DisableLazyCompilation(NoLazyCompilation);
 
   // If the user specifically requested an argv[0] to pass into the program,
@@ -513,8 +608,13 @@
   // Reset errno to zero on entry to main.
   errno = 0;
 
+  // Remote target MCJIT doesn't (yet) support static constructors. No reason
+  // it couldn't. This is a limitation of the LLI implemantation, not the
+  // MCJIT itself. FIXME.
+  //
   // Run static constructors.
-  EE->runStaticConstructorsDestructors(false);
+  if (!RemoteMCJIT)
+    EE->runStaticConstructorsDestructors(false);
 
   if (NoLazyCompilation) {
     for (Module::iterator I = Mod->begin(), E = Mod->end(); I != E; ++I) {
@@ -524,24 +624,66 @@
     }
   }
 
-  // Run main.
-  int Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp);
+  int Result;
+  if (RemoteMCJIT) {
+    RecordingMemoryManager *MM = static_cast<RecordingMemoryManager*>(JMM);
+    // Everything is prepared now, so lay out our program for the target
+    // address space, assign the section addresses to resolve any relocations,
+    // and send it to the target.
+    RemoteTarget Target;
+    Target.create();
 
-  // Run static destructors.
-  EE->runStaticConstructorsDestructors(true);
+    // Ask for a pointer to the entry function. This triggers the actual
+    // compilation.
+    (void)EE->getPointerToFunction(EntryFn);
 
-  // If the program didn't call exit explicitly, we should call it now.
-  // This ensures that any atexit handlers get called correctly.
-  if (Function *ExitF = dyn_cast<Function>(Exit)) {
-    std::vector<GenericValue> Args;
-    GenericValue ResultGV;
-    ResultGV.IntVal = APInt(32, Result);
-    Args.push_back(ResultGV);
-    EE->runFunction(ExitF, Args);
-    errs() << "ERROR: exit(" << Result << ") returned!\n";
-    abort();
+    // Enough has been compiled to execute the entry function now, so
+    // layout the target memory.
+    layoutRemoteTargetMemory(&Target, MM);
+
+    // Since we're executing in a (at least simulated) remote address space,
+    // we can't use the ExecutionEngine::runFunctionAsMain(). We have to
+    // grab the function address directly here and tell the remote target
+    // to execute the function.
+    // FIXME: argv and envp handling.
+    uint64_t Entry = (uint64_t)EE->getPointerToFunction(EntryFn);
+
+    DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at "
+                 << format("%#18x", Entry) << "\n");
+
+    if (Target.executeCode(Entry, Result))
+      errs() << "ERROR: " << Target.getErrorMsg() << "\n";
+
+    Target.stop();
   } else {
-    errs() << "ERROR: exit defined with wrong prototype!\n";
-    abort();
+    // Clear instruction cache before code will be executed.
+    if (JMM)
+      static_cast<LLIMCJITMemoryManager*>(JMM)->invalidateInstructionCache();
+
+    // Run main.
+    Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp);
   }
+
+  // Like static constructors, the remote target MCJIT support doesn't handle
+  // this yet. It could. FIXME.
+  if (!RemoteMCJIT) {
+    // Run static destructors.
+    EE->runStaticConstructorsDestructors(true);
+
+    // If the program didn't call exit explicitly, we should call it now.
+    // This ensures that any atexit handlers get called correctly.
+    if (Function *ExitF = dyn_cast<Function>(Exit)) {
+      std::vector<GenericValue> Args;
+      GenericValue ResultGV;
+      ResultGV.IntVal = APInt(32, Result);
+      Args.push_back(ResultGV);
+      EE->runFunction(ExitF, Args);
+      errs() << "ERROR: exit(" << Result << ") returned!\n";
+      abort();
+    } else {
+      errs() << "ERROR: exit defined with wrong prototype!\n";
+      abort();
+    }
+  }
+  return Result;
 }

diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index d630087..8109ca4 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp

@@ -40,7 +40,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/system_error.h"
-#include <cstdio>
+
 #include <map>
 #include <algorithm>
 using namespace llvm;
@@ -463,11 +463,11 @@
 }
 
 static void PrintSize(double Bits) {
-  fprintf(stderr, "%.2f/%.2fB/%luW", Bits, Bits/8,(unsigned long)(Bits/32));
+  outs() << format("%.2f/%.2fB/%luW", Bits, Bits/8,(unsigned long)(Bits/32));
 }
 static void PrintSize(uint64_t Bits) {
-  fprintf(stderr, "%lub/%.2fB/%luW", (unsigned long)Bits,
-          (double)Bits/8, (unsigned long)(Bits/32));
+  outs() << format("%lub/%.2fB/%luW", (unsigned long)Bits,
+                   (double)Bits/8, (unsigned long)(Bits/32));
 }
 
 
@@ -483,7 +483,7 @@
   if (MemBuf->getBufferSize() & 3)
     return Error("Bitcode stream should be a multiple of 4 bytes in length");
 
-  const unsigned char *BufPtr = (unsigned char *)MemBuf->getBufferStart();
+  const unsigned char *BufPtr = (const unsigned char *)MemBuf->getBufferStart();
   const unsigned char *EndBufPtr = BufPtr+MemBuf->getBufferSize();
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
@@ -556,7 +556,7 @@
     PrintSize(Stats.NumBits);
     outs() << "\n";
     double pct = (Stats.NumBits * 100.0) / BufferSizeBits;
-    errs() << "    Percent of file: " << format("%2.4f%%", pct) << "\n";
+    outs() << "    Percent of file: " << format("%2.4f%%", pct) << "\n";
     if (Stats.NumInstances > 1) {
       outs() << "       Average Size: ";
       PrintSize(Stats.NumBits/(double)Stats.NumInstances);
@@ -588,24 +588,26 @@
       std::reverse(FreqPairs.begin(), FreqPairs.end());
 
       outs() << "\tRecord Histogram:\n";
-      fprintf(stderr, "\t\t  Count    # Bits   %% Abv  Record Kind\n");
+      outs() << "\t\t  Count    # Bits   %% Abv  Record Kind\n";
       for (unsigned i = 0, e = FreqPairs.size(); i != e; ++i) {
         const PerRecordStats &RecStats = Stats.CodeFreq[FreqPairs[i].second];
 
-        fprintf(stderr, "\t\t%7d %9lu ", RecStats.NumInstances,
-                (unsigned long)RecStats.TotalBits);
+        outs() << format("\t\t%7d %9lu",
+                         RecStats.NumInstances,
+                         (unsigned long)RecStats.TotalBits);
 
         if (RecStats.NumAbbrev)
-          fprintf(stderr, "%7.2f  ",
-                  (double)RecStats.NumAbbrev/RecStats.NumInstances*100);
+          outs() <<
+              format("%7.2f  ",
+                     (double)RecStats.NumAbbrev/RecStats.NumInstances*100);
         else
-          fprintf(stderr, "         ");
+          outs() << "         ";
 
         if (const char *CodeName =
               GetCodeName(FreqPairs[i].second, I->first, StreamFile))
-          fprintf(stderr, "%s\n", CodeName);
+          outs() << CodeName << "\n";
         else
-          fprintf(stderr, "UnknownCode%d\n", FreqPairs[i].second);
+          outs() << "UnknownCode" << FreqPairs[i].second << "\n";
       }
       outs() << "\n";
 

diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index ec0b4ae..38c3a1e 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp

@@ -44,6 +44,18 @@
                cl::desc("Print function names as well as line information "
                         "for a given address"));
 
+static cl::opt<bool>
+PrintInlining("inlining", cl::init(false),
+              cl::desc("Print all inlined frames for a given address"));
+
+static void PrintDILineInfo(DILineInfo dli) {
+  if (PrintFunctions)
+    outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
+           << "\n";
+  outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
+         << dli.getLine() << ':' << dli.getColumn() << '\n';
+}
+
 static void DumpInput(const StringRef &Filename) {
   OwningPtr<MemoryBuffer> Buff;
 
@@ -59,6 +71,7 @@
   StringRef DebugLineSection;
   StringRef DebugArangesSection;
   StringRef DebugStringSection;
+  StringRef DebugRangesSection;
 
   error_code ec;
   for (section_iterator i = Obj->begin_sections(),
@@ -82,6 +95,8 @@
       DebugArangesSection = data;
     else if (name == "debug_str")
       DebugStringSection = data;
+    else if (name == "debug_ranges")
+      DebugRangesSection = data;
   }
 
   OwningPtr<DIContext> dictx(DIContext::getDWARFContext(/*FIXME*/true,
@@ -89,7 +104,8 @@
                                                         DebugAbbrevSection,
                                                         DebugArangesSection,
                                                         DebugLineSection,
-                                                        DebugStringSection));
+                                                        DebugStringSection,
+                                                        DebugRangesSection));
   if (Address == -1ULL) {
     outs() << Filename
            << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
@@ -97,16 +113,27 @@
     dictx->dump(outs());
   } else {
     // Print line info for the specified address.
-    int spec_flags = DILineInfoSpecifier::FileLineInfo |
-                     DILineInfoSpecifier::AbsoluteFilePath;
+    int SpecFlags = DILineInfoSpecifier::FileLineInfo |
+                    DILineInfoSpecifier::AbsoluteFilePath;
     if (PrintFunctions)
-      spec_flags |= DILineInfoSpecifier::FunctionName;
-    DILineInfo dli = dictx->getLineInfoForAddress(Address, spec_flags);
-    if (PrintFunctions)
-      outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
-             << "\n";
-    outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
-           << dli.getLine() << ':' << dli.getColumn() << '\n';
+      SpecFlags |= DILineInfoSpecifier::FunctionName;
+    if (PrintInlining) {
+      DIInliningInfo InliningInfo = dictx->getInliningInfoForAddress(
+          Address, SpecFlags);
+      uint32_t n = InliningInfo.getNumberOfFrames();
+      if (n == 0) {
+        // Print one empty debug line info in any case.
+        PrintDILineInfo(DILineInfo());
+      } else {
+        for (uint32_t i = 0; i < n; i++) {
+          DILineInfo dli = InliningInfo.getFrame(i);
+          PrintDILineInfo(dli);
+        }
+      }
+    } else {
+      DILineInfo dli = dictx->getLineInfoForAddress(Address, SpecFlags);
+      PrintDILineInfo(dli);
+    }
   }
 }
 

diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index b431c76..13ea4e3 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp

@@ -94,6 +94,12 @@
 SectionHeadersShorter("h", cl::desc("Alias for --section-headers"),
                       cl::aliasopt(SectionHeaders));
 
+static cl::list<std::string>
+MAttrs("mattr",
+  cl::CommaSeparated,
+  cl::desc("Target specific attributes"),
+  cl::value_desc("a1,+a2,-a3,..."));
+
 static StringRef ToolName;
 
 static bool error(error_code ec) {
@@ -169,6 +175,15 @@
   if (!TheTarget)
     return;
 
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
   error_code ec;
   for (section_iterator i = Obj->begin_sections(),
                         e = Obj->end_sections();
@@ -233,7 +248,7 @@
     }
 
     OwningPtr<const MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+      TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
 
     if (!STI) {
       errs() << "error: no subtarget info for target " << TripleName << "\n";

diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index b80bc34..f8b07b2 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp

@@ -163,13 +163,16 @@
   // generate object file
   bool genResult = false;
   tool_output_file objFile(uniqueObjPath.c_str(), errMsg);
-  if (!errMsg.empty())
+  if (!errMsg.empty()) {
+    uniqueObjPath.eraseFromDisk();
     return true;
+  }
 
   genResult = this->generateObjectFile(objFile.os(), errMsg);
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
+    uniqueObjPath.eraseFromDisk();
     return true;
   }
 
@@ -196,6 +199,7 @@
   OwningPtr<MemoryBuffer> BuffPtr;
   if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
     errMsg = ec.message();
+    sys::Path(_nativeObjectPath).eraseFromDisk();
     return NULL;
   }
   _nativeObjectFile = BuffPtr.take();

diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index d588f6a..b98bfab 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp

@@ -163,7 +163,7 @@
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
 /// bitcode.
 bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
-  return llvm::sys::IdentifyFileType((char*)mem, length)
+  return llvm::sys::IdentifyFileType((const char*)mem, length)
     == llvm::sys::Bitcode_FileType;
 }
 
@@ -307,7 +307,7 @@
 
 /// makeBuffer - Create a MemoryBuffer from a memory range.
 MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
-  const char *startPtr = (char*)mem;
+  const char *startPtr = (const char*)mem;
   return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
 }
 

diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 4ada7d1..7ecd25c 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp

@@ -513,10 +513,6 @@
     return 1;
   }
 
-  // Allocate a full target machine description only if necessary.
-  // FIXME: The choice of target should be controllable on the command line.
-  std::auto_ptr<TargetMachine> target;
-
   SMDiagnostic Err;
 
   // Load the input module...

diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index d272b09..cb9a909 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt

@@ -2,7 +2,7 @@
   Support
   )
 
-add_llvm_unittest(ADTTests
+set(ADTSources
   APFloatTest.cpp
   APIntTest.cpp
   BitVectorTest.cpp
@@ -31,3 +31,16 @@
   TwineTest.cpp
   VariadicFunctionTest.cpp
  )
+
+# They cannot be compiled on MSVC9 due to its bug.
+if(MSVC AND MSVC_VERSION LESS 1600)
+  set(LLVM_OPTIONAL_SOURCES
+    DenseMapTest.cpp
+    SmallVectorTest.cpp
+    )
+  list(REMOVE_ITEM ADTSources ${LLVM_OPTIONAL_SOURCES})
+endif()
+
+add_llvm_unittest(ADTTests
+  ${ADTSources}
+  )

diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index abcec8f..78eb641 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp

@@ -416,7 +416,7 @@
   SmallVector<SubtargetFeatureInfo*, 4> RequiredFeatures;
 
   /// ConversionFnKind - The enum value which is passed to the generated
-  /// ConvertToMCInst to convert parsed operands into an MCInst for this
+  /// convertToMCInst to convert parsed operands into an MCInst for this
   /// function.
   std::string ConversionFnKind;
 
@@ -488,6 +488,15 @@
         return false;
     }
 
+    // Give matches that require more features higher precedence. This is useful
+    // because we cannot define AssemblerPredicates with the negation of
+    // processor features. For example, ARM v6 "nop" may be either a HINT or
+    // MOV. With v6, we want to match HINT. The assembler has no way to
+    // predicate MOV under "NoV6", but HINT will always match first because it
+    // requires V6 while MOV does not.
+    if (RequiredFeatures.size() != RHS.RequiredFeatures.size())
+      return RequiredFeatures.size() > RHS.RequiredFeatures.size();
+
     return false;
   }
 
@@ -666,7 +675,7 @@
 }
 
 static std::pair<StringRef, StringRef>
-parseTwoOperandConstraint(StringRef S, SMLoc Loc) {
+parseTwoOperandConstraint(StringRef S, ArrayRef<SMLoc> Loc) {
   // Split via the '='.
   std::pair<StringRef, StringRef> Ops = S.split('=');
   if (Ops.second == "")
@@ -1638,35 +1647,91 @@
   }
 }
 
+static unsigned getConverterOperandID(const std::string &Name,
+                                      SetVector<std::string> &Table,
+                                      bool &IsNew) {
+  IsNew = Table.insert(Name);
+
+  unsigned ID = IsNew ? Table.size() - 1 :
+    std::find(Table.begin(), Table.end(), Name) - Table.begin();
+
+  assert(ID < Table.size());
+
+  return ID;
+}
+
+
 static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
                                 std::vector<MatchableInfo*> &Infos,
                                 raw_ostream &OS) {
-  // Write the convert function to a separate stream, so we can drop it after
-  // the enum.
-  std::string ConvertFnBody;
-  raw_string_ostream CvtOS(ConvertFnBody);
-
-  // Function we have already generated.
-  std::set<std::string> GeneratedFns;
-
-  // Start the unified conversion function.
-  CvtOS << "bool " << Target.getName() << ClassName << "::\n";
-  CvtOS << "ConvertToMCInst(unsigned Kind, MCInst &Inst, "
-        << "unsigned Opcode,\n"
-        << "                      const SmallVectorImpl<MCParsedAsmOperand*"
-        << "> &Operands) {\n";
-  CvtOS << "  Inst.setOpcode(Opcode);\n";
-  CvtOS << "  switch (Kind) {\n";
-  CvtOS << "  default:\n";
-
-  // Start the enum, which we will generate inline.
-
-  OS << "// Unified function for converting operands to MCInst instances.\n\n";
-  OS << "enum ConversionKind {\n";
+  SetVector<std::string> OperandConversionKinds;
+  SetVector<std::string> InstructionConversionKinds;
+  std::vector<std::vector<uint8_t> > ConversionTable;
+  size_t MaxRowLength = 2; // minimum is custom converter plus terminator.
 
   // TargetOperandClass - This is the target's operand class, like X86Operand.
   std::string TargetOperandClass = Target.getName() + "Operand";
 
+  // Write the convert function to a separate stream, so we can drop it after
+  // the enum. We'll build up the conversion handlers for the individual
+  // operand types opportunistically as we encounter them.
+  std::string ConvertFnBody;
+  raw_string_ostream CvtOS(ConvertFnBody);
+  // Start the unified conversion function.
+  CvtOS << "void " << Target.getName() << ClassName << "::\n"
+        << "convertToMCInst(unsigned Kind, MCInst &Inst, "
+        << "unsigned Opcode,\n"
+        << "                const SmallVectorImpl<MCParsedAsmOperand*"
+        << "> &Operands) {\n"
+        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
+        << "  uint8_t *Converter = ConversionTable[Kind];\n"
+        << "  Inst.setOpcode(Opcode);\n"
+        << "  for (uint8_t *p = Converter; *p; p+= 2) {\n"
+        << "    switch (*p) {\n"
+        << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
+        << "    case CVT_Reg:\n"
+        << "      static_cast<" << TargetOperandClass
+        << "*>(Operands[*(p + 1)])->addRegOperands(Inst, 1);\n"
+        << "      break;\n"
+        << "    case CVT_Tied:\n"
+        << "      Inst.addOperand(Inst.getOperand(*(p + 1)));\n"
+        << "      break;\n";
+
+  std::string OperandFnBody;
+  raw_string_ostream OpOS(OperandFnBody);
+  // Start the operand number lookup function.
+  OpOS << "unsigned " << Target.getName() << ClassName << "::\n"
+       << "getMCInstOperandNumImpl(unsigned Kind, MCInst &Inst,\n"
+       << "                        const SmallVectorImpl<MCParsedAsmOperand*> "
+       << "&Operands,\n                        unsigned OperandNum, unsigned "
+       << "&NumMCOperands) {\n"
+       << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
+       << "  NumMCOperands = 0;\n"
+       << "  unsigned MCOperandNum = 0;\n"
+       << "  uint8_t *Converter = ConversionTable[Kind];\n"
+       << "  for (uint8_t *p = Converter; *p; p+= 2) {\n"
+       << "    if (*(p + 1) > OperandNum) continue;\n"
+       << "    switch (*p) {\n"
+       << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
+       << "    case CVT_Reg:\n"
+       << "      if (*(p + 1) == OperandNum) {\n"
+       << "        NumMCOperands = 1;\n"
+       << "        break;\n"
+       << "      }\n"
+       << "      ++MCOperandNum;\n"
+       << "      break;\n"
+       << "    case CVT_Tied:\n"
+       << "      // FIXME: Tied operand calculation not supported.\n"
+       << "      assert (0 && \"getMCInstOperandNumImpl() doesn't support tied operands, yet!\");\n"
+       << "      break;\n";
+
+  // Pre-populate the operand conversion kinds with the standard always
+  // available entries.
+  OperandConversionKinds.insert("CVT_Done");
+  OperandConversionKinds.insert("CVT_Reg");
+  OperandConversionKinds.insert("CVT_Tied");
+  enum { CVT_Done, CVT_Reg, CVT_Tied };
+
   for (std::vector<MatchableInfo*>::const_iterator it = Infos.begin(),
          ie = Infos.end(); it != ie; ++it) {
     MatchableInfo &II = **it;
@@ -1679,24 +1744,35 @@
       II.ConversionFnKind = Signature;
 
       // Check if we have already generated this signature.
-      if (!GeneratedFns.insert(Signature).second)
+      if (!InstructionConversionKinds.insert(Signature))
         continue;
 
-      // If not, emit it now.  Add to the enum list.
-      OS << "  " << Signature << ",\n";
+      // Remember this converter for the kind enum.
+      unsigned KindID = OperandConversionKinds.size();
+      OperandConversionKinds.insert("CVT_" + AsmMatchConverter);
 
-      CvtOS << "  case " << Signature << ":\n";
-      CvtOS << "    return " << AsmMatchConverter
-            << "(Inst, Opcode, Operands);\n";
+      // Add the converter row for this instruction.
+      ConversionTable.push_back(std::vector<uint8_t>());
+      ConversionTable.back().push_back(KindID);
+      ConversionTable.back().push_back(CVT_Done);
+
+      // Add the handler to the conversion driver function.
+      CvtOS << "    case CVT_" << AsmMatchConverter << ":\n"
+            << "      " << AsmMatchConverter << "(Inst, Operands);\n"
+            << "      break;\n";
+
+      // FIXME: Handle the operand number lookup for custom match functions.
       continue;
     }
 
     // Build the conversion function signature.
     std::string Signature = "Convert";
-    std::string CaseBody;
-    raw_string_ostream CaseOS(CaseBody);
+
+    std::vector<uint8_t> ConversionRow;
 
     // Compute the convert enum and the case body.
+    MaxRowLength = std::max(MaxRowLength, II.ResOperands.size()*2 + 1 );
+
     for (unsigned i = 0, e = II.ResOperands.size(); i != e; ++i) {
       const MatchableInfo::ResOperand &OpInfo = II.ResOperands[i];
 
@@ -1709,74 +1785,186 @@
         // Registers are always converted the same, don't duplicate the
         // conversion function based on them.
         Signature += "__";
-        if (Op.Class->isRegisterClass())
-          Signature += "Reg";
-        else
-          Signature += Op.Class->ClassName;
+        std::string Class;
+        Class = Op.Class->isRegisterClass() ? "Reg" : Op.Class->ClassName;
+        Signature += Class;
         Signature += utostr(OpInfo.MINumOperands);
         Signature += "_" + itostr(OpInfo.AsmOperandNum);
 
-        CaseOS << "    ((" << TargetOperandClass << "*)Operands["
-               << (OpInfo.AsmOperandNum+1) << "])->" << Op.Class->RenderMethod
-               << "(Inst, " << OpInfo.MINumOperands << ");\n";
+        // Add the conversion kind, if necessary, and get the associated ID
+        // the index of its entry in the vector).
+        std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" :
+                                     Op.Class->RenderMethod);
+
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(OpInfo.AsmOperandNum + 1);
+
+        if (!IsNewConverter)
+          break;
+
+        // This is a new operand kind. Add a handler for it to the
+        // converter driver.
+        CvtOS << "    case " << Name << ":\n"
+              << "      static_cast<" << TargetOperandClass
+              << "*>(Operands[*(p + 1)])->"
+              << Op.Class->RenderMethod << "(Inst, " << OpInfo.MINumOperands
+              << ");\n"
+              << "      break;\n";
+
+        // Add a handler for the operand number lookup.
+        OpOS << "    case " << Name << ":\n"
+             << "      if (*(p + 1) == OperandNum) {\n"
+             << "        NumMCOperands = " << OpInfo.MINumOperands << ";\n"
+             << "        break;\n"
+             << "      }\n"
+             << "      MCOperandNum += " << OpInfo.MINumOperands << ";\n"
+             << "      break;\n";
         break;
       }
-
       case MatchableInfo::ResOperand::TiedOperand: {
         // If this operand is tied to a previous one, just copy the MCInst
         // operand from the earlier one.We can only tie single MCOperand values.
         //assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
         unsigned TiedOp = OpInfo.TiedOperandNum;
         assert(i > TiedOp && "Tied operand precedes its target!");
-        CaseOS << "    Inst.addOperand(Inst.getOperand(" << TiedOp << "));\n";
         Signature += "__Tie" + utostr(TiedOp);
+        ConversionRow.push_back(CVT_Tied);
+        ConversionRow.push_back(TiedOp);
+        // FIXME: Handle the operand number lookup for tied operands.
         break;
       }
       case MatchableInfo::ResOperand::ImmOperand: {
         int64_t Val = OpInfo.ImmVal;
-        CaseOS << "    Inst.addOperand(MCOperand::CreateImm(" << Val << "));\n";
-        Signature += "__imm" + itostr(Val);
+        std::string Ty = "imm_" + itostr(Val);
+        Signature += "__" + Ty;
+
+        std::string Name = "CVT_" + Ty;
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(0);
+
+        if (!IsNewConverter)
+          break;
+
+        CvtOS << "    case " << Name << ":\n"
+              << "      Inst.addOperand(MCOperand::CreateImm(" << Val << "));\n"
+              << "      break;\n";
+
+        OpOS << "    case " << Name << ":\n"
+             << "      if (*(p + 1) == OperandNum) {\n"
+             << "        NumMCOperands = 1;\n"
+             << "        break;\n"
+             << "      }\n"
+             << "      ++MCOperandNum;\n"
+             << "      break;\n";
         break;
       }
       case MatchableInfo::ResOperand::RegOperand: {
+        std::string Reg, Name;
         if (OpInfo.Register == 0) {
-          CaseOS << "    Inst.addOperand(MCOperand::CreateReg(0));\n";
-          Signature += "__reg0";
+          Name = "reg0";
+          Reg = "0";
         } else {
-          std::string N = getQualifiedName(OpInfo.Register);
-          CaseOS << "    Inst.addOperand(MCOperand::CreateReg(" << N << "));\n";
-          Signature += "__reg" + OpInfo.Register->getName();
+          Reg = getQualifiedName(OpInfo.Register);
+          Name = "reg" + OpInfo.Register->getName();
         }
+        Signature += "__" + Name;
+        Name = "CVT_" + Name;
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(0);
+
+        if (!IsNewConverter)
+          break;
+        CvtOS << "    case " << Name << ":\n"
+              << "      Inst.addOperand(MCOperand::CreateReg(" << Reg << "));\n"
+              << "      break;\n";
+
+        OpOS << "    case " << Name << ":\n"
+             << "      if (*(p + 1) == OperandNum) {\n"
+             << "        NumMCOperands = 1;\n"
+             << "        break;\n"
+             << "      }\n"
+             << "      ++MCOperandNum;\n"
+             << "      break;\n";
       }
       }
     }
 
+    // If there were no operands, add to the signature to that effect
+    if (Signature == "Convert")
+      Signature += "_NoOperands";
+
     II.ConversionFnKind = Signature;
 
-    // Check if we have already generated this signature.
-    if (!GeneratedFns.insert(Signature).second)
+    // Save the signature. If we already have it, don't add a new row
+    // to the table.
+    if (!InstructionConversionKinds.insert(Signature))
       continue;
 
-    // If not, emit it now.  Add to the enum list.
-    OS << "  " << Signature << ",\n";
-
-    CvtOS << "  case " << Signature << ":\n";
-    CvtOS << CaseOS.str();
-    CvtOS << "    return true;\n";
+    // Add the row to the table.
+    ConversionTable.push_back(ConversionRow);
   }
 
-  // Finish the convert function.
+  // Finish up the converter driver function.
+  CvtOS << "    }\n  }\n}\n\n";
 
-  CvtOS << "  }\n";
-  CvtOS << "  return false;\n";
-  CvtOS << "}\n\n";
+  // Finish up the operand number lookup function.
+  OpOS << "    }\n  }\n  return MCOperandNum;\n}\n\n";
 
-  // Finish the enum, and drop the convert function after it.
+  OS << "namespace {\n";
 
-  OS << "  NumConversionVariants\n";
+  // Output the operand conversion kind enum.
+  OS << "enum OperatorConversionKind {\n";
+  for (unsigned i = 0, e = OperandConversionKinds.size(); i != e; ++i)
+    OS << "  " << OperandConversionKinds[i] << ",\n";
+  OS << "  CVT_NUM_CONVERTERS\n";
   OS << "};\n\n";
 
+  // Output the instruction conversion kind enum.
+  OS << "enum InstructionConversionKind {\n";
+  for (SetVector<std::string>::const_iterator
+         i = InstructionConversionKinds.begin(),
+         e = InstructionConversionKinds.end(); i != e; ++i)
+    OS << "  " << *i << ",\n";
+  OS << "  CVT_NUM_SIGNATURES\n";
+  OS << "};\n\n";
+
+
+  OS << "} // end anonymous namespace\n\n";
+
+  // Output the conversion table.
+  OS << "static uint8_t ConversionTable[CVT_NUM_SIGNATURES]["
+     << MaxRowLength << "] = {\n";
+
+  for (unsigned Row = 0, ERow = ConversionTable.size(); Row != ERow; ++Row) {
+    assert(ConversionTable[Row].size() % 2 == 0 && "bad conversion row!");
+    OS << "  // " << InstructionConversionKinds[Row] << "\n";
+    OS << "  { ";
+    for (unsigned i = 0, e = ConversionTable[Row].size(); i != e; i += 2)
+      OS << OperandConversionKinds[ConversionTable[Row][i]] << ", "
+         << (unsigned)(ConversionTable[Row][i + 1]) << ", ";
+    OS << "CVT_Done },\n";
+  }
+
+  OS << "};\n\n";
+
+  // Spit out the conversion driver function.
   OS << CvtOS.str();
+
+  // Spit out the operand number lookup function.
+  OS << OpOS.str();
 }
 
 /// emitMatchClassEnumeration - Emit the enumeration for match class kinds.
@@ -2407,14 +2595,19 @@
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
   OS << "  unsigned ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
-  OS << "  bool ConvertToMCInst(unsigned Kind, MCInst &Inst, "
+  OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
      << "unsigned Opcode,\n"
-     << "                       const SmallVectorImpl<MCParsedAsmOperand*> "
+     << "                          const SmallVectorImpl<MCParsedAsmOperand*> "
      << "&Operands);\n";
+  OS << "  unsigned getMCInstOperandNumImpl(unsigned Kind, MCInst &Inst,\n     "
+     << "                              const "
+     << "SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n                     "
+     << "          unsigned OperandNum, unsigned &NumMCOperands);\n";
   OS << "  bool MnemonicIsValid(StringRef Mnemonic);\n";
-  OS << "  unsigned MatchInstructionImpl(\n";
-  OS << "    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
-  OS << "    MCInst &Inst, unsigned &ErrorInfo, unsigned VariantID = 0);\n";
+  OS << "  unsigned MatchInstructionImpl(\n"
+     << "    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n"
+     << "    unsigned &Kind, MCInst &Inst, "
+     << "unsigned &ErrorInfo,\n    unsigned VariantID = 0);\n";
 
   if (Info.OperandMatchInfo.size()) {
     OS << "\n  enum OperandMatchResultTy {\n";
@@ -2594,8 +2787,14 @@
      << Target.getName() << ClassName << "::\n"
      << "MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>"
      << " &Operands,\n";
-  OS << "                     MCInst &Inst, unsigned &ErrorInfo, ";
-  OS << "unsigned VariantID) {\n";
+  OS << "                     unsigned &Kind, MCInst &Inst, unsigned ";
+  OS << "&ErrorInfo,\n                     unsigned VariantID) {\n";
+
+  OS << "  // Eliminate obvious mismatches.\n";
+  OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
+  OS << "    ErrorInfo = " << (MaxNumOperands+1) << ";\n";
+  OS << "    return Match_InvalidOperand;\n";
+  OS << "  }\n\n";
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
@@ -2613,12 +2812,6 @@
   }
 
   // Emit code to compute the class list for this operand vector.
-  OS << "  // Eliminate obvious mismatches.\n";
-  OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
-  OS << "    ErrorInfo = " << (MaxNumOperands+1) << ";\n";
-  OS << "    return Match_InvalidOperand;\n";
-  OS << "  }\n\n";
-
   OS << "  // Some state to try to produce better error messages.\n";
   OS << "  bool HadMatchOtherThanFeatures = false;\n";
   OS << "  bool HadMatchOtherThanPredicate = false;\n";
@@ -2683,17 +2876,15 @@
   OS << "      HadMatchOtherThanFeatures = true;\n";
   OS << "      unsigned NewMissingFeatures = it->RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      if (CountPopulation_32(NewMissingFeatures) <= "
-        "CountPopulation_32(MissingFeatures))\n";
+  OS << "      if (CountPopulation_32(NewMissingFeatures) <=\n"
+        "          CountPopulation_32(MissingFeatures))\n";
   OS << "        MissingFeatures = NewMissingFeatures;\n";
   OS << "      continue;\n";
   OS << "    }\n";
   OS << "\n";
   OS << "    // We have selected a definite instruction, convert the parsed\n"
      << "    // operands into the appropriate MCInst.\n";
-  OS << "    if (!ConvertToMCInst(it->ConvertFn, Inst,\n"
-     << "                         it->Opcode, Operands))\n";
-  OS << "      return Match_ConversionFail;\n";
+  OS << "    convertToMCInst(it->ConvertFn, Inst, it->Opcode, Operands);\n";
   OS << "\n";
 
   // Verify the instruction with the target-specific match predicate function.
@@ -2714,6 +2905,7 @@
   if (!InsnCleanupFn.empty())
     OS << "    " << InsnCleanupFn << "(Inst);\n";
 
+  OS << "    Kind = it->ConvertFn;\n";
   OS << "    return Match_Success;\n";
   OS << "  }\n\n";
 

diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index eaa6789..db16bda 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp

@@ -92,7 +92,7 @@
 int CodeEmitterGen::getVariableBit(const std::string &VarName,
                                    BitsInit *BI, int bit) {
   if (VarBitInit *VBI = dynamic_cast<VarBitInit*>(BI->getBit(bit))) {
-    if (VarInit *VI = dynamic_cast<VarInit*>(VBI->getVariable()))
+    if (VarInit *VI = dynamic_cast<VarInit*>(VBI->getBitVar()))
       if (VI->getName() == VarName)
         return VBI->getBitNum();
   } else if (VarInit *VI = dynamic_cast<VarInit*>(BI->getBit(bit))) {

diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 34f8a34..8713a56 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp

@@ -1410,19 +1410,13 @@
       // Make sure that the value is representable for this type.
       if (Size >= 32) return MadeChange;
 
-      int Val = (II->getValue() << (32-Size)) >> (32-Size);
-      if (Val == II->getValue()) return MadeChange;
-
-      // If sign-extended doesn't fit, does it fit as unsigned?
-      unsigned ValueMask;
-      unsigned UnsignedVal;
-      ValueMask = unsigned(~uint32_t(0UL) >> (32-Size));
-      UnsignedVal = unsigned(II->getValue());
-
-      if ((ValueMask & UnsignedVal) == UnsignedVal)
+      // Check that the value doesn't use more bits than we have. It must either
+      // be a sign- or zero-extended equivalent of the original.
+      int64_t SignBitAndAbove = II->getValue() >> (Size - 1);
+      if (SignBitAndAbove == -1 || SignBitAndAbove == 0 || SignBitAndAbove == 1)
         return MadeChange;
 
-      TP.error("Integer value '" + itostr(II->getValue())+
+      TP.error("Integer value '" + itostr(II->getValue()) +
                "' is out of range for type '" + getEnumName(getType(0)) + "'!");
       return MadeChange;
     }
@@ -1581,8 +1575,7 @@
       // If the instruction expects a predicate or optional def operand, we
       // codegen this by setting the operand to it's default value if it has a
       // non-empty DefaultOps field.
-      if ((OperandNode->isSubClassOf("PredicateOperand") ||
-           OperandNode->isSubClassOf("OptionalDefOperand")) &&
+      if (OperandNode->isSubClassOf("OperandWithDefaultOps") &&
           !CDP.getDefaultOperand(OperandNode).DefaultOps.empty())
         continue;
 
@@ -2033,6 +2026,9 @@
   // stores, and side effects in many cases by examining an
   // instruction's pattern.
   InferInstructionFlags();
+
+  // Verify that instruction flags match the patterns.
+  VerifyInstructionFlags();
 }
 
 CodeGenDAGPatterns::~CodeGenDAGPatterns() {
@@ -2176,53 +2172,46 @@
 }
 
 void CodeGenDAGPatterns::ParseDefaultOperands() {
-  std::vector<Record*> DefaultOps[2];
-  DefaultOps[0] = Records.getAllDerivedDefinitions("PredicateOperand");
-  DefaultOps[1] = Records.getAllDerivedDefinitions("OptionalDefOperand");
+  std::vector<Record*> DefaultOps;
+  DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps");
 
   // Find some SDNode.
   assert(!SDNodes.empty() && "No SDNodes parsed?");
   Init *SomeSDNode = DefInit::get(SDNodes.begin()->first);
 
-  for (unsigned iter = 0; iter != 2; ++iter) {
-    for (unsigned i = 0, e = DefaultOps[iter].size(); i != e; ++i) {
-      DagInit *DefaultInfo = DefaultOps[iter][i]->getValueAsDag("DefaultOps");
+  for (unsigned i = 0, e = DefaultOps.size(); i != e; ++i) {
+    DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps");
 
-      // Clone the DefaultInfo dag node, changing the operator from 'ops' to
-      // SomeSDnode so that we can parse this.
-      std::vector<std::pair<Init*, std::string> > Ops;
-      for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
-        Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
-                                     DefaultInfo->getArgName(op)));
-      DagInit *DI = DagInit::get(SomeSDNode, "", Ops);
+    // Clone the DefaultInfo dag node, changing the operator from 'ops' to
+    // SomeSDnode so that we can parse this.
+    std::vector<std::pair<Init*, std::string> > Ops;
+    for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
+      Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
+                                   DefaultInfo->getArgName(op)));
+    DagInit *DI = DagInit::get(SomeSDNode, "", Ops);
 
-      // Create a TreePattern to parse this.
-      TreePattern P(DefaultOps[iter][i], DI, false, *this);
-      assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!");
+    // Create a TreePattern to parse this.
+    TreePattern P(DefaultOps[i], DI, false, *this);
+    assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!");
 
-      // Copy the operands over into a DAGDefaultOperand.
-      DAGDefaultOperand DefaultOpInfo;
+    // Copy the operands over into a DAGDefaultOperand.
+    DAGDefaultOperand DefaultOpInfo;
 
-      TreePatternNode *T = P.getTree(0);
-      for (unsigned op = 0, e = T->getNumChildren(); op != e; ++op) {
-        TreePatternNode *TPN = T->getChild(op);
-        while (TPN->ApplyTypeConstraints(P, false))
-          /* Resolve all types */;
+    TreePatternNode *T = P.getTree(0);
+    for (unsigned op = 0, e = T->getNumChildren(); op != e; ++op) {
+      TreePatternNode *TPN = T->getChild(op);
+      while (TPN->ApplyTypeConstraints(P, false))
+        /* Resolve all types */;
 
-        if (TPN->ContainsUnresolvedType()) {
-          if (iter == 0)
-            throw "Value #" + utostr(i) + " of PredicateOperand '" +
-              DefaultOps[iter][i]->getName() +"' doesn't have a concrete type!";
-          else
-            throw "Value #" + utostr(i) + " of OptionalDefOperand '" +
-              DefaultOps[iter][i]->getName() +"' doesn't have a concrete type!";
-        }
-        DefaultOpInfo.DefaultOps.push_back(TPN);
+      if (TPN->ContainsUnresolvedType()) {
+        throw "Value #" + utostr(i) + " of OperandWithDefaultOps '" +
+          DefaultOps[i]->getName() +"' doesn't have a concrete type!";
       }
-
-      // Insert it into the DefaultOperands map so we can find it later.
-      DefaultOperands[DefaultOps[iter][i]] = DefaultOpInfo;
+      DefaultOpInfo.DefaultOps.push_back(TPN);
     }
+
+    // Insert it into the DefaultOperands map so we can find it later.
+    DefaultOperands[DefaultOps[i]] = DefaultOpInfo;
   }
 }
 
@@ -2367,36 +2356,29 @@
 
 class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
-  bool &mayStore;
-  bool &mayLoad;
-  bool &IsBitcast;
-  bool &HasSideEffects;
-  bool &IsVariadic;
 public:
-  InstAnalyzer(const CodeGenDAGPatterns &cdp,
-               bool &maystore, bool &mayload, bool &isbc, bool &hse, bool &isv)
-    : CDP(cdp), mayStore(maystore), mayLoad(mayload), IsBitcast(isbc),
-      HasSideEffects(hse), IsVariadic(isv) {
+  bool hasSideEffects;
+  bool mayStore;
+  bool mayLoad;
+  bool isBitcast;
+  bool isVariadic;
+
+  InstAnalyzer(const CodeGenDAGPatterns &cdp)
+    : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
+      isBitcast(false), isVariadic(false) {}
+
+  void Analyze(const TreePattern *Pat) {
+    // Assume only the first tree is the pattern. The others are clobber nodes.
+    AnalyzeNode(Pat->getTree(0));
   }
 
-  /// Analyze - Analyze the specified instruction, returning true if the
-  /// instruction had a pattern.
-  bool Analyze(Record *InstRecord) {
-    const TreePattern *Pattern = CDP.getInstruction(InstRecord).getPattern();
-    if (Pattern == 0) {
-      HasSideEffects = 1;
-      return false;  // No pattern.
-    }
-
-    // FIXME: Assume only the first tree is the pattern. The others are clobber
-    // nodes.
-    AnalyzeNode(Pattern->getTree(0));
-    return true;
+  void Analyze(const PatternToMatch *Pat) {
+    AnalyzeNode(Pat->getSrcPattern());
   }
 
 private:
   bool IsNodeBitcast(const TreePatternNode *N) const {
-    if (HasSideEffects || mayLoad || mayStore || IsVariadic)
+    if (hasSideEffects || mayLoad || mayStore || isVariadic)
       return false;
 
     if (N->getNumChildren() != 2)
@@ -2418,6 +2400,7 @@
     return OpInfo.getEnumName() == "ISD::BITCAST";
   }
 
+public:
   void AnalyzeNode(const TreePatternNode *N) {
     if (N->isLeaf()) {
       if (DefInit *DI = dynamic_cast<DefInit*>(N->getLeafValue())) {
@@ -2427,7 +2410,7 @@
           const ComplexPattern &CP = CDP.getComplexPattern(LeafRec);
           if (CP.hasProperty(SDNPMayStore)) mayStore = true;
           if (CP.hasProperty(SDNPMayLoad)) mayLoad = true;
-          if (CP.hasProperty(SDNPSideEffect)) HasSideEffects = true;
+          if (CP.hasProperty(SDNPSideEffect)) hasSideEffects = true;
         }
       }
       return;
@@ -2439,7 +2422,7 @@
 
     // Ignore set nodes, which are not SDNodes.
     if (N->getOperator()->getName() == "set") {
-      IsBitcast = IsNodeBitcast(N);
+      isBitcast = IsNodeBitcast(N);
       return;
     }
 
@@ -2449,8 +2432,8 @@
     // Notice properties of the node.
     if (OpInfo.hasProperty(SDNPMayStore)) mayStore = true;
     if (OpInfo.hasProperty(SDNPMayLoad)) mayLoad = true;
-    if (OpInfo.hasProperty(SDNPSideEffect)) HasSideEffects = true;
-    if (OpInfo.hasProperty(SDNPVariadic)) IsVariadic = true;
+    if (OpInfo.hasProperty(SDNPSideEffect)) hasSideEffects = true;
+    if (OpInfo.hasProperty(SDNPVariadic)) isVariadic = true;
 
     if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
       // If this is an intrinsic, analyze it.
@@ -2462,62 +2445,64 @@
 
       if (IntInfo->ModRef >= CodeGenIntrinsic::ReadWriteMem)
         // WriteMem intrinsics can have other strange effects.
-        HasSideEffects = true;
+        hasSideEffects = true;
     }
   }
 
 };
 
-static void InferFromPattern(const CodeGenInstruction &Inst,
-                             bool &MayStore, bool &MayLoad,
-                             bool &IsBitcast,
-                             bool &HasSideEffects, bool &IsVariadic,
-                             const CodeGenDAGPatterns &CDP) {
-  MayStore = MayLoad = IsBitcast = HasSideEffects = IsVariadic = false;
+static bool InferFromPattern(CodeGenInstruction &InstInfo,
+                             const InstAnalyzer &PatInfo,
+                             Record *PatDef) {
+  bool Error = false;
 
-  bool HadPattern =
-    InstAnalyzer(CDP, MayStore, MayLoad, IsBitcast, HasSideEffects, IsVariadic)
-    .Analyze(Inst.TheDef);
+  // Remember where InstInfo got its flags.
+  if (InstInfo.hasUndefFlags())
+      InstInfo.InferredFrom = PatDef;
 
-  // InstAnalyzer only correctly analyzes mayStore/mayLoad so far.
-  if (Inst.mayStore) {  // If the .td file explicitly sets mayStore, use it.
-    // If we decided that this is a store from the pattern, then the .td file
-    // entry is redundant.
-    if (MayStore)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "mayStore flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    MayStore = true;
+  // Check explicitly set flags for consistency.
+  if (InstInfo.hasSideEffects != PatInfo.hasSideEffects &&
+      !InstInfo.hasSideEffects_Unset) {
+    // Allow explicitly setting hasSideEffects = 1 on instructions, even when
+    // the pattern has no side effects. That could be useful for div/rem
+    // instructions that may trap.
+    if (!InstInfo.hasSideEffects) {
+      Error = true;
+      PrintError(PatDef->getLoc(), "Pattern doesn't match hasSideEffects = " +
+                 Twine(InstInfo.hasSideEffects));
+    }
   }
 
-  if (Inst.mayLoad) {  // If the .td file explicitly sets mayLoad, use it.
-    // If we decided that this is a load from the pattern, then the .td file
-    // entry is redundant.
-    if (MayLoad)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "mayLoad flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    MayLoad = true;
+  if (InstInfo.mayStore != PatInfo.mayStore && !InstInfo.mayStore_Unset) {
+    Error = true;
+    PrintError(PatDef->getLoc(), "Pattern doesn't match mayStore = " +
+               Twine(InstInfo.mayStore));
   }
 
-  if (Inst.neverHasSideEffects) {
-    if (HadPattern)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "neverHasSideEffects flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    HasSideEffects = false;
+  if (InstInfo.mayLoad != PatInfo.mayLoad && !InstInfo.mayLoad_Unset) {
+    // Allow explicitly setting mayLoad = 1, even when the pattern has no loads.
+    // Some targets translate imediates to loads.
+    if (!InstInfo.mayLoad) {
+      Error = true;
+      PrintError(PatDef->getLoc(), "Pattern doesn't match mayLoad = " +
+                 Twine(InstInfo.mayLoad));
+    }
   }
 
-  if (Inst.hasSideEffects) {
-    if (HasSideEffects)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "hasSideEffects flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    HasSideEffects = true;
-  }
+  // Transfer inferred flags.
+  InstInfo.hasSideEffects |= PatInfo.hasSideEffects;
+  InstInfo.mayStore |= PatInfo.mayStore;
+  InstInfo.mayLoad |= PatInfo.mayLoad;
 
-  if (Inst.Operands.isVariadic)
-    IsVariadic = true;  // Can warn if we want.
+  // These flags are silently added without any verification.
+  InstInfo.isBitcast |= PatInfo.isBitcast;
+
+  // Don't infer isVariadic. This flag means something different on SDNodes and
+  // instructions. For example, a CALL SDNode is variadic because it has the
+  // call arguments as operands, but a CALL instruction is not variadic - it
+  // has argument registers as implicit, not explicit uses.
+
+  return Error;
 }
 
 /// hasNullFragReference - Return true if the DAG has any reference to the
@@ -2551,6 +2536,17 @@
   return false;
 }
 
+/// Get all the instructions in a tree.
+static void
+getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl<Record*> &Instrs) {
+  if (Tree->isLeaf())
+    return;
+  if (Tree->getOperator()->isSubClassOf("Instruction"))
+    Instrs.push_back(Tree->getOperator());
+  for (unsigned i = 0, e = Tree->getNumChildren(); i != e; ++i)
+    getInstructionsInTree(Tree->getChild(i), Instrs);
+}
+
 /// ParseInstructions - Parse all of the instructions, inlining and resolving
 /// any fragments involved.  This populates the Instructions list with fully
 /// resolved instructions.
@@ -2683,11 +2679,9 @@
         I->error("Operand #" + utostr(i) + " in operands list has no name!");
 
       if (!InstInputsCheck.count(OpName)) {
-        // If this is an predicate operand or optional def operand with an
-        // DefaultOps set filled in, we can ignore this.  When we codegen it,
-        // we will do so as always executed.
-        if (Op.Rec->isSubClassOf("PredicateOperand") ||
-            Op.Rec->isSubClassOf("OptionalDefOperand")) {
+        // If this is an operand with a DefaultOps set filled in, we can ignore
+        // this.  When we codegen it, we will do so as always executed.
+        if (Op.Rec->isSubClassOf("OperandWithDefaultOps")) {
           // Does it have a non-empty DefaultOps field?  If so, ignore this
           // operand.
           if (!getDefaultOperand(Op.Rec).DefaultOps.empty())
@@ -2852,25 +2846,156 @@
 void CodeGenDAGPatterns::InferInstructionFlags() {
   const std::vector<const CodeGenInstruction*> &Instructions =
     Target.getInstructionsByEnumValue();
+
+  // First try to infer flags from the primary instruction pattern, if any.
+  SmallVector<CodeGenInstruction*, 8> Revisit;
+  unsigned Errors = 0;
   for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
     CodeGenInstruction &InstInfo =
       const_cast<CodeGenInstruction &>(*Instructions[i]);
-    // Determine properties of the instruction from its pattern.
-    bool MayStore, MayLoad, IsBitcast, HasSideEffects, IsVariadic;
-    InferFromPattern(InstInfo, MayStore, MayLoad, IsBitcast,
-                     HasSideEffects, IsVariadic, *this);
-    InstInfo.mayStore = MayStore;
-    InstInfo.mayLoad = MayLoad;
-    InstInfo.isBitcast = IsBitcast;
-    InstInfo.hasSideEffects = HasSideEffects;
-    InstInfo.Operands.isVariadic = IsVariadic;
 
-    // Sanity checks.
-    if (InstInfo.isReMaterializable && InstInfo.hasSideEffects)
-      throw TGError(InstInfo.TheDef->getLoc(), "The instruction " +
-                    InstInfo.TheDef->getName() +
-                    " is rematerializable AND has unmodeled side effects?");
+    // Treat neverHasSideEffects = 1 as the equivalent of hasSideEffects = 0.
+    // This flag is obsolete and will be removed.
+    if (InstInfo.neverHasSideEffects) {
+      assert(!InstInfo.hasSideEffects);
+      InstInfo.hasSideEffects_Unset = false;
+    }
+
+    // Get the primary instruction pattern.
+    const TreePattern *Pattern = getInstruction(InstInfo.TheDef).getPattern();
+    if (!Pattern) {
+      if (InstInfo.hasUndefFlags())
+        Revisit.push_back(&InstInfo);
+      continue;
+    }
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(Pattern);
+    Errors += InferFromPattern(InstInfo, PatInfo, InstInfo.TheDef);
   }
+
+  // Second, look for single-instruction patterns defined outside the
+  // instruction.
+  for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) {
+    const PatternToMatch &PTM = *I;
+
+    // We can only infer from single-instruction patterns, otherwise we won't
+    // know which instruction should get the flags.
+    SmallVector<Record*, 8> PatInstrs;
+    getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
+    if (PatInstrs.size() != 1)
+      continue;
+
+    // Get the single instruction.
+    CodeGenInstruction &InstInfo = Target.getInstruction(PatInstrs.front());
+
+    // Only infer properties from the first pattern. We'll verify the others.
+    if (InstInfo.InferredFrom)
+      continue;
+
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(&PTM);
+    Errors += InferFromPattern(InstInfo, PatInfo, PTM.getSrcRecord());
+  }
+
+  if (Errors)
+    throw "pattern conflicts";
+
+  // Revisit instructions with undefined flags and no pattern.
+  if (Target.guessInstructionProperties()) {
+    for (unsigned i = 0, e = Revisit.size(); i != e; ++i) {
+      CodeGenInstruction &InstInfo = *Revisit[i];
+      if (InstInfo.InferredFrom)
+        continue;
+      // The mayLoad and mayStore flags default to false.
+      // Conservatively assume hasSideEffects if it wasn't explicit.
+      if (InstInfo.hasSideEffects_Unset)
+        InstInfo.hasSideEffects = true;
+    }
+    return;
+  }
+
+  // Complain about any flags that are still undefined.
+  for (unsigned i = 0, e = Revisit.size(); i != e; ++i) {
+    CodeGenInstruction &InstInfo = *Revisit[i];
+    if (InstInfo.InferredFrom)
+      continue;
+    if (InstInfo.hasSideEffects_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer hasSideEffects from patterns");
+    if (InstInfo.mayStore_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer mayStore from patterns");
+    if (InstInfo.mayLoad_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer mayLoad from patterns");
+  }
+}
+
+
+/// Verify instruction flags against pattern node properties.
+void CodeGenDAGPatterns::VerifyInstructionFlags() {
+  unsigned Errors = 0;
+  for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) {
+    const PatternToMatch &PTM = *I;
+    SmallVector<Record*, 8> Instrs;
+    getInstructionsInTree(PTM.getDstPattern(), Instrs);
+    if (Instrs.empty())
+      continue;
+
+    // Count the number of instructions with each flag set.
+    unsigned NumSideEffects = 0;
+    unsigned NumStores = 0;
+    unsigned NumLoads = 0;
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
+      NumSideEffects += InstInfo.hasSideEffects;
+      NumStores += InstInfo.mayStore;
+      NumLoads += InstInfo.mayLoad;
+    }
+
+    // Analyze the source pattern.
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(&PTM);
+
+    // Collect error messages.
+    SmallVector<std::string, 4> Msgs;
+
+    // Check for missing flags in the output.
+    // Permit extra flags for now at least.
+    if (PatInfo.hasSideEffects && !NumSideEffects)
+      Msgs.push_back("pattern has side effects, but hasSideEffects isn't set");
+
+    // Don't verify store flags on instructions with side effects. At least for
+    // intrinsics, side effects implies mayStore.
+    if (!PatInfo.hasSideEffects && PatInfo.mayStore && !NumStores)
+      Msgs.push_back("pattern may store, but mayStore isn't set");
+
+    // Similarly, mayStore implies mayLoad on intrinsics.
+    if (!PatInfo.mayStore && PatInfo.mayLoad && !NumLoads)
+      Msgs.push_back("pattern may load, but mayLoad isn't set");
+
+    // Print error messages.
+    if (Msgs.empty())
+      continue;
+    ++Errors;
+
+    for (unsigned i = 0, e = Msgs.size(); i != e; ++i)
+      PrintError(PTM.getSrcRecord()->getLoc(), Twine(Msgs[i]) + " on the " +
+                 (Instrs.size() == 1 ?
+                  "instruction" : "output instructions"));
+    // Provide the location of the relevant instruction definitions.
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      if (Instrs[i] != PTM.getSrcRecord())
+        PrintError(Instrs[i]->getLoc(), "defined here");
+      const CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
+      if (InstInfo.InferredFrom &&
+          InstInfo.InferredFrom != InstInfo.TheDef &&
+          InstInfo.InferredFrom != PTM.getSrcRecord())
+        PrintError(InstInfo.InferredFrom->getLoc(), "inferred from patttern");
+    }
+  }
+  if (Errors)
+    throw "Errors in DAG patterns";
 }
 
 /// Given a pattern result with an unresolved type, see if we can find one
@@ -3330,4 +3455,3 @@
     DEBUG(errs() << "\n");
   }
 }
-

diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 5a2d40a..25a0e4b 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h

@@ -582,8 +582,8 @@
   void ComputeNamedNodes(TreePatternNode *N);
 };
 
-/// DAGDefaultOperand - One of these is created for each PredicateOperand
-/// or OptionalDefOperand that has a set ExecuteAlways / DefaultOps field.
+/// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps
+/// that has a set ExecuteAlways / DefaultOps field.
 struct DAGDefaultOperand {
   std::vector<TreePatternNode*> DefaultOps;
 };
@@ -797,6 +797,7 @@
   void ParsePatterns();
   void InferInstructionFlags();
   void GenerateVariants();
+  void VerifyInstructionFlags();
 
   void AddPatternToMatch(const TreePattern *Pattern, const PatternToMatch &PTM);
   void FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,

diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 12e153a..38e2b83 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp

@@ -287,7 +287,8 @@
 // CodeGenInstruction Implementation
 //===----------------------------------------------------------------------===//
 
-CodeGenInstruction::CodeGenInstruction(Record *R) : TheDef(R), Operands(R) {
+CodeGenInstruction::CodeGenInstruction(Record *R)
+  : TheDef(R), Operands(R), InferredFrom(0) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = R->getValueAsString("AsmString");
 
@@ -301,8 +302,6 @@
   isBarrier    = R->getValueAsBit("isBarrier");
   isCall       = R->getValueAsBit("isCall");
   canFoldAsLoad = R->getValueAsBit("canFoldAsLoad");
-  mayLoad      = R->getValueAsBit("mayLoad");
-  mayStore     = R->getValueAsBit("mayStore");
   isPredicable = Operands.isPredicable || R->getValueAsBit("isPredicable");
   isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress");
   isCommutable = R->getValueAsBit("isCommutable");
@@ -313,8 +312,13 @@
   hasPostISelHook = R->getValueAsBit("hasPostISelHook");
   hasCtrlDep   = R->getValueAsBit("hasCtrlDep");
   isNotDuplicable = R->getValueAsBit("isNotDuplicable");
-  hasSideEffects = R->getValueAsBit("hasSideEffects");
+
+  mayLoad      = R->getValueAsBitOrUnset("mayLoad", mayLoad_Unset);
+  mayStore     = R->getValueAsBitOrUnset("mayStore", mayStore_Unset);
+  hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects",
+                                           hasSideEffects_Unset);
   neverHasSideEffects = R->getValueAsBit("neverHasSideEffects");
+
   isAsCheapAsAMove = R->getValueAsBit("isAsCheapAsAMove");
   hasExtraSrcRegAllocReq = R->getValueAsBit("hasExtraSrcRegAllocReq");
   hasExtraDefRegAllocReq = R->getValueAsBit("hasExtraDefRegAllocReq");
@@ -409,7 +413,7 @@
 /// successful match, with ResOp set to the result operand to be used.
 bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
                                        Record *InstOpRec, bool hasSubOps,
-                                       SMLoc Loc, CodeGenTarget &T,
+                                       ArrayRef<SMLoc> Loc, CodeGenTarget &T,
                                        ResultOperand &ResOp) {
   Init *Arg = Result->getArg(AliasOpNo);
   DefInit *ADI = dynamic_cast<DefInit*>(Arg);

diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 95b572d..f601a83 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h

@@ -226,7 +226,10 @@
     bool isBarrier;
     bool isCall;
     bool canFoldAsLoad;
-    bool mayLoad, mayStore;
+    bool mayLoad;
+    bool mayLoad_Unset;
+    bool mayStore;
+    bool mayStore_Unset;
     bool isPredicable;
     bool isConvertibleToThreeAddress;
     bool isCommutable;
@@ -238,6 +241,7 @@
     bool hasCtrlDep;
     bool isNotDuplicable;
     bool hasSideEffects;
+    bool hasSideEffects_Unset;
     bool neverHasSideEffects;
     bool isAsCheapAsAMove;
     bool hasExtraSrcRegAllocReq;
@@ -245,6 +249,14 @@
     bool isCodeGenOnly;
     bool isPseudo;
 
+    /// Are there any undefined flags?
+    bool hasUndefFlags() const {
+      return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
+    }
+
+    // The record used to infer instruction flags, or NULL if no flag values
+    // have been inferred.
+    Record *InferredFrom;
 
     CodeGenInstruction(Record *R);
 
@@ -319,7 +331,7 @@
     CodeGenInstAlias(Record *R, CodeGenTarget &T);
 
     bool tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
-                         Record *InstOpRec, bool hasSubOps, SMLoc Loc,
+                         Record *InstOpRec, bool hasSubOps, ArrayRef<SMLoc> Loc,
                          CodeGenTarget &T, ResultOperand &ResOp);
   };
 }

diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 011f4b7..b2e9e38 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp

@@ -298,7 +298,7 @@
   for (SubRegMap::const_iterator SI = SubRegs.begin(), SE = SubRegs.end();
        SI != SE; ++SI) {
     if (SI->second == this) {
-      SMLoc Loc;
+      ArrayRef<SMLoc> Loc;
       if (TheDef)
         Loc = TheDef->getLoc();
       throw TGError(Loc, "Register " + getName() +
@@ -310,7 +310,7 @@
     if (Ins->second == SI->first)
       continue;
     // Trouble: Two different names for SI->second.
-    SMLoc Loc;
+    ArrayRef<SMLoc> Loc;
     if (TheDef)
       Loc = TheDef->getLoc();
     throw TGError(Loc, "Sub-register can't have two names: " +

diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 1dd2efc..fa31c9f 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp

@@ -300,6 +300,8 @@
     "REG_SEQUENCE",
     "COPY",
     "BUNDLE",
+    "LIFETIME_START",
+    "LIFETIME_END",
     0
   };
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
@@ -334,6 +336,15 @@
   return getInstructionSet()->getValueAsBit("isLittleEndianEncoding");
 }
 
+/// guessInstructionProperties - Return true if it's OK to guess instruction
+/// properties instead of raising an error.
+///
+/// This is configurable as a temporary migration aid. It will eventually be
+/// permanently false.
+bool CodeGenTarget::guessInstructionProperties() const {
+  return getInstructionSet()->getValueAsBit("guessInstructionProperties");
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexPattern implementation
 //

diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 2f8cee4..672b140 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h

@@ -177,6 +177,10 @@
   ///
   bool isLittleEndianEncoding() const;
 
+  /// guessInstructionProperties - should we just guess unset instruction
+  /// properties?
+  bool guessInstructionProperties() const;
+
 private:
   void ComputeInstrsByEnum() const;
 };

diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index aed222c..b291269 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp

@@ -727,8 +727,7 @@
 
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
-    if ((OperandNode->isSubClassOf("PredicateOperand") ||
-         OperandNode->isSubClassOf("OptionalDefOperand")) &&
+    if (OperandNode->isSubClassOf("OperandWithDefaultOps") &&
         !CGP.getDefaultOperand(OperandNode).DefaultOps.empty()) {
       // This is a predicate or optional def operand; emit the
       // 'default ops' operands.

diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index 8bfecea..0ad25a5 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp

@@ -17,6 +17,7 @@
 
 #include "CodeGenTarget.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <list>
@@ -74,6 +75,8 @@
 // Another way of thinking about this transition is we are mapping a NDFA with
 // two states [0x01] and [0x10] into a DFA with a single state [0x01, 0x10].
 //
+// A State instance also contains a collection of transitions from that state:
+// a map from inputs to new states.
 //
 namespace {
 class State {
@@ -82,10 +85,16 @@
   int stateNum;
   bool isInitial;
   std::set<unsigned> stateInfo;
+  typedef std::map<unsigned, State *> TransitionMap;
+  TransitionMap Transitions;
 
   State();
   State(const State &S);
 
+  bool operator<(const State &s) const {
+    return stateNum < s.stateNum;
+  }
+
   //
   // canAddInsnClass - Returns true if an instruction of type InsnClass is a
   // valid transition from this state, i.e., can an instruction of type InsnClass
@@ -100,38 +109,18 @@
   // which are possible from this state (PossibleStates).
   //
   void AddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates);
+  // 
+  // addTransition - Add a transition from this state given the input InsnClass
+  //
+  void addTransition(unsigned InsnClass, State *To);
+  //
+  // hasTransition - Returns true if there is a transition from this state
+  // given the input InsnClass
+  //
+  bool hasTransition(unsigned InsnClass);
 };
 } // End anonymous namespace.
 
-
-namespace {
-struct Transition {
- public:
-  static int currentTransitionNum;
-  int transitionNum;
-  State *from;
-  unsigned input;
-  State *to;
-
-  Transition(State *from_, unsigned input_, State *to_);
-};
-} // End anonymous namespace.
-
-
-//
-// Comparators to keep set of states sorted.
-//
-namespace {
-struct ltState {
-  bool operator()(const State *s1, const State *s2) const;
-};
-
-struct ltTransition {
-  bool operator()(const Transition *s1, const Transition *s2) const;
-};
-} // End anonymous namespace.
-
-
 //
 // class DFA: deterministic finite automaton for processor resource tracking.
 //
@@ -139,36 +128,19 @@
 class DFA {
 public:
   DFA();
+  ~DFA();
 
   // Set of states. Need to keep this sorted to emit the transition table.
-  std::set<State*, ltState> states;
+  typedef std::set<State *, less_ptr<State> > StateSet;
+  StateSet states;
 
-  // Map from a state to the list of transitions with that state as source.
-  std::map<State*, std::set<Transition*, ltTransition>, ltState>
-    stateTransitions;
   State *currentState;
 
-  // Highest valued Input seen.
-  unsigned LargestInput;
-
   //
   // Modify the DFA.
   //
   void initialize();
   void addState(State *);
-  void addTransition(Transition *);
-
-  //
-  // getTransition -  Return the state when a transition is made from
-  // State From with Input I. If a transition is not found, return NULL.
-  //
-  State *getTransition(State *, unsigned);
-
-  //
-  // isValidTransition: Predicate that checks if there is a valid transition
-  // from state From on input InsnClass.
-  //
-  bool isValidTransition(State *From, unsigned InsnClass);
 
   //
   // writeTable: Print out a table representing the DFA.
@@ -179,7 +151,7 @@
 
 
 //
-// Constructors for State, Transition, and DFA
+// Constructors and destructors for State and DFA
 //
 State::State() :
   stateNum(currentStateNum++), isInitial(false) {}
@@ -189,22 +161,27 @@
   stateNum(currentStateNum++), isInitial(S.isInitial),
   stateInfo(S.stateInfo) {}
 
+DFA::DFA(): currentState(NULL) {}
 
-Transition::Transition(State *from_, unsigned input_, State *to_) :
-  transitionNum(currentTransitionNum++), from(from_), input(input_),
-  to(to_) {}
-
-
-DFA::DFA() :
-  LargestInput(0) {}
-
-
-bool ltState::operator()(const State *s1, const State *s2) const {
-    return (s1->stateNum < s2->stateNum);
+DFA::~DFA() {
+  DeleteContainerPointers(states);
 }
 
-bool ltTransition::operator()(const Transition *s1, const Transition *s2) const {
-    return (s1->input < s2->input);
+// 
+// addTransition - Add a transition from this state given the input InsnClass
+//
+void State::addTransition(unsigned InsnClass, State *To) {
+  assert(!Transitions.count(InsnClass) &&
+      "Cannot have multiple transitions for the same input");
+  Transitions[InsnClass] = To;
+}
+
+//
+// hasTransition - Returns true if there is a transition from this state
+// given the input InsnClass
+//
+bool State::hasTransition(unsigned InsnClass) {
+  return Transitions.count(InsnClass) > 0;
 }
 
 //
@@ -272,6 +249,7 @@
 
 
 void DFA::initialize() {
+  assert(currentState && "Missing current state");
   currentState->isInitial = true;
 }
 
@@ -282,47 +260,7 @@
 }
 
 
-void DFA::addTransition(Transition *T) {
-  // Update LargestInput.
-  if (T->input > LargestInput)
-    LargestInput = T->input;
-
-  // Add the new transition.
-  bool Added = stateTransitions[T->from].insert(T).second;
-  assert(Added && "Cannot have multiple states for the same input");
-  (void)Added;
-}
-
-
-//
-// getTransition - Return the state when a transition is made from
-// State From with Input I. If a transition is not found, return NULL.
-//
-State *DFA::getTransition(State *From, unsigned I) {
-  // Do we have a transition from state From?
-  if (!stateTransitions.count(From))
-    return NULL;
-
-  // Do we have a transition from state From with Input I?
-  Transition TVal(NULL, I, NULL);
-  // Do not count this temporal instance
-  Transition::currentTransitionNum--;
-  std::set<Transition*, ltTransition>::iterator T =
-    stateTransitions[From].find(&TVal);
-  if (T != stateTransitions[From].end())
-    return (*T)->to;
-
-  return NULL;
-}
-
-
-bool DFA::isValidTransition(State *From, unsigned InsnClass) {
-  return (getTransition(From, InsnClass) != NULL);
-}
-
-
 int State::currentStateNum = 0;
-int Transition::currentTransitionNum = 0;
 
 DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R):
   TargetName(CodeGenTarget(R).getName()),
@@ -341,7 +279,7 @@
 //
 //
 void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
-  std::set<State*, ltState>::iterator SI = states.begin();
+  DFA::StateSet::iterator SI = states.begin();
   // This table provides a map to the beginning of the transitions for State s
   // in DFAStateInputTable.
   std::vector<int> StateEntry(states.size());
@@ -353,18 +291,16 @@
   // to construct the StateEntry table.
   int ValidTransitions = 0;
   for (unsigned i = 0; i < states.size(); ++i, ++SI) {
+    assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
     StateEntry[i] = ValidTransitions;
-    for (unsigned j = 0; j <= LargestInput; ++j) {
-      assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
-      State *To = getTransition(*SI, j);
-      if (To == NULL)
-        continue;
-
-      OS << "{" << j << ", "
-         << To->stateNum
+    for (State::TransitionMap::iterator
+        II = (*SI)->Transitions.begin(), IE = (*SI)->Transitions.end();
+        II != IE; ++II) {
+      OS << "{" << II->first << ", "
+         << II->second->stateNum
          << "},    ";
-      ++ValidTransitions;
     }
+    ValidTransitions += (*SI)->Transitions.size();
 
     // If there are no valid transitions from this stage, we need a sentinel
     // transition.
@@ -539,7 +475,7 @@
       // If we haven't already created a transition for this input
       // and the state can accommodate this InsnClass, create a transition.
       //
-      if (!D.getTransition(current, InsnClass) &&
+      if (!current->hasTransition(InsnClass) &&
           current->canAddInsnClass(InsnClass)) {
         State *NewState = NULL;
         current->AddInsnClass(InsnClass, NewStateResources);
@@ -559,10 +495,8 @@
           Visited[NewStateResources] = NewState;
           WorkList.push_back(NewState);
         }
-
-        Transition *NewTransition = new Transition(current, InsnClass,
-                                                   NewState);
-        D.addTransition(NewTransition);
+        
+        current->addTransition(InsnClass, NewState);
       }
     }
   }

diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index e89c393..aa6d796 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp

@@ -1783,7 +1783,7 @@
       VarInit *Var = 0;
       VarBitInit *BI = dynamic_cast<VarBitInit*>(Bits.getBit(bi));
       if (BI)
-        Var = dynamic_cast<VarInit*>(BI->getVariable());
+        Var = dynamic_cast<VarInit*>(BI->getBitVar());
       else
         Var = dynamic_cast<VarInit*>(Bits.getBit(bi));
 

diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 8d9d419..8e28180 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp

@@ -156,7 +156,7 @@
 
   // If there are more operands that weren't in the DAG, they have to
   // be operands that have default values, or we have an error. Currently,
-  // PredicateOperand and OptionalDefOperand both have default values.
+  // Operands that are a sublass of OperandWithDefaultOp have default values.
 
 
   // Validate that each result pattern argument has a matching (by name)

diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index d8ab2ee..60202b5 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h

@@ -29,8 +29,8 @@
 /// Compute the layout of a table that contains all the sequences, possibly by
 /// reusing entries.
 ///
-/// @param SeqT The sequence container. (vector or string).
-/// @param Less A stable comparator for SeqT elements.
+/// @tparam SeqT The sequence container. (vector or string).
+/// @tparam Less A stable comparator for SeqT elements.
 template<typename SeqT, typename Less = std::less<typename SeqT::value_type> >
 class SequenceToOffsetTable {
   typedef typename SeqT::value_type ElemT;

diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 3472343..5dfd716 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp

@@ -626,7 +626,7 @@
     // Emit as { "cpu", procinit },
     OS << "  { "
        << "\"" << Name << "\", "
-       << "(void *)&" << ProcModelName;
+       << "(const void *)&" << ProcModelName;
 
     OS << " }";
 

diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 7ac2336..4b12279 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp

@@ -1145,6 +1145,8 @@
   // register IDs in 8-bit immediates nowadays.
   ENCODING("VR256",           ENCODING_IB)
   ENCODING("VR128",           ENCODING_IB)
+  ENCODING("FR32",            ENCODING_IB)
+  ENCODING("FR64",            ENCODING_IB)
   errs() << "Unhandled immediate encoding " << s << "\n";
   llvm_unreachable("Unhandled immediate encoding");
 }

diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 542e510..e0bb2e2 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h

@@ -143,7 +143,7 @@
   /// @param hasREX_WPrefix - Indicates whether the instruction has a REX.W
   ///                         prefix.  If it does, 32-bit register operands stay
   ///                         32-bit regardless of the operand size.
-  /// @param hasOpSizePrefix- Indicates whether the instruction has an OpSize
+  /// @param hasOpSizePrefix  Indicates whether the instruction has an OpSize
   ///                         prefix.  If it does not, then 16-bit register
   ///                         operands stay 16-bit.
   /// @return               - The operand's type.

diff --git a/utils/lit/lit/Util.py b/utils/lit/lit/Util.py
index 226e453..f294809 100644
--- a/utils/lit/lit/Util.py
+++ b/utils/lit/lit/Util.py

@@ -56,7 +56,7 @@
         paths = os.environ.get('PATH','')
 
     # Check for absolute match first.
-    if os.path.exists(command):
+    if os.path.isfile(command):
         return command
 
     # Would be nice if Python had a lib function for this.

diff --git a/utils/llvm-lit/llvm-lit.in b/utils/llvm-lit/llvm-lit.in
index 879d18bd..768dc51 100755
--- a/utils/llvm-lit/llvm-lit.in
+++ b/utils/llvm-lit/llvm-lit.in

@@ -18,10 +18,15 @@
     'llvm_site_config' : os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
     }
 
-clang_site_config = os.path.join(llvm_obj_root, 'tools', 'clang', 'test',
-                                 'lit.site.cfg')
-if os.path.exists(clang_site_config):
-    builtin_parameters['clang_site_config'] = clang_site_config
+clang_obj_root = os.path.join(llvm_obj_root, 'tools', 'clang')
+
+if os.path.exists(clang_obj_root):
+    builtin_parameters['clang_site_config'] = \
+        os.path.join(clang_obj_root, 'test', 'lit.site.cfg')
+    clang_tools_extra_obj_root = os.path.join(clang_obj_root, 'tools', 'extra')
+    if os.path.exists(clang_tools_extra_obj_root):
+        builtin_parameters['clang_tools_extra_site_config'] = \
+            os.path.join(clang_tools_extra_obj_root, 'test', 'lit.site.cfg')
 
 if __name__=='__main__':
     import lit

diff --git a/utils/llvm.grm b/utils/llvm.grm
index ad2799f..322036b 100644
--- a/utils/llvm.grm
+++ b/utils/llvm.grm

@@ -175,7 +175,6 @@
  | returns_twice
  | nonlazybind
  | address_safety
- | ia_nsdialect
  ;
 
 OptFuncAttrs  ::= + _ | OptFuncAttrs FuncAttr ;

diff --git a/utils/unittest/googletest/gtest-port.cc b/utils/unittest/googletest/gtest-port.cc
index 07e5bb3..3c32ff1 100644
--- a/utils/unittest/googletest/gtest-port.cc
+++ b/utils/unittest/googletest/gtest-port.cc

@@ -505,6 +505,10 @@
     GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
                                     << temp_file_path;
     filename_ = temp_file_path;
+#elif GTEST_OS_LINUX_ANDROID
+    char name_template[] = "/sdcard/captured_stderr.XXXXXX";
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
 # else
     // There's no guarantee that a test has write access to the
     // current directory, so we create the temporary file in the /tmp

diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-port.h b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
index 8ef5d7d..58f6caf 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-port.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-port.h

@@ -230,7 +230,7 @@
 # define GTEST_OS_MAC 1
 #elif defined __linux__
 # define GTEST_OS_LINUX 1
-# ifdef ANDROID
+# if defined(ANDROID) || defined(__ANDROID__)
 #  define GTEST_OS_LINUX_ANDROID 1
 # endif  // ANDROID
 #elif defined __MVS__
commit	99e8132d3daedcebd061aacb0b08d1e1a6dab14c	[log] [tgz]
author	Stephen Hines <srhines@google.com>	Thu Sep 13 19:06:52 2012 -0700
committer	Stephen Hines <srhines@google.com>	Thu Sep 13 19:06:52 2012 -0700
tree	4c816050f64e6b7de0984adb5cd3a0a6723c3703
parent	68aeecc773d36c4be9adf32d1f09ec739574c880 [diff]
parent	828ded66831c0caaeecd2291a6bfb084f373d0e4 [diff]