Fixed : decompression issue on 32-bits CPU without unaligned memory access
diff --git a/examples/Makefile b/examples/Makefile
index 4474f59..df24ea9 100644
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -32,7 +32,7 @@
 
 CC     := $(CC)
 CFLAGS ?= -O3
-CFLAGS += -std=c99 -Wall -Wextra -Wundef -Wshadow -Wstrict-prototypes -Wno-missing-braces   # Wno-missing-braces required due to GCC <4.8.3 bug
+CFLAGS += -std=c99 -Wall -Wextra -Wundef -Wshadow -Wcast-align -Wstrict-prototypes -Wno-missing-braces   # Wno-missing-braces required due to GCC <4.8.3 bug
 FLAGS   = -I.. $(CPPFLAGS) $(CFLAGS) $(LDFLAGS)
 
 TESTFILE= Makefile
diff --git a/lz4.c b/lz4.c
index 2a6f038..f2a8120 100644
--- a/lz4.c
+++ b/lz4.c
@@ -44,10 +44,26 @@
 
 /*
  * CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS :
- * You can force the code to use unaligned memory access, should you know your CPU can handle it efficiently.
- * If it effectively results in better speed (up to 50% improvement can be expected)
+ * By default, the source code expects the compiler to correctly optimize
+ * 4-bytes and 8-bytes read on architectures able to handle it efficiently.
+ * This is not always the case. In some circumstances (ARM notably),
+ * the compiler will issue cautious code even when target is able to correctly handle unaligned memory accesses.
+ *
+ * You can force the compiler to use unaligned memory access by uncommenting the line below.
+ * One of the below scenarios will happen :
+ * 1 - Your target CPU correctly handle unaligned access, and was not well optimized by compiler (good case).
+ *     You will witness large performance improvements (+50% and up).
+ *     Keep the line uncommented and send a word to upstream (https://groups.google.com/forum/#!forum/lz4c)
+ *     The goal is to automatically detect such situations by adding your target CPU within an exception list.
+ * 2 - Your target CPU correctly handle unaligned access, and was already correctly optimized by compiler
+ *     No change will be experienced.
+ * 3 - Your target CPU inefficiently handle unaligned access.
+ *     You will experience a performance loss. Comment back the line.
+ * 4 - Your target CPU does not handle unaligned access.
+ *     Program will crash.
+ * If it effectively results in better speed (case 1)
  * please report your configuration to upstream (https://groups.google.com/forum/#!forum/lz4c)
- * so that an automatic detection macro can be added to mainline.
+ * so that an automatic detection macro can be added for future versions of the library.
  */
 /* #define CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS 1 */
 
@@ -58,7 +74,7 @@
 /*
  * Automated efficient unaligned memory access detection
  * Based on known hardware architectures
- * This list will be updated thanks to Open Source community feedbacks
+ * This list will be updated thanks to feedbacks
  */
 #if defined(CPU_HAS_EFFICIENT_UNALIGNED_MEMORY_ACCESS) \
     || defined(__ARM_FEATURE_UNALIGNED) \
@@ -71,7 +87,10 @@
 #  define LZ4_UNALIGNED_ACCESS 0
 #endif
 
-/* Define this parameter if your target system or compiler does not support hardware bit count */
+/*
+ * LZ4_FORCE_SW_BITCOUNT
+ * Define this parameter if your target system or compiler does not support hardware bit count
+ */
 #if defined(_MSC_VER) && defined(_WIN32_WCE)   /* Visual Studio for Windows CE does not support Hardware bit count */
 #  define LZ4_FORCE_SW_BITCOUNT
 #endif
@@ -88,7 +107,7 @@
 
 #ifdef _MSC_VER    /* Visual Studio */
 #  define FORCE_INLINE static __forceinline
-#  include <intrin.h>                    /* For Visual 2005 */
+#  include <intrin.h>
 #  pragma warning(disable : 4127)        /* disable: C4127: conditional expression is constant */
 #else
 #  ifdef __GNUC__
@@ -961,7 +980,6 @@
         }
         LZ4_wildCopy(op, ip, cpy);
         ip += length; op = cpy;
-        //LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy;
 
         /* get offset */
         match = cpy - LZ4_readLE16(ip); ip+=2;
@@ -1018,7 +1036,7 @@
 
         /* copy repeated sequence */
         cpy = op + length;
-        if (unlikely((op-match)<(int)STEPSIZE))
+        if (unlikely((op-match)<8))
         {
             const size_t dec64 = dec64table[op-match];
             op[0] = match[0];
@@ -1036,7 +1054,7 @@
             if (op < oend-8)
             {
                 LZ4_wildCopy(op, match, oend-8);
-                match += oend-8 - op;
+                match += (oend-8) - op;
                 op = oend-8;
             }
             while (op<cpy) *op++ = *match++;
diff --git a/lz4.h b/lz4.h
index 8b03995..22bbcb5 100644
--- a/lz4.h
+++ b/lz4.h
@@ -169,7 +169,7 @@
 
 
 /***********************************************
-   Experimental Streaming Compression Functions
+   Streaming Compression Functions
 ***********************************************/
 
 #define LZ4_STREAMSIZE_U64 ((1 << (LZ4_MEMORY_USAGE-3)) + 4)
@@ -231,20 +231,17 @@
 
 
 /************************************************
-  Experimental Streaming Decompression Functions
+   Streaming Decompression Functions
 ************************************************/
 
 #define LZ4_STREAMDECODESIZE_U64  4
 #define LZ4_STREAMDECODESIZE     (LZ4_STREAMDECODESIZE_U64 * sizeof(unsigned long long))
+typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
 /*
  * LZ4_streamDecode_t
  * information structure to track an LZ4 stream.
- * important : init this structure content using LZ4_setStreamDecode or memset() before first use !
- */
-typedef struct { unsigned long long table[LZ4_STREAMDECODESIZE_U64]; } LZ4_streamDecode_t;
-
-/*
- * If you prefer dynamic allocation methods,
+ * init this structure content using LZ4_setStreamDecode or memset() before first use !
+ * If you prefer dynamic allocation methods :
  * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure
  * LZ4_freeStreamDecode releases its memory.
  */
@@ -254,9 +251,7 @@
 /*
  * LZ4_setStreamDecode
  * Use this function to instruct where to find the dictionary.
- * This function can be used to specify a static dictionary,
- * or to instruct where to find some previously decoded data saved into a different memory space.
- * Setting a size of 0 is allowed (same effect as no dictionary, same effect as reset).
+ * Setting a size of 0 is allowed (same effect as reset).
  * Return : 1 if OK, 0 if error
  */
 int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize);
@@ -277,7 +272,7 @@
 *_usingDict() :
     These decoding functions work the same as
     a combination of LZ4_setDictDecode() followed by LZ4_decompress_x_continue()
-    They don't use nor update an LZ4_streamDecode_t structure.
+    They are stand-alone and don't use nor update an LZ4_streamDecode_t structure.
 */
 int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxDecompressedSize, const char* dictStart, int dictSize);
 int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize);
@@ -294,18 +289,10 @@
 - LZ4_uncompress is the same as LZ4_decompress_fast
 - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe
 These function prototypes are now disabled; uncomment them if you really need them.
-It is highly recommended to stop using these functions and migrated to newer ones */
+It is highly recommended to stop using these functions and migrate to newer ones */
 /* int LZ4_uncompress (const char* source, char* dest, int outputSize); */
 /* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */
 
-/*
- * If you prefer dynamic allocation methods,
- * LZ4_createStreamDecode()
- * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure.
- * LZ4_free just frees it.
- */
-/* void* LZ4_createStreamDecode(void); */
-/*int   LZ4_free (void* LZ4_stream);    yes, it's the same one as for compression */
 
 /* Obsolete streaming functions; use new streaming interface whenever possible */
 void* LZ4_create (const char* inputBuffer);
diff --git a/programs/lz4cli.c b/programs/lz4cli.c
index 9a6e5bc..6e52ec6 100644
--- a/programs/lz4cli.c
+++ b/programs/lz4cli.c
@@ -48,10 +48,6 @@
 #  pragma warning(disable : 4127)      // disable: C4127: conditional expression is constant
 #endif
 
-#ifdef __clang__
-#  pragma clang diagnostic ignored "-Wunused-const-variable"   // const variable one is really used !
-#endif
-
 #define _FILE_OFFSET_BITS 64   // Large file support on 32-bits unix
 #define _POSIX_SOURCE 1        // for fileno() within <stdio.h> on unix
 
@@ -128,15 +124,6 @@
 
 
 //**************************************
-// Architecture Macros
-//**************************************
-static const int one = 1;
-#define CPU_LITTLE_ENDIAN   (*(char*)(&one))
-#define CPU_BIG_ENDIAN      (!CPU_LITTLE_ENDIAN)
-#define LITTLE_ENDIAN_32(i) (CPU_LITTLE_ENDIAN?(i):swap32(i))
-
-
-//**************************************
 // Macros
 //**************************************
 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
@@ -462,7 +449,7 @@
     }
 
     DISPLAYLEVEL(3, WELCOME_MESSAGE);
-    DISPLAYLEVEL(4, "Blocks size : %i KB\n", blockSize>>10);
+    if (!decode) DISPLAYLEVEL(4, "Blocks size : %i KB\n", blockSize>>10);
 
     // No input filename ==> use stdin
     if(!input_filename) { input_filename=stdinmark; }