add optimized SHA1 algorithm

This optimized implementation of the SHA1 algorithm is about 28%
faster than the old one (on sapphire hardware) but assumes
little-endianness.  Add it, but continue using the old implementation
on big-endian hardware.
diff --git a/include/mincrypt/sha.h b/include/mincrypt/sha.h
index c523460..2bcc522 100644
--- a/include/mincrypt/sha.h
+++ b/include/mincrypt/sha.h
@@ -13,14 +13,14 @@
 **       be used to endorse or promote products derived from this software
 **       without specific prior written permission.
 **
-** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR 
+** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
 ** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 
+** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 ** EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
-** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 ** OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ** ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
@@ -29,6 +29,7 @@
 #define _EMBEDDED_SHA_H_
 
 #include <inttypes.h>
+#include <endian.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,16 +37,23 @@
 
 typedef struct SHA_CTX {
     uint64_t count;
-    uint8_t buf[64];
     uint32_t state[5];
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+    union {
+        uint8_t b[64];
+        uint32_t w[16];
+    } buf;
+#else
+    uint8_t buf[64];
+#endif
 } SHA_CTX;
 
-void SHA_init(SHA_CTX *ctx);
-void SHA_update(SHA_CTX *ctx, const void* data, int len);
-const uint8_t* SHA_final(SHA_CTX *ctx);
+void SHA_init(SHA_CTX* ctx);
+void SHA_update(SHA_CTX* ctx, const void* data, int len);
+const uint8_t* SHA_final(SHA_CTX* ctx);
 
 /* Convenience method. Returns digest parameter value. */
-const uint8_t* SHA(const void *data, int len, uint8_t *digest);
+const uint8_t* SHA(const void* data, int len, uint8_t* digest);
 
 #define SHA_DIGEST_SIZE 20
 
diff --git a/sha.c b/sha.c
index d6120da..33d1cb3 100644
--- a/sha.c
+++ b/sha.c
@@ -13,20 +13,181 @@
 **       be used to endorse or promote products derived from this software
 **       without specific prior written permission.
 **
-** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR 
+** THIS SOFTWARE IS PROVIDED BY Google Inc. ``AS IS'' AND ANY EXPRESS OR
 ** IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 
+** MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
 ** EVENT SHALL Google Inc. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 ** SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
-** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
-** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 
+** PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+** OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+** WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 ** OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 ** ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 
+#include <byteswap.h>
+#include <endian.h>
+#include <memory.h>
+
 #include "mincrypt/sha.h"
 
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+
+// This version is about 28% faster than the generic version below,
+// but assumes little-endianness.
+
+static inline uint32_t ror27(uint32_t val) {
+    return (val >> 27) | (val << 5);
+}
+static inline uint32_t ror2(uint32_t val) {
+    return (val >> 2) | (val << 30);
+}
+static inline uint32_t ror31(uint32_t val) {
+    return (val >> 31) | (val << 1);
+}
+
+static void SHA1_Transform(SHA_CTX* ctx) {
+    uint32_t W[80];
+    register uint32_t A, B, C, D, E;
+    int t;
+
+    A = ctx->state[0];
+    B = ctx->state[1];
+    C = ctx->state[2];
+    D = ctx->state[3];
+    E = ctx->state[4];
+
+#define SHA_F1(A,B,C,D,E,t)                     \
+    E += ror27(A) +                             \
+        (W[t] = bswap_32(ctx->buf.w[t])) +      \
+        (D^(B&(C^D))) + 0x5A827999;             \
+    B = ror2(B);
+
+    for (t = 0; t < 15; t += 5) {
+        SHA_F1(A,B,C,D,E,t + 0);
+        SHA_F1(E,A,B,C,D,t + 1);
+        SHA_F1(D,E,A,B,C,t + 2);
+        SHA_F1(C,D,E,A,B,t + 3);
+        SHA_F1(B,C,D,E,A,t + 4);
+    }
+    SHA_F1(A,B,C,D,E,t + 0);  // 16th one, t == 15
+
+#undef SHA_F1
+
+#define SHA_F1(A,B,C,D,E,t)                                     \
+    E += ror27(A) +                                             \
+        (W[t] = ror31(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16])) +   \
+        (D^(B&(C^D))) + 0x5A827999;                             \
+    B = ror2(B);
+
+    SHA_F1(E,A,B,C,D,t + 1);
+    SHA_F1(D,E,A,B,C,t + 2);
+    SHA_F1(C,D,E,A,B,t + 3);
+    SHA_F1(B,C,D,E,A,t + 4);
+
+#undef SHA_F1
+
+#define SHA_F2(A,B,C,D,E,t)                                     \
+    E += ror27(A) +                                             \
+        (W[t] = ror31(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16])) +   \
+        (B^C^D) + 0x6ED9EBA1;                                   \
+    B = ror2(B);
+
+    for (t = 20; t < 40; t += 5) {
+        SHA_F2(A,B,C,D,E,t + 0);
+        SHA_F2(E,A,B,C,D,t + 1);
+        SHA_F2(D,E,A,B,C,t + 2);
+        SHA_F2(C,D,E,A,B,t + 3);
+        SHA_F2(B,C,D,E,A,t + 4);
+    }
+
+#undef SHA_F2
+
+#define SHA_F3(A,B,C,D,E,t)                                     \
+    E += ror27(A) +                                             \
+        (W[t] = ror31(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16])) +   \
+        ((B&C)|(D&(B|C))) + 0x8F1BBCDC;                         \
+    B = ror2(B);
+
+    for (; t < 60; t += 5) {
+        SHA_F3(A,B,C,D,E,t + 0);
+        SHA_F3(E,A,B,C,D,t + 1);
+        SHA_F3(D,E,A,B,C,t + 2);
+        SHA_F3(C,D,E,A,B,t + 3);
+        SHA_F3(B,C,D,E,A,t + 4);
+    }
+
+#undef SHA_F3
+
+#define SHA_F4(A,B,C,D,E,t)                                     \
+    E += ror27(A) +                                             \
+        (W[t] = ror31(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16])) +   \
+        (B^C^D) + 0xCA62C1D6;                                   \
+    B = ror2(B);
+
+    for (; t < 80; t += 5) {
+        SHA_F4(A,B,C,D,E,t + 0);
+        SHA_F4(E,A,B,C,D,t + 1);
+        SHA_F4(D,E,A,B,C,t + 2);
+        SHA_F4(C,D,E,A,B,t + 3);
+        SHA_F4(B,C,D,E,A,t + 4);
+    }
+
+#undef SHA_F4
+
+    ctx->state[0] += A;
+    ctx->state[1] += B;
+    ctx->state[2] += C;
+    ctx->state[3] += D;
+    ctx->state[4] += E;
+}
+
+void SHA_update(SHA_CTX* ctx, const void* data, int len) {
+    int i = ctx->count % sizeof(ctx->buf);
+    const uint8_t* p = (const uint8_t*)data;
+
+    ctx->count += len;
+
+    while (len > sizeof(ctx->buf) - i) {
+        memcpy(&ctx->buf.b[i], p, sizeof(ctx->buf) - i);
+        len -= sizeof(ctx->buf) - i;
+        p += sizeof(ctx->buf) - i;
+        SHA1_Transform(ctx);
+        i = 0;
+    }
+
+    while (len--) {
+        ctx->buf.b[i++] = *p++;
+        if (i == sizeof(ctx->buf)) {
+            SHA1_Transform(ctx);
+            i = 0;
+        }
+    }
+}
+
+
+const uint8_t* SHA_final(SHA_CTX* ctx) {
+    uint64_t cnt = ctx->count * 8;
+    int i;
+
+    SHA_update(ctx, (uint8_t*)"\x80", 1);
+    while ((ctx->count % sizeof(ctx->buf)) != (sizeof(ctx->buf) - 8)) {
+        SHA_update(ctx, (uint8_t*)"\0", 1);
+    }
+    for (i = 0; i < 8; ++i) {
+        uint8_t tmp = cnt >> ((7 - i) * 8);
+        SHA_update(ctx, &tmp, 1);
+    }
+
+    for (i = 0; i < 5; i++) {
+        ctx->buf.w[i] = bswap_32(ctx->state[i]);
+    }
+
+    return ctx->buf.b;
+}
+
+#else   // __BYTE_ORDER == BIG_ENDIAN
+
 #define rol(bits, value) (((value) << (bits)) | ((value) >> (32 - (bits))))
 
 static void SHA1_transform(SHA_CTX *ctx) {
@@ -79,15 +240,6 @@
     ctx->state[4] += E;
 }
 
-void SHA_init(SHA_CTX *ctx) {
-    ctx->state[0] = 0x67452301;
-    ctx->state[1] = 0xEFCDAB89;
-    ctx->state[2] = 0x98BADCFE;
-    ctx->state[3] = 0x10325476;
-    ctx->state[4] = 0xC3D2E1F0;
-    ctx->count = 0;
-}
-
 void SHA_update(SHA_CTX *ctx, const void *data, int len) {
     int i = ctx->count % sizeof(ctx->buf);
     const uint8_t* p = (const uint8_t*)data;
@@ -127,6 +279,17 @@
     return ctx->buf;
 }
 
+#endif // endianness
+
+void SHA_init(SHA_CTX* ctx) {
+    ctx->state[0] = 0x67452301;
+    ctx->state[1] = 0xEFCDAB89;
+    ctx->state[2] = 0x98BADCFE;
+    ctx->state[3] = 0x10325476;
+    ctx->state[4] = 0xC3D2E1F0;
+    ctx->count = 0;
+}
+
 /* Convenience function */
 const uint8_t* SHA(const void *data, int len, uint8_t *digest) {
     const uint8_t *p;