[C++] Use LCP merge sort for $(sort)

and use stable_sort on Mac.

On Linux:
LCPMS: 0.627s, sort: 3.37s, stable_sort: 1.79s, qsort: 1.95s

On Mac:
LCPMS: 1.583s, sort: 1.33s, stable_sort: 1.19s, qsort: 1.80s
diff --git a/Makefile.ckati b/Makefile.ckati
index cf5bab8..4e41d51 100644
--- a/Makefile.ckati
+++ b/Makefile.ckati
@@ -35,6 +35,7 @@
 	flags.cc \
 	func.cc \
 	io.cc \
+	lcp_msort.cc \
 	log.cc \
 	main.cc \
 	mutex.cc \
diff --git a/func.cc b/func.cc
index 49115d4..5c5cf30 100644
--- a/func.cc
+++ b/func.cc
@@ -30,6 +30,7 @@
 #include "eval.h"
 #include "fileutil.h"
 #include "find.h"
+#include "lcp_msort.h"
 #include "log.h"
 #include "parser.h"
 #include "stats.h"
@@ -182,10 +183,15 @@
 }
 
 void SortFunc(const vector<Value*>& args, Evaluator* ev, string* s) {
-  const string&& list = args[0]->Eval(ev);
+  string list;
+  args[0]->Eval(ev, &list);
+  COLLECT_STATS("func sort time");
+  // TODO(hamaji): Probably we could make LCP merge sort faster than
+  // stable_sort.
+#if __APPLE___
   vector<StringPiece> toks;
   WordScanner(list).Split(&toks);
-  sort(toks.begin(), toks.end());
+  stable_sort(toks.begin(), toks.end());
   WordWriter ww(s);
   StringPiece prev;
   for (StringPiece tok : toks) {
@@ -194,6 +200,9 @@
       prev = tok;
     }
   }
+#else
+  StringSortByLcpMsort(&list, s);
+#endif
 }
 
 static int GetNumericValueForFunc(const string& buf) {
diff --git a/lcp_msort.cc b/lcp_msort.cc
new file mode 100644
index 0000000..c062ea5
--- /dev/null
+++ b/lcp_msort.cc
@@ -0,0 +1,210 @@
+// Copyright 2016 Google Inc. All rights reserved
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// http://ci.nii.ac.jp/naid/110006613087
+
+#include "lcp_msort.h"
+
+#include <stdlib.h>
+
+#include "strutil.h"
+
+#define DO_SORT_AND_UNIQ_AT_ONCE
+
+namespace {
+
+struct AnnotatedString {
+  const unsigned char* s;
+  int l;
+};
+
+inline int LcpCompare(const unsigned char* s1,
+                      const unsigned char* s2,
+                      int b,
+                      int& i) {
+  for (i = b; s1[i] || s2[i]; i++) {
+    unsigned char c1 = s1[i];
+    unsigned char c2 = s2[i];
+    if (c1 < c2) {
+      return -1;
+    }
+    else if (c1 > c2) {
+      return 1;
+    }
+  }
+  return 0;
+}
+
+#ifdef DO_SORT_AND_UNIQ_AT_ONCE
+
+int StringMergeSortAndUniq(const vector<const unsigned char*>& data,
+                           int begin,
+                           int end,
+                           AnnotatedString* work,
+                           AnnotatedString* out) {
+  if (begin + 1 == end) {
+    out[begin] = AnnotatedString{ data[begin], 0 };
+    return 1;
+  }
+
+  int mid = (begin + end) / 2;
+  int s1_len = StringMergeSortAndUniq(data, begin, mid, work, out);
+  int s2_len = StringMergeSortAndUniq(data, mid, end, work, out);
+
+  memcpy(work, out + begin, s1_len * sizeof(AnnotatedString));
+
+  AnnotatedString* s1 = work;
+  AnnotatedString* s2 = out + mid;
+  int i = 0, j = 0, k = 0;
+  AnnotatedString* d = out + begin;
+  for (; i < s1_len && j < s2_len; k++) {
+    if (s1[i].l > s2[j].l) {
+      d[k] = s1[i];
+      i++;
+    } else if (s1[i].l < s2[j].l) {
+      d[k] = s2[j];
+      j++;
+    } else {
+      int m;
+      int r = LcpCompare(s1[i].s, s2[j].s, s1[i].l, m);
+      if (r < 0) {
+        d[k] = s1[i];
+        i++;
+        s2[j].l = m;
+      } else if (r > 0) {
+        d[k] = s2[j];
+        j++;
+        s1[i].l = m;
+      } else {
+        // Discard a unique string.
+        d[k] = s1[i];
+        i++;
+        j++;
+      }
+    }
+  }
+
+  if (i < s1_len) {
+    memcpy(&d[k], &s1[i], (s1_len - i) * sizeof(AnnotatedString));
+    k += s1_len - i;
+  }
+  else if (j < s2_len) {
+    memcpy(&d[k], &s2[j], (s2_len - j) * sizeof(AnnotatedString));
+    k += s2_len - j;
+  }
+  return k;
+}
+
+#else
+
+void StringMergeSort(const vector<const unsigned char*>& data,
+                     int begin,
+                     int end,
+                     AnnotatedString* work,
+                     AnnotatedString* out) {
+  if (begin + 1 == end) {
+    out[begin] = AnnotatedString{ data[begin], 0 };
+    return;
+  }
+
+  int mid = (begin + end) / 2;
+  StringMergeSort(data, begin, mid, work, out);
+  StringMergeSort(data, mid, end, work, out);
+
+  memcpy(work, out + begin, (mid - begin) * sizeof(AnnotatedString));
+
+  AnnotatedString* s1 = work;
+  int s1_len = mid - begin;
+  AnnotatedString* s2 = out + mid;
+  int s2_len = end - mid;
+  int i = 0, j = 0, k = 0;
+  AnnotatedString* d = out + begin;
+  for (; i < s1_len && j < s2_len; k++) {
+    if (s1[i].l > s2[j].l) {
+      d[k] = s1[i];
+      i++;
+    } else if (s1[i].l < s2[j].l) {
+      d[k] = s2[j];
+      j++;
+    } else {
+      int m;
+      int r = LcpCompare(s1[i].s, s2[j].s, s1[i].l, m);
+      if (r < 0) {
+        d[k] = s1[i];
+        i++;
+        s2[j].l = m;
+      } else {
+        d[k] = s2[j];
+        j++;
+        s1[i].l = m;
+      }
+    }
+  }
+
+  if (i < s1_len)
+    memcpy(&d[k], &s1[i], (s1_len - i) * sizeof(AnnotatedString));
+  else if (j < s2_len)
+    memcpy(&d[k], &s2[j], (s2_len - j) * sizeof(AnnotatedString));
+}
+
+#endif
+
+}  // namespace
+
+void StringSortByLcpMsort(string* buf, string* out) {
+  vector<const unsigned char*> toks;
+  for (StringPiece tok : WordScanner(*buf)) {
+    const_cast<char*>(tok.data())[tok.size()] = 0;
+    toks.push_back(reinterpret_cast<const unsigned char*>(tok.data()));
+  }
+
+  if (toks.empty())
+    return;
+  if (toks.size() == 1) {
+    *out += reinterpret_cast<const char*>(toks[0]);
+    return;
+  }
+
+  AnnotatedString* as = static_cast<AnnotatedString*>(
+      malloc(toks.size() * sizeof(AnnotatedString)));
+  AnnotatedString* work = static_cast<AnnotatedString*>(
+      malloc((toks.size() / 2 + 1) * sizeof(AnnotatedString)));
+
+#ifdef DO_SORT_AND_UNIQ_AT_ONCE
+  int len = StringMergeSortAndUniq(toks, 0, toks.size(), work, as);
+#else
+  int len = static_cast<int>(toks.size());
+  StringMergeSort(toks, 0, toks.size(), work, as);
+#endif
+
+  WordWriter ww(out);
+#ifdef DO_SORT_AND_UNIQ_AT_ONCE
+  for (int i = 0; i < len; i++) {
+    ww.MaybeAddWhitespace();
+    *out += reinterpret_cast<const char*>(as[i].s);
+  }
+#else
+  StringPiece prev;
+  for (int i = 0; i < len; i++) {
+    StringPiece tok = reinterpret_cast<const char*>(as[i].s);
+    if (prev != tok) {
+      ww.Write(tok);
+      prev = tok;
+    }
+  }
+#endif
+
+  free(as);
+  free(work);
+}
diff --git a/lcp_msort.h b/lcp_msort.h
new file mode 100644
index 0000000..a74b095
--- /dev/null
+++ b/lcp_msort.h
@@ -0,0 +1,25 @@
+// Copyright 2016 Google Inc. All rights reserved
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LCP_MSORT_H_
+#define LCP_MSORT_H_
+
+#include <string>
+
+using namespace std;
+
+// Split |buf|, sort and unique tokens, and append results to |out|.
+void StringSortByLcpMsort(string* buf, string* out);
+
+#endif  // LCP_MSORT_H_
diff --git a/testcase/sort.mk b/testcase/sort.mk
index c521289..03d0375 100644
--- a/testcase/sort.mk
+++ b/testcase/sort.mk
@@ -1,5 +1,11 @@
+sp := $(subst S, ,S)
+
 test:
 	echo $(sort foo bar lose)
 	echo $(sort foo bar aaaa)
 	echo $(sort foo bar lose lose foo bar bar)
+	echo $(sort baz bar)
+	echo $(sort single)
+	echo $(sort $(sp)foo$(sp))
 	echo $(sort )
+	echo $(sort device/sample/products/AndroidProducts.mk device/moto/shamu/AndroidProducts.mk device/asus/fugu/AndroidProducts.mk device/asus/deb/AndroidProducts.mk device/asus/flo/AndroidProducts.mk device/generic/arm64/AndroidProducts.mk device/generic/qemu/AndroidProducts.mk device/generic/mini-emulator-x86_64/AndroidProducts.mk device/generic/x86/AndroidProducts.mk device/generic/mips/AndroidProducts.mk device/generic/mini-emulator-x86/AndroidProducts.mk device/generic/mini-emulator-mips/AndroidProducts.mk device/generic/mini-emulator-arm64/AndroidProducts.mk device/generic/mini-emulator-armv7-a-neon/AndroidProducts.mk device/generic/x86_64/AndroidProducts.mk device/generic/armv7-a-neon/AndroidProducts.mk device/htc/flounder/AndroidProducts.mk device/lge/bullhead/AndroidProducts.mk device/lge/hammerhead/AndroidProducts.mk device/huawei/angler/AndroidProducts.mk)