Merge remote-tracking branch 'aosp/upstream'

There are three feature changes

https://github.com/danw/kati/commit/0b544c58579cf2214d
https://github.com/google/kati/commit/348a960f31d42c63
https://github.com/google/kati/commit/422179d6ddbf649a

Other changes are minor performance optimizations. ~1.5x faster
ninja generation overall.

Bug: 27225198
Change-Id: I686cf4357a8c25c14ecbdbc75cedcd6db61eec7b
diff --git a/Android.bp b/Android.bp
index 432505e..fea1241 100644
--- a/Android.bp
+++ b/Android.bp
@@ -15,6 +15,7 @@
 cc_library_host_static {
     name: "libckati",
     srcs: [
+        "affinity.cc",
         "command.cc",
         "condvar.cc",
         "dep.cc",
diff --git a/Makefile.ckati b/Makefile.ckati
index cf5bab8..38e1feb 100644
--- a/Makefile.ckati
+++ b/Makefile.ckati
@@ -22,6 +22,7 @@
 KATI_BIN_PATH ?= .
 
 KATI_CXX_SRCS := \
+	affinity.cc \
 	command.cc \
 	condvar.cc \
 	dep.cc \
diff --git a/affinity.cc b/affinity.cc
new file mode 100644
index 0000000..0743f94
--- /dev/null
+++ b/affinity.cc
@@ -0,0 +1,50 @@
+// Copyright 2016 Google Inc. All rights reserved
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "affinity.h"
+
+#include "flags.h"
+#include "log.h"
+
+#ifdef __linux__
+
+#include <sched.h>
+
+void SetAffinityForSingleThread() {
+  cpu_set_t cs;
+  CPU_ZERO(&cs);
+  int n = g_flags.num_cpus / 2;
+  CPU_SET(n, &cs);
+  if (n > 1)
+    CPU_SET(n + 1, &cs);
+  if (sched_setaffinity(0, sizeof(cs), &cs) < 0)
+    WARN("sched_setaffinity: %s", strerror(errno));
+}
+
+void SetAffinityForMultiThread() {
+  cpu_set_t cs;
+  CPU_ZERO(&cs);
+  for (int i = 0; i < g_flags.num_cpus; i++) {
+    CPU_SET(i, &cs);
+  }
+  if (sched_setaffinity(0, sizeof(cs), &cs) < 0)
+    WARN("sched_setaffinity: %s", strerror(errno));
+}
+
+#else
+
+void SetAffinityForSingleThread() {}
+void SetAffinityForMultiThread() {}
+
+#endif
diff --git a/affinity.h b/affinity.h
new file mode 100644
index 0000000..e6f6adc
--- /dev/null
+++ b/affinity.h
@@ -0,0 +1,21 @@
+// Copyright 2016 Google Inc. All rights reserved
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AFFINITY_H_
+#define AFFINITY_H_
+
+void SetAffinityForSingleThread();
+void SetAffinityForMultiThread();
+
+#endif
diff --git a/condvar.cc b/condvar.cc
index d7d4e39..f8b488b 100644
--- a/condvar.cc
+++ b/condvar.cc
@@ -26,8 +26,8 @@
     PERROR("pthread_cond_destroy");
 }
 
-void condition_variable::wait(const unique_lock<mutex>& mu) {
-  if (pthread_cond_wait(&cond_, &mu.mutex()->mu_) != 0)
+void condition_variable::wait(const UniqueLock<Mutex>& mu) {
+  if (pthread_cond_wait(&cond_, &mu.Mutex()->mu_) != 0)
     PERROR("pthread_cond_wait");
 }
 
diff --git a/condvar.h b/condvar.h
index 2f9ab3a..b75b9a4 100644
--- a/condvar.h
+++ b/condvar.h
@@ -24,7 +24,7 @@
   condition_variable();
   ~condition_variable();
 
-  void wait(const unique_lock<mutex>& mu);
+  void wait(const UniqueLock<Mutex>& mu);
   void notify_one();
   void notify_all();
 
diff --git a/dep.cc b/dep.cc
index 45f9584..434d9ad 100644
--- a/dep.cc
+++ b/dep.cc
@@ -27,8 +27,10 @@
 #include "fileutil.h"
 #include "log.h"
 #include "rule.h"
+#include "stats.h"
 #include "strutil.h"
 #include "symtab.h"
+#include "timeutil.h"
 #include "var.h"
 
 namespace {
@@ -116,13 +118,14 @@
 class DepBuilder {
  public:
   DepBuilder(Evaluator* ev,
-             const vector<shared_ptr<Rule>>& rules,
+             const vector<const Rule*>& rules,
              const unordered_map<Symbol, Vars*>& rule_vars)
       : ev_(ev),
         rule_vars_(rule_vars),
         implicit_rules_(new RuleTrie()),
         first_rule_(NULL),
         depfile_var_name_(Intern(".KATI_DEPFILE")) {
+    ScopedTimeReporter tr("make dep (populate)");
     PopulateRules(rules);
     LOG_STAT("%zu variables", ev->mutable_vars()->size());
     LOG_STAT("%zu explicit rules", rules_.size());
@@ -226,8 +229,8 @@
     return ::Exists(target.str());
   }
 
-  void PopulateRules(const vector<shared_ptr<Rule>>& rules) {
-    for (shared_ptr<Rule> rule : rules) {
+  void PopulateRules(const vector<const Rule*>& rules) {
+    for (const Rule* rule : rules) {
       if (rule->outputs.empty()) {
         PopulateImplicitRule(rule);
       } else {
@@ -239,7 +242,7 @@
     }
   }
 
-  bool PopulateSuffixRule(shared_ptr<Rule> rule, Symbol output) {
+  bool PopulateSuffixRule(const Rule* rule, Symbol output) {
     if (output.empty() || output.str()[0] != '.')
       return false;
 
@@ -291,6 +294,7 @@
                               const Rule& rule,
                               Symbol output,
                               bool is_suffix_rule) {
+    COLLECT_STATS("make dep (merge rule)");
     if (old_rule.is_double_colon != rule.is_double_colon) {
       ERROR("%s:%d: *** target file `%s' has both : and :: entries.",
             LOCF(rule.loc), output.str().c_str());
@@ -337,7 +341,7 @@
     return r;
   }
 
-  void PopulateExplicitRule(shared_ptr<Rule> orig_rule) {
+  void PopulateExplicitRule(const Rule* orig_rule) {
     for (Symbol output : orig_rule->outputs) {
       const bool is_suffix_rule = PopulateSuffixRule(orig_rule, output);
 
@@ -345,7 +349,7 @@
       rule->outputs.clear();
       rule->outputs.push_back(output);
 
-      auto p = rules_.insert(make_pair(output, rule));
+      auto p = rules_.emplace(output, rule);
       if (p.second) {
         if (!first_rule_ && output.get(0) != '.') {
           rule->is_default_target = true;
@@ -358,9 +362,24 @@
     }
   }
 
-  void PopulateImplicitRule(shared_ptr<Rule> rule) {
+  static bool IsIgnorableImplicitRule(const Rule* rule) {
+    // As kati doesn't have RCS/SCCS related default rules, we can
+    // safely ignore suppression for them.
+    if (rule->inputs.size() != 1)
+      return false;
+    if (!rule->order_only_inputs.empty())
+      return false;
+    if (!rule->cmds.empty())
+      return false;
+    const string& i = rule->inputs[0].str();
+    return (i == "RCS/%,v" || i == "RCS/%" || i == "%,v" ||
+            i == "s.%" || i == "SCCS/s.%");
+  }
+
+  void PopulateImplicitRule(const Rule* rule) {
     for (Symbol output_pattern : rule->output_patterns) {
-      implicit_rules_->Add(output_pattern.str(), rule.get());
+      if (output_pattern.str() != "%" || !IsIgnorableImplicitRule(rule))
+        implicit_rules_->Add(output_pattern.str(), rule);
     }
   }
 
@@ -437,6 +456,7 @@
 
   bool PickRule(Symbol output, DepNode* n,
                 shared_ptr<Rule>* out_rule, Vars** out_var) {
+    COLLECT_STATS("make dep (pick rule)");
     shared_ptr<Rule> rule = LookupRule(output);
     Vars* vars = LookupRuleVars(output);
     *out_rule = rule;
@@ -539,6 +559,7 @@
 
     vector<unique_ptr<ScopedVar>> sv;
     if (vars) {
+      COLLECT_STATS("make dep (create scope)");
       for (const auto& p : *vars) {
         Symbol name = p.first;
         RuleVar* var = reinterpret_cast<RuleVar*>(p.second);
@@ -619,11 +640,12 @@
 };
 
 void MakeDep(Evaluator* ev,
-             const vector<shared_ptr<Rule>>& rules,
+             const vector<const Rule*>& rules,
              const unordered_map<Symbol, Vars*>& rule_vars,
              const vector<Symbol>& targets,
              vector<DepNode*>* nodes) {
   DepBuilder db(ev, rules, rule_vars);
+  ScopedTimeReporter tr("make dep (build)");
   db.Build(targets, nodes);
 }
 
diff --git a/dep.h b/dep.h
index bb70eb3..05d10d0 100644
--- a/dep.h
+++ b/dep.h
@@ -15,7 +15,6 @@
 #ifndef DEP_H_
 #define DEP_H_
 
-#include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -53,7 +52,7 @@
 void QuitDepNodePool();
 
 void MakeDep(Evaluator* ev,
-             const vector<shared_ptr<Rule>>& rules,
+             const vector<const Rule*>& rules,
              const unordered_map<Symbol, Vars*>& rule_vars,
              const vector<Symbol>& targets,
              vector<DepNode*>* nodes);
diff --git a/eval.cc b/eval.cc
index ae41567..be4bb34 100644
--- a/eval.cc
+++ b/eval.cc
@@ -30,15 +30,8 @@
 #include "symtab.h"
 #include "var.h"
 
-EvalResult::~EvalResult() {
-  for (auto p : rule_vars)
-    delete p.second;
-  delete vars;
-}
-
 Evaluator::Evaluator(const Vars* vars)
-    : in_vars_(vars),
-      vars_(new Vars()),
+    : vars_(new Vars(*vars)),
       last_rule_(NULL),
       current_scope_(NULL),
       avoid_io_(false),
@@ -61,9 +54,12 @@
   Var* rhs = NULL;
   bool needs_assign = true;
   switch (op) {
-    case AssignOp::COLON_EQ:
-      rhs = new SimpleVar(rhs_v->Eval(this), origin);
+    case AssignOp::COLON_EQ: {
+      SimpleVar* sv = new SimpleVar(origin);
+      rhs_v->Eval(this, sv->mutable_value());
+      rhs = sv;
       break;
+    }
     case AssignOp::EQ:
       rhs = new RecursiveVar(rhs_v, origin, orig_rhs);
       break;
@@ -100,7 +96,7 @@
 void Evaluator::EvalAssign(const AssignStmt* stmt) {
   loc_ = stmt->loc();
   last_rule_ = NULL;
-  Symbol lhs = Intern(stmt->lhs->Eval(this));
+  Symbol lhs = stmt->GetLhsSymbol(this);
   if (lhs.empty())
     Error("*** empty variable name.");
   Var* rhs = EvalRHS(lhs, stmt->rhs, stmt->orig_rhs, stmt->op,
@@ -131,11 +127,12 @@
     }
 
     LOG("Rule: %s", rule->DebugString().c_str());
-    rules_.push_back(shared_ptr<Rule>(rule));
+    rules_.push_back(rule);
     last_rule_ = rule;
     return;
   }
 
+  Symbol lhs = Intern(rule_var.lhs);
   for (Symbol output : rule_var.outputs) {
     auto p = rule_vars_.emplace(output, nullptr);
     if (p.second) {
@@ -159,7 +156,6 @@
     }
 
     current_scope_ = p.first->second;
-    Symbol lhs = Intern(rule_var.lhs);
     Var* rhs_var = EvalRHS(lhs, rhs, StringPiece("*TODO*"), rule_var.op);
     if (rhs_var)
       current_scope_->Assign(lhs, new RuleVar(rhs_var, rule_var.op));
@@ -291,9 +287,6 @@
   Var* v = vars_->Lookup(name);
   if (v->IsDefined())
     return v;
-  v = in_vars_->Lookup(name);
-  if (v->IsDefined())
-    return v;
   used_undefined_vars_.insert(name);
   return v;
 }
diff --git a/eval.h b/eval.h
index 6bc21b9..cb03d5d 100644
--- a/eval.h
+++ b/eval.h
@@ -15,7 +15,6 @@
 #ifndef EVAL_H_
 #define EVAL_H_
 
-#include <memory>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -32,15 +31,6 @@
 class Var;
 class Vars;
 
-struct EvalResult {
-  ~EvalResult();
-  vector<shared_ptr<Rule>> rules;
-  Vars* vars;
-  unordered_map<StringPiece, Vars*> rule_vars;
-  // TODO: read_mks
-  unordered_map<StringPiece, bool> exports;
-};
-
 class Evaluator {
  public:
   Evaluator(const Vars* vars);
@@ -62,7 +52,7 @@
   const Loc& loc() const { return loc_; }
   void set_loc(const Loc& loc) { loc_ = loc; }
 
-  const vector<shared_ptr<Rule>>& rules() const { return rules_; }
+  const vector<const Rule*>& rules() const { return rules_; }
   const unordered_map<Symbol, Vars*>& rule_vars() const {
     return rule_vars_;
   }
@@ -107,10 +97,9 @@
 
   Var* LookupVarGlobal(Symbol name);
 
-  const Vars* in_vars_;
   Vars* vars_;
   unordered_map<Symbol, Vars*> rule_vars_;
-  vector<shared_ptr<Rule>> rules_;
+  vector<const Rule*> rules_;
   unordered_map<Symbol, bool> exports_;
 
   Rule* last_rule_;
diff --git a/expr.cc b/expr.cc
index a8f306c..a629352 100644
--- a/expr.cc
+++ b/expr.cc
@@ -62,7 +62,8 @@
     s->append(s_.begin(), s_.end());
   }
 
-  virtual bool IsLiteral() const { return true; }
+  virtual bool IsLiteral() const override { return true; }
+  virtual StringPiece GetLiteralValueUnsafe() const { return s_; }
 
   virtual string DebugString_() const override {
     return s_.as_string();
@@ -109,7 +110,7 @@
     return r;
   }
 
-  virtual Value* Compact() {
+  virtual Value* Compact() override {
     if (vals_.size() != 1) {
       return this;
     }
diff --git a/expr.h b/expr.h
index e479ed8..fdd6d36 100644
--- a/expr.h
+++ b/expr.h
@@ -42,6 +42,8 @@
   virtual Value* Compact() { return this; }
 
   virtual bool IsLiteral() const { return false; }
+  // Only safe after IsLiteral() returns true.
+  virtual StringPiece GetLiteralValueUnsafe() const { return ""; }
 
   string DebugString() const;
 
diff --git a/file_cache.cc b/file_cache.cc
index 6a05550..afdb3bd 100644
--- a/file_cache.cc
+++ b/file_cache.cc
@@ -44,7 +44,7 @@
 
   virtual Makefile* ReadMakefile(const string& filename) override {
     Makefile* result = NULL;
-    auto p = cache_.insert(make_pair(filename, result));
+    auto p = cache_.emplace(filename, result);
     if (p.second) {
       p.first->second = result = new Makefile(filename);
     } else {
diff --git a/fileutil.cc b/fileutil.cc
index 42e81a2..a2a0d09 100644
--- a/fileutil.cc
+++ b/fileutil.cc
@@ -32,6 +32,7 @@
 #include <unordered_map>
 
 #include "log.h"
+#include "strutil.h"
 
 bool Exists(StringPiece filename) {
   CHECK(filename.size() < PATH_MAX);
@@ -62,6 +63,13 @@
 int RunCommand(const string& shell, const string& cmd,
                RedirectStderr redirect_stderr,
                string* s) {
+  string cmd_escaped = cmd;
+  EscapeShell(&cmd_escaped);
+  string cmd_with_shell = shell + " -c \"" + cmd_escaped + "\"";
+  const char* argv[] = {
+    "/bin/sh", "-c", cmd_with_shell.c_str(), NULL
+  };
+
   int pipefd[2];
   if (pipe(pipefd) != 0)
     PERROR("pipe failed");
@@ -106,9 +114,6 @@
       PERROR("dup2 failed");
     close(pipefd[1]);
 
-    const char* argv[] = {
-      shell.c_str(), "-c", cmd.c_str(), NULL
-    };
     execvp(argv[0], const_cast<char**>(argv));
     PLOG("execvp for %s failed", argv[0]);
     kill(getppid(), SIGTERM);
diff --git a/find.cc b/find.cc
index 4c7cd8f..b783acb 100644
--- a/find.cc
+++ b/find.cc
@@ -154,7 +154,7 @@
   virtual bool RunFind(const FindCommand& fc, int d,
                        string* path,
                        unordered_map<const DirentNode*, string>*,
-                       string* out) const {
+                       string* out) const override {
     PrintIfNecessary(fc, *path, type_, d, out);
     return true;
   }
@@ -205,7 +205,7 @@
     }
   }
 
-  virtual const DirentNode* FindDir(StringPiece d) const {
+  virtual const DirentNode* FindDir(StringPiece d) const override {
     if (d.empty() || d == ".")
       return this;
     size_t index = d.find('/');
@@ -224,7 +224,7 @@
   virtual bool RunFind(const FindCommand& fc, int d,
                        string* path,
                        unordered_map<const DirentNode*, string>* cur_read_dirs,
-                       string* out) const {
+                       string* out) const override {
     ScopedReadDirTracker srdt(this, *path, cur_read_dirs);
     if (!srdt.ok()) {
       fprintf(stderr, "FindEmulator: find: File system loop detected; `%s' is "
@@ -308,7 +308,7 @@
       : DirentNode(name), to_(NULL), errno_(0) {
   }
 
-  virtual const DirentNode* FindDir(StringPiece d) const {
+  virtual const DirentNode* FindDir(StringPiece d) const override {
     if (errno_ == 0 && to_)
       return to_->FindDir(d);
     return NULL;
@@ -317,7 +317,7 @@
   virtual bool RunFind(const FindCommand& fc, int d,
                        string* path,
                        unordered_map<const DirentNode*, string>* cur_read_dirs,
-                       string* out) const {
+                       string* out) const override {
     unsigned char type = DT_LNK;
     if (fc.follows_symlinks && errno_ != ENOENT) {
       if (errno_) {
diff --git a/flags.cc b/flags.cc
index 06772f0..73c7105 100644
--- a/flags.cc
+++ b/flags.cc
@@ -49,7 +49,7 @@
 
 void Flags::Parse(int argc, char** argv) {
   subkati_args.push_back(argv[0]);
-  num_jobs = sysconf(_SC_NPROCESSORS_ONLN);
+  num_jobs = num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
   const char* num_jobs_str;
 
   for (int i = 1; i < argc; i++) {
diff --git a/flags.h b/flags.h
index d2fe923..275a3ee 100644
--- a/flags.h
+++ b/flags.h
@@ -44,6 +44,7 @@
   const char* makefile;
   const char* ninja_dir;
   const char* ninja_suffix;
+  int num_cpus;
   int num_jobs;
   int remote_num_jobs;
   vector<const char*> subkati_args;
diff --git a/func.cc b/func.cc
index 39af0e9..2ea8afa 100644
--- a/func.cc
+++ b/func.cc
@@ -182,10 +182,14 @@
 }
 
 void SortFunc(const vector<Value*>& args, Evaluator* ev, string* s) {
-  const string&& list = args[0]->Eval(ev);
+  string list;
+  args[0]->Eval(ev, &list);
+  COLLECT_STATS("func sort time");
+  // TODO(hamaji): Probably we could use a faster string-specific sort
+  // algorithm.
   vector<StringPiece> toks;
   WordScanner(list).Split(&toks);
-  sort(toks.begin(), toks.end());
+  stable_sort(toks.begin(), toks.end());
   WordWriter ww(s);
   StringPiece prev;
   for (StringPiece tok : toks) {
@@ -574,8 +578,9 @@
 }
 
 void CallFunc(const vector<Value*>& args, Evaluator* ev, string* s) {
-  static const string tmpvar_names[] = {
-    "0", "1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9"
+  static const Symbol tmpvar_names[] = {
+    Intern("0"), Intern("1"),  Intern("2"), Intern("3"), Intern("4"),
+    Intern("5"), Intern("6"),  Intern("7"), Intern("8"), Intern("9")
   };
 
   const string&& func_name = args[0]->Eval(ev);
@@ -593,25 +598,25 @@
   vector<unique_ptr<ScopedVar>> sv;
   for (size_t i = 1; ; i++) {
     string s;
-    StringPiece tmpvar_name;
+    Symbol tmpvar_name_sym(Symbol::IsUninitialized{});
     if (i < sizeof(tmpvar_names)/sizeof(tmpvar_names[0])) {
-      tmpvar_name = tmpvar_names[i];
+      tmpvar_name_sym = tmpvar_names[i];
     } else {
       s = StringPrintf("%d", i);
-      tmpvar_name = s;
+      tmpvar_name_sym = Intern(s);
     }
     if (i < args.size()) {
       sv.emplace_back(new ScopedVar(ev->mutable_vars(),
-                                    Intern(tmpvar_name), av[i-1].get()));
+                                    tmpvar_name_sym, av[i-1].get()));
     } else {
       // We need to blank further automatic vars
-      Var *v = ev->LookupVar(Intern(tmpvar_name));
+      Var *v = ev->LookupVar(tmpvar_name_sym);
       if (!v->IsDefined()) break;
       if (v->Origin() != VarOrigin::AUTOMATIC) break;
 
       av.emplace_back(new SimpleVar("", VarOrigin::AUTOMATIC));
       sv.emplace_back(new ScopedVar(ev->mutable_vars(),
-                                    Intern(tmpvar_name), av[i-1].get()));
+                                    tmpvar_name_sym, av[i-1].get()));
     }
   }
 
diff --git a/main.cc b/main.cc
index a64a37e..c732dda 100644
--- a/main.cc
+++ b/main.cc
@@ -21,6 +21,7 @@
 #include <time.h>
 #include <unistd.h>
 
+#include "affinity.h"
 #include "dep.h"
 #include "eval.h"
 #include "exec.h"
@@ -123,7 +124,7 @@
   if (g_flags.generate_ninja && (g_flags.regen || g_flags.dump_kati_stamp)) {
     ScopedTimeReporter tr("regen check time");
     if (!NeedsRegen(start_time, orig_args)) {
-      printf("No need to regenerate ninja file\n");
+      fprintf(stderr, "No need to regenerate ninja file\n");
       return 0;
     }
     if (g_flags.dump_kati_stamp) {
@@ -133,9 +134,14 @@
     ClearGlobCache();
   }
 
+  SetAffinityForSingleThread();
+
   MakefileCacheManager* cache_mgr = NewMakefileCacheManager();
 
   Vars* vars = new Vars();
+  vars->Assign(Intern("MAKEFILE_LIST"),
+               new SimpleVar(StringPrintf(" %s", g_flags.makefile),
+                             VarOrigin::FILE));
   for (char** p = environ; *p; p++) {
     SetVar(*p, VarOrigin::ENVIRONMENT, vars);
   }
@@ -154,10 +160,6 @@
     SetVar(l, VarOrigin::COMMAND_LINE, ev->mutable_vars());
   }
 
-  vars->Assign(Intern("MAKEFILE_LIST"),
-               new SimpleVar(StringPrintf(" %s", g_flags.makefile),
-                             VarOrigin::FILE));
-
   {
     ScopedTimeReporter tr("eval time");
     Makefile* mk = cache_mgr->ReadMakefile(g_flags.makefile);
@@ -208,6 +210,8 @@
   for (Stmt* stmt : bootstrap_asts)
     delete stmt;
   delete ev;
+  // Each Var will be deleted by |ev|.
+  vars->clear();
   delete vars;
   delete cache_mgr;
 
diff --git a/mutex.cc b/mutex.cc
index ae4bd3b..986366b 100644
--- a/mutex.cc
+++ b/mutex.cc
@@ -16,22 +16,22 @@
 
 #include "log.h"
 
-mutex::mutex() {
+Mutex::Mutex() {
   if (pthread_mutex_init(&mu_, NULL) != 0)
     PERROR("pthread_mutex_init");
 }
 
-mutex::~mutex() {
+Mutex::~Mutex() {
   if (pthread_mutex_destroy(&mu_) != 0)
     PERROR("pthread_mutex_destroy");
 }
 
-void mutex::lock() {
+void Mutex::lock() {
   if (pthread_mutex_lock(&mu_) != 0)
     PERROR("pthread_mutex_lock");
 }
 
-void mutex::unlock() {
+void Mutex::unlock() {
   if (pthread_mutex_unlock(&mu_) != 0)
     PERROR("pthread_mutex_unlock");
 }
diff --git a/mutex.h b/mutex.h
index d6f1f5a..e730294 100644
--- a/mutex.h
+++ b/mutex.h
@@ -17,10 +17,10 @@
 
 #include <pthread.h>
 
-class mutex {
+class Mutex {
  public:
-  explicit mutex();
-  ~mutex();
+  explicit Mutex();
+  ~Mutex();
 
   void lock();
   void unlock();
@@ -31,17 +31,17 @@
   friend class condition_variable;
 };
 
-template<class T> class unique_lock {
+template<class T> class UniqueLock {
  public:
-  explicit unique_lock(T& mu)
+  explicit UniqueLock(T& mu)
       : mu_(mu) {
     mu_.lock();
   }
-  ~unique_lock() {
+  ~UniqueLock() {
     mu_.unlock();
   }
 
-  T* mutex() const { return &mu_; }
+  T* Mutex() const { return &mu_; }
 
  private:
   T& mu_;
diff --git a/ninja.cc b/ninja.cc
index f14610f..eeefce0 100644
--- a/ninja.cc
+++ b/ninja.cc
@@ -22,6 +22,7 @@
 #include <unistd.h>
 
 #include <map>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -40,6 +41,8 @@
 #include "string_piece.h"
 #include "stringprintf.h"
 #include "strutil.h"
+#include "thread_pool.h"
+#include "timeutil.h"
 #include "var.h"
 #include "version.h"
 
@@ -165,6 +168,12 @@
   return true;
 }
 
+struct NinjaNode {
+  const DepNode* node;
+  vector<Command*> commands;
+  int rule_id;
+};
+
 class NinjaGenerator {
  public:
   NinjaGenerator(Evaluator* ev, double start_time)
@@ -175,7 +184,7 @@
         start_time_(start_time),
         default_target_(NULL) {
     ev_->set_avoid_io(true);
-    shell_ = ev->EvalVar(kShellSym);
+    shell_ = EscapeNinja(ev->EvalVar(kShellSym));
     if (g_flags.goma_dir)
       gomacc_ = StringPrintf("%s/gomacc ", g_flags.goma_dir);
 
@@ -184,12 +193,15 @@
 
   ~NinjaGenerator() {
     ev_->set_avoid_io(false);
+    for (NinjaNode* nn : nodes_)
+      delete nn;
   }
 
   void Generate(const vector<DepNode*>& nodes,
                 const string& orig_args) {
     unlink(GetNinjaStampFilename().c_str());
-    GenerateNinja(nodes, orig_args);
+    PopulateNinjaNodes(nodes);
+    GenerateNinja(orig_args);
     GenerateShell();
     GenerateStamp(orig_args);
   }
@@ -206,8 +218,40 @@
   }
 
  private:
-  string GenRuleName() {
-    return StringPrintf("rule%d", rule_id_++);
+  void PopulateNinjaNodes(const vector<DepNode*>& nodes) {
+    ScopedTimeReporter tr("ninja gen (eval)");
+    for (DepNode* node : nodes) {
+      PopulateNinjaNode(node);
+    }
+  }
+
+  void PopulateNinjaNode(DepNode* node) {
+    auto p = done_.insert(node->output);
+    if (!p.second)
+      return;
+
+    // A hack to exclude out phony target in Android. If this exists,
+    // "ninja -t clean" tries to remove this directory and fails.
+    if (g_flags.detect_android_echo && node->output.str() == "out")
+      return;
+
+    // This node is a leaf node
+    if (!node->has_rule && !node->is_phony) {
+      return;
+    }
+
+    NinjaNode* nn = new NinjaNode;
+    nn->node = node;
+    ce_.Eval(node, &nn->commands);
+    nn->rule_id = nn->commands.empty() ? -1 : rule_id_++;
+    nodes_.push_back(nn);
+
+    for (DepNode* d : node->deps) {
+      PopulateNinjaNode(d);
+    }
+    for (DepNode* d : node->order_onlys) {
+      PopulateNinjaNode(d);
+    }
   }
 
   StringPiece TranslateCommand(const char* in, string* cmd_buf) {
@@ -284,6 +328,22 @@
                        cmd_buf->size() - orig_size);
   }
 
+  bool IsOutputMkdir(const char *name, StringPiece cmd) {
+    if (!HasPrefix(cmd, "mkdir -p ")) {
+      return false;
+    }
+    cmd = cmd.substr(9, cmd.size());
+    if (cmd.get(cmd.size() - 1) == '/') {
+      cmd = cmd.substr(0, cmd.size() - 1);
+    }
+
+    StringPiece dir = Dirname(name);
+    if (cmd == dir) {
+      return true;
+    }
+    return false;
+  }
+
   bool GetDescriptionFromCommand(StringPiece cmd, string *out) {
     if (!HasPrefix(cmd, "echo ")) {
       return false;
@@ -333,7 +393,8 @@
     return true;
   }
 
-  bool GenShellScript(const vector<Command*>& commands,
+  bool GenShellScript(const char *name,
+                      const vector<Command*>& commands,
                       string* cmd_buf,
                       string* description) {
     // TODO: This is a dirty hack to set local_pool even without
@@ -347,7 +408,10 @@
     bool got_descritpion = false;
     bool use_gomacc = false;
     bool should_ignore_error = false;
+    auto command_count = commands.size();
     for (const Command* c : commands) {
+      size_t cmd_begin = cmd_buf->size();
+
       if (!cmd_buf->empty()) {
         if (should_ignore_error) {
           *cmd_buf += " ; ";
@@ -361,7 +425,7 @@
       while (isspace(*in))
         in++;
 
-      bool needs_subshell = commands.size() > 1;
+      bool needs_subshell = command_count > 1;
       if (*in == '(') {
         needs_subshell = false;
       }
@@ -374,11 +438,15 @@
       if (g_flags.detect_android_echo && !got_descritpion && !c->echo &&
           GetDescriptionFromCommand(translated, description)) {
         got_descritpion = true;
-        cmd_buf->resize(cmd_start);
+        translated.clear();
+      } else if (IsOutputMkdir(name, translated) && !c->echo &&
+                 cmd_begin == 0) {
         translated.clear();
       }
       if (translated.empty()) {
-        *cmd_buf += "true";
+        cmd_buf->resize(cmd_begin);
+        command_count -= 1;
+        continue;
       } else if (g_flags.goma_dir) {
         size_t pos = GetGomaccPosForAndroidCompileCommand(translated);
         if (pos != string::npos) {
@@ -401,7 +469,7 @@
             g_flags.goma_dir) && !use_gomacc;
   }
 
-  bool GetDepfile(DepNode* node, string* cmd_buf, string* depfile) {
+  bool GetDepfile(const DepNode* node, string* cmd_buf, string* depfile) {
     if (node->depfile_var) {
       node->depfile_var->Eval(ev_, depfile);
       return true;
@@ -413,75 +481,55 @@
     return result;
   }
 
-  void EmitDepfile(DepNode* node, string* cmd_buf) {
+  void EmitDepfile(NinjaNode* nn, string* cmd_buf, ostringstream* o) {
+    const DepNode* node = nn->node;
     string depfile;
     if (!GetDepfile(node, cmd_buf, &depfile))
       return;
-    fprintf(fp_, " depfile = %s\n", depfile.c_str());
-    fprintf(fp_, " deps = gcc\n");
+    *o << " depfile = " << depfile << "\n";
+    *o << " deps = gcc\n";
   }
 
-  void EmitNode(DepNode* node) {
-    auto p = done_.insert(node->output);
-    if (!p.second)
-      return;
-
-    // A hack to exclude out phony target in Android. If this exists,
-    // "ninja -t clean" tries to remove this directory and fails.
-    if (g_flags.detect_android_echo && node->output.str() == "out")
-      return;
-
-    // This node is a leaf node
-    if (!node->has_rule && !node->is_phony) {
-      return;
-    }
-
-    vector<Command*> commands;
-    ce_.Eval(node, &commands);
+  void EmitNode(NinjaNode* nn, ostringstream* o) {
+    const DepNode* node = nn->node;
+    const vector<Command*>& commands = nn->commands;
 
     string rule_name = "phony";
     bool use_local_pool = false;
     if (!commands.empty()) {
-      rule_name = GenRuleName();
-      fprintf(fp_, "rule %s\n", rule_name.c_str());
+      rule_name = StringPrintf("rule%d", nn->rule_id);
+      *o << "rule " << rule_name << "\n";
 
       string description = "build $out";
       string cmd_buf;
-      use_local_pool |= GenShellScript(commands, &cmd_buf, &description);
-      fprintf(fp_, " description = %s\n", description.c_str());
-      EmitDepfile(node, &cmd_buf);
+      use_local_pool |= GenShellScript(node->output.c_str(), commands,
+                                       &cmd_buf, &description);
+      *o << " description = " << description << "\n";
+      EmitDepfile(nn, &cmd_buf, o);
 
       // It seems Linux is OK with ~130kB and Mac's limit is ~250kB.
       // TODO: Find this number automatically.
       if (cmd_buf.size() > 100 * 1000) {
-        fprintf(fp_, " rspfile = $out.rsp\n");
-        fprintf(fp_, " rspfile_content = %s\n", cmd_buf.c_str());
-        fprintf(fp_, " command = %s $out.rsp\n", shell_.c_str());
+        *o << " rspfile = $out.rsp\n";
+        *o << " rspfile_content = " << cmd_buf << "\n";
+        *o << " command = " << shell_ << " $out.rsp\n";
       } else {
         EscapeShell(&cmd_buf);
-        fprintf(fp_, " command = %s -c \"%s\"\n",
-                shell_.c_str(), cmd_buf.c_str());
+        *o << " command = " << shell_ << " -c \"" << cmd_buf << "\"\n";
       }
       if (node->is_restat) {
-        fprintf(fp_, " restat = 1\n");
+        *o << " restat = 1\n";
       }
     }
 
-    EmitBuild(node, rule_name, use_local_pool);
-
-    for (DepNode* d : node->deps) {
-      EmitNode(d);
-    }
-    for (DepNode* d : node->order_onlys) {
-      EmitNode(d);
-    }
+    EmitBuild(nn, rule_name, use_local_pool, o);
   }
 
-  string EscapeBuildTarget(Symbol s) const {
-    if (s.str().find_first_of("$: ") == string::npos)
-      return s.str();
+  string EscapeNinja(const string& s) const {
+    if (s.find_first_of("$: ") == string::npos)
+      return s;
     string r;
-    for (char c : s.str()) {
+    for (char c : s) {
       switch (c) {
         case '$':
         case ':':
@@ -495,58 +543,33 @@
     return r;
   }
 
-  void EscapeShell(string* s) const {
-    if (s->find_first_of("$`\\\"") == string::npos)
-      return;
-    string r;
-    bool last_dollar = false;
-    for (char c : *s) {
-      switch (c) {
-        case '$':
-          if (last_dollar) {
-            r += c;
-            last_dollar = false;
-          } else {
-            r += '\\';
-            r += c;
-            last_dollar = true;
-          }
-          break;
-        case '`':
-        case '"':
-        case '\\':
-          r += '\\';
-          // fall through.
-        default:
-          r += c;
-          last_dollar = false;
-      }
-    }
-    s->swap(r);
+  string EscapeBuildTarget(Symbol s) const {
+    return EscapeNinja(s.str());
   }
 
-  void EmitBuild(DepNode* node, const string& rule_name, bool use_local_pool) {
+  void EmitBuild(NinjaNode* nn, const string& rule_name,
+                 bool use_local_pool, ostringstream* o) {
+    const DepNode* node = nn->node;
     string target = EscapeBuildTarget(node->output);
-    fprintf(fp_, "build %s: %s",
-            target.c_str(),
-            rule_name.c_str());
+    *o << "build " << target << ": " << rule_name;
     vector<Symbol> order_onlys;
     if (node->is_phony) {
-      fprintf(fp_, " _kati_always_build_");
+      *o << " _kati_always_build_";
     }
     for (DepNode* d : node->deps) {
-      fprintf(fp_, " %s", EscapeBuildTarget(d->output).c_str());
+      *o << " " << EscapeBuildTarget(d->output).c_str();
     }
     if (!node->order_onlys.empty()) {
-      fprintf(fp_, " ||");
+      *o << " ||";
       for (DepNode* d : node->order_onlys) {
-        fprintf(fp_, " %s", EscapeBuildTarget(d->output).c_str());
+        *o << " " << EscapeBuildTarget(d->output).c_str();
       }
     }
-    fprintf(fp_, "\n");
+    *o << "\n";
     if (use_local_pool)
-      fprintf(fp_, " pool = local_pool\n");
+      *o << " pool = local_pool\n";
     if (node->is_default_target) {
+      UniqueLock<Mutex> lock(mu_);
       default_target_ = node;
     }
   }
@@ -573,8 +596,8 @@
     return GetFilename("env%s.sh");
   }
 
-  void GenerateNinja(const vector<DepNode*>& nodes,
-                     const string& orig_args) {
+  void GenerateNinja(const string& orig_args) {
+    ScopedTimeReporter tr("ninja gen (emit)");
     fp_ = fopen(GetNinjaFilename().c_str(), "wb");
     if (fp_ == NULL)
       PERROR("fopen(build.ninja) failed");
@@ -601,8 +624,24 @@
 
     EmitRegenRules(orig_args);
 
-    for (DepNode* node : nodes) {
-      EmitNode(node);
+    unique_ptr<ThreadPool> tp(NewThreadPool(g_flags.num_jobs));
+    CHECK(g_flags.num_jobs);
+    int num_nodes_per_task = nodes_.size() / (g_flags.num_jobs * 10) + 1;
+    int num_tasks = nodes_.size() / num_nodes_per_task + 1;
+    vector<ostringstream> bufs(num_tasks);
+    for (int i = 0; i < num_tasks; i++) {
+      tp->Submit([this, i, num_nodes_per_task, &bufs]() {
+          int l = min(num_nodes_per_task * (i + 1),
+                      static_cast<int>(nodes_.size()));
+          for (int j = num_nodes_per_task * i; j < l; j++) {
+            EmitNode(nodes_[j], &bufs[i]);
+          }
+        });
+    }
+    tp->Wait();
+
+    for (const ostringstream& buf : bufs) {
+      fprintf(fp_, "%s", buf.str().c_str());
     }
 
     unordered_set<Symbol> used_env_vars(Vars::used_env_vars());
@@ -765,8 +804,11 @@
   string shell_;
   map<string, string> used_envs_;
   string kati_binary_;
-  double start_time_;
-  DepNode* default_target_;
+  const double start_time_;
+  vector<NinjaNode*> nodes_;
+
+  Mutex mu_;
+  const DepNode* default_target_;
 };
 
 string GetNinjaFilename() {
diff --git a/regen.cc b/regen.cc
index a40ebf3..a448612 100644
--- a/regen.cc
+++ b/regen.cc
@@ -364,7 +364,7 @@
         // TODO: Make glob cache thread safe and create a task for each glob.
         for (GlobResult* gr : globs_) {
           if (CheckGlobResult(gr, &err)) {
-            unique_lock<mutex> lock(mu_);
+            UniqueLock<Mutex> lock(mu_);
             if (!needs_regen_) {
               needs_regen_ = true;
               msg_ = err;
@@ -378,7 +378,7 @@
       tp->Submit([this, sr]() {
           string err;
           if (CheckShellResult(sr, &err)) {
-            unique_lock<mutex> lock(mu_);
+            UniqueLock<Mutex> lock(mu_);
             if (!needs_regen_) {
               needs_regen_ = true;
               msg_ = err;
@@ -398,7 +398,7 @@
   double gen_time_;
   vector<GlobResult*> globs_;
   vector<ShellResult*> commands_;
-  mutex mu_;
+  Mutex mu_;
   bool needs_regen_;
   string msg_;
 };
diff --git a/run_integration_test.rb b/run_integration_test.rb
deleted file mode 100755
index 3a5b7d0..0000000
--- a/run_integration_test.rb
+++ /dev/null
@@ -1,182 +0,0 @@
-#!/usr/bin/env ruby
-#
-# Copyright 2015 Google Inc. All rights reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-require 'fileutils'
-
-FileUtils.mkdir_p('repo')
-Dir.chdir('repo')
-
-def check_command(cmd)
-  puts cmd
-  if !system(cmd)
-    puts "#{cmd} failed"
-    exit 1
-  end
-end
-
-class TestCase
-  attr_reader :name
-
-  def initialize(name, checkout, prepare, clean, target)
-    @name = name
-    @checkout = checkout
-    @prepare = prepare
-    @clean = clean
-    @target = target
-  end
-
-  def normalize_log(log, out)
-    # TODO: Fix.
-    if @name == 'android'
-      log = log.gsub(/[ \t]+/, ' ')
-      log = log.split("\n").sort.join("\n").sub(/ Stop\.$/, '')
-      # This is a completely sane warning from kati for Android.
-      log.sub!(%r(build/core/product_config.mk:152: warning: Unmatched parens: .*\n), '')
-      # Not sure why the order can be inconsistent, but this would be OK.
-      # TODO: Inevestigate.
-      log.gsub!(/(\.mk\.PRODUCT_COPY_FILES := )(.*)/){$1 + $2.split.sort * ' '}
-    end
-    File.open(out, 'w') do |of|
-      of.print log
-    end
-    log
-  end
-
-  def run
-    @checkout.call(self)
-
-    Dir.chdir(@name) do
-      @prepare.call(self)
-
-      [['make', 'make'], ['kati', '../../kati']].each do |n, cmd|
-        @clean.call(self)
-        print "Running #{n} for #{@name}..."
-        STDOUT.flush
-        started = Time.now
-        system("#{cmd} #{@target} > #{n}.log 2>&1")
-        elapsed = Time.now - started
-        puts " %.2f secs" % elapsed
-      end
-
-      make_log = File.read('make.log')
-      kati_log = File.read('kati.log')
-      kati_log.gsub!(/^\*kati\*.*\n/, '')
-
-      make_log = normalize_log(make_log, 'make-normalized.log')
-      kati_log = normalize_log(kati_log, 'kati-normalized.log')
-      if make_log == kati_log
-        puts "#{@name}: OK"
-        return true
-      else
-        puts "#{@name}: FAIL"
-        return false
-      end
-    end
-  end
-end
-
-class GitTestCase < TestCase
-  def initialize(name, repo, rev, prepare, clean, target)
-    checkout = Proc.new{|tc|
-      if !File.exist?(@name)
-        check_command("git clone #{repo}")
-      end
-      Dir.chdir(@name) {
-        check_command("git checkout #{rev}")
-      }
-    }
-
-    super(name, checkout, prepare, clean, target)
-  end
-end
-
-class AndroidTestCase < TestCase
-  def initialize
-    name = 'android'
-    checkout = Proc.new{|tc|
-      FileUtils.mkdir_p(@name)
-      md5 = `md5sum android.tgz`
-      need_update = true
-      if File.exist?("#{@name}/STAMP")
-        stamp = File.read("#{@name}/STAMP")
-        if md5 == stamp
-          need_update = false
-        end
-      end
-
-      if need_update
-        check_command("tar -xzf android.tgz")
-        File.open("#{@name}/STAMP.tmp", 'w') do |ofile|
-          ofile.print(md5)
-        end
-        File.rename("#{@name}/STAMP.tmp", "#{@name}/STAMP")
-      end
-    }
-
-    super(name, checkout, DO_NOTHING, DO_NOTHING, 'dump-products')
-  end
-end
-
-DO_NOTHING = Proc.new{|tc|}
-MAKE_CLEAN = Proc.new{|tc|
-  check_command("make clean > /dev/null")
-}
-CONFIGURE = Proc.new{|tc|
-  check_command("./configure > /dev/null")
-}
-
-TESTS = [
-    GitTestCase.new('maloader',
-                    'https://github.com/shinh/maloader.git',
-                    '5d125933bc6c141bed05c309c2dc0e14ada6f5c7',
-                    DO_NOTHING,
-                    MAKE_CLEAN,
-                    ''),
-    GitTestCase.new('glog',
-                    'https://github.com/google/glog',
-                    '1b0b08c8dda1659027677966b03a3ff3c488e549',
-                    CONFIGURE,
-                    MAKE_CLEAN,
-                    ''),
-   AndroidTestCase.new(),
-]
-
-fails = []
-TESTS.each do |tc|
-  if !ARGV.empty?
-    if !ARGV.include?(tc.name)
-      next
-    end
-  end
-
-  if !tc.run
-    fails << tc.name
-  end
-end
-
-puts
-
-if fails.empty?
-  puts "PASS!"
-else
-  puts "=== Failures ==="
-  fails.each do |n|
-    puts n
-  end
-  puts
-
-  puts "FAIL!"
-end
diff --git a/runtest.rb b/runtest.rb
index 7123113..bf42bfe 100755
--- a/runtest.rb
+++ b/runtest.rb
@@ -341,7 +341,6 @@
     output = normalize_kati_log(output)
     if is_ninja_test
       output = normalize_ninja_log(output, sh)
-      output.gsub!(/No need to regenerate ninja file\n/, '')
     end
     File.open('out.make', 'w'){|ofile|ofile.print(expected)}
     File.open('out.kati', 'w'){|ofile|ofile.print(output)}
diff --git a/stats.cc b/stats.cc
index 9d4a160..60b3285 100644
--- a/stats.cc
+++ b/stats.cc
@@ -27,43 +27,37 @@
 
 namespace {
 
-mutex g_mu;
+Mutex g_mu;
 vector<Stats*>* g_stats;
-#ifdef __linux__
-thread_local double g_start_time;
-#define REF(x) x
-#else
 DEFINE_THREAD_LOCAL(double, g_start_time);
-#define REF(x) x.Ref()
-#endif
 
 }  // namespace
 
 Stats::Stats(const char* name)
     : name_(name), elapsed_(0), cnt_(0) {
-  unique_lock<mutex> lock(g_mu);
+  UniqueLock<Mutex> lock(g_mu);
   if (g_stats == NULL)
     g_stats = new vector<Stats*>;
   g_stats->push_back(this);
 }
 
 string Stats::String() const {
-  unique_lock<mutex> lock(mu_);
+  UniqueLock<Mutex> lock(mu_);
   return StringPrintf("%s: %f / %d", name_, elapsed_, cnt_);
 }
 
 void Stats::Start() {
-  CHECK(!REF(g_start_time));
-  REF(g_start_time) = GetTime();
-  unique_lock<mutex> lock(mu_);
+  CHECK(!TLS_REF(g_start_time));
+  TLS_REF(g_start_time) = GetTime();
+  UniqueLock<Mutex> lock(mu_);
   cnt_++;
 }
 
 double Stats::End() {
-  CHECK(REF(g_start_time));
-  double e = GetTime() - REF(g_start_time);
-  REF(g_start_time) = 0;
-  unique_lock<mutex> lock(mu_);
+  CHECK(TLS_REF(g_start_time));
+  double e = GetTime() - TLS_REF(g_start_time);
+  TLS_REF(g_start_time) = 0;
+  UniqueLock<Mutex> lock(mu_);
   elapsed_ += e;
   return e;
 }
diff --git a/stats.h b/stats.h
index 6244b54..190fa57 100644
--- a/stats.h
+++ b/stats.h
@@ -36,7 +36,7 @@
   const char* name_;
   double elapsed_;
   int cnt_;
-  mutable mutex mu_;
+  mutable Mutex mu_;
 };
 
 class ScopedStatsRecorder {
diff --git a/stmt.cc b/stmt.cc
index 1abab32..155e0cd 100644
--- a/stmt.cc
+++ b/stmt.cc
@@ -55,6 +55,19 @@
                       opstr, dirstr, LOCF(loc()));
 }
 
+Symbol AssignStmt::GetLhsSymbol(Evaluator* ev) const {
+  if (!lhs->IsLiteral()) {
+    string buf;
+    lhs->Eval(ev, &buf);
+    return Intern(buf);
+  }
+
+  if (!lhs_sym_cache_.IsValid()) {
+    lhs_sym_cache_ = Intern(lhs->GetLiteralValueUnsafe());
+  }
+  return lhs_sym_cache_;
+}
+
 string CommandStmt::DebugString() const {
   return StringPrintf("CommandStmt(%s, loc=%s:%d)",
                       expr->DebugString().c_str(), LOCF(loc()));
diff --git a/stmt.h b/stmt.h
index 4d4c5eb..3b6feeb 100644
--- a/stmt.h
+++ b/stmt.h
@@ -20,6 +20,7 @@
 
 #include "loc.h"
 #include "string_piece.h"
+#include "symtab.h"
 
 using namespace std;
 
@@ -85,11 +86,19 @@
   AssignOp op;
   AssignDirective directive;
 
+  AssignStmt()
+      : lhs_sym_cache_(Symbol::IsUninitialized{}) {
+  }
   virtual ~AssignStmt();
 
   virtual void Eval(Evaluator* ev) const;
 
   virtual string DebugString() const;
+
+  Symbol GetLhsSymbol(Evaluator* ev) const;
+
+ private:
+  mutable Symbol lhs_sym_cache_;
 };
 
 struct CommandStmt : public Stmt {
diff --git a/string_piece.cc b/string_piece.cc
index 78de4ed..d287616 100644
--- a/string_piece.cc
+++ b/string_piece.cc
@@ -21,6 +21,7 @@
 
 #include <ctype.h>
 #include <limits.h>
+#include <stdint.h>
 
 #include <algorithm>
 #include <ostream>
@@ -32,8 +33,15 @@
 bool operator==(const StringPiece& x, const StringPiece& y) {
   if (x.size() != y.size())
     return false;
-
-  return StringPiece::wordmemcmp(x.data(), y.data(), x.size()) == 0;
+  size_t len = x.size();
+  if (len >= sizeof(uint64_t)) {
+    len -= sizeof(uint64_t);
+    uint64_t xt = *reinterpret_cast<const uint64_t*>(x.data() + len);
+    uint64_t yt = *reinterpret_cast<const uint64_t*>(y.data() + len);
+    if (xt != yt)
+      return false;
+  }
+  return StringPiece::wordmemcmp(x.data(), y.data(), len) == 0;
 }
 
 void StringPiece::CopyToString(std::string* target) const {
diff --git a/string_piece_test.cc b/string_piece_test.cc
index 7b253a0..0434cbe 100644
--- a/string_piece_test.cc
+++ b/string_piece_test.cc
@@ -30,4 +30,8 @@
   assert(sps.size() == 2);
   assert(sps.count(StringPiece("foo")) == 1);
   assert(sps.count(StringPiece("bar")) == 1);
+
+  assert(StringPiece("hogefugahige") == StringPiece("hogefugahige"));
+  assert(StringPiece("hogefugahoge") != StringPiece("hogefugahige"));
+  assert(StringPiece("hogefugahige") != StringPiece("higefugahige"));
 }
diff --git a/strutil.cc b/strutil.cc
index 0e5e766..80a4b5b 100644
--- a/strutil.cc
+++ b/strutil.cc
@@ -55,11 +55,11 @@
 
 WordScanner::Iterator& WordScanner::Iterator::operator++() {
   int len = static_cast<int>(in->size());
-  for (s = i; s < len; s++) {
+  for (s = i + 1; s < len; s++) {
     if (!isSpace((*in)[s]))
       break;
   }
-  if (s == len) {
+  if (s >= len) {
     in = NULL;
     s = 0;
     i = 0;
@@ -92,7 +92,7 @@
   Iterator iter;
   iter.in = &in_;
   iter.s = 0;
-  iter.i = 0;
+  iter.i = -1;
   ++iter;
   return iter;
 }
@@ -423,6 +423,30 @@
 }
 
 size_t FindEndOfLine(StringPiece s, size_t e, size_t* lf_cnt) {
+#ifdef __SSE4_2__
+  static const char ranges[] = "\n\n\\\\";
+  while (e < s.size()) {
+    e += SkipUntilSSE42(s.data() + e, s.size() - e, ranges, 4);
+    char c = s[e];
+    if (c == '\\') {
+      if (s[e+1] == '\n') {
+        e += 2;
+        ++*lf_cnt;
+      } else if (s[e+1] == '\r' && s[e+2] == '\n') {
+        e += 3;
+        ++*lf_cnt;
+      } else if (s[e+1] == '\\') {
+        e += 2;
+      } else {
+        e++;
+      }
+    } else if (c == '\n') {
+      ++*lf_cnt;
+      return e;
+    }
+  }
+  return e;
+#else
   bool prev_backslash = false;
   for (; e < s.size(); e++) {
     char c = s[e];
@@ -439,6 +463,7 @@
     }
   }
   return e;
+#endif
 }
 
 StringPiece TrimLeadingCurdir(StringPiece s) {
@@ -496,3 +521,33 @@
   }
   return buf;
 }
+
+void EscapeShell(string* s) {
+  if (s->find_first_of("$`\\\"") == string::npos)
+    return;
+  string r;
+  bool last_dollar = false;
+  for (char c : *s) {
+    switch (c) {
+      case '$':
+        if (last_dollar) {
+          r += c;
+          last_dollar = false;
+        } else {
+          r += '\\';
+          r += c;
+          last_dollar = true;
+        }
+        break;
+      case '`':
+      case '"':
+      case '\\':
+        r += '\\';
+        // fall through.
+      default:
+        r += c;
+        last_dollar = false;
+    }
+  }
+  s->swap(r);
+}
diff --git a/strutil.h b/strutil.h
index 5938efb..12e46a5 100644
--- a/strutil.h
+++ b/strutil.h
@@ -142,4 +142,6 @@
 
 string EchoEscape(const string str);
 
+void EscapeShell(string* s);
+
 #endif  // STRUTIL_H_
diff --git a/symtab.cc b/symtab.cc
index d8d7e93..af0b620 100644
--- a/symtab.cc
+++ b/symtab.cc
@@ -14,8 +14,13 @@
 
 // +build ignore
 
+//#define ENABLE_TID_CHECK
+
 #include "symtab.h"
 
+#ifdef ENABLE_TID_CHECK
+#include <pthread.h>
+#endif
 #include <string.h>
 
 #include <unordered_map>
@@ -35,6 +40,10 @@
 class Symtab {
  public:
   Symtab() {
+#ifdef ENABLE_TID_CHECK
+    tid_ = pthread_self();
+#endif
+
     CHECK(g_symbols == NULL);
     g_symbols = &symbols_;
 
@@ -72,6 +81,11 @@
   }
 
   Symbol Intern(StringPiece s) {
+#ifdef ENABLE_TID_CHECK
+    if (tid_ != pthread_self())
+      abort();
+#endif
+
     if (s.size() <= 1) {
       return Symbol(s.empty() ? 0 : (unsigned char)s[0]);
     }
@@ -81,6 +95,9 @@
  private:
   unordered_map<StringPiece, Symbol> symtab_;
   vector<string*> symbols_;
+#ifdef ENABLE_TID_CHECK
+  pthread_t tid_;
+#endif
 };
 
 static Symtab* g_symtab;
diff --git a/testcase/ninja_mkdir.sh b/testcase/ninja_mkdir.sh
new file mode 100644
index 0000000..e54298d
--- /dev/null
+++ b/testcase/ninja_mkdir.sh
@@ -0,0 +1,42 @@
+#!/bin/sh
+#
+# Copyright 2016 Google Inc. All rights reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http:#www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+log=/tmp/log
+mk="$@"
+
+cat <<EOF > Makefile
+test: a/b
+
+a/b:
+	@mkdir -p \$(dir \$@)
+	touch \$@
+EOF
+
+${mk} 2> ${log}
+if [ -e ninja.sh ]; then
+  ./ninja.sh
+fi
+if [[ ! -d a ]]; then
+  echo "Created 'a'"
+fi
+if [ -e ninja.sh ]; then
+  if grep -q "mkdir -p" build.ninja; then
+    echo "Should not include 'mkdir -p' in build.ninja"
+    echo "Ninja will automatically create this directory"
+  fi
+fi
diff --git a/testcase/shell_var_with_args.mk b/testcase/shell_var_with_args.mk
new file mode 100644
index 0000000..ca7b54a
--- /dev/null
+++ b/testcase/shell_var_with_args.mk
@@ -0,0 +1,9 @@
+# TODO(go): Fix
+
+export FOO=-x
+
+SHELL := PS4="cmd: " /bin/bash $${FOO}
+$(info $(shell echo foo))
+
+test:
+	@echo baz
diff --git a/testcase/sort.mk b/testcase/sort.mk
index c521289..bf67a14 100644
--- a/testcase/sort.mk
+++ b/testcase/sort.mk
@@ -1,5 +1,12 @@
+sp := $(subst S, ,S)
+
 test:
 	echo $(sort foo bar lose)
 	echo $(sort foo bar aaaa)
 	echo $(sort foo bar lose lose foo bar bar)
+	echo $(sort baz bar)
+	echo $(sort single)
+	echo $(sort $(sp)foo$(sp))
 	echo $(sort )
+	echo $(sort device/sample/products/AndroidProducts.mk device/moto/shamu/AndroidProducts.mk device/asus/fugu/AndroidProducts.mk device/asus/deb/AndroidProducts.mk device/asus/flo/AndroidProducts.mk device/generic/arm64/AndroidProducts.mk device/generic/qemu/AndroidProducts.mk device/generic/mini-emulator-x86_64/AndroidProducts.mk device/generic/x86/AndroidProducts.mk device/generic/mips/AndroidProducts.mk device/generic/mini-emulator-x86/AndroidProducts.mk device/generic/mini-emulator-mips/AndroidProducts.mk device/generic/mini-emulator-arm64/AndroidProducts.mk device/generic/mini-emulator-armv7-a-neon/AndroidProducts.mk device/generic/x86_64/AndroidProducts.mk device/generic/armv7-a-neon/AndroidProducts.mk device/htc/flounder/AndroidProducts.mk device/lge/bullhead/AndroidProducts.mk device/lge/hammerhead/AndroidProducts.mk device/huawei/angler/AndroidProducts.mk)
+	echo $(sort cpplint-art-phony libart libartd libgabi++ libopenjdkjvm libopenjdkjvmd libart)
diff --git a/thread_local.h b/thread_local.h
index 3bbf663..28b146a 100644
--- a/thread_local.h
+++ b/thread_local.h
@@ -39,6 +39,13 @@
 
 #include "log.h"
 
+#ifdef __linux__
+
+#define DEFINE_THREAD_LOCAL(Type, name) thread_local Type name
+#define TLS_REF(x) x
+
+#else
+
 // Thread local storage implementation which uses pthread.
 // Note that DEFINE_THREAD_LOCAL creates a global variable just like
 // thread local storage based on __thread keyword. So we should not use
@@ -88,4 +95,8 @@
   }                                                     \
   ThreadLocal<Type, &name##_key, &name##_once> name;
 
+#define TLS_REF(x) x.Ref()
+
+#endif
+
 #endif  // THREAD_LOCAL_H_
diff --git a/thread_pool.cc b/thread_pool.cc
index ba7ee00..c504ef5 100644
--- a/thread_pool.cc
+++ b/thread_pool.cc
@@ -17,6 +17,7 @@
 #include <stack>
 #include <vector>
 
+#include "affinity.h"
 #include "condvar.h"
 #include "mutex.h"
 #include "thread.h"
@@ -25,6 +26,7 @@
  public:
   explicit ThreadPoolImpl(int num_threads)
       : is_waiting_(false) {
+    SetAffinityForMultiThread();
     threads_.reserve(num_threads);
     for (int i = 0; i < num_threads; i++) {
       threads_.push_back(thread([this]() { Loop(); }));
@@ -35,14 +37,14 @@
   }
 
   virtual void Submit(function<void(void)> task) override {
-    unique_lock<mutex> lock(mu_);
+    UniqueLock<Mutex> lock(mu_);
     tasks_.push(task);
     cond_.notify_one();
   }
 
   virtual void Wait() override {
     {
-      unique_lock<mutex> lock(mu_);
+      UniqueLock<Mutex> lock(mu_);
       is_waiting_ = true;
       cond_.notify_all();
     }
@@ -50,6 +52,8 @@
     for (thread& th : threads_) {
       th.join();
     }
+
+    SetAffinityForSingleThread();
   }
 
  private:
@@ -57,7 +61,7 @@
     while (true) {
       function<void(void)> task;
       {
-        unique_lock<mutex> lock(mu_);
+        UniqueLock<Mutex> lock(mu_);
         if (tasks_.empty()) {
           if (is_waiting_)
             return;
@@ -75,7 +79,7 @@
   }
 
   vector<thread> threads_;
-  mutex mu_;
+  Mutex mu_;
   condition_variable cond_;
   stack<function<void(void)>> tasks_;
   bool is_waiting_;
diff --git a/var.cc b/var.cc
index d21dc4f..a5cde3e 100644
--- a/var.cc
+++ b/var.cc
@@ -47,6 +47,10 @@
   CHECK(false);
 }
 
+SimpleVar::SimpleVar(VarOrigin origin)
+    : origin_(origin) {
+}
+
 SimpleVar::SimpleVar(const string& v, VarOrigin origin)
     : v_(v), origin_(origin) {
 }
@@ -121,7 +125,7 @@
 }
 
 void Vars::Assign(Symbol name, Var* v) {
-  auto p = insert(make_pair(name, v));
+  auto p = emplace(name, v);
   if (!p.second) {
     Var* orig = p.first->second;
     if (orig->Origin() == VarOrigin::OVERRIDE ||
@@ -141,7 +145,7 @@
 
 ScopedVar::ScopedVar(Vars* vars, Symbol name, Var* var)
     : vars_(vars), orig_(NULL) {
-  auto p = vars->insert(make_pair(name, var));
+  auto p = vars->emplace(name, var);
   iter_ = p.first;
   if (!p.second) {
     orig_ = iter_->second;
diff --git a/var.h b/var.h
index 3bae262..5ba5fcb 100644
--- a/var.h
+++ b/var.h
@@ -62,23 +62,26 @@
 
 class SimpleVar : public Var {
  public:
+  explicit SimpleVar(VarOrigin origin);
   SimpleVar(const string& v, VarOrigin origin);
 
-  virtual const char* Flavor() const {
+  virtual const char* Flavor() const override {
     return "simple";
   }
-  virtual VarOrigin Origin() const {
+  virtual VarOrigin Origin() const override {
     return origin_;
   }
 
   virtual void Eval(Evaluator* ev, string* s) const override;
 
-  virtual void AppendVar(Evaluator* ev, Value* v);
+  virtual void AppendVar(Evaluator* ev, Value* v) override;
 
   virtual StringPiece String() const override;
 
   virtual string DebugString() const override;
 
+  string* mutable_value() { return &v_; }
+
  private:
   string v_;
   VarOrigin origin_;
@@ -88,16 +91,16 @@
  public:
   RecursiveVar(Value* v, VarOrigin origin, StringPiece orig);
 
-  virtual const char* Flavor() const {
+  virtual const char* Flavor() const override {
     return "recursive";
   }
-  virtual VarOrigin Origin() const {
+  virtual VarOrigin Origin() const override {
     return origin_;
   }
 
   virtual void Eval(Evaluator* ev, string* s) const override;
 
-  virtual void AppendVar(Evaluator* ev, Value* v);
+  virtual void AppendVar(Evaluator* ev, Value* v) override;
 
   virtual StringPiece String() const override;
 
@@ -113,13 +116,13 @@
  public:
   UndefinedVar();
 
-  virtual const char* Flavor() const {
+  virtual const char* Flavor() const override {
     return "undefined";
   }
-  virtual VarOrigin Origin() const {
+  virtual VarOrigin Origin() const override {
     return VarOrigin::UNDEFINED;
   }
-  virtual bool IsDefined() const { return false; }
+  virtual bool IsDefined() const override { return false; }
 
   virtual void Eval(Evaluator* ev, string* s) const override;
 
@@ -138,19 +141,19 @@
     delete v_;
   }
 
-  virtual const char* Flavor() const {
+  virtual const char* Flavor() const override {
     return v_->Flavor();
   }
-  virtual VarOrigin Origin() const {
+  virtual VarOrigin Origin() const override {
     return v_->Origin();
   }
-  virtual bool IsDefined() const {
+  virtual bool IsDefined() const override {
     return v_->IsDefined();
   }
-  virtual void Eval(Evaluator* ev, string* s) const {
+  virtual void Eval(Evaluator* ev, string* s) const override {
     v_->Eval(ev, s);
   }
-  virtual void AppendVar(Evaluator* ev, Value* v) {
+  virtual void AppendVar(Evaluator* ev, Value* v) override {
     v_->AppendVar(ev, v);
   }
   virtual StringPiece String() const override {