Update to stressapptest 1.0.7 from upstream
https://github.com/stressapptest/stressapptest

Change-Id: I6307bcfad2e67392b4e0308680c708546e9a15a3
Signed-off-by: Nick Sanders <nsanders@google.com>
diff --git a/Android.mk b/Android.mk
index e026a36..43127e9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -16,11 +16,17 @@
 	src/queue.cc \
 	src/sat.cc \
 	src/sat_factory.cc \
-	src/worker.cc \
+	src/worker.cc
 
 LOCAL_MODULE:= stressapptest
 LOCAL_MODULE_TAGS := optional
+
 LOCAL_CFLAGS := -DHAVE_CONFIG_H -DANDROID -DNDEBUG -UDEBUG -DCHECKOPTS
+
+LOCAL_C_INCLUDES := \
+	bionic  \
+	libc++
+
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_CXX_STL := libc++
 
diff --git a/Makefile.am b/Makefile.am
index c476e5f..5b1998f 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,2 +1,3 @@
 SUBDIRS = src
-dist_doc_DATA = COPYING stressapptest.1
\ No newline at end of file
+dist_man_MANS = stressapptest.1
+
diff --git a/Makefile.in b/Makefile.in
index 718866a..e0386c7 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -14,7 +14,6 @@
 # PARTICULAR PURPOSE.
 
 @SET_MAKE@
-
 VPATH = @srcdir@
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
@@ -34,9 +33,8 @@
 POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
-target_triplet = @target@
 subdir = .
-DIST_COMMON = $(am__configure_deps) $(dist_doc_DATA) \
+DIST_COMMON = $(am__configure_deps) $(dist_man_MANS) \
 	$(srcdir)/Makefile.am $(srcdir)/Makefile.in \
 	$(top_srcdir)/configure COPYING config.guess config.sub \
 	depcomp install-sh missing
@@ -80,8 +78,10 @@
 am__base_list = \
   sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \
   sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g'
-am__installdirs = "$(DESTDIR)$(docdir)"
-DATA = $(dist_doc_DATA)
+man1dir = $(mandir)/man1
+am__installdirs = "$(DESTDIR)$(man1dir)"
+NROFF = nroff
+MANS = $(dist_man_MANS)
 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
   distclean-recursive maintainer-clean-recursive
 AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
@@ -220,16 +220,12 @@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
-target = @target@
 target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
 SUBDIRS = src
-dist_doc_DATA = COPYING stressapptest.1
+dist_man_MANS = stressapptest.1
 all: all-recursive
 
 .SUFFIXES:
@@ -267,26 +263,44 @@
 $(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	$(am__cd) $(srcdir) && $(ACLOCAL) $(ACLOCAL_AMFLAGS)
 $(am__aclocal_m4_deps):
-install-dist_docDATA: $(dist_doc_DATA)
+install-man1: $(dist_man_MANS)
 	@$(NORMAL_INSTALL)
-	test -z "$(docdir)" || $(MKDIR_P) "$(DESTDIR)$(docdir)"
-	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
-	for p in $$list; do \
-	  if test -f "$$p"; then d=; else d="$(srcdir)/"; fi; \
-	  echo "$$d$$p"; \
-	done | $(am__base_list) | \
+	test -z "$(man1dir)" || $(MKDIR_P) "$(DESTDIR)$(man1dir)"
+	@list=''; test -n "$(man1dir)" || exit 0; \
+	{ for i in $$list; do echo "$$i"; done; \
+	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
+	  sed -n '/\.1[a-z]*$$/p'; \
+	} | while read p; do \
+	  if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	  echo "$$d$$p"; echo "$$p"; \
+	done | \
+	sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
+	sed 'N;N;s,\n, ,g' | { \
+	list=; while read file base inst; do \
+	  if test "$$base" = "$$inst"; then list="$$list $$file"; else \
+	    echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
+	    $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
+	  fi; \
+	done; \
+	for i in $$list; do echo "$$i"; done | $(am__base_list) | \
 	while read files; do \
-	  echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(docdir)'"; \
-	  $(INSTALL_DATA) $$files "$(DESTDIR)$(docdir)" || exit $$?; \
-	done
+	  test -z "$$files" || { \
+	    echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
+	    $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
+	done; }
 
-uninstall-dist_docDATA:
+uninstall-man1:
 	@$(NORMAL_UNINSTALL)
-	@list='$(dist_doc_DATA)'; test -n "$(docdir)" || list=; \
-	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
-	test -n "$$files" || exit 0; \
-	echo " ( cd '$(DESTDIR)$(docdir)' && rm -f" $$files ")"; \
-	cd "$(DESTDIR)$(docdir)" && rm -f $$files
+	@list=''; test -n "$(man1dir)" || exit 0; \
+	files=`{ for i in $$list; do echo "$$i"; done; \
+	l2='$(dist_man_MANS)'; for i in $$l2; do echo "$$i"; done | \
+	  sed -n '/\.1[a-z]*$$/p'; \
+	} | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
+	      -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
+	test -z "$$files" || { \
+	  echo " ( cd '$(DESTDIR)$(man1dir)' && rm -f" $$files ")"; \
+	  cd "$(DESTDIR)$(man1dir)" && rm -f $$files; }
 
 # This directory's subdirectories are mostly independent; you can cd
 # into them and run `make' without going through this Makefile.
@@ -424,6 +438,19 @@
 	-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
 
 distdir: $(DISTFILES)
+	@list='$(MANS)'; if test -n "$$list"; then \
+	  list=`for p in $$list; do \
+	    if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
+	    if test -f "$$d$$p"; then echo "$$d$$p"; else :; fi; done`; \
+	  if test -n "$$list" && \
+	    grep 'ab help2man is required to generate this page' $$list >/dev/null; then \
+	    echo "error: found man pages containing the \`missing help2man' replacement text:" >&2; \
+	    grep -l 'ab help2man is required to generate this page' $$list | sed 's/^/         /' >&2; \
+	    echo "       to fix them, install help2man, remove and regenerate the man pages;" >&2; \
+	    echo "       typically \`make maintainer-clean' will remove them" >&2; \
+	    exit 1; \
+	  else :; fi; \
+	else :; fi
 	$(am__remove_distdir)
 	test -d "$(distdir)" || mkdir "$(distdir)"
 	@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
@@ -600,10 +627,10 @@
 	       exit 1; } >&2
 check-am: all-am
 check: check-recursive
-all-am: Makefile $(DATA)
+all-am: Makefile $(MANS)
 installdirs: installdirs-recursive
 installdirs-am:
-	for dir in "$(DESTDIR)$(docdir)"; do \
+	for dir in "$(DESTDIR)$(man1dir)"; do \
 	  test -z "$$dir" || $(MKDIR_P) "$$dir"; \
 	done
 install: install-recursive
@@ -652,7 +679,7 @@
 
 info-am:
 
-install-data-am: install-dist_docDATA
+install-data-am: install-man
 
 install-dvi: install-dvi-recursive
 
@@ -668,7 +695,7 @@
 
 install-info-am:
 
-install-man:
+install-man: install-man1
 
 install-pdf: install-pdf-recursive
 
@@ -698,7 +725,9 @@
 
 ps-am:
 
-uninstall-am: uninstall-dist_docDATA
+uninstall-am: uninstall-man
+
+uninstall-man: uninstall-man1
 
 .MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) ctags-recursive \
 	install-am install-strip tags-recursive
@@ -710,14 +739,14 @@
 	distclean distclean-generic distclean-tags distcleancheck \
 	distdir distuninstallcheck dvi dvi-am html html-am info \
 	info-am install install-am install-data install-data-am \
-	install-dist_docDATA install-dvi install-dvi-am install-exec \
-	install-exec-am install-html install-html-am install-info \
-	install-info-am install-man install-pdf install-pdf-am \
-	install-ps install-ps-am install-strip installcheck \
-	installcheck-am installdirs installdirs-am maintainer-clean \
+	install-dvi install-dvi-am install-exec install-exec-am \
+	install-html install-html-am install-info install-info-am \
+	install-man install-man1 install-pdf install-pdf-am install-ps \
+	install-ps-am install-strip installcheck installcheck-am \
+	installdirs installdirs-am maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-generic pdf \
 	pdf-am ps ps-am tags tags-recursive uninstall uninstall-am \
-	uninstall-dist_docDATA
+	uninstall-man uninstall-man1
 
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
diff --git a/configure b/configure
index 3f27d49..97d2c38 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.65 for stressapptest 1.0.4_autoconf.
+# Generated by GNU Autoconf 2.65 for stressapptest 1.0.7_autoconf.
 #
 # Report bugs to <opensource@google.com>.
 #
@@ -552,8 +552,8 @@
 # Identity of this package.
 PACKAGE_NAME='stressapptest'
 PACKAGE_TARNAME='stressapptest'
-PACKAGE_VERSION='1.0.4_autoconf'
-PACKAGE_STRING='stressapptest 1.0.4_autoconf'
+PACKAGE_VERSION='1.0.7_autoconf'
+PACKAGE_STRING='stressapptest 1.0.7_autoconf'
 PACKAGE_BUGREPORT='opensource@google.com'
 PACKAGE_URL=''
 
@@ -646,10 +646,6 @@
 INSTALL_DATA
 INSTALL_SCRIPT
 INSTALL_PROGRAM
-target_os
-target_vendor
-target_cpu
-target
 host_os
 host_vendor
 host_cpu
@@ -701,6 +697,7 @@
 enable_option_checking
 with_static
 enable_dependency_tracking
+enable_default_optimizations
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1255,7 +1252,7 @@
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures stressapptest 1.0.4_autoconf to adapt to many kinds of systems.
+\`configure' configures stressapptest 1.0.7_autoconf to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1320,13 +1317,12 @@
 System types:
   --build=BUILD     configure for building on BUILD [guessed]
   --host=HOST       cross-compile to build programs to run on HOST [BUILD]
-  --target=TARGET   configure for building compilers for TARGET [HOST]
 _ACEOF
 fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of stressapptest 1.0.4_autoconf:";;
+     short | recursive ) echo "Configuration of stressapptest 1.0.7_autoconf:";;
    esac
   cat <<\_ACEOF
 
@@ -1336,6 +1332,8 @@
   --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
   --disable-dependency-tracking  speeds up one-time build
   --enable-dependency-tracking   do not reject slow dependency extractors
+  --disable-default-optimizations
+                          Disable default optimization flag overrides
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1420,7 +1418,7 @@
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-stressapptest configure 1.0.4_autoconf
+stressapptest configure 1.0.7_autoconf
 generated by GNU Autoconf 2.65
 
 Copyright (C) 2009 Free Software Foundation, Inc.
@@ -1976,7 +1974,7 @@
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by stressapptest $as_me 1.0.4_autoconf, which was
+It was created by stressapptest $as_me 1.0.7_autoconf, which was
 generated by GNU Autoconf 2.65.  Invocation command line was
 
   $ $0 $@
@@ -2331,13 +2329,13 @@
 fi
 
 
-if test "$with_static" == "yes"
+if test "$with_static" = "yes"
 then
-	{ $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with staticaly linked libraries." >&5
+  { $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with staticaly linked libraries." >&5
 $as_echo "$as_me: Compiling with staticaly linked libraries." >&6;}
-	LIBS="$LIBS -static"
+  LIBS="$LIBS -static"
 else
-	{ $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with dynamically linked libraries." >&5
+  { $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with dynamically linked libraries." >&5
 $as_echo "$as_me: Compiling with dynamically linked libraries." >&6;}
 fi
 
@@ -2435,105 +2433,74 @@
 case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac
 
 
-
 # Checking for target cpu and setting custom configuration
 # for the different platforms
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking target system type" >&5
-$as_echo_n "checking target system type... " >&6; }
-if test "${ac_cv_target+set}" = set; then :
-  $as_echo_n "(cached) " >&6
-else
-  if test "x$target_alias" = x; then
-  ac_cv_target=$ac_cv_host
-else
-  ac_cv_target=`$SHELL "$ac_aux_dir/config.sub" $target_alias` ||
-    as_fn_error "$SHELL $ac_aux_dir/config.sub $target_alias failed" "$LINENO" 5
-fi
+case "$host_cpu" in #(
+  *x86_64*) :
 
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_target" >&5
-$as_echo "$ac_cv_target" >&6; }
-case $ac_cv_target in
-*-*-*) ;;
-*) as_fn_error "invalid value of canonical target" "$LINENO" 5;;
-esac
-target=$ac_cv_target
-ac_save_IFS=$IFS; IFS='-'
-set x $ac_cv_target
-shift
-target_cpu=$1
-target_vendor=$2
-shift; shift
-# Remember, the first character of IFS is used to create $*,
-# except with old shells:
-target_os=$*
-IFS=$ac_save_IFS
-case $target_os in *\ *) target_os=`echo "$target_os" | sed 's/ /-/g'`;; esac
-
-
-# The aliases save the names the user supplied, while $host etc.
-# will get canonicalized.
-test -n "$target_alias" &&
-  test "$program_prefix$program_suffix$program_transform_name" = \
-    NONENONEs,x,x, &&
-  program_prefix=${target_alias}-
-case x"$target_cpu" in
-  "xx86_64")
 
 $as_echo "#define STRESSAPPTEST_CPU_X86_64 /**/" >>confdefs.h
 
-    ;;
-  "xi686")
+     ;; #(
+  *i686*) :
+
 
 $as_echo "#define STRESSAPPTEST_CPU_I686 /**/" >>confdefs.h
 
-    ;;
-  "xpowerpc")
+     ;; #(
+  *powerpc*) :
+
 
 $as_echo "#define STRESSAPPTEST_CPU_PPC /**/" >>confdefs.h
 
-    ;;
-  "xarmv7a")
+     ;; #(
+  *armv7a*) :
+
 
 $as_echo "#define STRESSAPPTEST_CPU_ARMV7A /**/" >>confdefs.h
 
-    ;;
-  *)
-    as_fn_error "$target_cpu is not supported! Try x86_64, i686, powerpc, or armv7a" "$LINENO" 5
-    ;;
+     ;; #(
+  *) :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: Unsupported CPU: $host_cpu! Try x86_64, i686, powerpc, or armv7a" >&5
+$as_echo "$as_me: WARNING: Unsupported CPU: $host_cpu! Try x86_64, i686, powerpc, or armv7a" >&2;}
+ ;;
 esac
 
-_os=`uname`
 ## The following allows like systems to share settings. This is not meant to
 ## imply that these OS are the same thing. From OpenOffice dmake configure.in
-case "$_os" in
-  "Linux")
+case "$host_os" in #(
+  *linux*) :
+
     OS_VERSION=linux
 
 $as_echo "#define STRESSAPPTEST_OS_LINUX /**/" >>confdefs.h
 
-    ;;
-  "Darwin")
+     ;; #(
+  *darwin*) :
+
     OS_VERSION=macosx
 
 $as_echo "#define STRESSAPPTEST_OS_DARWIN /**/" >>confdefs.h
 
-    ;;
-  "FreeBSD")
+     ;; #(
+  *freebsd*) :
+
     OS_VERSION=bsd
 
 $as_echo "#define STRESSAPPTEST_OS_BSD /**/" >>confdefs.h
 
-    ;;
-  "NetBSD")
+     ;; #(
+  *netbsd*) :
+
     OS_VERSION=bsd
 
 $as_echo "#define STRESSAPPTEST_OS_BSD /**/" >>confdefs.h
 
-    ;;
-  *)
-    as_fn_error "$_os operating system is not suitable to build dmake!" "$LINENO" 5
-    ;;
+     ;; #(
+  *) :
+    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unsupported system: $host_os" >&5
+$as_echo "$as_me: WARNING: unsupported system: $host_os" >&2;}
+ ;;
 esac
 
 am__api_version='1.11'
@@ -2974,7 +2941,7 @@
 
 # Define the identity of the package.
  PACKAGE='stressapptest'
- VERSION='1.0.4_autoconf'
+ VERSION='1.0.7_autoconf'
 
 
 cat >>confdefs.h <<_ACEOF
@@ -4412,10 +4379,19 @@
 _ACEOF
 
 
-#Default cxxflags
-CXXFLAGS="$CXXFLAGS -DCHECKOPTS"
-CXXFLAGS="$CXXFLAGS -Wreturn-type -Wunused -Wuninitialized -Wall -Wno-psabi"
-CXXFLAGS="$CXXFLAGS -O3 -funroll-all-loops  -funroll-loops -DNDEBUG"
+# Check whether --enable-default-optimizations was given.
+if test "${enable_default_optimizations+set}" = set; then :
+  enableval=$enable_default_optimizations;
+fi
+
+if test x"$enable_default_optimizations" != xno; then :
+
+    #Default cxxflags
+    CXXFLAGS="$CXXFLAGS -DCHECKOPTS"
+    CXXFLAGS="$CXXFLAGS -Wreturn-type -Wunused -Wuninitialized -Wall"
+    CXXFLAGS="$CXXFLAGS -O3 -funroll-all-loops  -funroll-loops -DNDEBUG"
+
+fi
 
 # Checks for header files.
 
@@ -5064,6 +5040,13 @@
 
 fi
 
+ac_fn_c_check_type "$LINENO" "pthread_barrier_t" "ac_cv_type_pthread_barrier_t" "$ac_includes_default"
+if test "x$ac_cv_type_pthread_barrier_t" = x""yes; then :
+
+$as_echo "#define HAVE_PTHREAD_BARRIERS 1" >>confdefs.h
+
+fi
+
 for ac_header in libaio.h
 do :
   ac_fn_c_check_header_mongrel "$LINENO" "libaio.h" "ac_cv_header_libaio_h" "$ac_includes_default"
@@ -5201,6 +5184,7 @@
 fi
 
 
+
 # Checks for typedefs, structures, and compiler characteristics.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for stdbool.h that conforms to C99" >&5
 $as_echo_n "checking for stdbool.h that conforms to C99... " >&6; }
@@ -6455,7 +6439,7 @@
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by stressapptest $as_me 1.0.4_autoconf, which was
+This file was extended by stressapptest $as_me 1.0.7_autoconf, which was
 generated by GNU Autoconf 2.65.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -6521,7 +6505,7 @@
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
-stressapptest config.status 1.0.4_autoconf
+stressapptest config.status 1.0.7_autoconf
 configured by $0, generated by GNU Autoconf 2.65,
   with options \\"\$ac_cs_config\\"
 
diff --git a/configure.ac b/configure.ac
index e1e44fa..74e8687 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,71 +1,64 @@
 AC_PREREQ(2.61)
-AC_INIT([stressapptest], [1.0.4_autoconf], [opensource@google.com])
+AC_INIT([stressapptest], [1.0.7_autoconf], [opensource@google.com])
 
 AC_ARG_WITH(static, [  --with-static            enable static linking])
 
-if test "$with_static" == "yes"
+if test "$with_static" = "yes"
 then
-	AC_MSG_NOTICE([Compiling with staticaly linked libraries.])
-	LIBS="$LIBS -static"
+  AC_MSG_NOTICE([Compiling with staticaly linked libraries.])
+  LIBS="$LIBS -static"
 else
-	AC_MSG_NOTICE([Compiling with dynamically linked libraries.])
+  AC_MSG_NOTICE([Compiling with dynamically linked libraries.])
 fi
 
 AC_CANONICAL_HOST
-AC_CANONICAL_BUILD
 # Checking for target cpu and setting custom configuration
 # for the different platforms
-AC_CANONICAL_TARGET
-case x"$target_cpu" in
-  "xx86_64")
+AS_CASE(["$host_cpu"],
+  [*x86_64*], [
     AC_DEFINE([STRESSAPPTEST_CPU_X86_64],[],
               [Defined if the target CPU is x86_64])
-    ;;
-  "xi686")
+    ], 
+  [*i686*], [
     AC_DEFINE([STRESSAPPTEST_CPU_I686],[],
               [Defined if the target CPU is i686])
-    ;;
-  "xpowerpc")
+    ], 
+  [*powerpc*], [
     AC_DEFINE([STRESSAPPTEST_CPU_PPC],[],
               [Defined if the target CPU is PowerPC])
-    ;;
-  "xarmv7a")
+    ], 
+  [*armv7a*], [
     AC_DEFINE([STRESSAPPTEST_CPU_ARMV7A],[],
               [Defined if the target CPU is armv7a])
-    ;;
-  *)
-    AC_MSG_ERROR([$target_cpu is not supported! Try x86_64, i686, powerpc, or armv7a])
-    ;;
-esac
+    ], 
+  [AC_MSG_WARN([Unsupported CPU: $host_cpu! Try x86_64, i686, powerpc, or armv7a])]
+)
 
-_os=`uname`
 ## The following allows like systems to share settings. This is not meant to
 ## imply that these OS are the same thing. From OpenOffice dmake configure.in
-case "$_os" in
-  "Linux")
+AS_CASE(["$host_os"],
+  [*linux*], [
     OS_VERSION=linux
     AC_DEFINE([STRESSAPPTEST_OS_LINUX],[],
               [Defined if the target OS is Linux])
-    ;;
-  "Darwin")
+    ],
+  [*darwin*], [
     OS_VERSION=macosx
     AC_DEFINE([STRESSAPPTEST_OS_DARWIN],[],
               [Defined if the target OS is OSX])
-    ;;
-  "FreeBSD")
+    ],
+  [*freebsd*], [
     OS_VERSION=bsd
     AC_DEFINE([STRESSAPPTEST_OS_BSD],[],
               [Defined if the target OS is BSD based])
-    ;;
-  "NetBSD")
+    ],
+  [*netbsd*], [
     OS_VERSION=bsd
     AC_DEFINE([STRESSAPPTEST_OS_BSD],[],
               [Defined if the target OS is BSD based])
-    ;;
-  *)
-    AC_MSG_ERROR([$_os operating system is not suitable to build dmake!])
-    ;;
-esac
+    ],
+  [AC_MSG_WARN([unsupported system: $host_os])]
+)
 
 AM_INIT_AUTOMAKE([-Wall -Werror foreign])
 AC_CONFIG_SRCDIR([src/])
@@ -95,10 +88,14 @@
                    "$username @ $hostname on $timestamp",
                    [Timestamp when ./configure was executed])
 
-#Default cxxflags
-CXXFLAGS="$CXXFLAGS -DCHECKOPTS"
-CXXFLAGS="$CXXFLAGS -Wreturn-type -Wunused -Wuninitialized -Wall -Wno-psabi"
-CXXFLAGS="$CXXFLAGS -O3 -funroll-all-loops  -funroll-loops -DNDEBUG"
+AC_ARG_ENABLE([default-optimizations],
+    [AS_HELP_STRING([--disable-default-optimizations], [Disable default optimization flag overrides])])
+AS_IF([test x"$enable_default_optimizations" != xno], [
+    #Default cxxflags
+    CXXFLAGS="$CXXFLAGS -DCHECKOPTS"
+    CXXFLAGS="$CXXFLAGS -Wreturn-type -Wunused -Wuninitialized -Wall"
+    CXXFLAGS="$CXXFLAGS -O3 -funroll-all-loops  -funroll-loops -DNDEBUG"
+])
 
 # Checks for header files.
 AC_HEADER_DIRENT
@@ -107,11 +104,13 @@
 AC_CHECK_HEADERS([arpa/inet.h fcntl.h netdb.h stdint.h stdlib.h string.h sys/ioctl.h sys/socket.h sys/time.h unistd.h], [], [AC_MSG_FAILURE([Missing some header files.])])
 AC_CHECK_HEADERS([pthread.h])
 AC_SEARCH_LIBS([pthread_create], [pthread])
+AC_CHECK_TYPE([pthread_barrier_t], AC_DEFINE(HAVE_PTHREAD_BARRIERS, [1], [Define to 1 if the system has `pthread_barrier'.]))
 AC_CHECK_HEADERS([libaio.h])
 AC_SEARCH_LIBS([io_setup], [aio])
 AC_CHECK_HEADERS([sys/shm.h])
 AC_SEARCH_LIBS([shm_open], [rt])
 
+
 # Checks for typedefs, structures, and compiler characteristics.
 AC_HEADER_STDBOOL
 AC_C_CONST
diff --git a/src/Makefile.am b/src/Makefile.am
index e044974..16f539d 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,4 +1,5 @@
 bin_PROGRAMS = stressapptest
+noinst_PROGRAMS = findmask
 
 AM_DEFAULT_SOURCE_EXT=.cc
 
@@ -27,5 +28,7 @@
 HFILES += disk_blocks.h
 HFILES += adler32memcpy.h
 HFILES += logger.h
+HFILES += clock.h
 
 stressapptest_SOURCES = $(MAINFILES) $(CFILES) $(HFILES)
+findmask_SOURCES = findmask.c findmask.inc
diff --git a/src/Makefile.in b/src/Makefile.in
index f62d1ac..ff320f3 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -34,8 +34,8 @@
 POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
-target_triplet = @target@
 bin_PROGRAMS = stressapptest$(EXEEXT)
+noinst_PROGRAMS = findmask$(EXEEXT)
 subdir = src
 DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
 	$(srcdir)/stressapptest_config.h.in
@@ -48,7 +48,10 @@
 CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 am__installdirs = "$(DESTDIR)$(bindir)"
-PROGRAMS = $(bin_PROGRAMS)
+PROGRAMS = $(bin_PROGRAMS) $(noinst_PROGRAMS)
+am_findmask_OBJECTS = findmask.$(OBJEXT)
+findmask_OBJECTS = $(am_findmask_OBJECTS)
+findmask_LDADD = $(LDADD)
 am__objects_1 = main.$(OBJEXT)
 am__objects_2 = os.$(OBJEXT) os_factory.$(OBJEXT) pattern.$(OBJEXT) \
 	queue.$(OBJEXT) sat.$(OBJEXT) sat_factory.$(OBJEXT) \
@@ -63,17 +66,17 @@
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
+COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
+	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+CCLD = $(CC)
+LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
 CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
 	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS)
 CXXLD = $(CXX)
 CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
 	-o $@
-COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
-	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-CCLD = $(CC)
-LINK = $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) $(LDFLAGS) -o $@
-SOURCES = $(stressapptest_SOURCES)
-DIST_SOURCES = $(stressapptest_SOURCES)
+SOURCES = $(findmask_SOURCES) $(stressapptest_SOURCES)
+DIST_SOURCES = $(findmask_SOURCES) $(stressapptest_SOURCES)
 ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
@@ -171,11 +174,7 @@
 sharedstatedir = @sharedstatedir@
 srcdir = @srcdir@
 sysconfdir = @sysconfdir@
-target = @target@
 target_alias = @target_alias@
-target_cpu = @target_cpu@
-target_os = @target_os@
-target_vendor = @target_vendor@
 top_build_prefix = @top_build_prefix@
 top_builddir = @top_builddir@
 top_srcdir = @top_srcdir@
@@ -186,13 +185,14 @@
 	adler32memcpy.cc logger.cc
 HFILES = os.h pattern.h queue.h sat.h worker.h sattypes.h \
 	finelock_queue.h error_diag.h disk_blocks.h adler32memcpy.h \
-	logger.h
+	logger.h clock.h
 stressapptest_SOURCES = $(MAINFILES) $(CFILES) $(HFILES)
+findmask_SOURCES = findmask.c findmask.inc
 all: stressapptest_config.h
 	$(MAKE) $(AM_MAKEFLAGS) all-am
 
 .SUFFIXES:
-.SUFFIXES: .cc .o .obj
+.SUFFIXES: .c .cc .o .obj
 $(srcdir)/Makefile.in:  $(srcdir)/Makefile.am  $(am__configure_deps)
 	@for dep in $?; do \
 	  case '$(am__configure_deps)' in \
@@ -277,6 +277,12 @@
 
 clean-binPROGRAMS:
 	-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
+
+clean-noinstPROGRAMS:
+	-test -z "$(noinst_PROGRAMS)" || rm -f $(noinst_PROGRAMS)
+findmask$(EXEEXT): $(findmask_OBJECTS) $(findmask_DEPENDENCIES) 
+	@rm -f findmask$(EXEEXT)
+	$(LINK) $(findmask_OBJECTS) $(findmask_LDADD) $(LIBS)
 stressapptest$(EXEEXT): $(stressapptest_OBJECTS) $(stressapptest_DEPENDENCIES) 
 	@rm -f stressapptest$(EXEEXT)
 	$(CXXLINK) $(stressapptest_OBJECTS) $(stressapptest_LDADD) $(LIBS)
@@ -290,6 +296,7 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/adler32memcpy.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/disk_blocks.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/error_diag.Po@am__quote@
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/findmask.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/finelock_queue.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/logger.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
@@ -301,6 +308,20 @@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sat_factory.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/worker.Po@am__quote@
 
+.c.o:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+
+.c.obj:
+@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+
 .cc.o:
 @am__fastdepCXX_TRUE@	$(CXXCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
 @am__fastdepCXX_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@@ -431,7 +452,8 @@
 	@echo "it deletes files that may require special tools to rebuild."
 clean: clean-am
 
-clean-am: clean-binPROGRAMS clean-generic mostlyclean-am
+clean-am: clean-binPROGRAMS clean-generic clean-noinstPROGRAMS \
+	mostlyclean-am
 
 distclean: distclean-am
 	-rm -rf ./$(DEPDIR)
@@ -501,17 +523,18 @@
 .MAKE: all install-am install-strip
 
 .PHONY: CTAGS GTAGS all all-am check check-am clean clean-binPROGRAMS \
-	clean-generic ctags distclean distclean-compile \
-	distclean-generic distclean-hdr distclean-tags distdir dvi \
-	dvi-am html html-am info info-am install install-am \
-	install-binPROGRAMS install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-pdf install-pdf-am install-ps install-ps-am \
-	install-strip installcheck installcheck-am installdirs \
-	maintainer-clean maintainer-clean-generic mostlyclean \
-	mostlyclean-compile mostlyclean-generic pdf pdf-am ps ps-am \
-	tags uninstall uninstall-am uninstall-binPROGRAMS
+	clean-generic clean-noinstPROGRAMS ctags distclean \
+	distclean-compile distclean-generic distclean-hdr \
+	distclean-tags distdir dvi dvi-am html html-am info info-am \
+	install install-am install-binPROGRAMS install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic pdf pdf-am ps ps-am tags uninstall \
+	uninstall-am uninstall-binPROGRAMS
 
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
diff --git a/src/adler32memcpy.cc b/src/adler32memcpy.cc
index 69324f7..47c6262 100644
--- a/src/adler32memcpy.cc
+++ b/src/adler32memcpy.cc
@@ -70,7 +70,7 @@
 // Returns string representation of the Adler checksum.
 string AdlerChecksum::ToHexString() const {
   char buffer[128];
-  snprintf(buffer, sizeof(buffer), "%llx%llx%llx%llx", a1_, a2_, b1_, b2_);
+  snprintf(buffer, sizeof(buffer), "%016llx %016llx %016llx %016llx", a1_, a2_, b1_, b2_);
   return string(buffer);
 }
 
@@ -399,7 +399,124 @@
   // that there is no problem with memory this just mean that data was copied
   // from src to dst and checksum was calculated successfully).
   return true;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A) && defined(__ARM_NEON__)
+  // Elements 0 to 3 are used for holding checksum terms a1, a2,
+  // b1, b2 respectively. These elements are filled by asm code.
+  // Checksum is seeded with the null checksum.
+  volatile uint64 checksum_arr[] __attribute__ ((aligned(16))) =
+      {1, 1, 0, 0};
+
+  if ((size_in_bytes >> 19) > 0) {
+    // Size is too large. Must be less than 2^19 bytes = 512 KB.
+    return false;
+  }
+
+  // Since we are moving 64 bytes at a time number of iterations = total size/64
+  uint32 blocks = size_in_bytes / 64;
+
+  uint64 *dst = dstmem64;
+  uint64 *src = srcmem64;
+
+  #define src_r "r3"
+  #define dst_r "r4"
+  #define blocks_r "r5"
+  #define crc_r "r6"
+
+  asm volatile (
+      "mov "src_r", %[src];	 	\n"
+      "mov "dst_r", %[dst]; 		\n"
+      "mov "crc_r", %[crc]; 		\n"
+      "mov "blocks_r", %[blocks]; 	\n"
+
+      // Loop over block count.
+      "cmp "blocks_r", #0; 	\n"   // Compare counter to zero.
+      "ble END;			\n"
+
+
+      // Preload upcoming cacheline.
+      "pld ["src_r", #0x0];	\n"
+      "pld ["src_r", #0x20];	\n"
+
+      // Init checksum
+      "vldm "crc_r", {q0};		\n"
+      "vmov.i32 q1, #0;			\n"
+
+      // Start of the loop which copies 48 bytes from source to dst each time.
+      "TOP:			\n"
+
+      // Make 3 moves each of 16 bytes from srcmem to qX registers.
+      // We are using 2 words out of 4 words in each qX register,
+      // word index 0 and word index 2. We'll swizzle them in a bit.
+      // Copy it.
+      "vldm "src_r"!, {q8, q9, q10, q11};	\n"
+      "vstm "dst_r"!, {q8, q9, q10, q11};	\n"
+
+      // Arrange it.
+      "vmov.i64 q12, #0;	\n"
+      "vmov.i64 q13, #0;	\n"
+      "vmov.i64 q14, #0;	\n"
+      "vmov.i64 q15, #0;	\n"
+      // This exchenges words 1,3 in the filled registers with 
+      // words 0,2 in the empty registers.
+      "vtrn.32 q8, q12;		\n"
+      "vtrn.32 q9, q13;		\n"
+      "vtrn.32 q10, q14;	\n"
+      "vtrn.32 q11, q15;	\n"
+
+      // Sum into q0, then into q1.
+      // Repeat this for q8 - q13.
+      // Overflow can occur only if there are more
+      // than 2^16 additions => more than 2^17 words => more than 2^19 bytes so
+      // if size_in_bytes > 2^19 than overflow occurs.
+      "vadd.i64 q0, q0, q8;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q12;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q9;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q13;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+
+      "vadd.i64 q0, q0, q10;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q14;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q11;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+      "vadd.i64 q0, q0, q15;	\n"
+      "vadd.i64 q1, q1, q0;	\n"
+
+      // Increment counter and loop.
+      "sub "blocks_r", "blocks_r", #1;	\n"
+      "cmp "blocks_r", #0;	\n"   // Compare counter to zero.
+      "bgt TOP;	\n"
+
+
+      "END:\n"
+      // Report checksum values A and B (both right now are two concatenated
+      // 64 bit numbers and have to be converted to 64 bit numbers)
+      // seems like Adler128 (since size of each part is 4 byte rather than
+      // 1 byte).
+      "vstm "crc_r", {q0, q1};	\n"
+
+      // Output registers.
+      :
+      // Input registers.
+      : [src] "r"(src), [dst] "r"(dst), [blocks] "r"(blocks) , [crc] "r"(checksum_arr)
+      : "memory", "cc", "r3", "r4", "r5", "r6", "q0", "q1", "q8","q9","q10", "q11", "q12","q13","q14","q15"
+  );  // asm.
+
+  if (checksum != NULL) {
+    checksum->Set(checksum_arr[0], checksum_arr[1],
+                  checksum_arr[2], checksum_arr[3]);
+  }
+
+  // Everything went fine, so return true (this does not mean
+  // that there is no problem with memory this just mean that data was copied
+  // from src to dst and checksum was calculated successfully).
+  return true;
 #else
+  #warning "No vector copy defined for this architecture."
   // Fall back to C implementation for anything else.
   return AdlerMemcpyWarmC(dstmem64, srcmem64, size_in_bytes, checksum);
 #endif
diff --git a/src/clock.h b/src/clock.h
new file mode 100644
index 0000000..4204188
--- /dev/null
+++ b/src/clock.h
@@ -0,0 +1,29 @@
+// Copyright 2010 Google Inc. All Rights Reserved.
+// Author: cferris
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//      http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef STRESSAPPTEST_CLOCK_H_  // NOLINT
+#define STRESSAPPTEST_CLOCK_H_
+
+#include <time.h>
+
+// This class implements a clock that can be overriden for unit tests.
+class Clock {
+ public:
+  virtual ~Clock() {}
+
+  virtual time_t Now() { return time(NULL); }
+};
+
+#endif  // STRESSAPPTEST_CLOCK_H_ NOLINT
diff --git a/src/disk_blocks.cc b/src/disk_blocks.cc
index c7860b0..60018f9 100644
--- a/src/disk_blocks.cc
+++ b/src/disk_blocks.cc
@@ -14,38 +14,51 @@
 
 // Thread-safe container of disk blocks
 
-#include <utility>
-
 // This file must work with autoconf on its public version,
 // so these includes are correct.
 #include "disk_blocks.h"
 
-DiskBlockTable::DiskBlockTable() {
-  nelems_ = 0;
+#include <utility>
+
+// BlockData
+BlockData::BlockData() : address_(0), size_(0),
+                         references_(0), initialized_(false),
+                         pattern_(NULL) {
+  pthread_mutex_init(&data_mutex_, NULL);
+}
+
+BlockData::~BlockData() {
+  pthread_mutex_destroy(&data_mutex_);
+}
+
+void BlockData::set_initialized() {
+  pthread_mutex_lock(&data_mutex_);
+  initialized_ = true;
+  pthread_mutex_unlock(&data_mutex_);
+}
+
+bool BlockData::initialized() const {
+  pthread_mutex_lock(&data_mutex_);
+  bool initialized = initialized_;
+  pthread_mutex_unlock(&data_mutex_);
+  return initialized;
+}
+
+// DiskBlockTable
+DiskBlockTable::DiskBlockTable() : sector_size_(0), write_block_size_(0),
+                                   device_name_(""), device_sectors_(0),
+                                   segment_size_(0), size_(0) {
   pthread_mutex_init(&data_mutex_, NULL);
   pthread_mutex_init(&parameter_mutex_, NULL);
   pthread_cond_init(&data_condition_, NULL);
 }
 
 DiskBlockTable::~DiskBlockTable() {
-  CleanTable();
   pthread_mutex_destroy(&data_mutex_);
   pthread_mutex_destroy(&parameter_mutex_);
   pthread_cond_destroy(&data_condition_);
 }
 
-void DiskBlockTable::CleanTable() {
-  pthread_mutex_lock(&data_mutex_);
-  for (map<int64, StorageData*>::iterator it =
-           addr_to_block_.begin(); it != addr_to_block_.end(); ++it) {
-    delete it->second;
-  }
-  addr_to_block_.erase(addr_to_block_.begin(), addr_to_block_.end());
-  nelems_ = 0;
-  pthread_cond_broadcast(&data_condition_);
-  pthread_mutex_unlock(&data_mutex_);
-}
-
 // 64-bit non-negative random number generator.  Stolen from
 // depot/google3/base/tracecontext_unittest.cc.
 int64 DiskBlockTable::Random64() {
@@ -58,28 +71,27 @@
     return -x;
 }
 
-int64 DiskBlockTable::NumElems() {
-  unsigned int nelems;
+uint64 DiskBlockTable::Size() {
   pthread_mutex_lock(&data_mutex_);
-  nelems = nelems_;
+  uint64 size = size_;
   pthread_mutex_unlock(&data_mutex_);
-  return nelems;
+  return size;
 }
 
 void DiskBlockTable::InsertOnStructure(BlockData *block) {
-  int64 address = block->GetAddress();
+  int64 address = block->address();
   StorageData *sd = new StorageData();
   sd->block = block;
-  sd->pos = nelems_;
+  sd->pos = size_;
   // Creating new block ...
   pthread_mutex_lock(&data_mutex_);
-  if (pos_to_addr_.size() <= nelems_) {
+  if (pos_to_addr_.size() <= size_) {
     pos_to_addr_.insert(pos_to_addr_.end(), address);
   } else {
-    pos_to_addr_[nelems_] = address;
+    pos_to_addr_[size_] = address;
   }
-  addr_to_block_.insert(std::make_pair(address, sd));
-  nelems_++;
+  addr_to_block_[address] = sd;
+  size_++;
   pthread_cond_broadcast(&data_condition_);
   pthread_mutex_unlock(&data_mutex_);
 }
@@ -87,26 +99,28 @@
 int DiskBlockTable::RemoveBlock(BlockData *block) {
   // For write threads, check the reference counter and remove
   // it from the structure.
-  int64 address = block->GetAddress();
+  int64 address = block->address();
   AddrToBlockMap::iterator it = addr_to_block_.find(address);
   int ret = 1;
   if (it != addr_to_block_.end()) {
     int curr_pos = it->second->pos;
-    int last_pos = nelems_ - 1;
+    int last_pos = size_ - 1;
     AddrToBlockMap::iterator last_it = addr_to_block_.find(
         pos_to_addr_[last_pos]);
-    sat_assert(nelems_ > 0);
+    sat_assert(size_ > 0);
     sat_assert(last_it != addr_to_block_.end());
-    // Everything is fine, updating ...
+    // Everything is fine, removing block from table.
     pthread_mutex_lock(&data_mutex_);
     pos_to_addr_[curr_pos] = pos_to_addr_[last_pos];
     last_it->second->pos = curr_pos;
     delete it->second;
     addr_to_block_.erase(it);
-    nelems_--;
+    size_--;
     block->DecreaseReferenceCounter();
     if (block->GetReferenceCounter() == 0)
       delete block;
+    else if (block->GetReferenceCounter() < 0)
+      ret = 0;
     pthread_cond_broadcast(&data_condition_);
     pthread_mutex_unlock(&data_mutex_);
   } else {
@@ -116,18 +130,16 @@
 }
 
 int DiskBlockTable::ReleaseBlock(BlockData *block) {
-  // If is a random thread, just check the reference counter.
+  // If caller is a random thread, just check the reference counter.
   int ret = 1;
   pthread_mutex_lock(&data_mutex_);
   int references = block->GetReferenceCounter();
-  if (references > 0) {
-    if (references == 1)
-      delete block;
-    else
-      block->DecreaseReferenceCounter();
-  } else {
+  if (references == 1)
+    delete block;
+  else if (references > 0)
+    block->DecreaseReferenceCounter();
+  else
     ret = 0;
-  }
   pthread_mutex_unlock(&data_mutex_);
   return ret;
 }
@@ -135,13 +147,13 @@
 BlockData *DiskBlockTable::GetRandomBlock() {
   struct timespec ts;
   struct timeval tp;
-  int result = 0;
   gettimeofday(&tp, NULL);
   ts.tv_sec  = tp.tv_sec;
   ts.tv_nsec = tp.tv_usec * 1000;
   ts.tv_sec += 2;  // Wait for 2 seconds.
+  int result = 0;
   pthread_mutex_lock(&data_mutex_);
-  while (!nelems_ && result != ETIMEDOUT) {
+  while (!size_ && result != ETIMEDOUT) {
     result = pthread_cond_timedwait(&data_condition_, &data_mutex_, &ts);
   }
   if (result == ETIMEDOUT) {
@@ -149,13 +161,13 @@
     return NULL;
   } else {
     int64 random_number = Random64();
-    int64 random_pos = random_number % nelems_;
+    int64 random_pos = random_number % size_;
     int64 address = pos_to_addr_[random_pos];
     AddrToBlockMap::const_iterator it = addr_to_block_.find(address);
     sat_assert(it != addr_to_block_.end());
     BlockData *b = it->second->block;
     // A block is returned only if its content is written on disk.
-    if (b->BlockIsInitialized()) {
+    if (b->initialized()) {
       b->IncreaseReferenceCounter();
     } else {
       b = NULL;
@@ -165,45 +177,38 @@
   }
 }
 
-void DiskBlockTable::SetParameters(
-    int sector_size, int write_block_size, int64 device_sectors,
-    int64 segment_size, string device_name) {
+void DiskBlockTable::SetParameters(int sector_size,
+                                   int write_block_size,
+                                   int64 device_sectors,
+                                   int64 segment_size,
+                                   const string& device_name) {
+  sat_assert(size_ == 0);
   pthread_mutex_lock(&parameter_mutex_);
   sector_size_ = sector_size;
   write_block_size_ = write_block_size;
   device_sectors_ = device_sectors;
   segment_size_ = segment_size;
   device_name_ = device_name;
-  CleanTable();
   pthread_mutex_unlock(&parameter_mutex_);
 }
 
 BlockData *DiskBlockTable::GetUnusedBlock(int64 segment) {
   int64 sector = 0;
   BlockData *block = new BlockData();
-
   bool good_sequence = false;
-  int num_sectors;
-
   if (block == NULL) {
     logprintf(0, "Process Error: Unable to allocate memory "
               "for sector data for disk %s.\n", device_name_.c_str());
     return NULL;
   }
-
   pthread_mutex_lock(&parameter_mutex_);
-
   sat_assert(device_sectors_ != 0);
-
   // Align the first sector with the beginning of a write block
-  num_sectors = write_block_size_ / sector_size_;
-
+  int num_sectors = write_block_size_ / sector_size_;
   for (int i = 0; i < kBlockRetry && !good_sequence; i++) {
     good_sequence = true;
-
     // Use the entire disk or a small segment of the disk to allocate the first
     // sector in the block from.
-
     if (segment_size_ == -1) {
       sector = (Random64() & 0x7FFFFFFFFFFFFFFFLL) % (
           device_sectors_ / num_sectors);
@@ -213,7 +218,6 @@
           segment_size_ / num_sectors);
       sector *= num_sectors;
       sector += segment * segment_size_;
-
       // Make sure the block is within the segment.
       if (sector + num_sectors > (segment + 1) * segment_size_) {
         good_sequence = false;
@@ -229,7 +233,6 @@
     // now aligned to the write_block_size, it is not necessary
     // to check each sector, just the first block (a sector
     // overlap will never occur).
-
     pthread_mutex_lock(&data_mutex_);
     if (addr_to_block_.find(sector) != addr_to_block_.end()) {
       good_sequence = false;
@@ -238,7 +241,8 @@
   }
 
   if (good_sequence) {
-    block->SetParameters(sector, write_block_size_);
+    block->set_address(sector);
+    block->set_size(write_block_size_);
     block->IncreaseReferenceCounter();
     InsertOnStructure(block);
   } else {
@@ -248,66 +252,5 @@
     block = NULL;
   }
   pthread_mutex_unlock(&parameter_mutex_);
-
   return block;
 }
-
-// BlockData
-
-BlockData::BlockData() {
-  addr_ = 0;
-  size_ = 0;
-  references_ = 0;
-  initialized_ = false;
-  pthread_mutex_init(&data_mutex_, NULL);
-}
-
-BlockData::~BlockData() {
-  pthread_mutex_destroy(&data_mutex_);
-}
-
-void BlockData::SetParameters(int64 address, int64 size) {
-  addr_ = address;
-  size_ = size;
-}
-
-void BlockData::IncreaseReferenceCounter() {
-  references_++;
-}
-
-void BlockData::DecreaseReferenceCounter() {
-  references_--;
-}
-
-int BlockData::GetReferenceCounter() {
-  return references_;
-}
-
-void BlockData::SetBlockAsInitialized() {
-  pthread_mutex_lock(&data_mutex_);
-  initialized_ = true;
-  pthread_mutex_unlock(&data_mutex_);
-}
-
-bool BlockData::BlockIsInitialized() {
-  pthread_mutex_lock(&data_mutex_);
-  bool initialized = initialized_;
-  pthread_mutex_unlock(&data_mutex_);
-  return initialized;
-}
-
-int64 BlockData::GetAddress() {
-  return addr_;
-}
-
-int64 BlockData::GetSize() {
-  return size_;
-}
-
-Pattern *BlockData::GetPattern() {
-  return pattern_;
-}
-
-void BlockData::SetPattern(Pattern *p) {
-  pattern_ = p;
-}
diff --git a/src/disk_blocks.h b/src/disk_blocks.h
index cb634c9..638ee9f 100644
--- a/src/disk_blocks.h
+++ b/src/disk_blocks.h
@@ -25,87 +25,146 @@
 #include <map>
 #include <vector>
 #include <string>
-// This file must work with autoconf on its public version,
-// so these includes are correct.
-#include "pattern.h"
+
+#include "sattypes.h"
+
+class Pattern;
 
 // Data about a block written to disk so that it can be verified later.
+// Thread-unsafe, must be used with locks on non-const methods,
+// except for initialized accessor/mutator, which are thread-safe
+// (and in fact, is the only method supposed to be accessed from
+// someone which is not the thread-safe DiskBlockTable).
 class BlockData {
  public:
   BlockData();
   ~BlockData();
-  void SetParameters(int64 address, int64 size);
-  void IncreaseReferenceCounter();
-  void DecreaseReferenceCounter();
-  int GetReferenceCounter();
-  void SetBlockAsInitialized();
-  bool BlockIsInitialized();
-  int64 GetAddress();
-  int64 GetSize();
-  void SetPattern(Pattern *p);
-  Pattern *GetPattern();
- protected:
-  int64 addr_;         // address of first sector in block
-  int64 size_;         // size of block
-  int references_;      // reference counter
-  bool initialized_;     // flag indicating the block was written on disk
+
+  // These are reference counters used to control how many
+  // threads currently have a copy of this particular block.
+  void IncreaseReferenceCounter() { references_++; }
+  void DecreaseReferenceCounter() { references_--; }
+  int GetReferenceCounter() const { return references_; }
+
+  // Controls whether the block was written on disk or not.
+  // Once written, you cannot "un-written" then without destroying
+  // this object.
+  void set_initialized();
+  bool initialized() const;
+
+  // Accessor methods for some data related to blocks.
+  void set_address(uint64 address) { address_ = address; }
+  uint64 address() const { return address_; }
+  void set_size(uint64 size) { size_ = size; }
+  uint64 size() const { return size_; }
+  void set_pattern(Pattern *p) { pattern_ = p; }
+  Pattern *pattern() { return pattern_; }
+ private:
+  uint64 address_;  // Address of first sector in block
+  uint64 size_;  // Size of block
+  int references_;  // Reference counter
+  bool initialized_;  // Flag indicating the block was written on disk
   Pattern *pattern_;
-  pthread_mutex_t data_mutex_;
+  mutable pthread_mutex_t data_mutex_;
   DISALLOW_COPY_AND_ASSIGN(BlockData);
 };
 
-// Disk Block table - store data from blocks to be write / read by
-// a DiskThread
+// A thread-safe table used to store block data and control access
+// to these blocks, letting several threads read and write blocks on
+// disk.
 class DiskBlockTable {
  public:
   DiskBlockTable();
   virtual ~DiskBlockTable();
 
-  // Get Number of elements stored on table
-  int64 NumElems();
-  // Clean all table data
-  void CleanTable();
-  // Get a random block from the list. Only returns if a element
-  // is available (consider that other thread must have added them.
-  BlockData *GetRandomBlock();
-  // Set all initial parameters. Assumes all existent data is
+  // Returns number of elements stored on table.
+  uint64 Size();
+
+  // Sets all initial parameters. Assumes all existent data is
   // invalid and, therefore, must be removed.
   void SetParameters(int sector_size, int write_block_size,
                      int64 device_sectors,
                      int64 segment_size,
-                     string device_name);
-  // Return a new block in a unused address.
+                     const string& device_name);
+
+  // During the regular execution, there will be 2 types of threads:
+  // - Write thread:  gets a large number of blocks using GetUnusedBlock,
+  //                  writes them on disk (if on destructive mode),
+  //                  reads block content ONCE from disk and them removes
+  //                  the block from queue with RemoveBlock. After a removal a
+  //                  block is not available for read threads, but it is
+  //                  only removed from memory if there is no reference for
+  //                  this block. Note that a write thread also counts as
+  //                  a reference.
+  // - Read threads:  get one block at a time (if available) with
+  //                  GetRandomBlock, reads its content from disk,
+  //                  checking whether it is correct or not, and releases
+  //                  (Using ReleaseBlock) the block to be erased by the
+  //                  write threads. Since several read threads are allowed
+  //                  to read the same block, a reference counter is used to
+  //                  control when the block can be REALLY erased from
+  //                  memory, and all memory management is made by a
+  //                  DiskBlockTable instance.
+
+  // Returns a new block in a unused address. Does not
+  // grant ownership of the pointer to the caller
+  // (use RemoveBlock to delete the block from memory instead).
   BlockData *GetUnusedBlock(int64 segment);
-  // Remove block from structure (called by write threads)
+
+  // Removes block from structure (called by write threads). Returns
+  // 1 if successful, 0 otherwise.
   int RemoveBlock(BlockData *block);
-  // Release block to be erased (called by random threads)
+
+  // Gets a random block from the list. Only returns if an element
+  // is available (a write thread has got this block, written it on disk,
+  // and set this block as initialized). Does not grant ownership of the
+  // pointer to the caller (use RemoveBlock to delete the block from
+  // memory instead).
+  BlockData *GetRandomBlock();
+
+  // Releases block to be erased (called by random threads). Returns
+  // 1 if successful, 0 otherwise.
   int ReleaseBlock(BlockData *block);
 
  protected:
-
-  void InsertOnStructure(BlockData *block);
-  //  Generate a random 64-bit integer (virtual so it could be
-  //  override by the tests)
-  virtual int64 Random64();
-
   struct StorageData {
     BlockData *block;
     int pos;
   };
-
-  static const int kBlockRetry = 100;       // Number of retries to allocate
-                                            // sectors.
-
   typedef map<int64, StorageData*> AddrToBlockMap;
   typedef vector<int64> PosToAddrVector;
+
+  // Inserts block in structure, used in tests and by other methods.
+  void InsertOnStructure(BlockData *block);
+
+  // Generates a random 64-bit integer.
+  // Virtual method so it can be overridden by the tests.
+  virtual int64 Random64();
+
+  // Accessor methods for testing.
+  const PosToAddrVector& pos_to_addr() const { return pos_to_addr_; }
+  const AddrToBlockMap& addr_to_block() const { return addr_to_block_; }
+
+  int sector_size() const { return sector_size_; }
+  int write_block_size() const { return write_block_size_; }
+  const string& device_name() const { return device_name_; }
+  int64 device_sectors() const { return device_sectors_; }
+  int64 segment_size() const { return segment_size_; }
+
+ private:
+  // Number of retries to allocate sectors.
+  static const int kBlockRetry = 100;
+  // Actual tables.
   PosToAddrVector pos_to_addr_;
   AddrToBlockMap addr_to_block_;
-  uint64 nelems_;
-  int sector_size_;          // Sector size, in bytes
-  int write_block_size_;     // Block size, in bytes
-  string device_name_;       // Device name
-  int64 device_sectors_;     // Number of sectors in device
-  int64 segment_size_;       // Segment size, in bytes
+
+  // Configuration parameters for block selection
+  int sector_size_;  // Sector size, in bytes
+  int write_block_size_;  // Block size, in bytes
+  string device_name_;  // Device name
+  int64 device_sectors_;  // Number of sectors in device
+  int64 segment_size_;  // Segment size in bytes
+  uint64 size_;  // Number of elements on table
   pthread_mutex_t data_mutex_;
   pthread_cond_t data_condition_;
   pthread_mutex_t parameter_mutex_;
diff --git a/src/findmask.c b/src/findmask.c
new file mode 100644
index 0000000..1b10988
--- /dev/null
+++ b/src/findmask.c
@@ -0,0 +1,140 @@
+/* Copyright 2013 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This "tool" can be used to brute force the XOR bitmask that a memory
+ * controller uses to interleave addresses onto its two channels. To use it,
+ * you need to have a bunch of addresses that are known to go to only one
+ * of the memory channels... easiest way to get these is to run stressapptest on
+ * a machine while holding a soldering iron close to the chips of one channel.
+ * Generate about a thousand failures and extract their physical addresses
+ * from the output. Write them to findmask.inc in a way that forms a valid
+ * definition for the addrs array. Make and run on a big machine.
+ *
+ * The program iterates over all possible bitmasks within the first NUM_BITS,
+ * parallelizing execution over NUM_THREADS. Every integer is masked
+ * onto all supplied addresses, counting the amount of times this results in
+ * an odd or even amount of bits. If all but NOISE addresses fall on one side,
+ * it will print that mask to stdout. Note that the script will always "find"
+ * the mask 0x0, and may also report masks such as 0x100000000 depending on
+ * your test machines memory size... you will need to use your own judgement to
+ * interpret the results.
+ *
+ * As the program might run for a long time, you can send SIGUSR1 to it to
+ * output the last mask that was processed and get a rough idea of the
+ * current progress.
+ */
+
+#include <inttypes.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NOISE 20
+#define NUM_BITS 32
+#define NUM_THREADS 128  // keep this a power of two
+
+static uint64_t addrs[] = {
+#include "findmask.inc"
+};
+static uint64_t lastmask;
+
+__attribute__((optimize(3, "unroll-loops")))
+void* thread_func(void* arg) {
+  register uint64_t mask;
+  register uintptr_t num = (uintptr_t)arg;
+
+  for (mask = num; mask < (1ULL << (NUM_BITS + 1)); mask += NUM_THREADS) {
+    register const uint64_t* cur;
+    register int a = 0;
+    register int b = 0;
+
+    for (cur = addrs; (char*)cur < (char*)addrs + sizeof(addrs); cur++) {
+#ifdef __x86_64__
+      register uint64_t addr asm("rdx") = *cur & mask;
+      register uint32_t tmp asm("ebx");
+
+      // Behold: the dark bit counting magic!
+      asm (
+        // Fold high and low 32 bits onto each other
+        "MOVl %%edx, %%ebx\n\t"
+        "SHRq $32, %%rdx\n\t"
+        "XORl %%ebx, %%edx\n\t"
+        // Fold high and low 16 bits onto each other
+        "MOVl %%edx, %%ebx\n\t"
+        "SHRl $16, %%edx\n\t"
+        "XORw %%bx, %%dx\n\t"
+        // Fold high and low 8 bits onto each other
+        "XORb %%dh, %%dl\n\t"
+        // Invoke ancient 8086 parity flag (only counts lowest byte)
+        "SETnp %%bl\n\t"
+        "SETp %%dl\n\t"
+        // Stupid SET instruction can only affect the lowest byte...
+        "ANDl $1, %%ebx\n\t"
+        "ANDl $1, %%edx\n\t"
+        // Increment either 'a' or 'b' without needing another branch
+        "ADDl %%ebx, %2\n\t"
+        "ADDl %%edx, %1\n\t"
+        : "=b" (tmp), "+r"(a), "+r"(b) : "d"(addr) : "cc");
+
+#else  // generic processor
+      register uint64_t addr = *cur & mask;
+      register uint32_t low = (uint32_t)addr;
+      register uint32_t high = (uint32_t)(addr >> 32);
+
+      // Takes about twice as long as the version above... take that GCC!
+      __builtin_parity(low) ^ __builtin_parity(high) ? a++ : b++;
+#endif
+
+      // Early abort: probably still the most valuable optimization in here
+      if (a >= NOISE && b >= NOISE) break;
+    }
+
+    if (a < NOISE) b = a;
+    if (b < NOISE) {
+      printf("Found mask with just %d deviations: 0x%" PRIx64 "\n", b, mask);
+      fflush(stdout);
+    }
+
+    // I'm a little paranoid about performance: don't write to memory too often
+    if (!(mask & 0x7ff)) lastmask = mask;
+  }
+
+  return 0;
+}
+
+void signal_handler(int signum) {
+  printf("Received signal... currently evaluating mask 0x%" PRIx64 "!\n",
+         lastmask);
+  fflush(stdout);
+}
+
+int main(int argc, char** argv) {
+  uintptr_t i;
+  pthread_t threads[NUM_THREADS];
+
+  signal(SIGUSR1, signal_handler);
+
+  for (i = 0; i < NUM_THREADS; i++)
+    pthread_create(&threads[i], 0, thread_func, (void*)i);
+
+  for (i = 0; i < NUM_THREADS; i++)
+    pthread_join(threads[i], 0);
+
+  return 0;
+}
diff --git a/src/findmask.inc b/src/findmask.inc
new file mode 100644
index 0000000..e76f72f
--- /dev/null
+++ b/src/findmask.inc
@@ -0,0 +1,4 @@
+// This is the body of a uintptr_t array definition. Fill in your own addresses.
+0x116bb312c, // example values (can be >32 bit)
+0x38d3c5ad,  // replace with your own
+0x77c1e96d   // don't forget: no comma after the last one
diff --git a/src/logger.cc b/src/logger.cc
index e4ecb03..f13e003 100644
--- a/src/logger.cc
+++ b/src/logger.cc
@@ -17,6 +17,7 @@
 #include <pthread.h>
 #include <stdarg.h>
 #include <stdio.h>
+#include <time.h>
 #include <unistd.h>
 
 #include <string>
@@ -37,10 +38,20 @@
     return;
   }
   char buffer[4096];
-  int length = vsnprintf(buffer, sizeof buffer, format, args);
-  if (static_cast<size_t>(length) >= sizeof buffer) {
-    length = sizeof buffer;
-    buffer[sizeof buffer - 1] = '\n';
+  size_t length = 0;
+  if (log_timestamps_) {
+    time_t raw_time;
+    time(&raw_time);
+    struct tm time_struct;
+    localtime_r(&raw_time, &time_struct);
+    length = strftime(buffer, sizeof(buffer), "%Y/%m/%d-%H:%M:%S(%Z) ",
+                      &time_struct);
+    LOGGER_ASSERT(length);  // Catch if the buffer is set too small.
+  }
+  length += vsnprintf(buffer + length, sizeof(buffer) - length, format, args);
+  if (length >= sizeof(buffer)) {
+    length = sizeof(buffer);
+    buffer[sizeof(buffer) - 1] = '\n';
   }
   QueueLogLine(new string(buffer, length));
 }
@@ -52,19 +63,30 @@
 }
 
 void Logger::StopThread() {
-  LOGGER_ASSERT(thread_running_);
+  // Allow this to be called before the thread has started.
+  if (!thread_running_) {
+    return;
+  }
   thread_running_ = false;
-  LOGGER_ASSERT(0 == pthread_mutex_lock(&queued_lines_mutex_));
+  int retval = pthread_mutex_lock(&queued_lines_mutex_);
+  LOGGER_ASSERT(0 == retval);
   bool need_cond_signal = queued_lines_.empty();
   queued_lines_.push_back(NULL);
-  LOGGER_ASSERT(0 == pthread_mutex_unlock(&queued_lines_mutex_));
+  retval = pthread_mutex_unlock(&queued_lines_mutex_);
+  LOGGER_ASSERT(0 == retval);
   if (need_cond_signal) {
-    LOGGER_ASSERT(0 == pthread_cond_signal(&queued_lines_cond_));
+    retval = pthread_cond_signal(&queued_lines_cond_);
+    LOGGER_ASSERT(0 == retval);
   }
-  LOGGER_ASSERT(0 == pthread_join(thread_, NULL));
+  retval = pthread_join(thread_, NULL);
+  LOGGER_ASSERT(0 == retval);
 }
 
-Logger::Logger() : verbosity_(20), log_fd_(-1), thread_running_(false) {
+Logger::Logger()
+    : verbosity_(20),
+      log_fd_(-1),
+      thread_running_(false),
+      log_timestamps_(true) {
   LOGGER_ASSERT(0 == pthread_mutex_init(&queued_lines_mutex_, NULL));
   LOGGER_ASSERT(0 == pthread_cond_init(&queued_lines_cond_, NULL));
   LOGGER_ASSERT(0 == pthread_cond_init(&full_queue_cond_, NULL));
@@ -94,19 +116,15 @@
   LOGGER_ASSERT(0 == pthread_mutex_unlock(&queued_lines_mutex_));
 }
 
-namespace {
-void WriteToFile(const string& line, int fd) {
-  LOGGER_ASSERT(write(fd, line.data(), line.size()) ==
-                static_cast<ssize_t>(line.size()));
-}
-}
-
 void Logger::WriteAndDeleteLogLine(string *line) {
   LOGGER_ASSERT(line != NULL);
+  ssize_t bytes_written;
   if (log_fd_ >= 0) {
-    WriteToFile(*line, log_fd_);
+    bytes_written = write(log_fd_, line->data(), line->size());
+    LOGGER_ASSERT(bytes_written == static_cast<ssize_t>(line->size()));
   }
-  WriteToFile(*line, 1);
+  bytes_written = write(STDOUT_FILENO, line->data(), line->size());
+  LOGGER_ASSERT(bytes_written == static_cast<ssize_t>(line->size()));
   delete line;
 }
 
diff --git a/src/logger.h b/src/logger.h
index 1d70107..21b3c6b 100644
--- a/src/logger.h
+++ b/src/logger.h
@@ -62,7 +62,7 @@
 
   // Lines with a priority numerically greater than this will not be logged.
   // May not be called while multiple threads are running.
-  void SetVerbosity(int verbosity) {
+  virtual void SetVerbosity(int verbosity) {
     verbosity_ = verbosity;
   }
 
@@ -72,17 +72,22 @@
   // Args:
   //   log_fd: The file descriptor to write to.  Will not be closed by this
   //           object.
-  void SetLogFd(int log_fd) {
+  virtual void SetLogFd(int log_fd) {
     LOGGER_ASSERT(log_fd >= 0);
     log_fd_ = log_fd;
   }
 
   // Set output to be written to stdout only.  This is the default mode.  May
   // not be called while multiple threads are running.
-  void SetStdoutOnly() {
+  virtual void SetStdoutOnly() {
     log_fd_ = -1;
   }
 
+  // Enable or disable logging of timestamps.
+  void SetTimestampLogging(bool log_ts_enabled) {
+    log_timestamps_ = log_ts_enabled;
+  }
+
   // Logs a line, with a vprintf(3)-like interface.  This will block on writing
   // the line to stdout/disk iff the dedicated logging thread is not running.
   // This will block on adding the line to the queue if doing so would exceed
@@ -104,11 +109,12 @@
   // before this returns.  Waits for the thread to finish before returning.
   void StopThread();
 
- private:
+ protected:
   Logger();
 
-  ~Logger();
+  virtual ~Logger();
 
+ private:
   // Args:
   //   line: Must be non-NULL.  This function takes ownership of it.
   void QueueLogLine(string *line);
@@ -127,6 +133,7 @@
   int verbosity_;
   int log_fd_;
   bool thread_running_;
+  bool log_timestamps_;
   vector<string*> queued_lines_;
   // This doubles as a mutex for log_fd_ when the logging thread is not running.
   pthread_mutex_t queued_lines_mutex_;
diff --git a/src/os.cc b/src/os.cc
index 8032cfc..7c4e3d1 100644
--- a/src/os.cc
+++ b/src/os.cc
@@ -48,6 +48,7 @@
 // so these includes are correct.
 #include "sattypes.h"
 #include "error_diag.h"
+#include "clock.h"
 
 // OsLayer initialization.
 OsLayer::OsLayer() {
@@ -55,10 +56,12 @@
   testmemsize_ = 0;
   totalmemsize_ = 0;
   min_hugepages_bytes_ = 0;
+  reserve_mb_ = 0;
   normal_mem_ = true;
   use_hugepages_ = false;
   use_posix_shm_ = false;
   dynamic_mapped_shmem_ = false;
+  mmapped_allocation_ = false;
   shmid_ = 0;
 
   time_initialized_ = 0;
@@ -76,20 +79,28 @@
   address_mode_ = sizeof(pvoid) * 8;
 
   has_clflush_ = false;
-  has_sse2_ = false;
+  has_vector_ = false;
 
   use_flush_page_cache_ = false;
+
+  clock_ = NULL;
 }
 
 // OsLayer cleanup.
 OsLayer::~OsLayer() {
   if (error_diagnoser_)
     delete error_diagnoser_;
+  if (clock_)
+    delete clock_;
 }
 
 // OsLayer initialization.
 bool OsLayer::Initialize() {
-  time_initialized_ = time(NULL);
+  if (!clock_) {
+    clock_ = new Clock();
+  }
+
+  time_initialized_ = clock_->Now();
   // Detect asm support.
   GetFeatures();
 
@@ -129,8 +140,28 @@
 
 // Translates user virtual to physical address.
 uint64 OsLayer::VirtualToPhysical(void *vaddr) {
-  // Needs platform specific implementation.
-  return 0;
+  uint64 frame, shift;
+  off64_t off = ((uintptr_t)vaddr) / sysconf(_SC_PAGESIZE) * 8;
+  int fd = open(kPagemapPath, O_RDONLY);
+  // /proc/self/pagemap is available in kernel >= 2.6.25
+  if (fd < 0)
+    return 0;
+
+  if (lseek64(fd, off, SEEK_SET) != off || read(fd, &frame, 8) != 8) {
+    int err = errno;
+    string errtxt = ErrorString(err);
+    logprintf(0, "Process Error: failed to access %s with errno %d (%s)\n",
+              kPagemapPath, err, errtxt.c_str());
+    if (fd >= 0)
+      close(fd);
+    return 0;
+  }
+  close(fd);
+  if (!(frame & (1LL << 63)) || (frame & (1LL << 62)))
+    return 0;
+  shift = (frame >> 55) & 0x3f;
+  frame = (frame & 0x007fffffffffffffLL) << shift;
+  return frame | ((uintptr_t)vaddr & ((1LL << shift) - 1));
 }
 
 // Returns the HD device that contains this file.
@@ -149,21 +180,21 @@
 // Get HW core features from cpuid instruction.
 void OsLayer::GetFeatures() {
 #if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
-  // CPUID features documented at:
-  // http://www.sandpile.org/ia32/cpuid.htm
-  int ax, bx, cx, dx;
-  __asm__ __volatile__ (
-      "cpuid": "=a" (ax), "=b" (bx), "=c" (cx), "=d" (dx) : "a" (1));
-  has_clflush_ = (dx >> 19) & 1;
-  has_sse2_ = (dx >> 26) & 1;
+  unsigned int eax = 1, ebx, ecx, edx;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  has_clflush_ = (edx >> 19) & 1;
+  has_vector_ = (edx >> 26) & 1;  // SSE2 caps bit.
 
   logprintf(9, "Log: has clflush: %s, has sse2: %s\n",
             has_clflush_ ? "true" : "false",
-            has_sse2_ ? "true" : "false");
+            has_vector_ ? "true" : "false");
 #elif defined(STRESSAPPTEST_CPU_PPC)
   // All PPC implementations have cache flush instructions.
   has_clflush_ = true;
 #elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  // TODO(nsanders): add detect from /proc/cpuinfo or /proc/self/auxv.
+  // For now assume neon and don't run -W if you don't have it.
+  has_vector_ = true; // NEON.
 #warning "Unsupported CPU type ARMV7A: unable to determine feature set."
 #else
 #warning "Unsupported CPU type: unable to determine feature set."
@@ -215,8 +246,9 @@
 void OsLayer::Flush(void *vaddr) {
   // Use the generic flush. This function is just so we can override
   // this if we are so inclined.
-  if (has_clflush_)
-    FastFlush(vaddr);
+  if (has_clflush_) {
+    OsLayer::FastFlush(vaddr);
+  }
 }
 
 
@@ -224,7 +256,7 @@
 bool OsLayer::AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
                               unsigned int size_in_bytes,
                               AdlerChecksum *checksum) {
-  if (has_sse2_) {
+  if (has_vector_) {
     return AdlerMemcpyAsm(dstmem, srcmem, size_in_bytes, checksum);
   } else {
     return AdlerMemcpyWarmC(dstmem, srcmem, size_in_bytes, checksum);
@@ -232,12 +264,31 @@
 }
 
 
-// Translate user virtual to physical address.
+// Translate physical address to memory module/chip name.
+// Assumes interleaving between two memory channels based on the XOR of
+// all address bits in the 'channel_hash' mask, with repeated 'channel_width_'
+// blocks with bits distributed from each chip in that channel.
 int OsLayer::FindDimm(uint64 addr, char *buf, int len) {
-  char tmpbuf[256];
-  snprintf(tmpbuf, sizeof(tmpbuf), "DIMM Unknown");
-  snprintf(buf, len, "%s", tmpbuf);
-  return 0;
+  if (!channels_) {
+    snprintf(buf, len, "DIMM Unknown");
+    return -1;
+  }
+
+  // Find channel by XORing address bits in channel_hash mask.
+  uint32 low = static_cast<uint32>(addr & channel_hash_);
+  uint32 high = static_cast<uint32>((addr & channel_hash_) >> 32);
+  vector<string>& channel = (*channels_)[
+      __builtin_parity(high) ^ __builtin_parity(low)];
+
+  // Find dram chip by finding which byte within the channel
+  // by address mod channel width, then divide the channel
+  // evenly among the listed dram chips. Note, this will not work
+  // with x4 dram.
+  int chip = (addr % (channel_width_ / 8)) /
+             ((channel_width_ / 8) / channel.size());
+  string name = channel[chip];
+  snprintf(buf, len, "%s", name.c_str());
+  return 1;
 }
 
 
@@ -293,9 +344,17 @@
 
 // Report an error in an easily parseable way.
 bool OsLayer::ErrorReport(const char *part, const char *symptom, int count) {
-  time_t now = time(NULL);
+  time_t now = clock_->Now();
   int ttf = now - time_initialized_;
-  logprintf(0, "Report Error: %s : %s : %d : %ds\n", symptom, part, count, ttf);
+  if (strlen(symptom) && strlen(part)) {
+    logprintf(0, "Report Error: %s : %s : %d : %ds\n",
+              symptom, part, count, ttf);
+  } else {
+    // Log something so the error still shows up, but this won't break the
+    // parser.
+    logprintf(0, "Warning: Invalid Report Error: "
+              "%s : %s : %d : %ds\n", symptom, part, count, ttf);
+  }
   return true;
 }
 
@@ -359,12 +418,31 @@
   //
   // TODO(nsanders): is there a more correct way to determine target
   // memory size?
-  if (hugepagesize > 0 && min_hugepages_bytes_ > 0) {
-    minsize = min_hugepages_bytes_;
-  } else if (physsize < 2048LL * kMegabyte) {
-    minsize = ((pages * 85) / 100) * pagesize;
+  if (hugepagesize > 0) {
+    if (min_hugepages_bytes_ > 0) {
+      minsize = min_hugepages_bytes_;
+    } else {
+      minsize = hugepagesize;
+    }
   } else {
-    minsize = ((pages * 95) / 100) * pagesize - (192 * kMegabyte);
+    if (physsize < 2048LL * kMegabyte) {
+      minsize = ((pages * 85) / 100) * pagesize;
+    } else {
+      minsize = ((pages * 95) / 100) * pagesize - (192 * kMegabyte);
+    }
+    // Make sure that at least reserve_mb_ is left for the system.
+    if (reserve_mb_ > 0) {
+      int64 totalsize = pages * pagesize;
+      int64 reserve_kb = reserve_mb_ * kMegabyte;
+      if (reserve_kb > totalsize) {
+        logprintf(0, "Procedural Error: %lld is bigger than the total memory "
+                  "available %lld\n", reserve_kb, totalsize);
+      } else if (reserve_kb > totalsize - minsize) {
+        logprintf(5, "Warning: Overriding memory to use: original %lld, "
+                  "current %lld\n", minsize, totalsize - reserve_kb);
+        minsize = totalsize - reserve_kb;
+      }
+    }
   }
 
   // Use hugepage sizing if available.
@@ -435,7 +513,7 @@
                  "'sudo mount -o remount,size=100\% /dev/shm.'\n");
   } else if (hugepagesize >= length) {
     prefer_hugepages = true;
-    logprintf(3, "Log: Prefer using hugepace allocation.\n");
+    logprintf(3, "Log: Prefer using hugepage allocation.\n");
   } else {
     logprintf(3, "Log: Prefer plain malloc memory allocation.\n");
   }
@@ -458,7 +536,7 @@
         break;
       }
 
-      shmaddr = shmat(shmid, NULL, NULL);
+      shmaddr = shmat(shmid, NULL, 0);
       if (shmaddr == reinterpret_cast<void*>(-1)) {
         int err = errno;
         string errtxt = ErrorString(err);
@@ -515,7 +593,7 @@
         // Do a full mapping here otherwise.
         shmaddr = mmap64(NULL, length, PROT_READ | PROT_WRITE,
                          MAP_SHARED | MAP_NORESERVE | MAP_LOCKED | MAP_POPULATE,
-                         shm_object, NULL);
+                         shm_object, 0);
         if (shmaddr == reinterpret_cast<void*>(-1)) {
           int err = errno;
           string errtxt = ErrorString(err);
@@ -540,18 +618,32 @@
     } while (0);
     shm_unlink("/stressapptest");
   }
-#endif // HAVE_SYS_SHM_H
+#endif  // HAVE_SYS_SHM_H
 
   if (!use_hugepages_ && !use_posix_shm_) {
-    // Use memalign to ensure that blocks are aligned enough for disk direct IO.
-    buf = static_cast<char*>(memalign(4096, length));
-    if (buf) {
-      logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
-    } else {
-      logprintf(0, "Process Error: memalign returned 0\n");
-      if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
-        logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
-                     "bit process. Please setup shared memory.\n");
+    // If the page size is what SAT is expecting explicitly perform mmap()
+    // allocation.
+    if (sysconf(_SC_PAGESIZE) >= 4096) {
+      void *map_buf = mmap(NULL, length, PROT_READ | PROT_WRITE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+      if (map_buf != MAP_FAILED) {
+        buf = map_buf;
+        mmapped_allocation_ = true;
+        logprintf(0, "Log: Using mmap() allocation at %p.\n", buf);
+      }
+    }
+    if (!mmapped_allocation_) {
+      // Use memalign to ensure that blocks are aligned enough for disk direct
+      // IO.
+      buf = static_cast<char*>(memalign(4096, length));
+      if (buf) {
+        logprintf(0, "Log: Using memaligned allocation at %p.\n", buf);
+      } else {
+        logprintf(0, "Process Error: memalign returned 0\n");
+        if ((length >= 1499LL * kMegabyte) && (address_mode_ == 32)) {
+          logprintf(0, "Log: You are trying to allocate > 1.4G on a 32 "
+                       "bit process. Please setup shared memory.\n");
+        }
       }
     }
   }
@@ -579,6 +671,8 @@
         munmap(testmem_, testmemsize_);
       }
       close(shmid_);
+    } else if (mmapped_allocation_) {
+      munmap(testmem_, testmemsize_);
     } else {
       free(testmem_);
     }
@@ -800,7 +894,9 @@
 bool OsLayer::CpuStressWorkload() {
   double float_arr[100];
   double sum = 0;
+#ifdef HAVE_RAND_R
   unsigned int seed = 12345;
+#endif
 
   // Initialize array with random numbers.
   for (int i = 0; i < 100; i++) {
@@ -809,8 +905,9 @@
     if (rand_r(&seed) % 2)
       float_arr[i] *= -1.0;
 #else
-    float_arr[i] = rand();
-    if (rand() % 2)
+    srand(time(NULL));
+    float_arr[i] = rand();  // NOLINT
+    if (rand() % 2)         // NOLINT
       float_arr[i] *= -1.0;
 #endif
   }
@@ -828,82 +925,3 @@
     logprintf(12, "Log: I'm Feeling Lucky!\n");
   return true;
 }
-
-PCIDevices OsLayer::GetPCIDevices() {
-  PCIDevices device_list;
-  DIR *dir;
-  struct dirent *buf = new struct dirent();
-  struct dirent *entry;
-  dir = opendir(kSysfsPath);
-  if (!dir)
-    logprintf(0, "Process Error: Cannot open %s", kSysfsPath);
-  while (readdir_r(dir, buf, &entry) == 0 && entry) {
-    PCIDevice *device;
-    unsigned int dev, func;
-    // ".", ".." or a special non-device perhaps.
-    if (entry->d_name[0] == '.')
-      continue;
-
-    device = new PCIDevice();
-    if (sscanf(entry->d_name, "%04x:%02hx:%02x.%d",
-               &device->domain, &device->bus, &dev, &func) < 4) {
-      logprintf(0, "Process Error: Couldn't parse %s", entry->d_name);
-      free(device);
-      continue;
-    }
-    device->dev = dev;
-    device->func = func;
-    device->vendor_id = PCIGetValue(entry->d_name, "vendor");
-    device->device_id = PCIGetValue(entry->d_name, "device");
-    PCIGetResources(entry->d_name, device);
-    device_list.insert(device_list.end(), device);
-  }
-  closedir(dir);
-  delete buf;
-  return device_list;
-}
-
-int OsLayer::PCIGetValue(string name, string object) {
-  int fd, len;
-  char filename[256];
-  char buf[256];
-  snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
-           name.c_str(), object.c_str());
-  fd = open(filename, O_RDONLY);
-  if (fd < 0)
-    return 0;
-  len = read(fd, buf, 256);
-  close(fd);
-  buf[len] = '\0';
-  return strtol(buf, NULL, 0);  // NOLINT
-}
-
-int OsLayer::PCIGetResources(string name, PCIDevice *device) {
-  char filename[256];
-  char buf[256];
-  FILE *file;
-  int64 start;
-  int64 end;
-  int64 size;
-  int i;
-  snprintf(filename, sizeof(filename), "%s/%s/%s", kSysfsPath,
-           name.c_str(), "resource");
-  file = fopen(filename, "r");
-  if (!file) {
-    logprintf(0, "Process Error: impossible to find resource file for %s",
-              filename);
-    return errno;
-  }
-  for (i = 0; i < 6; i++) {
-    if (!fgets(buf, 256, file))
-      break;
-    sscanf(buf, "%llx %llx", &start, &end);  // NOLINT
-    size = 0;
-    if (start)
-      size = end - start + 1;
-    device->base_addr[i] = start;
-    device->size[i] = size;
-  }
-  fclose(file);
-  return 0;
-}
diff --git a/src/os.h b/src/os.h
index b043b61..0812f1a 100644
--- a/src/os.h
+++ b/src/os.h
@@ -17,6 +17,9 @@
 #define STRESSAPPTEST_OS_H_
 
 #include <dirent.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
 #include <string>
 #include <list>
 #include <map>
@@ -26,8 +29,9 @@
 // so these includes are correct.
 #include "adler32memcpy.h"  // NOLINT
 #include "sattypes.h"       // NOLINT
+#include "clock.h"          // NOLINT
 
-const char kSysfsPath[] = "/sys/bus/pci/devices";
+const char kPagemapPath[] = "/proc/self/pagemap";
 
 struct PCIDevice {
   int32 domain;
@@ -44,6 +48,8 @@
 
 class ErrorDiag;
 
+class Clock;
+
 // This class implements OS/Platform specific funtions.
 class OsLayer {
  public:
@@ -56,6 +62,21 @@
     min_hugepages_bytes_ = min_bytes;
   }
 
+  // Set the minium amount of memory that should not be allocated. This only
+  // has any affect if hugepages are not used.
+  // Must be set before Initialize().
+  void SetReserveSize(int64 reserve_mb) {
+    reserve_mb_ = reserve_mb;
+  }
+
+  // Set parameters needed to translate physical address to memory module.
+  void SetDramMappingParams(uintptr_t channel_hash, int channel_width,
+                            vector< vector<string> > *channels) {
+    channel_hash_ = channel_hash;
+    channel_width_ = channel_width;
+    channels_ = channels;
+  }
+
   // Initializes data strctures and open files.
   // Returns false on error.
   virtual bool Initialize();
@@ -68,13 +89,11 @@
   // Prints failed dimm. This implementation is optional for
   // subclasses to implement.
   // Takes a bus address and string, and prints the DIMM name
-  // into the string. Returns error status.
+  // into the string. Returns the DIMM number that corresponds to the
+  // address given, or -1 if unable to identify the DIMM number.
+  // Note that subclass implementations of FindDimm() MUST fill
+  // buf with at LEAST one non-whitespace character (provided len > 0).
   virtual int FindDimm(uint64 addr, char *buf, int len);
-  // Print dimm info, plus more available info.
-  virtual int FindDimmExtended(uint64 addr, char *buf, int len) {
-    return FindDimm(addr, buf, len);
-  }
-
 
   // Classifies addresses according to "regions"
   // This may mean different things on different platforms.
@@ -132,10 +151,94 @@
     // instruction. For example, software can use an MFENCE instruction to
     // insure that previous stores are included in the write-back.
     asm volatile("mfence");
-    asm volatile("clflush (%0)" :: "r" (vaddr));
+    asm volatile("clflush (%0)" : : "r" (vaddr));
+    asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A) && !defined(__aarch64__)
+    // ARMv7a cachelines are 8 words (32 bytes).
+    syscall(__ARM_NR_cacheflush, vaddr, reinterpret_cast<char*>(vaddr) + 32, 0);
+#else
+  #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Takes a NULL-terminated
+  // array of addresses to flush.
+  inline static void FastFlushList(void **vaddrs) {
+#ifdef STRESSAPPTEST_CPU_PPC
+    while (*vaddrs) {
+      asm volatile("dcbf 0,%0" : : "r" (*vaddrs++));
+    }
+    asm volatile("sync");
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // Put mfence before and after clflush to make sure:
+    // 1. The write before the clflush is committed to memory bus;
+    // 2. The read after the clflush is hitting the memory bus.
+    //
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
+    asm volatile("mfence");
+    while (*vaddrs) {
+      asm volatile("clflush (%0)" : : "r" (*vaddrs++));
+    }
     asm volatile("mfence");
 #elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: Unable to force cache flushes."
+    while (*vaddrs) {
+      FastFlush(*vaddrs++);
+    }
+#else
+    #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush hint, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Note that this
+  // will not guarantee that a flush happens, but will at least
+  // hint that it should.  This is useful for speeding up
+  // parallel march algorithms.
+  inline static void FastFlushHint(void *vaddr) {
+#ifdef STRESSAPPTEST_CPU_PPC
+    asm volatile("dcbf 0,%0" : : "r" (vaddr));
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
+    asm volatile("clflush (%0)" : : "r" (vaddr));
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+    FastFlush(vaddr);
+#else
+    #warning "Unsupported CPU type: Unable to force cache flushes."
+#endif
+  }
+
+  // Fast flush, for use in performance critical code.
+  // This is bound at compile time, and will not pick up
+  // any runtime machine configuration info.  Sync's any
+  // transactions for ordering FastFlushHints.
+  inline static void FastFlushSync() {
+#ifdef STRESSAPPTEST_CPU_PPC
+    asm volatile("sync");
+#elif defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+    // Put mfence before and after clflush to make sure:
+    // 1. The write before the clflush is committed to memory bus;
+    // 2. The read after the clflush is hitting the memory bus.
+    //
+    // From Intel manual:
+    // CLFLUSH is only ordered by the MFENCE instruction. It is not guaranteed
+    // to be ordered by any other fencing, serializing or other CLFLUSH
+    // instruction. For example, software can use an MFENCE instruction to
+    // insure that previous stores are included in the write-back.
+    asm volatile("mfence");
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+    // This is a NOP, FastFlushHint() always does a full flush, so there's
+    // nothing to do for FastFlushSync().
 #else
   #warning "Unsupported CPU type: Unable to force cache flushes."
 #endif
@@ -164,10 +267,10 @@
     __asm __volatile("rdtsc" : "=a" (data.l32.l), "=d"(data.l32.h));
     tsc = data.l64;
 #elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: your build may not function correctly"
+    #warning "Unsupported CPU type ARMV7A: your timer may not function correctly"
     tsc = 0;
 #else
-  #warning "Unsupported CPU type: your build may not function correctly"
+    #warning "Unsupported CPU type: your timer may not function correctly"
     tsc = 0;
 #endif
     return (tsc);
@@ -230,9 +333,6 @@
   // Handle to platform-specific error diagnoser.
   ErrorDiag *error_diagnoser_;
 
-  // Detect all PCI Devices.
-  virtual PCIDevices GetPCIDevices();
-
   // Disambiguate between different "warm" memcopies.
   virtual bool AdlerMemcpyWarm(uint64 *dstmem, uint64 *srcmem,
                                unsigned int size_in_bytes,
@@ -249,17 +349,31 @@
   }
   ErrCallback get_err_log_callback() { return err_log_callback_; }
 
+  // Set a clock object that can be overridden for use with unit tests.
+  void SetClock(Clock *clock) {
+    if (clock_) {
+      delete clock_;
+    }
+    clock_ = clock;
+    time_initialized_ = clock_->Now();
+  }
+
  protected:
   void *testmem_;                // Location of test memory.
   uint64 testmemsize_;           // Size of test memory.
   int64 totalmemsize_;           // Size of available memory.
   int64 min_hugepages_bytes_;    // Minimum hugepages size.
+  int64 reserve_mb_;             // Minimum amount of memory to reserve in MB.
   bool  error_injection_;        // Do error injection?
   bool  normal_mem_;             // Memory DMA capable?
   bool  use_hugepages_;          // Use hugepage shmem?
   bool  use_posix_shm_;          // Use 4k page shmem?
   bool  dynamic_mapped_shmem_;   // Conserve virtual address space.
+  bool  mmapped_allocation_;     // Was memory allocated using mmap()?
   int   shmid_;                  // Handle to shmem
+  vector< vector<string> > *channels_;  // Memory module names per channel.
+  uint64 channel_hash_;          // Mask of address bits XORed for channel.
+  int channel_width_;            // Channel width in bits.
 
   int64 regionsize_;             // Size of memory "regions"
   int   regioncount_;            // Number of memory "regions"
@@ -267,7 +381,7 @@
   int   num_nodes_;              // Number of nodes in the system.
   int   num_cpus_per_node_;      // Number of cpus per node in the system.
   int   address_mode_;           // Are we running 32 or 64 bit?
-  bool  has_sse2_;               // Do we have sse2 instructions?
+  bool  has_vector_;             // Do we have sse2/neon instructions?
   bool  has_clflush_;            // Do we have clflush instructions?
   bool  use_flush_page_cache_;   // Do we need to flush the page cache?
 
@@ -279,9 +393,6 @@
 
   // Get file descriptor for dev msr.
   virtual int OpenMSR(uint32 core, uint32 address);
-  // Auxiliary methods for PCI device configuration
-  int PCIGetValue(string name, string object);
-  int PCIGetResources(string name, PCIDevice *device);
 
   // Look up how many hugepages there are.
   virtual int64 FindHugePages();
@@ -289,6 +400,9 @@
   // Link to find last transaction at an error location.
   ErrCallback err_log_callback_;
 
+  // Object to wrap the time function.
+  Clock *clock_;
+
  private:
   DISALLOW_COPY_AND_ASSIGN(OsLayer);
 };
diff --git a/src/sat.cc b/src/sat.cc
index ede951d..927ba54 100644
--- a/src/sat.cc
+++ b/src/sat.cc
@@ -125,6 +125,26 @@
   #error Build system regression - COPTS disregarded.
 #endif
 
+  // Check if the cpu frequency test is enabled and able to run.
+  if (cpu_freq_test_) {
+    if (!CpuFreqThread::CanRun()) {
+      logprintf(0, "Process Error: This platform does not support this "
+                "test.\n");
+      bad_status();
+      return false;
+    } else if (cpu_freq_threshold_ <= 0) {
+      logprintf(0, "Process Error: The cpu frequency test requires "
+                "--cpu_freq_threshold set to a value > 0\n");
+      bad_status();
+      return false;
+    } else if (cpu_freq_round_ < 0) {
+      logprintf(0, "Process Error: The --cpu_freq_round option must be greater"
+                " than or equal to zero. A value of zero means no rounding.\n");
+      bad_status();
+      return false;
+    }
+  }
+
   // Use all CPUs if nothing is specified.
   if (memory_threads_ == -1) {
     memory_threads_ = os_->num_cpus();
@@ -488,15 +508,9 @@
   for (int64 i = 0; i < pages_; i++) {
     struct page_entry pe;
     // Only get valid pages with uninitialized tags here.
-    char buf[256];
     if (GetValid(&pe, kInvalidTag)) {
       int64 paddr = os_->VirtualToPhysical(pe.addr);
       int32 region = os_->FindRegion(paddr);
-
-      os_->FindDimm(paddr, buf, sizeof(buf));
-      if (i < 256) {
-        logprintf(12, "Log: address: %#llx, %s\n", paddr, buf);
-      }
       region_[region]++;
       pe.paddr = paddr;
       pe.tag = 1 << region;
@@ -554,6 +568,7 @@
   // Initializes sync'd log file to ensure output is saved.
   if (!InitializeLogfile())
     return false;
+  Logger::GlobalLogger()->SetTimestampLogging(log_timestamps_);
   Logger::GlobalLogger()->StartThread();
 
   logprintf(5, "Log: Commandline - %s\n", cmdline_.c_str());
@@ -573,6 +588,17 @@
   if (min_hugepages_mbytes_ > 0)
     os_->SetMinimumHugepagesSize(min_hugepages_mbytes_ * kMegabyte);
 
+  if (reserve_mb_ > 0)
+    os_->SetReserveSize(reserve_mb_);
+
+  if (channels_.size() > 0) {
+    logprintf(6, "Log: Decoding memory: %dx%d bit channels,"
+        "%d modules per channel (x%d), decoding hash 0x%x\n",
+        channels_.size(), channel_width_, channels_[0].size(),
+        channel_width_/channels_[0].size(), channel_hash_);
+    os_->SetDramMappingParams(channel_hash_, channel_width_, &channels_);
+  }
+
   if (!os_->Initialize()) {
     logprintf(0, "Process Error: Failed to initialize OS layer\n");
     bad_status();
@@ -640,18 +666,23 @@
   pages_ = 0;
   size_mb_ = 0;
   size_ = size_mb_ * kMegabyte;
+  reserve_mb_ = 0;
   min_hugepages_mbytes_ = 0;
   freepages_ = 0;
   paddr_base_ = 0;
+  channel_hash_ = kCacheLineSize;
+  channel_width_ = 64;
 
   user_break_ = false;
   verbosity_ = 8;
   Logger::GlobalLogger()->SetVerbosity(verbosity_);
+  print_delay_ = 10;
   strict_ = 1;
   warm_ = 0;
   run_on_anything_ = 0;
   use_logfile_ = 0;
   logfile_ = 0;
+  log_timestamps_ = true;
   // Detect 32/64 bit binary.
   void *pvoid = 0;
   address_mode_ = sizeof(pvoid) * 8;
@@ -669,9 +700,15 @@
   // Cache coherency data initialization.
   cc_test_ = false;         // Flag to trigger cc threads.
   cc_cacheline_count_ = 2;  // Two datastructures of cache line size.
+  cc_cacheline_size_ = 0;   // Size of a cacheline (0 for auto-detect).
   cc_inc_count_ = 1000;     // Number of times to increment the shared variable.
   cc_cacheline_data_ = 0;   // Cache Line size datastructure.
 
+  // Cpu frequency data initialization.
+  cpu_freq_test_ = false;   // Flag to trigger cpu frequency thread.
+  cpu_freq_threshold_ = 0;  // Threshold, in MHz, at which a cpu fails.
+  cpu_freq_round_ = 10;     // Round the computed frequency to this value.
+
   sat_assert(0 == pthread_mutex_init(&worker_lock_, NULL));
   file_threads_ = 0;
   net_threads_ = 0;
@@ -765,6 +802,9 @@
     // Set number of megabyte to use.
     ARG_IVALUE("-M", size_mb_);
 
+    // Specify the amount of megabytes to be reserved for system.
+    ARG_IVALUE("--reserve_memory", reserve_mb_);
+
     // Set minimum megabytes of hugepages to require.
     ARG_IVALUE("-H", min_hugepages_mbytes_);
 
@@ -786,8 +826,21 @@
     // Set number of cache line size datastructures
     ARG_IVALUE("--cc_line_count", cc_cacheline_count_);
 
+    // Override the detected or assumed cache line size.
+    ARG_IVALUE("--cc_line_size", cc_cacheline_size_);
+
     // Flag set when cache coherency tests need to be run
-    ARG_KVALUE("--cc_test", cc_test_, 1);
+    ARG_KVALUE("--cc_test", cc_test_, true);
+
+    // Set when the cpu_frequency test needs to be run
+    ARG_KVALUE("--cpu_freq_test", cpu_freq_test_, true);
+
+    // Set the threshold in MHz at which the cpu frequency test will fail.
+    ARG_IVALUE("--cpu_freq_threshold", cpu_freq_threshold_);
+
+    // Set the rounding value for the cpu frequency test. The default is to
+    // round to the nearest 10s value.
+    ARG_IVALUE("--cpu_freq_round", cpu_freq_round_);
 
     // Set number of CPU stress threads.
     ARG_IVALUE("-C", cpu_stress_threads_);
@@ -798,6 +851,12 @@
     // Verbosity level.
     ARG_IVALUE("-v", verbosity_);
 
+    // Chatty printout level.
+    ARG_IVALUE("--printsec", print_delay_);
+
+    // Turn off timestamps logging.
+    ARG_KVALUE("--no_timestamps", log_timestamps_, false);
+
     // Set maximum number of errors to collect. Stop running after this many.
     ARG_IVALUE("--max_errors", max_errorcount_);
 
@@ -918,6 +977,23 @@
       continue;
     }
 
+    ARG_IVALUE("--channel_hash", channel_hash_);
+    ARG_IVALUE("--channel_width", channel_width_);
+
+    if (!strcmp(argv[i], "--memory_channel")) {
+      i++;
+      if (i < argc) {
+        char *channel = argv[i];
+        channels_.push_back(vector<string>());
+        while (char* next = strchr(channel, ',')) {
+          channels_.back().push_back(string(channel, next - channel));
+          channel = next + 1;
+        }
+        channels_.back().push_back(string(channel));
+      }
+      continue;
+    }
+
     // Default:
     PrintVersion();
     PrintHelp();
@@ -963,6 +1039,47 @@
       disk_pages_ = 1;
   }
 
+  // Validate memory channel parameters if supplied
+  if (channels_.size()) {
+    if (channels_.size() == 1) {
+      channel_hash_ = 0;
+      logprintf(7, "Log: "
+          "Only one memory channel...deactivating interleave decoding.\n");
+    } else if (channels_.size() > 2) {
+      logprintf(6, "Process Error: "
+          "Triple-channel mode not yet supported... sorry.\n");
+      bad_status();
+      return false;
+    }
+    for (uint i = 0; i < channels_.size(); i++)
+      if (channels_[i].size() != channels_[0].size()) {
+        logprintf(6, "Process Error: "
+            "Channels 0 and %d have a different count of dram modules.\n", i);
+        bad_status();
+        return false;
+      }
+    if (channels_[0].size() & (channels_[0].size() - 1)) {
+      logprintf(6, "Process Error: "
+          "Amount of modules per memory channel is not a power of 2.\n");
+      bad_status();
+      return false;
+    }
+    if (channel_width_ < 16
+        || channel_width_ & (channel_width_ - 1)) {
+      logprintf(6, "Process Error: "
+          "Channel width %d is invalid.\n", channel_width_);
+      bad_status();
+      return false;
+    }
+    if (channel_width_ / channels_[0].size() < 8) {
+      logprintf(6, "Process Error: Chip width x%d must be x8 or greater.\n",
+          channel_width_ / channels_[0].size());
+      bad_status();
+      return false;
+    }
+  }
+
+
   // Print each argument.
   for (int i = 0; i < argc; i++) {
     if (i)
@@ -976,6 +1093,8 @@
 void Sat::PrintHelp() {
   printf("Usage: ./sat(32|64) [options]\n"
          " -M mbytes        megabytes of ram to test\n"
+         " --reserve-memory If not using hugepages, the amount of memory to "
+         " reserve for the system\n"
          " -H mbytes        minimum megabytes of hugepages to require\n"
          " -s seconds       number of seconds to run\n"
          " -m threads       number of memory copy threads to run\n"
@@ -987,8 +1106,10 @@
          " -f filename      add a disk thread with "
          "tempfile 'filename'\n"
          " -l logfile       log output to file 'logfile'\n"
+         " --no_timestamps  do not prefix timestamps to log messages\n"
          " --max_errors n   exit early after finding 'n' errors\n"
          " -v level         verbosity (0-20), default is 8\n"
+         " --printsec secs  How often to print 'seconds remaining'\n"
          " -W               Use more CPU-stressful memory copy\n"
          " -A               run in degraded mode on incompatible systems\n"
          " -p pagesize      size in bytes of memory chunks\n"
@@ -1024,13 +1145,26 @@
          "cacheline's member\n"
          " --cc_line_count  number of cache line sized datastructures "
          "to allocate for the cache coherency threads to operate\n"
+         " --cc_line_size   override the auto-detected cache line size\n"
+         " --cpu_freq_test  enable the cpu frequency test (requires the "
+         "--cpu_freq_threshold argument to be set)\n"
+         " --cpu_freq_threshold  fail the cpu frequency test if the frequency "
+         "goes below this value (specified in MHz)\n"
+         " --cpu_freq_round round the computed frequency to this value, if set"
+         " to zero, only round to the nearest MHz\n"
          " --paddr_base     allocate memory starting from this address\n"
          " --pause_delay    delay (in seconds) between power spikes\n"
          " --pause_duration duration (in seconds) of each pause\n"
-         " --local_numa : choose memory regions associated with "
+         " --local_numa     choose memory regions associated with "
          "each CPU to be tested by that CPU\n"
-         " --remote_numa : choose memory regions not associated with "
-         "each CPU to be tested by that CPU\n");
+         " --remote_numa    choose memory regions not associated with "
+         "each CPU to be tested by that CPU\n"
+         " --channel_hash   mask of address bits XORed to determine channel. "
+         "Mask 0x40 interleaves cachelines between channels\n"
+         " --channel_width bits     width in bits of each memory channel\n"
+         " --memory_channel u1,u2   defines a comma-separated list of names "
+         "for dram packages in a memory channel. Use multiple times to "
+         "define multiple channels.\n");
 }
 
 bool Sat::CheckGoogleSpecificArgs(int argc, char **argv, int *i) {
@@ -1275,32 +1409,45 @@
            sizeof(cc_cacheline_data) * cc_cacheline_count_);
 
     int num_cpus = CpuCount();
+    char *num;
+    // Calculate the number of cache lines needed just to give each core
+    // its own counter.
+    int line_size = cc_cacheline_size_;
+    if (line_size <= 0) {
+      line_size = CacheLineSize();
+      if (line_size < kCacheLineSize)
+        line_size = kCacheLineSize;
+      logprintf(12, "Log: Using %d as cache line size\n", line_size);
+    }
+    // The number of cache lines needed to hold an array of num_cpus.
+    // "num" must be the same type as cc_cacheline_data[X].num or the memory
+    // size calculations will fail.
+    int needed_lines = (sizeof(*num) * num_cpus + line_size - 1) / line_size;
     // Allocate all the nums once so that we get a single chunk
     // of contiguous memory.
-    int *num;
 #ifdef HAVE_POSIX_MEMALIGN
     int err_result = posix_memalign(
         reinterpret_cast<void**>(&num),
-        kCacheLineSize, sizeof(*num) * num_cpus * cc_cacheline_count_);
+        line_size, line_size * needed_lines * cc_cacheline_count_);
 #else
-    num = reinterpret_cast<int*>(memalign(kCacheLineSize,
-			sizeof(*num) * num_cpus * cc_cacheline_count_));
+    num = reinterpret_cast<char*>(memalign(
+        line_size, line_size * needed_lines * cc_cacheline_count_));
     int err_result = (num == 0);
 #endif
     sat_assert(err_result == 0);
 
     int cline;
     for (cline = 0; cline < cc_cacheline_count_; cline++) {
-      memset(num, 0, sizeof(num_cpus) * num_cpus);
+      memset(num, 0, sizeof(*num) * num_cpus);
       cc_cacheline_data_[cline].num = num;
-      num += num_cpus;
+      num += (line_size * needed_lines) / sizeof(*num);
     }
 
     int tnum;
     for (tnum = 0; tnum < num_cpus; tnum++) {
       CpuCacheCoherencyThread *thread =
           new CpuCacheCoherencyThread(cc_cacheline_data_, cc_cacheline_count_,
-                                      tnum, cc_inc_count_);
+                                      tnum, num_cpus, cc_inc_count_);
       thread->InitThread(total_threads_++, this, os_, patternlist_,
                          &continuous_status_);
       // Pin the thread to a particular core.
@@ -1311,6 +1458,22 @@
     }
     workers_map_.insert(make_pair(kCCType, cc_vector));
   }
+
+  if (cpu_freq_test_) {
+    // Create the frequency test thread.
+    logprintf(5, "Log: Running cpu frequency test: threshold set to %dMHz.\n",
+              cpu_freq_threshold_);
+    CpuFreqThread *thread = new CpuFreqThread(CpuCount(), cpu_freq_threshold_,
+                                              cpu_freq_round_);
+    // This thread should be paused when other threads are paused.
+    thread->InitThread(total_threads_++, this, os_, NULL,
+                       &power_spike_status_);
+
+    WorkerVector *cpu_freq_vector = new WorkerVector();
+    cpu_freq_vector->insert(cpu_freq_vector->end(), thread);
+    workers_map_.insert(make_pair(kCPUFreqType, cpu_freq_vector));
+  }
+
   ReleaseWorkerLock();
 }
 
@@ -1319,6 +1482,19 @@
   return sysconf(_SC_NPROCESSORS_CONF);
 }
 
+// Return the worst case (largest) cache line size of the various levels of
+// cache actually prsent in the machine.
+int Sat::CacheLineSize() {
+  int max_linesize = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+  int linesize = sysconf(_SC_LEVEL2_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  linesize = sysconf(_SC_LEVEL3_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  linesize = sysconf(_SC_LEVEL4_CACHE_LINESIZE);
+  if (linesize > max_linesize) max_linesize = linesize;
+  return max_linesize;
+}
+
 // Notify and reap worker threads.
 void Sat::JoinThreads() {
   logprintf(12, "Log: Joining worker threads\n");
@@ -1443,7 +1619,7 @@
        map_it != workers_map_.end(); ++map_it) {
     for (WorkerVector::const_iterator it = map_it->second->begin();
          it != map_it->second->end(); ++it) {
-      thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000;
+      thread_runtime_sec = (*it)->GetRunDurationUSec()*1.0/1000000.;
       total_data += (*it)->GetMemoryCopiedData();
       total_data += (*it)->GetDeviceCopiedData();
       if (thread_runtime_sec > max_runtime_sec) {
@@ -1714,12 +1890,12 @@
   // All of these are in seconds.  You probably want them to be >=
   // kSleepFrequency and multiples of kSleepFrequency, but neither is necessary.
   static const time_t kInjectionFrequency = 10;
-  static const time_t kPrintFrequency = 10;
+  // print_delay_ determines "seconds remaining" chatty update.
 
   const time_t start = time(NULL);
   const time_t end = start + runtime_seconds_;
   time_t now = start;
-  time_t next_print = start + kPrintFrequency;
+  time_t next_print = start + print_delay_;
   time_t next_pause = start + pause_delay_;
   time_t next_resume = 0;
   time_t next_injection;
@@ -1755,7 +1931,7 @@
     if (now >= next_print) {
       // Print a count down message.
       logprintf(5, "Log: Seconds remaining: %d\n", seconds_remaining);
-      next_print = NextOccurance(kPrintFrequency, start, now);
+      next_print = NextOccurance(print_delay_, start, now);
     }
 
     if (next_injection && now >= next_injection) {
@@ -1901,3 +2077,9 @@
   Logger::GlobalLogger()->VLogF(priority, format, args);
   va_end(args);
 }
+
+// Stop the logging thread and verify any pending data is written to the log.
+void logstop() {
+  Logger::GlobalLogger()->StopThread();
+}
+
diff --git a/src/sat.h b/src/sat.h
index b48f519..5cc3bec 100644
--- a/src/sat.h
+++ b/src/sat.h
@@ -134,6 +134,8 @@
 
   // Return the number of cpus in the system.
   int CpuCount();
+  // Return the worst-case (largest) cache line size of the system.
+  int CacheLineSize();
 
   // Collect error counts from threads.
   int64 GetTotalErrorCount();
@@ -147,17 +149,23 @@
   int64 pages_;                       // Number of memory blocks.
   int64 size_;                        // Size of memory tested, in bytes.
   int64 size_mb_;                     // Size of memory tested, in MB.
+  int64 reserve_mb_;                  // Reserve at least this amount of memory
+                                      // for the system, in MB.
   int64 min_hugepages_mbytes_;        // Minimum hugepages size.
   int64 freepages_;                   // How many invalid pages we need.
   int disk_pages_;                    // Number of pages per temp file.
   uint64 paddr_base_;                 // Physical address base.
+  uint64 channel_hash_;               // Mask of address bits XORed for channel.
+  int channel_width_;                 // Channel width in bits.
+  vector< vector<string> > channels_;  // Memory module names per channel.
 
   // Control flags.
   volatile sig_atomic_t user_break_;  // User has signalled early exit.  Used as
                                       // a boolean.
   int verbosity_;                     // How much to print.
+  int print_delay_;                   // Chatty update frequency.
   int strict_;                        // Check results per transaction.
-  int warm_;                          // FPU warms CPU while coying.
+  int warm_;                          // FPU warms CPU while copying.
   int address_mode_;                  // 32 or 64 bit binary.
   bool stop_on_error_;                // Exit immendiately on any error.
   bool findfiles_;                    // Autodetect tempfile locations.
@@ -169,6 +177,7 @@
   int use_logfile_;                   // Log to a file.
   char logfilename_[255];             // Name of file to log to.
   int logfile_;                       // File handle to log to.
+  bool log_timestamps_;               // Whether to add timestamps to log lines.
 
   // Disk thread options.
   int read_block_size_;               // Size of block to read from disk.
@@ -199,9 +208,18 @@
   bool cc_test_;                      // Flag to decide whether to start the
                                       // cache coherency threads.
   int cc_cacheline_count_;            // Number of cache line size structures.
+  int cc_cacheline_size_;             // Size of a cache line.
   int cc_inc_count_;                  // Number of times to increment the shared
                                       // cache lines structure members.
 
+  // Cpu Frequency Options.
+  bool cpu_freq_test_;                // Flag to decide whether to start the
+                                      // cpu frequency thread.
+  int cpu_freq_threshold_;            // The MHz threshold which will cause
+                                      // the test to fail.
+  int cpu_freq_round_;                // Round the computed frequency to this
+                                      // value.
+
   // Thread control.
   int file_threads_;                  // Threads of file IO.
   int net_threads_;                   // Threads of network IO.
@@ -249,7 +267,8 @@
     kRandomDiskType = 7,
     kCPUType = 8,
     kErrorType = 9,
-    kCCType = 10
+    kCCType = 10,
+    kCPUFreqType = 11,
   };
 
   // Helper functions.
diff --git a/src/sattypes.h b/src/sattypes.h
index c9341d0..79bb47d 100644
--- a/src/sattypes.h
+++ b/src/sattypes.h
@@ -27,11 +27,11 @@
 
 #ifdef HAVE_CONFIG_H  // Built using autoconf
 #ifdef __ANDROID__
-#include "stressapptest_config_android.h"
+#include "stressapptest_config_android.h"  // NOLINT
 #else
-#include "stressapptest_config.h"
-using namespace __gnu_cxx;
-#endif
+#include "stressapptest_config.h"  // NOLINT
+using namespace __gnu_cxx;  //NOLINT
+#endif  // __ANDROID__
 using namespace std;
 
 typedef signed long long   int64;
@@ -57,10 +57,10 @@
 }
 
 static const bool kOpenSource = true;
-#else
+#else  // !HAVE_CONFIG_H
 static const bool kOpenSource = false;
-  #include "googlesattypes.h"
-#endif
+  #include "googlesattypes.h"  // NOLINT
+#endif  // HAVE_CONFIG_H
 // Workaround to allow 32/64 bit conversion
 // without running into strict aliasing problems.
 union datacast_t {
@@ -75,11 +75,15 @@
 // File sync'd print to console and log
 void logprintf(int priority, const char *format, ...);
 
+// Stop the log and dump any queued lines.
+void logstop();
+
 // We print to stderr ourselves first in case we're in such a bad state that the
 // logger can't work.
 #define sat_assert(x) \
 {\
   if (!(x)) {\
+    logstop();\
     fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
     logprintf(0, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
     exit(1);\
@@ -186,6 +190,48 @@
 #endif
 }
 
+// Execute the cpuid instruction and pass back the contents of the registers.
+// This only works on x86 based platforms.
+inline void cpuid(
+  unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
+  *ebx = 0;
+  *ecx = 0;
+  *edx = 0;
+  // CPUID features documented at:
+  // http://www.sandpile.org/ia32/cpuid.htm
+#if defined(STRESSAPPTEST_CPU_I686) || defined(STRESSAPPTEST_CPU_X86_64)
+#if defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
+  // In PIC compilations using the i686 cpu type, ebx contains the address
+  // of the global offset table. The compiler can't properly handle constraints
+  // using the ebx register for this compile, so preserve the register
+  // ourselves.
+  asm(
+    "mov %%ebx, %%edi;"
+    "cpuid;"
+    "xchg %%edi, %%ebx;"
+    // Output registers.
+    : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx)
+    // Input registers.
+    : "a" (*eax)
+  );  // Asm
+#else
+  asm(
+    "cpuid;"
+    // Output registers.
+    : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
+    // Input registers.
+    : "a" (*eax)
+  );  // Asm
+#endif  // defined(__PIC__) && defined(STRESSAPPTEST_CPU_I686)
+#elif defined(STRESSAPPTEST_CPU_PPC)
+  return;
+#elif defined(STRESSAPPTEST_CPU_ARMV7A)
+  return;
+#else
+#warning "Unsupported CPU type."
+#endif
+}
+
 // Define handy constants here
 static const int kTicksPerSec = 100;
 static const int kMegabyte = (1024LL*1024LL);
diff --git a/src/stressapptest_config.h.in b/src/stressapptest_config.h.in
index 97f306e..5412df4 100644
--- a/src/stressapptest_config.h.in
+++ b/src/stressapptest_config.h.in
@@ -53,6 +53,9 @@
 /* Define to 1 if you have the `posix_memalign' function. */
 #undef HAVE_POSIX_MEMALIGN
 
+/* Define to 1 if the system has `pthread_barrier'. */
+#undef HAVE_PTHREAD_BARRIERS
+
 /* Define to 1 if you have the <pthread.h> header file. */
 #undef HAVE_PTHREAD_H
 
diff --git a/src/stressapptest_config_android.h b/src/stressapptest_config_android.h
index 3817bdf..14081e5 100644
--- a/src/stressapptest_config_android.h
+++ b/src/stressapptest_config_android.h
@@ -54,12 +54,12 @@
 /* Define to 1 if you have the `posix_memalign' function. */
 /* #undef HAVE_POSIX_MEMALIGN */
 
+/* Define to 1 if the system has `pthread_barrier'. */
+#undef HAVE_PTHREAD_BARRIERS
+
 /* Define to 1 if you have the <pthread.h> header file. */
 #define HAVE_PTHREAD_H 1
 
-/* Android, why do you define _POSIX_BARRIERS when you have no _POSIX_BARRIERS?! */
-#undef _POSIX_BARRIERS
-
 /* Define to 1 if you have the `rand_r' function. */
 /* #undef HAVE_RAND_R */
 
@@ -144,7 +144,7 @@
 #define PACKAGE_NAME "stressapptest"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "stressapptest 1.0.4_autoconf"
+#define PACKAGE_STRING "stressapptest 1.0.7_autoconf"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "stressapptest"
@@ -153,7 +153,7 @@
 #define PACKAGE_URL ""
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.0.4_autoconf"
+#define PACKAGE_VERSION "1.0.7_autoconf"
 
 /* Define as the return type of signal handlers (`int' or `void'). */
 #define RETSIGTYPE void
@@ -179,6 +179,9 @@
 /* Defined if the target CPU is i686 */
 /* #undef STRESSAPPTEST_CPU_I686 */
 
+/* Defined if the target CPU is mips */
+/* #undef STRESSAPPTEST_CPU_MIPS */
+
 /* Defined if the target CPU is PowerPC */
 /* #undef STRESSAPPTEST_CPU_PPC */
 
@@ -203,7 +206,7 @@
 #define TIME_WITH_SYS_TIME 1
 
 /* Version number of package */
-#define VERSION "1.0.4_autoconf"
+#define VERSION "1.0.7_autoconf"
 
 /* Define to empty if `const' does not conform to ANSI C. */
 /* #undef const */
diff --git a/src/worker.cc b/src/worker.cc
index 62b0ede..5b0fe59 100644
--- a/src/worker.cc
+++ b/src/worker.cc
@@ -78,21 +78,6 @@
 #endif
 
 namespace {
-  // Get HW core ID from cpuid instruction.
-  inline int apicid(void) {
-    int cpu;
-#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
-    __asm __volatile("cpuid" : "=b" (cpu) : "a" (1) : "cx", "dx");
-#elif defined(STRESSAPPTEST_CPU_ARMV7A)
-  #warning "Unsupported CPU type ARMV7A: unable to determine core ID."
-    cpu = 0;
-#else
-  #warning "Unsupported CPU type: unable to determine core ID."
-    cpu = 0;
-#endif
-    return (cpu >> 24);
-  }
-
   // Work around the sad fact that there are two (gnu, xsi) incompatible
   // versions of strerror_r floating around google. Awesome.
   bool sat_strerror(int err, char *buf, int len) {
@@ -114,7 +99,7 @@
   inline uint64 addr_to_tag(void *address) {
     return reinterpret_cast<uint64>(address);
   }
-}
+}  // namespace
 
 #if !defined(O_DIRECT)
 // Sometimes this isn't available.
@@ -144,7 +129,7 @@
 void WorkerStatus::Initialize() {
   sat_assert(0 == pthread_mutex_init(&num_workers_mutex_, NULL));
   sat_assert(0 == pthread_rwlock_init(&status_rwlock_, NULL));
-#ifdef _POSIX_BARRIERS
+#ifdef HAVE_PTHREAD_BARRIERS
   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL,
                                        num_workers_ + 1));
 #endif
@@ -153,7 +138,7 @@
 void WorkerStatus::Destroy() {
   sat_assert(0 == pthread_mutex_destroy(&num_workers_mutex_));
   sat_assert(0 == pthread_rwlock_destroy(&status_rwlock_));
-#ifdef _POSIX_BARRIERS
+#ifdef HAVE_PTHREAD_BARRIERS
   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
 #endif
 }
@@ -173,10 +158,13 @@
     WaitOnPauseBarrier();
 }
 
-bool WorkerStatus::ContinueRunning() {
+bool WorkerStatus::ContinueRunning(bool *paused) {
   // This loop is an optimization.  We use it to immediately re-check the status
   // after resuming from a pause, instead of returning and waiting for the next
   // call to this function.
+  if (paused) {
+    *paused = false;
+  }
   for (;;) {
     switch (GetStatus()) {
       case RUN:
@@ -187,6 +175,10 @@
         WaitOnPauseBarrier();
         // Wait for ResumeWorkers() to be called.
         WaitOnPauseBarrier();
+        // Indicate that a pause occurred.
+        if (paused) {
+          *paused = true;
+        }
         break;
       case STOP:
         return false;
@@ -220,7 +212,7 @@
   AcquireNumWorkersLock();
   // Decrement num_workers_ and reinitialize pause_barrier_, which we know isn't
   // in use because (status != PAUSE).
-#ifdef _POSIX_BARRIERS
+#ifdef HAVE_PTHREAD_BARRIERS
   sat_assert(0 == pthread_barrier_destroy(&pause_barrier_));
   sat_assert(0 == pthread_barrier_init(&pause_barrier_, NULL, num_workers_));
 #endif
@@ -315,8 +307,8 @@
     logprintf(11, "Log: Bind to %s failed.\n",
               cpuset_format(&cpu_mask_).c_str());
 
-  logprintf(11, "Log: Thread %d running on apic ID %d mask %s (%s).\n",
-            thread_num_, apicid(),
+  logprintf(11, "Log: Thread %d running on core ID %d mask %s (%s).\n",
+            thread_num_, sched_getcpu(),
             CurrentCpusFormat().c_str(),
             cpuset_format(&cpu_mask_).c_str());
 #if 0
@@ -580,7 +572,7 @@
                                 const char *message) {
   char dimm_string[256] = "";
 
-  int apic_id = apicid();
+  int core_id = sched_getcpu();
 
   // Determine if this is a write or read error.
   os_->Flush(error->vaddr);
@@ -615,7 +607,7 @@
               "%s: miscompare on CPU %d(0x%s) at %p(0x%llx:%s): "
               "read:0x%016llx, reread:0x%016llx expected:0x%016llx\n",
               message,
-              apic_id,
+              core_id,
               CurrentCpusFormat().c_str(),
               error->vaddr,
               error->paddr,
@@ -815,6 +807,9 @@
       if ((state == kGoodAgain) || (state == kBad)) {
         unsigned int blockerrors = badend - badstart + 1;
         errormessage = "Block Error";
+        // It's okay for the 1st entry to be corrected multiple times,
+        // it will simply be reported twice. Once here and once below
+        // when processing the error queue.
         ProcessError(&recorded[0], 0, errormessage.c_str());
         logprintf(0, "Block Error: (%p) pattern %s instead of %s, "
                   "%d bytes from offset 0x%x to 0x%x\n",
@@ -823,8 +818,6 @@
                   blockerrors * wordsize_,
                   offset + badstart * wordsize_,
                   offset + badend * wordsize_);
-        errorcount_ += blockerrors;
-        return blockerrors;
       }
     }
   }
@@ -840,7 +833,6 @@
 
   if (page_error) {
     // For each word in the data region.
-    int error_recount = 0;
     for (int i = 0; i < length / wordsize_; i++) {
       uint64 actual = memblock[i];
       uint64 expected;
@@ -859,21 +851,16 @@
 
       // If the value is incorrect, save an error record for later printing.
       if (actual != expected) {
-        if (error_recount < kErrorLimit) {
-          // We already reported these.
-          error_recount++;
-        } else {
-          // If we have overflowed the error queue, print the errors now.
-          struct ErrorRecord er;
-          er.actual = actual;
-          er.expected = expected;
-          er.vaddr = &memblock[i];
+        // If we have overflowed the error queue, print the errors now.
+        struct ErrorRecord er;
+        er.actual = actual;
+        er.expected = expected;
+        er.vaddr = &memblock[i];
 
-          // Do the error printout. This will take a long time and
-          // likely change the machine state.
-          ProcessError(&er, 12, errormessage.c_str());
-          overflowerrors++;
-        }
+        // Do the error printout. This will take a long time and
+        // likely change the machine state.
+        ProcessError(&er, 12, errormessage.c_str());
+        overflowerrors++;
       }
     }
   }
@@ -948,7 +935,7 @@
   char tag_dimm_string[256] = "";
   bool read_error = false;
 
-  int apic_id = apicid();
+  int core_id = sched_getcpu();
 
   // Determine if this is a write or read error.
   os_->Flush(error->vaddr);
@@ -982,7 +969,7 @@
               error->tagvaddr, error->tagpaddr,
               tag_dimm_string,
               read_error ? "read error" : "write error",
-              apic_id,
+              core_id,
               CurrentCpusFormat().c_str(),
               error->vaddr,
               error->paddr,
@@ -1100,12 +1087,18 @@
   AdlerChecksum ignored_checksum;
   os_->AdlerMemcpyWarm(dstmem64, srcmem64, size_in_bytes, &ignored_checksum);
 
-  // Force cache flush.
-  int length = size_in_bytes / sizeof(*dstmem64);
-  for (int i = 0; i < length; i += sizeof(*dstmem64)) {
-    os_->FastFlush(dstmem64 + i);
-    os_->FastFlush(srcmem64 + i);
+  // Force cache flush of both the source and destination addresses.
+  //  length - length of block to flush in cachelines.
+  //  mem_increment - number of dstmem/srcmem values per cacheline.
+  int length = size_in_bytes / kCacheLineSize;
+  int mem_increment = kCacheLineSize / sizeof(*dstmem64);
+  OsLayer::FastFlushSync();
+  for (int i = 0; i < length; ++i) {
+    OsLayer::FastFlushHint(dstmem64 + (i * mem_increment));
+    OsLayer::FastFlushHint(srcmem64 + (i * mem_increment));
   }
+  OsLayer::FastFlushSync();
+
   // Check results.
   AdlerAddrCrcC(srcmem64, size_in_bytes, checksum, pe);
   // Patch up address tags.
@@ -1236,11 +1229,11 @@
                                    blocksize,
                                    currentblock * blocksize, 0);
           if (errorcount == 0) {
-            int apic_id = apicid();
+            int core_id = sched_getcpu();
             logprintf(0, "Process Error: CPU %d(0x%s) CrcCopyPage "
                          "CRC mismatch %s != %s, "
                          "but no miscompares found on second pass.\n",
-                      apic_id, CurrentCpusFormat().c_str(),
+                      core_id, CurrentCpusFormat().c_str(),
                       crc.ToHexString().c_str(),
                       expectedcrc->ToHexString().c_str());
             struct ErrorRecord er;
@@ -1366,10 +1359,10 @@
                                    blocksize,
                                    currentblock * blocksize, 0);
       if (errorcount == 0) {
-        logprintf(0, "Log: CrcWarmCopyPage CRC mismatch %s != %s, "
+        logprintf(0, "Log: CrcWarmCopyPage CRC mismatch expected: %s != actual: %s, "
                      "but no miscompares found. Retrying with fresh data.\n",
-                  crc.ToHexString().c_str(),
-                  expectedcrc->ToHexString().c_str());
+                  expectedcrc->ToHexString().c_str(),
+                  crc.ToHexString().c_str() );
         if (!tag_mode_) {
           // Copy the data originally read from this region back again.
           // This data should have any corruption read originally while
@@ -1380,16 +1373,16 @@
                                    blocksize,
                                    currentblock * blocksize, 0);
           if (errorcount == 0) {
-            int apic_id = apicid();
+            int core_id = sched_getcpu();
             logprintf(0, "Process Error: CPU %d(0x%s) CrciWarmCopyPage "
                          "CRC mismatch %s != %s, "
                          "but no miscompares found on second pass.\n",
-                      apic_id, CurrentCpusFormat().c_str(),
+                      core_id, CurrentCpusFormat().c_str(),
                       crc.ToHexString().c_str(),
                       expectedcrc->ToHexString().c_str());
             struct ErrorRecord er;
             er.actual = sourcemem[0];
-            er.expected = 0x0;
+            er.expected = 0xbad;
             er.vaddr = sourcemem;
             ProcessError(&er, 0, "Hardware Error");
           }
@@ -1600,12 +1593,11 @@
 
 // Open the file for access.
 bool FileThread::OpenFile(int *pfile) {
-  bool no_O_DIRECT = false;
   int flags = O_RDWR | O_CREAT | O_SYNC;
   int fd = open(filename_.c_str(), flags | O_DIRECT, 0644);
   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
-    no_O_DIRECT = true;
-    fd = open(filename_.c_str(), flags, 0644); // Try without O_DIRECT
+    fd = open(filename_.c_str(), flags, 0644);  // Try without O_DIRECT
+    os_->ActivateFlushPageCache();  // Not using O_DIRECT fixed EINVAL
   }
   if (fd < 0) {
     logprintf(0, "Process Error: Failed to create file %s!!\n",
@@ -1613,8 +1605,6 @@
     pages_copied_ = 0;
     return false;
   }
-  if (no_O_DIRECT)
-    os_->ActivateFlushPageCache(); // Not using O_DIRECT fixed EINVAL
   *pfile = fd;
   return true;
 }
@@ -1685,7 +1675,7 @@
     if (!result)
       return false;
   }
-  return os_->FlushPageCache(); // If O_DIRECT worked, this will be a NOP.
+  return os_->FlushPageCache();  // If O_DIRECT worked, this will be a NOP.
 }
 
 // Copy data from file into memory block.
@@ -1964,7 +1954,7 @@
   // Load patterns into page records.
   page_recs_ = new struct PageRec[sat_->disk_pages()];
   for (int i = 0; i < sat_->disk_pages(); i++) {
-    page_recs_[i].pattern = new struct Pattern();
+    page_recs_[i].pattern = new class Pattern();
   }
 
   // Loop until done.
@@ -2465,13 +2455,22 @@
 CpuCacheCoherencyThread::CpuCacheCoherencyThread(cc_cacheline_data *data,
                                                  int cacheline_count,
                                                  int thread_num,
+                                                 int thread_count,
                                                  int inc_count) {
   cc_cacheline_data_ = data;
   cc_cacheline_count_ = cacheline_count;
   cc_thread_num_ = thread_num;
+  cc_thread_count_ = thread_count;
   cc_inc_count_ = inc_count;
 }
 
+// A very simple psuedorandom generator.  Since the random number is based
+// on only a few simple logic operations, it can be done quickly in registers
+// and the compiler can inline it.
+uint64 CpuCacheCoherencyThread::SimpleRandom(uint64 seed) {
+  return (seed >> 1) ^ (-(seed & 1) & kRandomPolynomial);
+}
+
 // Worked thread to test the cache coherency of the CPUs
 // Return false on fatal sw error.
 bool CpuCacheCoherencyThread::Work() {
@@ -2480,7 +2479,19 @@
   uint64 time_start, time_end;
   struct timeval tv;
 
+  // Use a slightly more robust random number for the initial
+  // value, so the random sequences from the simple generator will
+  // be more divergent.
+#ifdef HAVE_RAND_R
   unsigned int seed = static_cast<unsigned int>(gettid());
+  uint64 r = static_cast<uint64>(rand_r(&seed));
+  r |= static_cast<uint64>(rand_r(&seed)) << 32;
+#else
+  srand(time(NULL));
+  uint64 r = static_cast<uint64>(rand());  // NOLINT
+  r |= static_cast<uint64>(rand()) << 32;  // NOLINT
+#endif
+
   gettimeofday(&tv, NULL);  // Get the timestamp before increments.
   time_start = tv.tv_sec * 1000000ULL + tv.tv_usec;
 
@@ -2490,14 +2501,19 @@
       // Choose a datastructure in random and increment the appropriate
       // member in that according to the offset (which is the same as the
       // thread number.
-#ifdef HAVE_RAND_R
-      int r = rand_r(&seed);
-#else
-      int r = rand();
-#endif
-      r = cc_cacheline_count_ * (r / (RAND_MAX + 1.0));
+      r = SimpleRandom(r);
+      int cline_num = r % cc_cacheline_count_;
+      int offset;
+      // Reverse the order for odd numbered threads in odd numbered cache
+      // lines.  This is designed for massively multi-core systems where the
+      // number of cores exceeds the bytes in a cache line, so "distant" cores
+      // get a chance to exercize cache coherency between them.
+      if (cline_num & cc_thread_num_ & 1)
+        offset = (cc_thread_count_ & ~1) - cc_thread_num_;
+      else
+        offset = cc_thread_num_;
       // Increment the member of the randomely selected structure.
-      (cc_cacheline_data_[r].num[cc_thread_num_])++;
+      (cc_cacheline_data_[cline_num].num[offset])++;
     }
 
     total_inc += cc_inc_count_;
@@ -2506,14 +2522,26 @@
     // in all the cache line structures for this particular thread.
     int cc_global_num = 0;
     for (int cline_num = 0; cline_num < cc_cacheline_count_; cline_num++) {
-      cc_global_num += cc_cacheline_data_[cline_num].num[cc_thread_num_];
+      int offset;
+      // Perform the same offset calculation from above.
+      if (cline_num & cc_thread_num_ & 1)
+        offset = (cc_thread_count_ & ~1) - cc_thread_num_;
+      else
+        offset = cc_thread_num_;
+      cc_global_num += cc_cacheline_data_[cline_num].num[offset];
       // Reset the cachline member's value for the next run.
-      cc_cacheline_data_[cline_num].num[cc_thread_num_] = 0;
+      cc_cacheline_data_[cline_num].num[offset] = 0;
     }
     if (sat_->error_injection())
       cc_global_num = -1;
 
-    if (cc_global_num != cc_inc_count_) {
+    // Since the count is only stored in a byte, to squeeze more into a
+    // single cache line, only compare it as a byte.  In the event that there
+    // is something detected, the chance that it would be missed by a single
+    // thread is 1 in 256.  If it affects all cores, that makes the chance
+    // of it being missed terribly minute.  It seems unlikely any failure
+    // case would be off by more than a small number.
+    if ((cc_global_num & 0xff) != (cc_inc_count_ & 0xff)) {
       errorcount_++;
       logprintf(0, "Hardware Error: global(%d) and local(%d) do not match\n",
                 cc_global_num, cc_inc_count_);
@@ -2697,20 +2725,17 @@
 
 // Open a device, return false on failure.
 bool DiskThread::OpenDevice(int *pfile) {
-  bool no_O_DIRECT = false;
   int flags = O_RDWR | O_SYNC | O_LARGEFILE;
   int fd = open(device_name_.c_str(), flags | O_DIRECT, 0);
   if (O_DIRECT != 0 && fd < 0 && errno == EINVAL) {
-    no_O_DIRECT = true;
-    fd = open(device_name_.c_str(), flags, 0); // Try without O_DIRECT
+    fd = open(device_name_.c_str(), flags, 0);  // Try without O_DIRECT
+    os_->ActivateFlushPageCache();
   }
   if (fd < 0) {
     logprintf(0, "Process Error: Failed to open device %s (thread %d)!!\n",
               device_name_.c_str(), thread_num_);
     return false;
   }
-  if (no_O_DIRECT)
-    os_->ActivateFlushPageCache();
   *pfile = fd;
 
   return GetDiskSize(fd);
@@ -2866,11 +2891,11 @@
 
       // Block is either initialized by writing, or in nondestructive case,
       // initialized by being added into the datastructure for later reading.
-      block->SetBlockAsInitialized();
+      block->initialized();
 
       in_flight_sectors_.push(block);
     }
-    if (!os_->FlushPageCache()) // If O_DIRECT worked, this will be a NOP.
+    if (!os_->FlushPageCache())  // If O_DIRECT worked, this will be a NOP.
       return false;
 
     // Verify blocks on disk.
@@ -2979,8 +3004,9 @@
     errorcount_++;
     os_->ErrorReport(device_name_.c_str(), operations[op].error_str, 1);
 
-    if (event.res < 0) {
-      switch (event.res) {
+    int64 result = static_cast<int64>(event.res);
+    if (result < 0) {
+      switch (result) {
         case -EIO:
           logprintf(0, "Hardware Error: Low-level I/O error while doing %s to "
                        "sectors starting at %lld on disk %s (thread %d).\n",
@@ -3003,7 +3029,7 @@
   }
 
   return true;
-#else // !HAVE_LIBAIO_H
+#else  // !HAVE_LIBAIO_H
   return false;
 #endif
 }
@@ -3011,7 +3037,7 @@
 // Write a block to disk.
 // Return false if the block is not written.
 bool DiskThread::WriteBlockToDisk(int fd, BlockData *block) {
-  memset(block_buffer_, 0, block->GetSize());
+  memset(block_buffer_, 0, block->size());
 
   // Fill block buffer with a pattern
   struct page_entry pe;
@@ -3019,30 +3045,30 @@
     // Even though a valid page could not be obatined, it is not an error
     // since we can always fill in a pattern directly, albeit slower.
     unsigned int *memblock = static_cast<unsigned int *>(block_buffer_);
-    block->SetPattern(patternlist_->GetRandomPattern());
+    block->set_pattern(patternlist_->GetRandomPattern());
 
     logprintf(11, "Log: Warning, using pattern fill fallback in "
                   "DiskThread::WriteBlockToDisk on disk %s (thread %d).\n",
               device_name_.c_str(), thread_num_);
 
-    for (int i = 0; i < block->GetSize()/wordsize_; i++) {
-      memblock[i] = block->GetPattern()->pattern(i);
+    for (unsigned int i = 0; i < block->size()/wordsize_; i++) {
+      memblock[i] = block->pattern()->pattern(i);
     }
   } else {
-    memcpy(block_buffer_, pe.addr, block->GetSize());
-    block->SetPattern(pe.pattern);
+    memcpy(block_buffer_, pe.addr, block->size());
+    block->set_pattern(pe.pattern);
     sat_->PutValid(&pe);
   }
 
   logprintf(12, "Log: Writing %lld sectors starting at %lld on disk %s"
             " (thread %d).\n",
-            block->GetSize()/kSectorSize, block->GetAddress(),
+            block->size()/kSectorSize, block->address(),
             device_name_.c_str(), thread_num_);
 
   int64 start_time = GetTime();
 
-  if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->GetSize(),
-                   block->GetAddress() * kSectorSize, write_timeout_)) {
+  if (!AsyncDiskIO(ASYNC_IO_WRITE, fd, block_buffer_, block->size(),
+                   block->address() * kSectorSize, write_timeout_)) {
     return false;
   }
 
@@ -3063,11 +3089,11 @@
 // Return true if the block was read, also increment errorcount
 // if the block had data errors or performance problems.
 bool DiskThread::ValidateBlockOnDisk(int fd, BlockData *block) {
-  int64 blocks = block->GetSize() / read_block_size_;
+  int64 blocks = block->size() / read_block_size_;
   int64 bytes_read = 0;
   int64 current_blocks;
   int64 current_bytes;
-  uint64 address = block->GetAddress();
+  uint64 address = block->address();
 
   logprintf(20, "Log: Reading sectors starting at %lld on disk %s "
             "(thread %d).\n",
@@ -3119,7 +3145,7 @@
     // In non-destructive mode, don't compare the block to the pattern since
     // the block was never written to disk in the first place.
     if (!non_destructive_) {
-      if (CheckRegion(block_buffer_, block->GetPattern(), current_bytes,
+      if (CheckRegion(block_buffer_, block->pattern(), current_bytes,
                       0, bytes_read)) {
         os_->ErrorReport(device_name_.c_str(), "disk-pattern-error", 1);
         errorcount_ += 1;
@@ -3156,7 +3182,7 @@
   // when using direct IO.
 #ifdef HAVE_POSIX_MEMALIGN
   int memalign_result = posix_memalign(&block_buffer_, kBufferAlignment,
-                              sat_->page_length());
+                                       sat_->page_length());
 #else
   block_buffer_ = memalign(kBufferAlignment, sat_->page_length());
   int memalign_result = (block_buffer_ == 0);
@@ -3400,3 +3426,224 @@
             "pages checked\n", thread_num_, status_, pages_copied_);
   return result;
 }
+
+// The list of MSRs to read from each cpu.
+const CpuFreqThread::CpuRegisterType CpuFreqThread::kCpuRegisters[] = {
+  { kMsrTscAddr, "TSC" },
+  { kMsrAperfAddr, "APERF" },
+  { kMsrMperfAddr, "MPERF" },
+};
+
+CpuFreqThread::CpuFreqThread(int num_cpus, int freq_threshold, int round)
+  : num_cpus_(num_cpus),
+    freq_threshold_(freq_threshold),
+    round_(round) {
+  sat_assert(round >= 0);
+  if (round == 0) {
+    // If rounding is off, force rounding to the nearest MHz.
+    round_ = 1;
+    round_value_ = 0.5;
+  } else {
+    round_value_ = round/2.0;
+  }
+}
+
+CpuFreqThread::~CpuFreqThread() {
+}
+
+// Compute the difference between the currently read MSR values and the
+// previously read values and store the results in delta. If any of the
+// values did not increase, or the TSC value is too small, returns false.
+// Otherwise, returns true.
+bool CpuFreqThread::ComputeDelta(CpuDataType *current, CpuDataType *previous,
+                                 CpuDataType *delta) {
+  // Loop through the msrs.
+  for (int msr = 0; msr < kMsrLast; msr++) {
+    if (previous->msrs[msr] > current->msrs[msr]) {
+      logprintf(0, "Log: Register %s went backwards 0x%llx to 0x%llx "
+                "skipping interval\n", kCpuRegisters[msr], previous->msrs[msr],
+                current->msrs[msr]);
+      return false;
+    } else {
+      delta->msrs[msr] = current->msrs[msr] - previous->msrs[msr];
+    }
+  }
+
+  // Check for TSC < 1 Mcycles over interval.
+  if (delta->msrs[kMsrTsc] < (1000 * 1000)) {
+    logprintf(0, "Log: Insanely slow TSC rate, TSC stops in idle?\n");
+    return false;
+  }
+  timersub(&current->tv, &previous->tv, &delta->tv);
+
+  return true;
+}
+
+// Compute the change in values of the MSRs between current and previous,
+// set the frequency in MHz of the cpu. If there is an error computing
+// the delta, return false. Othewise, return true.
+bool CpuFreqThread::ComputeFrequency(CpuDataType *current,
+                                     CpuDataType *previous, int *freq) {
+  CpuDataType delta;
+  if (!ComputeDelta(current, previous, &delta)) {
+    return false;
+  }
+
+  double interval = delta.tv.tv_sec + delta.tv.tv_usec / 1000000.0;
+  double frequency = 1.0 * delta.msrs[kMsrTsc] / 1000000
+                     * delta.msrs[kMsrAperf] / delta.msrs[kMsrMperf] / interval;
+
+  // Use the rounding value to round up properly.
+  int computed = static_cast<int>(frequency + round_value_);
+  *freq = computed - (computed % round_);
+  return true;
+}
+
+// This is the task function that the thread executes.
+bool CpuFreqThread::Work() {
+  cpu_set_t cpuset;
+  if (!AvailableCpus(&cpuset)) {
+    logprintf(0, "Process Error: Cannot get information about the cpus.\n");
+    return false;
+  }
+
+  // Start off indicating the test is passing.
+  status_ = true;
+
+  int curr = 0;
+  int prev = 1;
+  uint32 num_intervals = 0;
+  bool paused = false;
+  bool valid;
+  bool pass = true;
+
+  vector<CpuDataType> data[2];
+  data[0].resize(num_cpus_);
+  data[1].resize(num_cpus_);
+  while (IsReadyToRun(&paused)) {
+    if (paused) {
+      // Reset the intervals and restart logic after the pause.
+      num_intervals = 0;
+    }
+    if (num_intervals == 0) {
+      // If this is the first interval, then always wait a bit before
+      // starting to collect data.
+      sat_sleep(kStartupDelay);
+    }
+
+    // Get the per cpu counters.
+    valid = true;
+    for (int cpu = 0; cpu < num_cpus_; cpu++) {
+      if (CPU_ISSET(cpu, &cpuset)) {
+        if (!GetMsrs(cpu, &data[curr][cpu])) {
+          logprintf(0, "Failed to get msrs on cpu %d.\n", cpu);
+          valid = false;
+          break;
+        }
+      }
+    }
+    if (!valid) {
+      // Reset the number of collected intervals since something bad happened.
+      num_intervals = 0;
+      continue;
+    }
+
+    num_intervals++;
+
+    // Only compute a delta when we have at least two intervals worth of data.
+    if (num_intervals > 2) {
+      for (int cpu = 0; cpu < num_cpus_; cpu++) {
+        if (CPU_ISSET(cpu, &cpuset)) {
+          int freq;
+          if (!ComputeFrequency(&data[curr][cpu], &data[prev][cpu],
+                                &freq)) {
+            // Reset the number of collected intervals since an unknown
+            // error occurred.
+            logprintf(0, "Log: Cannot get frequency of cpu %d.\n", cpu);
+            num_intervals = 0;
+            break;
+          }
+          logprintf(15, "Cpu %d Freq %d\n", cpu, freq);
+          if (freq < freq_threshold_) {
+            errorcount_++;
+            pass = false;
+            logprintf(0, "Log: Cpu %d frequency is too low, frequency %d MHz "
+                      "threshold %d MHz.\n", cpu, freq, freq_threshold_);
+          }
+        }
+      }
+    }
+
+    sat_sleep(kIntervalPause);
+
+    // Swap the values in curr and prev (these values flip between 0 and 1).
+    curr ^= 1;
+    prev ^= 1;
+  }
+
+  return pass;
+}
+
+
+// Get the MSR values for this particular cpu and save them in data. If
+// any error is encountered, returns false. Otherwise, returns true.
+bool CpuFreqThread::GetMsrs(int cpu, CpuDataType *data) {
+  for (int msr = 0; msr < kMsrLast; msr++) {
+    if (!os_->ReadMSR(cpu, kCpuRegisters[msr].msr, &data->msrs[msr])) {
+      return false;
+    }
+  }
+  // Save the time at which we acquired these values.
+  gettimeofday(&data->tv, NULL);
+
+  return true;
+}
+
+// Returns true if this test can run on the current machine. Otherwise,
+// returns false.
+bool CpuFreqThread::CanRun() {
+#if defined(STRESSAPPTEST_CPU_X86_64) || defined(STRESSAPPTEST_CPU_I686)
+  unsigned int eax, ebx, ecx, edx;
+
+  // Check that the TSC feature is supported.
+  // This check is valid for both Intel and AMD.
+  eax = 1;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (!(edx & (1 << 5))) {
+    logprintf(0, "Process Error: No TSC support.\n");
+    return false;
+  }
+
+  // Check the highest extended function level supported.
+  // This check is valid for both Intel and AMD.
+  eax = 0x80000000;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if (eax < 0x80000007) {
+    logprintf(0, "Process Error: No invariant TSC support.\n");
+    return false;
+  }
+
+  // Non-Stop TSC is advertised by CPUID.EAX=0x80000007: EDX.bit8
+  // This check is valid for both Intel and AMD.
+  eax = 0x80000007;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if ((edx & (1 << 8)) == 0) {
+    logprintf(0, "Process Error: No non-stop TSC support.\n");
+    return false;
+  }
+
+  // APERF/MPERF is advertised by CPUID.EAX=0x6: ECX.bit0
+  // This check is valid for both Intel and AMD.
+  eax = 0x6;
+  cpuid(&eax, &ebx, &ecx, &edx);
+  if ((ecx & 1) == 0) {
+    logprintf(0, "Process Error: No APERF MSR support.\n");
+    return false;
+  }
+  return true;
+#else
+  logprintf(0, "Process Error: "
+               "cpu_freq_test is only supported on X86 processors.\n");
+  return false;
+#endif
+}
diff --git a/src/worker.h b/src/worker.h
index 0ec4c1d..091d96b 100644
--- a/src/worker.h
+++ b/src/worker.h
@@ -44,7 +44,7 @@
 
 // Global Datastruture shared by the Cache Coherency Worker Threads.
 struct cc_cacheline_data {
-  int *num;
+  char *num;
 };
 
 // Typical usage:
@@ -127,10 +127,8 @@
   // ResumeWorkers() or StopWorkers() has been called.  Number of distinct
   // calling threads must match the worker count (see AddWorkers() and
   // RemoveSelf()).
-  bool ContinueRunning();
+  bool ContinueRunning(bool *paused);
 
-  // TODO(matthewb): Is this functionality really necessary?  Remove it if not.
-  //
   // This is a hack!  It's like ContinueRunning(), except it won't pause.  If
   // any worker threads use this exclusively in place of ContinueRunning() then
   // PauseWorkers() should never be used!
@@ -140,7 +138,7 @@
   enum Status { RUN, PAUSE, STOP };
 
   void WaitOnPauseBarrier() {
-#ifdef _POSIX_BARRIERS
+#ifdef HAVE_PTHREAD_BARRIERS
     int error = pthread_barrier_wait(&pause_barrier_);
     if (error != PTHREAD_BARRIER_SERIAL_THREAD)
       sat_assert(error == 0);
@@ -189,7 +187,7 @@
   pthread_rwlock_t status_rwlock_;
   Status status_;
 
-#ifdef _POSIX_BARRIERS
+#ifdef HAVE_PTHREAD_BARRIERS
   // Guaranteed to not be in use when (status_ != PAUSE).
   pthread_barrier_t pause_barrier_;
 #endif
@@ -242,7 +240,7 @@
   int64 ReadThreadTimer() {
     struct timeval end_time_;
     gettimeofday(&end_time_, NULL);
-    return (end_time_.tv_sec - start_time_.tv_sec)*1000000 +
+    return (end_time_.tv_sec - start_time_.tv_sec)*1000000ULL +
       (end_time_.tv_usec - start_time_.tv_usec);
   }
   // Stops per-WorkerThread timer and records thread run duration.
@@ -266,10 +264,10 @@
   // Calculate worker thread specific bandwidth.
   virtual float GetMemoryBandwidth()
     {return GetMemoryCopiedData() / (
-        runduration_usec_ * 1.0 / 1000000);}
+        runduration_usec_ * 1.0 / 1000000.);}
   virtual float GetDeviceBandwidth()
     {return GetDeviceCopiedData() / (
-        runduration_usec_ * 1.0 / 1000000);}
+        runduration_usec_ * 1.0 / 1000000.);}
 
   void set_cpu_mask(cpu_set_t *mask) {
     memcpy(&cpu_mask_, mask, sizeof(*mask));
@@ -304,9 +302,10 @@
   //   do {
   //     // work.
   //   } while (IsReadyToRun());
-  virtual bool IsReadyToRun() { return worker_status_->ContinueRunning(); }
-  // TODO(matthewb): Is this function really necessary? Remove it if not.
-  //
+  virtual bool IsReadyToRun(bool *paused = NULL) {
+    return worker_status_->ContinueRunning(paused);
+  }
+
   // Like IsReadyToRun(), except it won't pause.
   virtual bool IsReadyToRunNoPause() {
     return worker_status_->ContinueRunningNoPause();
@@ -422,7 +421,7 @@
   // Record of where these pages were sourced from, and what
   // potentially broken components they passed through.
   struct PageRec {
-     struct Pattern *pattern;  // This is the data it should contain.
+     class Pattern *pattern;  // This is the data it should contain.
      void *src;  // This is the memory location the data was sourced from.
      void *dst;  // This is where it ended up.
   };
@@ -641,16 +640,27 @@
   CpuCacheCoherencyThread(cc_cacheline_data *cc_data,
                           int cc_cacheline_count_,
                           int cc_thread_num_,
+                          int cc_thread_count_,
                           int cc_inc_count_);
   virtual bool Work();
 
  protected:
+  // Used by the simple random number generator as a shift feedback;
+  // this polynomial (x^64 + x^63 + x^61 + x^60 + 1) will produce a
+  // psuedorandom cycle of period 2^64-1.
+  static const uint64 kRandomPolynomial = 0xD800000000000000ULL;
+  // A very simple psuedorandom generator that can be inlined and use
+  // registers, to keep the CC test loop tight and focused.
+  static uint64 SimpleRandom(uint64 seed);
+
   cc_cacheline_data *cc_cacheline_data_;  // Datstructure for each cacheline.
   int cc_local_num_;        // Local counter for each thread.
   int cc_cacheline_count_;  // Number of cache lines to operate on.
   int cc_thread_num_;       // The integer id of the thread which is
                             // used as an index into the integer array
                             // of the cacheline datastructure.
+  int cc_thread_count_;     // Total number of threads being run, for
+                            // calculations mixing up cache line access.
   int cc_inc_count_;        // Number of times to increment the counter.
 
  private:
@@ -809,4 +819,80 @@
   DISALLOW_COPY_AND_ASSIGN(MemoryRegionThread);
 };
 
+// Worker thread to check that the frequency of every cpu does not go below a
+// certain threshold.
+class CpuFreqThread : public WorkerThread {
+ public:
+  CpuFreqThread(int num_cpus, int freq_threshold, int round);
+  ~CpuFreqThread();
+
+  // This is the task function that the thread executes.
+  virtual bool Work();
+
+  // Returns true if this test can run on the current machine. Otherwise,
+  // returns false.
+  static bool CanRun();
+
+ private:
+  static const int kIntervalPause = 10;   // The number of seconds to pause
+                                          // between acquiring the MSR data.
+  static const int kStartupDelay = 5;     // The number of seconds to wait
+                                          // before acquiring MSR data.
+  static const int kMsrTscAddr = 0x10;    // The address of the TSC MSR.
+  static const int kMsrAperfAddr = 0xE8;  // The address of the APERF MSR.
+  static const int kMsrMperfAddr = 0xE7;  // The address of the MPERF MSR.
+
+  // The index values into the CpuDataType.msr[] array.
+  enum MsrValues {
+    kMsrTsc = 0,           // MSR index 0 = TSC.
+    kMsrAperf = 1,         // MSR index 1 = APERF.
+    kMsrMperf = 2,         // MSR index 2 = MPERF.
+    kMsrLast,              // Last MSR index.
+  };
+
+  typedef struct {
+    uint32 msr;         // The address of the MSR.
+    const char *name;   // A human readable string for the MSR.
+  } CpuRegisterType;
+
+  typedef struct {
+    uint64 msrs[kMsrLast];  // The values of the MSRs.
+    struct timeval tv;      // The time at which the MSRs were read.
+  } CpuDataType;
+
+  // The set of MSR addresses and register names.
+  static const CpuRegisterType kCpuRegisters[kMsrLast];
+
+  // Compute the change in values of the MSRs between current and previous,
+  // set the frequency in MHz of the cpu. If there is an error computing
+  // the delta, return false. Othewise, return true.
+  bool ComputeFrequency(CpuDataType *current, CpuDataType *previous,
+                        int *frequency);
+
+  // Get the MSR values for this particular cpu and save them in data. If
+  // any error is encountered, returns false. Otherwise, returns true.
+  bool GetMsrs(int cpu, CpuDataType *data);
+
+  // Compute the difference between the currently read MSR values and the
+  // previously read values and store the results in delta. If any of the
+  // values did not increase, or the TSC value is too small, returns false.
+  // Otherwise, returns true.
+  bool ComputeDelta(CpuDataType *current, CpuDataType *previous,
+                    CpuDataType *delta);
+
+  // The total number of cpus on the system.
+  int num_cpus_;
+
+  // The minimum frequency that each cpu must operate at (in MHz).
+  int freq_threshold_;
+
+  // The value to round the computed frequency to.
+  int round_;
+
+  // Precomputed value to add to the frequency to do the rounding.
+  double round_value_;
+
+  DISALLOW_COPY_AND_ASSIGN(CpuFreqThread);
+};
+
 #endif  // STRESSAPPTEST_WORKER_H_
diff --git a/stressapptest.1 b/stressapptest.1
index 695f9ee..2c91478 100644
--- a/stressapptest.1
+++ b/stressapptest.1
@@ -86,10 +86,15 @@
 
 .TP
 .B \-\-cc_line_count <number>
-Mumber of cache line sized datastructures to allocate for the cache coherency
+Number of cache line sized datastructures to allocate for the cache coherency
 threads to operate.
 
 .TP
+.B \-\-cc_line_size <number>
+Size of cache line to use as the basis for cache coherency test data
+structures.
+
+.TP
 .B \-\-cc_test
 Do the cache coherency testing.