Update to fio-2.20. am: eda3a60699 am: 5198dc19ac am: 21dbb04785 Change-Id: I4ee03d7f35a27d5a405ee49ad8f45085599914f2

commit: bd65a914df7058352a8ec94c8406ac7b683ca65f [log] [tgz]
author: Elliott Hughes <enh@google.com> Sat May 20 03:51:12 2017 +0000
committer: android-build-merger <android-build-merger@google.com> Sat May 20 03:51:12 2017 +0000
tree: 7da7fbd89e536bc8e7465f86c0b738dae13ec124
parent: 6fddb55670aaf3df6ccfb73a0da75aea3d53b42d [diff]
parent: 21dbb04785b943ed85e8854800777e6a9dd806dd [diff]
diff --git a/Android.bp b/Android.bp
new file mode 100644
index 0000000..fa4069c
--- /dev/null
+++ b/Android.bp

@@ -0,0 +1,123 @@
+cc_defaults {
+    name: "fio_defaults",
+    cflags: [
+        "-DFIO_VERSION=\"fio-2.20\"",
+        "-DCONFIG_3ARG_AFFINITY",
+        "-DCONFIG_CLOCK_GETTIME",
+        "-DCONFIG_CLOCK_MONOTONIC",
+        "-DCONFIG_FDATASYNC",
+        "-DCONFIG_GETOPT_LONG_ONLY",
+        "-DCONFIG_GETTIMEOFDAY",
+        "-DCONFIG_IPV6",
+        "-DCONFIG_LINUX_FALLOCATE",
+        "-DCONFIG_LINUX_SPLICE",
+        "-DCONFIG_LITTLE_ENDIAN",
+        "-DCONFIG_RLIMIT_MEMLOCK",
+        "-DCONFIG_RUSAGE_THREAD",
+        "-DCONFIG_SCHED_IDLE",
+        "-DCONFIG_SETVBUF",
+        "-DCONFIG_SFAA",
+        "-DCONFIG_SOCKLEN_T",
+        "-DCONFIG_STRCASESTR",
+        "-DCONFIG_STRLCAT",
+        "-DCONFIG_STRSEP",
+        "-DCONFIG_TCP_NODELAY",
+        "-DCONFIG_TLS_THREAD",
+        "-DCONFIG_ZLIB",
+        "-DFIO_HAVE_CGROUPS",
+        "-DFIO_INC_DEBUG",
+        "-DFIO_INTERNAL",
+        "-D_FILE_OFFSET_BITS=64",
+        "-O3",
+        "-Wall",
+        "-Wdeclaration-after-statement",
+        "-Wwrite-strings",
+        "-ffast-math",
+        "-fno-omit-frame-pointer",
+        "-std=gnu99",
+        "-Wno-macro-redefined",
+        "-Wno-missing-field-initializers",
+        "-Wno-pointer-arith",
+        "-Wno-sign-compare",
+        "-Wno-unused-parameter",
+        "-Wno-unused-variable",
+        "-include sys/sysmacros.h",
+    ],
+
+    arch: {
+      arm: { cflags: ["-DBITS_PER_LONG=32", "-DCONFIG_32BIT"] },
+      arm64: { cflags: ["-DBITS_PER_LONG=64", "-DCONFIG_64BIT"] },
+      mips: { cflags: ["-DBITS_PER_LONG=32", "-DCONFIG_32BIT"] },
+      mips64: { cflags: ["-DBITS_PER_LONG=64", "-DCONFIG_64BIT"] },
+      x86: { cflags: ["-DBITS_PER_LONG=32", "-DCONFIG_32BIT"] },
+      x86_64: { cflags: ["-DBITS_PER_LONG=64", "-DCONFIG_64BIT"] },
+    },
+
+    tidy_checks: ["-clang-analyzer*"],
+}
+
+cc_binary {
+    name: "fio",
+    defaults: ["fio_defaults"],
+    shared_libs: ["libdl"],
+    static_libs: [
+        "libcutils",
+        "libz",
+    ],
+    srcs: [
+        "backend.c",
+        "blktrace.c",
+        "cconv.c",
+        "cgroup.c",
+        "client.c",
+        "debug.c",
+        "diskutil.c",
+        "eta.c",
+        "fifo.c",
+        "filehash.c",
+        "filelock.c",
+        "filesetup.c",
+        "fio.c",
+        "flow.c",
+        "gettime.c",
+        "gettime-thread.c",
+        "helpers.c",
+        "helper_thread.c",
+        "idletime.c",
+        "init.c",
+        "ioengines.c",
+        "iolog.c",
+        "io_u.c",
+        "io_u_queue.c",
+        "json.c",
+        "libfio.c",
+        "log.c",
+        "memory.c",
+        "mutex.c",
+        "optgroup.c",
+        "options.c",
+        "parse.c",
+        "profile.c",
+        "rate-submit.c",
+        "server.c",
+        "smalloc.c",
+        "stat.c",
+        "steadystate.c",
+        "td_error.c",
+        "tickmarks.c",
+        "time.c",
+        "trim.c",
+        "verify.c",
+        "workqueue.c",
+        "crc/*.c",
+        "engines/cpu.c",
+        "engines/mmap.c",
+        "engines/null.c",
+        "engines/net.c",
+        "engines/sg.c",
+        "engines/splice.c",
+        "engines/sync.c",
+        "lib/*.c",
+        "oslib/linux-dev-lookup.c",
+    ],
+}

diff --git a/Android.mk b/Android.mk
deleted file mode 100644
index 87eb249..0000000
--- a/Android.mk
+++ /dev/null

@@ -1,114 +0,0 @@
-# Copyright (C) 2014 The Android Open Source Project
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-LOCAL_PATH:= $(call my-dir)
-
-include $(CLEAR_VARS)
-
-# b/31559947, work around clang-tidy bug, disable clang-analyzer checks.
-LOCAL_TIDY_CHECKS := -clang-analyzer*
-
-LOCAL_CFLAGS_32 += -DBITS_PER_LONG=32 -DCONFIG_64BIT
-LOCAL_CFLAGS_64 += -DBITS_PER_LONG=64 -DCONFIG_32BIT
-
-main_src_files := gettime.c fio.c ioengines.c init.c stat.c log.c time.c \
-                  filesetup.c eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \
-                  smalloc.c filehash.c helpers.c profile.c debug.c backend.c \
-                  cconv.c client.c filelock.c flow.c gettime-thread.c idletime.c io_u_queue.c \
-                  iolog.c json.c libfio.c memalign.c profiles/act.c profiles/tiobench.c server.c \
-                  td_error.c diskutil.c blktrace.c trim.c fifo.c cgroup.c
-
-lib_src_files := lib/rbtree.c lib/flist_sort.c lib/getrusage.c lib/hweight.c lib/ieee754.c lib/lfsr.c \
-                 lib/num2str.c lib/prio_tree.c lib/rand.c lib/zipf.c lib/inet_aton.c lib/axmap.c \
-                 lib/bloom.c lib/linux-dev-lookup.c lib/tp.c
-
-crc_src_files := crc/crc7.c crc/crc16.c crc/crc32.c crc/crc64.c crc/crc32c.c crc/crc32c-intel.c \
-                 crc/sha1.c crc/sha256.c crc/sha512.c crc/md5.c crc/test.c crc/xxhash.c \
-                 crc/fnv.c crc/murmur3.c
-
-engines_src_files := engines/cpu.c engines/mmap.c engines/null.c engines/net.c \
-                     engines/sg.c engines/sync.c
-
-engines_src_files_64 := engines/splice.c
-
-LOCAL_SRC_FILES := $(main_src_files) \
-                   $(lib_src_files) \
-                   $(crc_src_files) \
-                   $(engines_src_files) \
-
-LOCAL_SRC_FILES_64 += $(engines_src_files_64)
-
-LOCAL_MODULE := fio
-LOCAL_MODULE_PATH := $(TARGET_OUT_OPTIONAL_EXECUTABLES)
-LOCAL_MODULE_TAGS := debug
-
-LOCAL_SHARED_LIBRARIES := libdl
-LOCAL_STATIC_LIBRARIES := libcutils libz
-
-LOCAL_CFLAGS += -DFIO_VERSION="\"fio-2.2.6\"" \
-                -DCONFIG_3ARG_AFFINITY \
-                -DCONFIG_CLOCK_GETTIME \
-                -DCONFIG_CLOCK_MONOTONIC \
-                -DCONFIG_FDATASYNC \
-                -DCONFIG_GETOPT_LONG_ONLY \
-                -DCONFIG_GETTIMEOFDAY \
-                -DCONFIG_IPV6 \
-                -DCONFIG_LINUX_FALLOCATE \
-                -DCONFIG_LITTLE_ENDIAN \
-                -DCONFIG_RLIMIT_MEMLOCK \
-                -DCONFIG_RUSAGE_THREAD \
-                -DCONFIG_SCHED_IDLE \
-                -DCONFIG_SETVBUF \
-                -DCONFIG_SFAA \
-                -DCONFIG_SOCKLEN_T \
-                -DCONFIG_STRCASESTR \
-                -DCONFIG_STRSEP \
-                -DCONFIG_TCP_NODELAY \
-                -DCONFIG_TLS_THREAD \
-                -DCONFIG_ZLIB \
-                -DFIO_HAVE_CGROUPS \
-                -DFIO_INC_DEBUG \
-                -DFIO_INTERNAL \
-                -DNO_GETMNTENT_R \
-                -DNO_INET_NETWORK \
-                -D_FILE_OFFSET_BITS=64 \
-                -D_FORTIFY_SOURCE=2 \
-                -D_GNU_SOURCE \
-                -D_LARGEFILE_SOURCE \
-                -D__ANDROID__ \
-                -O3 \
-                -Wall \
-                -Wdeclaration-after-statement \
-                -Wwrite-strings \
-                -ffast-math \
-                -fno-omit-frame-pointer \
-                -g \
-                -std=gnu99 \
-                -Wno-pointer-arith \
-                -Wno-sign-compare \
-                -Wno-unused-parameter \
-                -Wno-unused-variable \
-
-# Workaround until upstream copes with a glibc/bionic where <sys/types.h>
-# doesn't get you <sys/sysmacros.h> for free.
-LOCAL_CFLAGS += -include sys/sysmacros.h
-
-LOCAL_LDFLAGS += \
-                -rdynamic \
-
-LOCAL_CFLAGS_64 += \
-                -DCONFIG_LINUX_SPLICE \
-
-include $(BUILD_EXECUTABLE)

diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index 9ae7b7d..a9ddb31 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN

@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.2.6
+DEF_VER=fio-2.20
 
 LF='
 '
@@ -15,7 +15,7 @@
 	VN=`git describe --match "fio-[0-9]*" --abbrev=4 HEAD 2>/dev/null` &&
 	case "$VN" in
 	*$LF*) (exit 1) ;;
-	v[0-9]*)
+	fio-[0-9]*)
 		git update-index -q --refresh
 		test -z "`git diff-index --name-only HEAD --`" ||
 		VN="$VN-dirty" ;;
@@ -38,5 +38,3 @@
 	echo >&2 "FIO_VERSION = $VN"
 	echo "FIO_VERSION = $VN" >$GVF
 }
-
-

diff --git a/HOWTO b/HOWTO
index 0f7909d..d9e881a 100644
--- a/HOWTO
+++ b/HOWTO

@@ -1,2079 +1,3521 @@
-Table of contents
------------------
+How fio works
+-------------
 
-1. Overview
-2. How fio works
-3. Running fio
-4. Job file format
-5. Detailed list of parameters
-6. Normal output
-7. Terse output
-8. Trace file format
-9. CPU idleness profiling
+The first step in getting fio to simulate a desired I/O workload, is writing a
+job file describing that specific setup. A job file may contain any number of
+threads and/or files -- the typical contents of the job file is a *global*
+section defining shared parameters, and one or more job sections describing the
+jobs involved. When run, fio parses this file and sets everything up as
+described. If we break down a job from top to bottom, it contains the following
+basic parameters:
 
-1.0 Overview and history
-------------------------
-fio was originally written to save me the hassle of writing special test
-case programs when I wanted to test a specific workload, either for
-performance reasons or to find/reproduce a bug. The process of writing
-such a test app can be tiresome, especially if you have to do it often.
-Hence I needed a tool that would be able to simulate a given io workload
-without resorting to writing a tailored test case again and again.
+`I/O type`_
 
-A test work load is difficult to define, though. There can be any number
-of processes or threads involved, and they can each be using their own
-way of generating io. You could have someone dirtying large amounts of
-memory in an memory mapped file, or maybe several threads issuing
-reads using asynchronous io. fio needed to be flexible enough to
-simulate both of these cases, and many more.
+		Defines the I/O pattern issued to the file(s).  We may only be reading
+		sequentially from this file(s), or we may be writing randomly. Or even
+		mixing reads and writes, sequentially or randomly.
+		Should we be doing buffered I/O, or direct/raw I/O?
 
-2.0 How fio works
------------------
-The first step in getting fio to simulate a desired io workload, is
-writing a job file describing that specific setup. A job file may contain
-any number of threads and/or files - the typical contents of the job file
-is a global section defining shared parameters, and one or more job
-sections describing the jobs involved. When run, fio parses this file
-and sets everything up as described. If we break down a job from top to
-bottom, it contains the following basic parameters:
+`Block size`_
 
-	IO type		Defines the io pattern issued to the file(s).
-			We may only be reading sequentially from this
-			file(s), or we may be writing randomly. Or even
-			mixing reads and writes, sequentially or randomly.
+		In how large chunks are we issuing I/O? This may be a single value,
+		or it may describe a range of block sizes.
 
-	Block size	In how large chunks are we issuing io? This may be
-			a single value, or it may describe a range of
-			block sizes.
+`I/O size`_
 
-	IO size		How much data are we going to be reading/writing.
+		How much data are we going to be reading/writing.
 
-	IO engine	How do we issue io? We could be memory mapping the
-			file, we could be using regular read/write, we
-			could be using splice, async io, syslet, or even
-			SG (SCSI generic sg).
+`I/O engine`_
 
-	IO depth	If the io engine is async, how large a queuing
-			depth do we want to maintain?
+		How do we issue I/O? We could be memory mapping the file, we could be
+		using regular read/write, we could be using splice, async I/O, or even
+		SG (SCSI generic sg).
 
-	IO type		Should we be doing buffered io, or direct/raw io?
+`I/O depth`_
 
-	Num files	How many files are we spreading the workload over.
-
-	Num threads	How many threads or processes should we spread
-			this workload over.
-
-The above are the basic parameters defined for a workload, in addition
-there's a multitude of parameters that modify other aspects of how this
-job behaves.
+		If the I/O engine is async, how large a queuing depth do we want to
+		maintain?
 
 
-3.0 Running fio
+`Target file/device`_
+
+		How many files are we spreading the workload over.
+
+`Threads, processes and job synchronization`_
+
+		How many threads or processes should we spread this workload over.
+
+The above are the basic parameters defined for a workload, in addition there's a
+multitude of parameters that modify other aspects of how this job behaves.
+
+
+Command line options
+--------------------
+
+.. option:: --debug=type
+
+    Enable verbose tracing of various fio actions.  May be ``all`` for all types
+    or individual types separated by a comma (e.g. ``--debug=file,mem`` will
+    enable file and memory debugging).  Currently, additional logging is
+    available for:
+
+    *process*
+			Dump info related to processes.
+    *file*
+			Dump info related to file actions.
+    *io*
+			Dump info related to I/O queuing.
+    *mem*
+			Dump info related to memory allocations.
+    *blktrace*
+			Dump info related to blktrace setup.
+    *verify*
+			Dump info related to I/O verification.
+    *all*
+			Enable all debug options.
+    *random*
+			Dump info related to random offset generation.
+    *parse*
+			Dump info related to option matching and parsing.
+    *diskutil*
+			Dump info related to disk utilization updates.
+    *job:x*
+			Dump info only related to job number x.
+    *mutex*
+			Dump info only related to mutex up/down ops.
+    *profile*
+			Dump info related to profile extensions.
+    *time*
+			Dump info related to internal time keeping.
+    *net*
+			Dump info related to networking connections.
+    *rate*
+			Dump info related to I/O rate switching.
+    *compress*
+			Dump info related to log compress/decompress.
+    *?* or *help*
+			Show available debug options.
+
+.. option:: --parse-only
+
+    Parse options only, don\'t start any I/O.
+
+.. option:: --output=filename
+
+	Write output to file `filename`.
+
+.. option:: --bandwidth-log
+
+	Generate aggregate bandwidth logs.
+
+.. option:: --minimal
+
+	Print statistics in a terse, semicolon-delimited format.
+
+.. option:: --append-terse
+
+    Print statistics in selected mode AND terse, semicolon-delimited format.
+    **deprecated**, use :option:`--output-format` instead to select multiple
+    formats.
+
+.. option:: --output-format=type
+
+	Set the reporting format to `normal`, `terse`, `json`, or `json+`.  Multiple
+	formats can be selected, separate by a comma.  `terse` is a CSV based
+	format.  `json+` is like `json`, except it adds a full dump of the latency
+	buckets.
+
+.. option:: --terse-version=type
+
+	Set terse version output format (default 3, or 2 or 4).
+
+.. option:: --version
+
+	Print version info and exit.
+
+.. option:: --help
+
+	Print this page.
+
+.. option:: --cpuclock-test
+
+	Perform test and validation of internal CPU clock.
+
+.. option:: --crctest=test
+
+    Test the speed of the builtin checksumming functions. If no argument is
+    given, all of them are tested. Or a comma separated list can be passed, in
+    which case the given ones are tested.
+
+.. option:: --cmdhelp=command
+
+	Print help information for `command`. May be ``all`` for all commands.
+
+.. option:: --enghelp=[ioengine[,command]]
+
+    List all commands defined by :option:`ioengine`, or print help for `command`
+    defined by :option:`ioengine`.  If no :option:`ioengine` is given, list all
+    available ioengines.
+
+.. option:: --showcmd=jobfile
+
+	Turn a job file into command line options.
+
+.. option:: --readonly
+
+    Turn on safety read-only checks, preventing writes.  The ``--readonly``
+    option is an extra safety guard to prevent users from accidentally starting
+    a write workload when that is not desired.  Fio will only write if
+    `rw=write/randwrite/rw/randrw` is given.  This extra safety net can be used
+    as an extra precaution as ``--readonly`` will also enable a write check in
+    the I/O engine core to prevent writes due to unknown user space bug(s).
+
+.. option:: --eta=when
+
+	When real-time ETA estimate should be printed.  May be `always`, `never` or
+	`auto`.
+
+.. option:: --eta-newline=time
+
+	Force a new line for every `time` period passed.
+
+.. option:: --status-interval=time
+
+	Force full status dump every `time` period passed.
+
+.. option:: --section=name
+
+    Only run specified section in job file.  Multiple sections can be specified.
+    The ``--section`` option allows one to combine related jobs into one file.
+    E.g. one job file could define light, moderate, and heavy sections. Tell
+    fio to run only the "heavy" section by giving ``--section=heavy``
+    command line option.  One can also specify the "write" operations in one
+    section and "verify" operation in another section.  The ``--section`` option
+    only applies to job sections.  The reserved *global* section is always
+    parsed and used.
+
+.. option:: --alloc-size=kb
+
+    Set the internal smalloc pool to this size in kb (def 1024).  The
+    ``--alloc-size`` switch allows one to use a larger pool size for smalloc.
+    If running large jobs with randommap enabled, fio can run out of memory.
+    Smalloc is an internal allocator for shared structures from a fixed size
+    memory pool. The pool size defaults to 16M and can grow to 8 pools.
+
+    NOTE: While running :file:`.fio_smalloc.*` backing store files are visible
+    in :file:`/tmp`.
+
+.. option:: --warnings-fatal
+
+    All fio parser warnings are fatal, causing fio to exit with an
+    error.
+
+.. option:: --max-jobs=nr
+
+	Maximum number of threads/processes to support.
+
+.. option:: --server=args
+
+    Start a backend server, with `args` specifying what to listen to.
+    See `Client/Server`_ section.
+
+.. option:: --daemonize=pidfile
+
+    Background a fio server, writing the pid to the given `pidfile` file.
+
+.. option:: --client=hostname
+
+    Instead of running the jobs locally, send and run them on the given host or
+    set of hosts.  See `Client/Server`_ section.
+
+.. option:: --remote-config=file
+
+	Tell fio server to load this local file.
+
+.. option:: --idle-prof=option
+
+	Report cpu idleness on a system or percpu basis
+	``--idle-prof=system,percpu`` or
+	run unit work calibration only ``--idle-prof=calibrate``.
+
+.. option:: --inflate-log=log
+
+	Inflate and output compressed log.
+
+.. option:: --trigger-file=file
+
+	Execute trigger cmd when file exists.
+
+.. option:: --trigger-timeout=t
+
+	Execute trigger at this time.
+
+.. option:: --trigger=cmd
+
+	Set this command as local trigger.
+
+.. option:: --trigger-remote=cmd
+
+	Set this command as remote trigger.
+
+.. option:: --aux-path=path
+
+	Use this path for fio state generated files.
+
+Any parameters following the options will be assumed to be job files, unless
+they match a job file parameter. Multiple job files can be listed and each job
+file will be regarded as a separate group. Fio will :option:`stonewall`
+execution between each group.
+
+
+Job file format
 ---------------
-See the README file for command line parameters, there are only a few
-of them.
 
-Running fio is normally the easiest part - you just give it the job file
-(or job files) as parameters:
+As previously described, fio accepts one or more job files describing what it is
+supposed to do. The job file format is the classic ini file, where the names
+enclosed in [] brackets define the job name. You are free to use any ASCII name
+you want, except *global* which has special meaning.  Following the job name is
+a sequence of zero or more parameters, one per line, that define the behavior of
+the job. If the first character in a line is a ';' or a '#', the entire line is
+discarded as a comment.
 
-$ fio job_file
+A *global* section sets defaults for the jobs described in that file. A job may
+override a *global* section parameter, and a job file may even have several
+*global* sections if so desired. A job is only affected by a *global* section
+residing above it.
 
-and it will start doing what the job_file tells it to do. You can give
-more than one job file on the command line, fio will serialize the running
-of those files. Internally that is the same as using the 'stonewall'
-parameter described in the parameter section.
+The :option:`--cmdhelp` option also lists all options. If used with an `option`
+argument, :option:`--cmdhelp` will detail the given `option`.
 
-If the job file contains only one job, you may as well just give the
-parameters on the command line. The command line parameters are identical
-to the job parameters, with a few extra that control global parameters
-(see README). For example, for the job file parameter iodepth=2, the
-mirror command line option would be --iodepth 2 or --iodepth=2. You can
-also use the command line for giving more than one job entry. For each
---name option that fio sees, it will start a new job with that name.
-Command line entries following a --name entry will apply to that job,
-until there are no more entries or a new --name entry is seen. This is
-similar to the job file options, where each option applies to the current
-job until a new [] job entry is seen.
-
-fio does not need to run as root, except if the files or devices specified
-in the job section requires that. Some other options may also be restricted,
-such as memory locking, io scheduler switching, and decreasing the nice value.
-
-
-4.0 Job file format
--------------------
-As previously described, fio accepts one or more job files describing
-what it is supposed to do. The job file format is the classic ini file,
-where the names enclosed in [] brackets define the job name. You are free
-to use any ascii name you want, except 'global' which has special meaning.
-A global section sets defaults for the jobs described in that file. A job
-may override a global section parameter, and a job file may even have
-several global sections if so desired. A job is only affected by a global
-section residing above it. If the first character in a line is a ';' or a
-'#', the entire line is discarded as a comment.
+See the `examples/` directory for inspiration on how to write job files.  Note
+the copyright and license requirements currently apply to `examples/` files.
 
 So let's look at a really simple job file that defines two processes, each
-randomly reading from a 128MB file.
+randomly reading from a 128MiB file:
 
-; -- start job file --
-[global]
-rw=randread
-size=128m
+.. code-block:: ini
 
-[job1]
+    ; -- start job file --
+    [global]
+    rw=randread
+    size=128m
 
-[job2]
+    [job1]
 
-; -- end job file --
+    [job2]
 
-As you can see, the job file sections themselves are empty as all the
-described parameters are shared. As no filename= option is given, fio
-makes up a filename for each of the jobs as it sees fit. On the command
-line, this job would look as follows:
+    ; -- end job file --
+
+As you can see, the job file sections themselves are empty as all the described
+parameters are shared. As no :option:`filename` option is given, fio makes up a
+`filename` for each of the jobs as it sees fit. On the command line, this job
+would look as follows::
 
 $ fio --name=global --rw=randread --size=128m --name=job1 --name=job2
 
 
-Let's look at an example that has a number of processes writing randomly
-to files.
+Let's look at an example that has a number of processes writing randomly to
+files:
 
-; -- start job file --
-[random-writers]
-ioengine=libaio
-iodepth=4
-rw=randwrite
-bs=32k
-direct=0
-size=64m
-numjobs=4
+.. code-block:: ini
 
-; -- end job file --
+    ; -- start job file --
+    [random-writers]
+    ioengine=libaio
+    iodepth=4
+    rw=randwrite
+    bs=32k
+    direct=0
+    size=64m
+    numjobs=4
+    ; -- end job file --
 
-Here we have no global section, as we only have one job defined anyway.
-We want to use async io here, with a depth of 4 for each file. We also
-increased the buffer size used to 32KB and define numjobs to 4 to
-fork 4 identical jobs. The result is 4 processes each randomly writing
-to their own 64MB file. Instead of using the above job file, you could
-have given the parameters on the command line. For this case, you would
-specify:
+Here we have no *global* section, as we only have one job defined anyway.  We
+want to use async I/O here, with a depth of 4 for each file. We also increased
+the buffer size used to 32KiB and define numjobs to 4 to fork 4 identical
+jobs. The result is 4 processes each randomly writing to their own 64MiB
+file. Instead of using the above job file, you could have given the parameters
+on the command line. For this case, you would specify::
 
 $ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
 
 When fio is utilized as a basis of any reasonably large test suite, it might be
 desirable to share a set of standardized settings across multiple job files.
 Instead of copy/pasting such settings, any section may pull in an external
-.fio file with 'include filename' directive, as in the following example:
+:file:`filename.fio` file with *include filename* directive, as in the following
+example::
 
-; -- start job file including.fio --
-[global]
-filename=/tmp/test
-filesize=1m
-include glob-include.fio
+    ; -- start job file including.fio --
+    [global]
+    filename=/tmp/test
+    filesize=1m
+    include glob-include.fio
 
-[test]
-rw=randread
-bs=4k
-time_based=1
-runtime=10
-include test-include.fio
-; -- end job file including.fio --
+    [test]
+    rw=randread
+    bs=4k
+    time_based=1
+    runtime=10
+    include test-include.fio
+    ; -- end job file including.fio --
 
-; -- start job file glob-include.fio --
-thread=1
-group_reporting=1
-; -- end job file glob-include.fio --
+.. code-block:: ini
 
-; -- start job file test-include.fio --
-ioengine=libaio
-iodepth=4
-; -- end job file test-include.fio --
+    ; -- start job file glob-include.fio --
+    thread=1
+    group_reporting=1
+    ; -- end job file glob-include.fio --
 
-Settings pulled into a section apply to that section only (except global
-section). Include directives may be nested in that any included file may
-contain further include directive(s). Include files may not contain []
-sections.
+.. code-block:: ini
+
+    ; -- start job file test-include.fio --
+    ioengine=libaio
+    iodepth=4
+    ; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except *global*
+section). Include directives may be nested in that any included file may contain
+further include directive(s). Include files may not contain [] sections.
 
 
-4.1 Environment variables
--------------------------
+Environment variables
+~~~~~~~~~~~~~~~~~~~~~
 
-fio also supports environment variable expansion in job files. Any
-substring of the form "${VARNAME}" as part of an option value (in other
-words, on the right of the `='), will be expanded to the value of the
-environment variable called VARNAME.  If no such environment variable
-is defined, or VARNAME is the empty string, the empty string will be
-substituted.
+Fio also supports environment variable expansion in job files. Any sub-string of
+the form ``${VARNAME}`` as part of an option value (in other words, on the right
+of the '='), will be expanded to the value of the environment variable called
+`VARNAME`.  If no such environment variable is defined, or `VARNAME` is the
+empty string, the empty string will be substituted.
 
-As an example, let's look at a sample fio invocation and job file:
+As an example, let's look at a sample fio invocation and job file::
 
 $ SIZE=64m NUMJOBS=4 fio jobfile.fio
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=${SIZE}
-numjobs=${NUMJOBS}
-; -- end job file --
+.. code-block:: ini
+
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=${SIZE}
+    numjobs=${NUMJOBS}
+    ; -- end job file --
 
 This will expand to the following equivalent job file at runtime:
 
-; -- start job file --
-[random-writers]
-rw=randwrite
-size=64m
-numjobs=4
-; -- end job file --
+.. code-block:: ini
 
-fio ships with a few example job files, you can also look there for
-inspiration.
+    ; -- start job file --
+    [random-writers]
+    rw=randwrite
+    size=64m
+    numjobs=4
+    ; -- end job file --
 
-4.2 Reserved keywords
----------------------
+Fio ships with a few example job files, you can also look there for inspiration.
+
+Reserved keywords
+~~~~~~~~~~~~~~~~~
 
 Additionally, fio has a set of reserved keywords that will be replaced
 internally with the appropriate value. Those keywords are:
 
-$pagesize	The architecture page size of the running system
-$mb_memory	Megabytes of total memory in the system
-$ncpus		Number of online available CPUs
+**$pagesize**
+
+	The architecture page size of the running system.
+
+**$mb_memory**
+
+	Megabytes of total memory in the system.
+
+**$ncpus**
+
+	Number of online available CPUs.
 
 These can be used on the command line or in the job file, and will be
-automatically substituted with the current system values when the job
-is run. Simple math is also supported on these keywords, so you can
-perform actions like:
+automatically substituted with the current system values when the job is
+run. Simple math is also supported on these keywords, so you can perform actions
+like::
 
-size=8*$mb_memory
+        size=8*$mb_memory
 
-and get that properly expanded to 8 times the size of memory in the
-machine.
+and get that properly expanded to 8 times the size of memory in the machine.
 
 
-5.0 Detailed list of parameters
--------------------------------
+Job file parameters
+-------------------
 
-This section describes in details each parameter associated with a job.
-Some parameters take an option of a given type, such as an integer or
-a string. Anywhere a numeric value is required, an arithmetic expression
-may be used, provided it is surrounded by parentheses. Supported operators
-are:
+This section describes in details each parameter associated with a job.  Some
+parameters take an option of a given type, such as an integer or a
+string. Anywhere a numeric value is required, an arithmetic expression may be
+used, provided it is surrounded by parentheses. Supported operators are:
 
-	addition (+)
-	subtraction (-)
-	multiplication (*)
-	division (/)
-	modulus (%)
-	exponentiation (^)
+	- addition (+)
+	- subtraction (-)
+	- multiplication (*)
+	- division (/)
+	- modulus (%)
+	- exponentiation (^)
 
 For time values in expressions, units are microseconds by default. This is
 different than for time values not in expressions (not enclosed in
 parentheses). The following types are used:
 
-str	String. This is a sequence of alpha characters.
-time	Integer with possible time suffix. In seconds unless otherwise
-	specified, use eg 10m for 10 minutes. Accepts s/m/h for seconds,
-	minutes, and hours, and accepts 'ms' (or 'msec') for milliseconds,
-	and 'us' (or 'usec') for microseconds.
-int	SI integer. A whole number value, which may contain a suffix
-	describing the base of the number. Accepted suffixes are k/m/g/t/p,
-	meaning kilo, mega, giga, tera, and peta. The suffix is not case
-	sensitive, and you may also include trailing 'b' (eg 'kb' is the same
-	as 'k'). So if you want to specify 4096, you could either write
-	out '4096' or just give 4k. The suffixes signify base 2 values, so
-	1024 is 1k and 1024k is 1m and so on, unless the suffix is explicitly
-	set to a base 10 value using 'kib', 'mib', 'gib', etc. If that is the
-	case, then 1000 is used as the multiplier. This can be handy for
-	disks, since manufacturers generally use base 10 values when listing
-	the capacity of a drive. If the option accepts an upper and lower
-	range, use a colon ':' or minus '-' to separate such values.  May also
-	include a prefix to indicate numbers base. If 0x is used, the number
-	is assumed to be hexadecimal.  See irange.
-bool	Boolean. Usually parsed as an integer, however only defined for
+
+Parameter types
+~~~~~~~~~~~~~~~
+
+**str**
+    String. This is a sequence of alpha characters.
+
+**time**
+	Integer with possible time suffix. In seconds unless otherwise
+	specified, use e.g. 10m for 10 minutes. Accepts s/m/h for seconds, minutes,
+	and hours, and accepts 'ms' (or 'msec') for milliseconds, and 'us' (or
+	'usec') for microseconds.
+
+.. _int:
+
+**int**
+	Integer. A whole number value, which may contain an integer prefix
+	and an integer suffix:
+
+        [*integer prefix*] **number** [*integer suffix*]
+
+	The optional *integer prefix* specifies the number's base. The default
+	is decimal. *0x* specifies hexadecimal.
+
+	The optional *integer suffix* specifies the number's units, and includes an
+	optional unit prefix and an optional unit.  For quantities of data, the
+	default unit is bytes. For quantities of time, the default unit is seconds.
+
+	With :option:`kb_base` =1000, fio follows international standards for unit
+	prefixes.  To specify power-of-10 decimal values defined in the
+	International System of Units (SI):
+
+		* *Ki* -- means kilo (K) or 1000
+		* *Mi* -- means mega (M) or 1000**2
+		* *Gi* -- means giga (G) or 1000**3
+		* *Ti* -- means tera (T) or 1000**4
+		* *Pi* -- means peta (P) or 1000**5
+
+	To specify power-of-2 binary values defined in IEC 80000-13:
+
+		* *k* -- means kibi (Ki) or 1024
+		* *M* -- means mebi (Mi) or 1024**2
+		* *G* -- means gibi (Gi) or 1024**3
+		* *T* -- means tebi (Ti) or 1024**4
+		* *P* -- means pebi (Pi) or 1024**5
+
+	With :option:`kb_base` =1024 (the default), the unit prefixes are opposite
+	from those specified in the SI and IEC 80000-13 standards to provide
+	compatibility with old scripts.  For example, 4k means 4096.
+
+	For quantities of data, an optional unit of 'B' may be included
+	(e.g.,  'kB' is the same as 'k').
+
+	The *integer suffix* is not case sensitive (e.g., m/mi mean mebi/mega,
+	not milli). 'b' and 'B' both mean byte, not bit.
+
+	Examples with :option:`kb_base` =1000:
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4ki, 4kib, 4kiB, 4Ki, 4KiB
+		* *1 MiB*: 1048576, 1mi, 1024ki
+		* *1 MB*: 1000000, 1m, 1000k
+		* *1 TiB*: 1099511627776, 1ti, 1024gi, 1048576mi
+		* *1 TB*: 1000000000, 1t, 1000m, 1000000k
+
+	Examples with :option:`kb_base` =1024 (default):
+
+		* *4 KiB*: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+		* *1 MiB*: 1048576, 1m, 1024k
+		* *1 MB*: 1000000, 1mi, 1000ki
+		* *1 TiB*: 1099511627776, 1t, 1024g, 1048576m
+		* *1 TB*: 1000000000, 1ti, 1000mi, 1000000ki
+
+	To specify times (units are not case sensitive):
+
+		* *D* -- means days
+		* *H* -- means hours
+		* *M* -- mean minutes
+		* *s* -- or sec means seconds (default)
+		* *ms* -- or *msec* means milliseconds
+		* *us* -- or *usec* means microseconds
+
+	If the option accepts an upper and lower range, use a colon ':' or
+	minus '-' to separate such values. See :ref:`irange <irange>`.
+	If the lower value specified happens to be larger than the upper value,
+	two values are swapped.
+
+.. _bool:
+
+**bool**
+	Boolean. Usually parsed as an integer, however only defined for
 	true and false (1 and 0).
-irange	Integer range with suffix. Allows value range to be given, such
-	as 1024-4096. A colon may also be used as the separator, eg
-	1k:4k. If the option allows two sets of ranges, they can be
-	specified with a ',' or '/' delimiter: 1k-4k/8k-32k. Also see
-	int.
-float_list	A list of floating numbers, separated by a ':' character.
 
-With the above in mind, here follows the complete list of fio job
-parameters.
+.. _irange:
 
-name=str	ASCII name of the job. This may be used to override the
-		name printed by fio for this job. Otherwise the job
-		name is used. On the command line this parameter has the
-		special purpose of also signaling the start of a new
-		job.
+**irange**
+	Integer range with suffix. Allows value range to be given, such as
+	1024-4096. A colon may also be used as the separator, e.g. 1k:4k. If the
+	option allows two sets of ranges, they can be specified with a ',' or '/'
+	delimiter: 1k-4k/8k-32k. Also see :ref:`int <int>`.
 
-description=str	Text description of the job. Doesn't do anything except
-		dump this text description when this job is run. It's
-		not parsed.
+**float_list**
+	A list of floating point numbers, separated by a ':' character.
 
-directory=str	Prefix filenames with this directory. Used to place files
-		in a different location than "./". See the 'filename' option
-		for escaping certain characters.
 
-filename=str	Fio normally makes up a filename based on the job name,
-		thread number, and file number. If you want to share
-		files between threads in a job or several jobs, specify
-		a filename for each of them to override the default. If
-		the ioengine used is 'net', the filename is the host, port,
-		and protocol to use in the format of =host,port,protocol.
-		See ioengine=net for more. If the ioengine is file based, you
-		can specify a number of files by separating the names with a
-		':' colon. So if you wanted a job to open /dev/sda and /dev/sdb
-		as the two working files, you would use
-		filename=/dev/sda:/dev/sdb. On Windows, disk devices are
-		accessed as \\.\PhysicalDrive0 for the first device,
-		\\.\PhysicalDrive1 for the second etc. Note: Windows and
-		FreeBSD prevent write access to areas of the disk containing
-		in-use data (e.g. filesystems).
-		If the wanted filename does need to include a colon, then
-		escape that with a '\' character. For instance, if the filename
-		is "/dev/dsk/foo@3,0:c", then you would use
-		filename="/dev/dsk/foo@3,0\:c". '-' is a reserved name, meaning
-		stdin or stdout. Which of the two depends on the read/write
-		direction set.
+Units
+~~~~~
 
-filename_format=str
-		If sharing multiple files between jobs, it is usually necessary
-		to  have fio generate the exact names that you want. By default,
-		fio will name a file based on the default file format
-		specification of jobname.jobnumber.filenumber. With this
-		option, that can be customized. Fio will recognize and replace
-		the following keywords in this string:
+.. option:: kb_base=int
 
-		$jobname
-			The name of the worker thread or process.
+	Select the interpretation of unit prefixes in input parameters.
 
-		$jobnum
-			The incremental number of the worker thread or
-			process.
+		**1000**
+			Inputs comply with IEC 80000-13 and the International
+			System of Units (SI). Use:
 
-		$filenum
-			The incremental number of the file for that worker
-			thread or process.
+				- power-of-2 values with IEC prefixes (e.g., KiB)
+				- power-of-10 values with SI prefixes (e.g., kB)
 
-		To have dependent jobs share a set of files, this option can
-		be set to have fio generate filenames that are shared between
-		the two. For instance, if testfiles.$filenum is specified,
-		file number 4 for any job will be named testfiles.4. The
-		default of $jobname.$jobnum.$filenum will be used if
-		no other format specifier is given.
+		**1024**
+			Compatibility mode (default).  To avoid breaking old scripts:
 
-opendir=str	Tell fio to recursively add any file it can find in this
-		directory and down the file system tree.
+				- power-of-2 values with SI prefixes
+				- power-of-10 values with IEC prefixes
 
-lockfile=str	Fio defaults to not locking any files before it does
-		IO to them. If a file or file descriptor is shared, fio
-		can serialize IO to that file to make the end result
-		consistent. This is usual for emulating real workloads that
-		share files. The lock modes are:
+	See :option:`bs` for more details on input parameters.
 
-			none		No locking. The default.
-			exclusive	Only one thread/process may do IO,
-					excluding all others.
-			readwrite	Read-write locking on the file. Many
-					readers may access the file at the
-					same time, but writes get exclusive
-					access.
+	Outputs always use correct prefixes.  Most outputs include both
+	side-by-side, like::
 
-readwrite=str
-rw=str		Type of io pattern. Accepted values are:
+		bw=2383.3kB/s (2327.4KiB/s)
 
-			read		Sequential reads
-			write		Sequential writes
-			randwrite	Random writes
-			randread	Random reads
-			rw,readwrite	Sequential mixed reads and writes
-			randrw		Random mixed reads and writes
+	If only one value is reported, then kb_base selects the one to use:
 
-		For the mixed io types, the default is to split them 50/50.
-		For certain types of io the result may still be skewed a bit,
-		since the speed may be different. It is possible to specify
-		a number of IO's to do before getting a new offset, this is
-		done by appending a ':<nr>' to the end of the string given.
-		For a random read, it would look like 'rw=randread:8' for
-		passing in an offset modifier with a value of 8. If the
-		suffix is used with a sequential IO pattern, then the value
-		specified will be added to the generated offset for each IO.
-		For instance, using rw=write:4k will skip 4k for every
-		write. It turns sequential IO into sequential IO with holes.
-		See the 'rw_sequencer' option.
+		**1000** -- SI prefixes
 
-rw_sequencer=str If an offset modifier is given by appending a number to
-		the rw=<str> line, then this option controls how that
-		number modifies the IO offset being generated. Accepted
-		values are:
+		**1024** -- IEC prefixes
 
-			sequential	Generate sequential offset
-			identical	Generate the same offset
+.. option:: unit_base=int
 
-		'sequential' is only useful for random IO, where fio would
-		normally generate a new random offset for every IO. If you
-		append eg 8 to randread, you would get a new random offset for
-		every 8 IO's. The result would be a seek for only every 8
-		IO's, instead of for every IO. Use rw=randread:8 to specify
-		that. As sequential IO is already sequential, setting
-		'sequential' for that would not result in any differences.
-		'identical' behaves in a similar fashion, except it sends
-		the same offset 8 number of times before generating a new
-		offset.
+	Base unit for reporting.  Allowed values are:
 
-kb_base=int	The base unit for a kilobyte. The defacto base is 2^10, 1024.
-		Storage manufacturers like to use 10^3 or 1000 as a base
-		ten unit instead, for obvious reasons. Allow values are
-		1024 or 1000, with 1024 being the default.
+	**0**
+		Use auto-detection (default).
+	**8**
+		Byte based.
+	**1**
+		Bit based.
 
-unified_rw_reporting=bool	Fio normally reports statistics on a per
-		data direction basis, meaning that read, write, and trim are
-		accounted and reported separately. If this option is set,
-		the fio will sum the results and report them as "mixed"
-		instead.
 
-randrepeat=bool	For random IO workloads, seed the generator in a predictable
-		way so that results are repeatable across repetitions.
+With the above in mind, here follows the complete list of fio job parameters.
 
-randseed=int	Seed the random number generators based on this seed value, to
-		be able to control what sequence of output is being generated.
-		If not set, the random sequence depends on the randrepeat
-		setting.
 
-fallocate=str	Whether pre-allocation is performed when laying down files.
-		Accepted values are:
+Job description
+~~~~~~~~~~~~~~~
 
-			none		Do not pre-allocate space
-			posix		Pre-allocate via posix_fallocate()
-			keep		Pre-allocate via fallocate() with
-					FALLOC_FL_KEEP_SIZE set
-			0		Backward-compatible alias for 'none'
-			1		Backward-compatible alias for 'posix'
+.. option:: name=str
 
-		May not be available on all supported platforms. 'keep' is only
-		available on Linux.If using ZFS on Solaris this must be set to
-		'none' because ZFS doesn't support it. Default: 'posix'.
+	ASCII name of the job. This may be used to override the name printed by fio
+	for this job. Otherwise the job name is used. On the command line this
+	parameter has the special purpose of also signaling the start of a new job.
 
-fadvise_hint=bool By default, fio will use fadvise() to advise the kernel
-		on what IO patterns it is likely to issue. Sometimes you
-		want to test specific IO patterns without telling the
-		kernel about it, in which case you can disable this option.
-		If set, fio will use POSIX_FADV_SEQUENTIAL for sequential
-		IO and POSIX_FADV_RANDOM for random IO.
+.. option:: description=str
 
-size=int	The total size of file io for this job. Fio will run until
-		this many bytes has been transferred, unless runtime is
-		limited by other options (such as 'runtime', for instance,
-		or increased/decreased by 'io_size'). Unless specific nrfiles
-		and filesize options are given, fio will divide this size
-		between the available files specified by the job. If not set,
-		fio will use the full size of the given files or devices.
-		If the files do not exist, size must be given. It is also
-		possible to give size as a percentage between 1 and 100. If
-		size=20% is given, fio will use 20% of the full size of the
-		given files or devices.
+	Text description of the job. Doesn't do anything except dump this text
+	description when this job is run. It's not parsed.
 
-io_size=int
-io_limit=int	Normally fio operates within the region set by 'size', which
-		means that the 'size' option sets both the region and size of
-		IO to be performed. Sometimes that is not what you want. With
-		this option, it is possible to define just the amount of IO
-		that fio should do. For instance, if 'size' is set to 20G and
-		'io_size' is set to 5G, fio will perform IO within the first
-		20G but exit when 5G have been done. The opposite is also
-		possible - if 'size' is set to 20G, and 'io_size' is set to
-		40G, then fio will do 40G of IO within the 0..20G region.
+.. option:: loops=int
 
-filesize=int	Individual file sizes. May be a range, in which case fio
-		will select sizes for files at random within the given range
-		and limited to 'size' in total (if that is given). If not
-		given, each created file is the same size.
+	Run the specified number of iterations of this job. Used to repeat the same
+	workload a given number of times. Defaults to 1.
 
-file_append=bool	Perform IO after the end of the file. Normally fio will
-		operate within the size of a file. If this option is set, then
-		fio will append to the file instead. This has identical
-		behavior to setting offset to the size of a file. This option
-		is ignored on non-regular files.
+.. option:: numjobs=int
 
-fill_device=bool
-fill_fs=bool	Sets size to something really large and waits for ENOSPC (no
-		space left on device) as the terminating condition. Only makes
-		sense with sequential write. For a read workload, the mount
-		point will be filled first then IO started on the result. This
-		option doesn't make sense if operating on a raw device node,
-		since the size of that is already known by the file system.
-		Additionally, writing beyond end-of-device will not return
-		ENOSPC there.
+	Create the specified number of clones of this job. Each clone of job
+	is spawned as an independent thread or process. May be used to setup a
+	larger number of threads/processes doing the same thing. Each thread is
+	reported separately; to see statistics for all clones as a whole, use
+	:option:`group_reporting` in conjunction with :option:`new_group`.
+	See :option:`--max-jobs`.
 
-blocksize=int
-bs=int		The block size used for the io units. Defaults to 4k. Values
-		can be given for both read and writes. If a single int is
-		given, it will apply to both. If a second int is specified
-		after a comma, it will apply to writes only. In other words,
-		the format is either bs=read_and_write or bs=read,write,trim.
-		bs=4k,8k will thus use 4k blocks for reads, 8k blocks for
-		writes, and 8k for trims. You can terminate the list with
-		a trailing comma. bs=4k,8k, would use the default value for
-		trims.. If you only wish to set the write size, you
-		can do so by passing an empty read size - bs=,8k will set
-		8k for writes and leave the read default value.
 
-blockalign=int
-ba=int		At what boundary to align random IO offsets. Defaults to
-		the same as 'blocksize' the minimum blocksize given.
-		Minimum alignment is typically 512b for using direct IO,
-		though it usually depends on the hardware block size. This
-		option is mutually exclusive with using a random map for
-		files, so it will turn off that option.
+Time related parameters
+~~~~~~~~~~~~~~~~~~~~~~~
 
-blocksize_range=irange
-bsrange=irange	Instead of giving a single block size, specify a range
-		and fio will mix the issued io block sizes. The issued
-		io unit will always be a multiple of the minimum value
-		given (also see bs_unaligned). Applies to both reads and
-		writes, however a second range can be given after a comma.
-		See bs=.
+.. option:: runtime=time
 
-bssplit=str	Sometimes you want even finer grained control of the
-		block sizes issued, not just an even split between them.
-		This option allows you to weight various block sizes,
-		so that you are able to define a specific amount of
-		block sizes issued. The format for this option is:
+	Tell fio to terminate processing after the specified period of time.  It
+	can be quite hard to determine for how long a specified job will run, so
+	this parameter is handy to cap the total runtime to a given time.  When
+	the unit is omitted, the value is given in seconds.
 
-			bssplit=blocksize/percentage:blocksize/percentage
+.. option:: time_based
 
-		for as many block sizes as needed. So if you want to define
-		a workload that has 50% 64k blocks, 10% 4k blocks, and
-		40% 32k blocks, you would write:
+	If set, fio will run for the duration of the :option:`runtime` specified
+	even if the file(s) are completely read or written. It will simply loop over
+	the same workload as many times as the :option:`runtime` allows.
 
-			bssplit=4k/10:64k/50:32k/40
+.. option:: startdelay=irange(time)
 
-		Ordering does not matter. If the percentage is left blank,
-		fio will fill in the remaining values evenly. So a bssplit
-		option like this one:
+	Delay start of job for the specified number of seconds. Supports all time
+	suffixes to allow specification of hours, minutes, seconds and milliseconds
+	-- seconds are the default if a unit is omitted.  Can be given as a range
+	which causes each thread to choose randomly out of the range.
 
-			bssplit=4k/50:1k/:32k/
+.. option:: ramp_time=time
 
-		would have 50% 4k ios, and 25% 1k and 32k ios. The percentages
-		always add up to 100, if bssplit is given a range that adds
-		up to more, it will error out.
+	If set, fio will run the specified workload for this amount of time before
+	logging any performance numbers. Useful for letting performance settle
+	before logging results, thus minimizing the runtime required for stable
+	results. Note that the ``ramp_time`` is considered lead in time for a job,
+	thus it will increase the total runtime if a special timeout or
+	:option:`runtime` is specified.  When the unit is omitted, the value is
+	given in seconds.
 
-		bssplit also supports giving separate splits to reads and
-		writes. The format is identical to what bs= accepts. You
-		have to separate the read and write parts with a comma. So
-		if you want a workload that has 50% 2k reads and 50% 4k reads,
-		while having 90% 4k writes and 10% 8k writes, you would
-		specify:
+.. option:: clocksource=str
 
-		bssplit=2k/50:4k/50,4k/90:8k/10
+	Use the given clocksource as the base of timing. The supported options are:
 
-blocksize_unaligned
-bs_unaligned	If this option is given, any byte size value within bsrange
-		may be used as a block range. This typically wont work with
-		direct IO, as that normally requires sector alignment.
+		**gettimeofday**
+			:manpage:`gettimeofday(2)`
 
-bs_is_seq_rand	If this option is set, fio will use the normal read,write
-		blocksize settings as sequential,random instead. Any random
-		read or write will use the WRITE blocksize settings, and any
-		sequential read or write will use the READ blocksize setting.
+		**clock_gettime**
+			:manpage:`clock_gettime(2)`
 
-zero_buffers	If this option is given, fio will init the IO buffers to
-		all zeroes. The default is to fill them with random data.
-		The resulting IO buffers will not be completely zeroed,
-		unless scramble_buffers is also turned off.
+		**cpu**
+			Internal CPU clock source
 
-refill_buffers	If this option is given, fio will refill the IO buffers
-		on every submit. The default is to only fill it at init
-		time and reuse that data. Only makes sense if zero_buffers
-		isn't specified, naturally. If data verification is enabled,
-		refill_buffers is also automatically enabled.
+	cpu is the preferred clocksource if it is reliable, as it is very fast (and
+	fio is heavy on time calls). Fio will automatically use this clocksource if
+	it's supported and considered reliable on the system it is running on,
+	unless another clocksource is specifically set. For x86/x86-64 CPUs, this
+	means supporting TSC Invariant.
 
-scramble_buffers=bool	If refill_buffers is too costly and the target is
-		using data deduplication, then setting this option will
-		slightly modify the IO buffer contents to defeat normal
-		de-dupe attempts. This is not enough to defeat more clever
-		block compression attempts, but it will stop naive dedupe of
-		blocks. Default: true.
+.. option:: gtod_reduce=bool
 
-buffer_compress_percentage=int	If this is set, then fio will attempt to
-		provide IO buffer content (on WRITEs) that compress to
-		the specified level. Fio does this by providing a mix of
-		random data and a fixed pattern. The fixed pattern is either
-		zeroes, or the pattern specified by buffer_pattern. If the
-		pattern option is used, it might skew the compression ratio
-		slightly. Note that this is per block size unit, for file/disk
-		wide compression level that matches this setting, you'll also
-		want to set refill_buffers.
+	Enable all of the :manpage:`gettimeofday(2)` reducing options
+	(:option:`disable_clat`, :option:`disable_slat`, :option:`disable_bw_measurement`) plus
+	reduce precision of the timeout somewhat to really shrink the
+	:manpage:`gettimeofday(2)` call count. With this option enabled, we only do
+	about 0.4% of the :manpage:`gettimeofday(2)` calls we would have done if all
+	time keeping was enabled.
 
-buffer_compress_chunk=int	See buffer_compress_percentage. This
-		setting allows fio to manage how big the ranges of random
-		data and zeroed data is. Without this set, fio will
-		provide buffer_compress_percentage of blocksize random
-		data, followed by the remaining zeroed. With this set
-		to some chunk size smaller than the block size, fio can
-		alternate random and zeroed data throughout the IO
-		buffer.
+.. option:: gtod_cpu=int
 
-buffer_pattern=str	If set, fio will fill the io buffers with this
-		pattern. If not set, the contents of io buffers is defined by
-		the other options related to buffer contents. The setting can
-		be any pattern of bytes, and can be prefixed with 0x for hex
-		values. It may also be a string, where the string must then
-		be wrapped with "".
+	Sometimes it's cheaper to dedicate a single thread of execution to just
+	getting the current time. Fio (and databases, for instance) are very
+	intensive on :manpage:`gettimeofday(2)` calls. With this option, you can set
+	one CPU aside for doing nothing but logging current time to a shared memory
+	location. Then the other threads/processes that run I/O workloads need only
+	copy that segment, instead of entering the kernel with a
+	:manpage:`gettimeofday(2)` call. The CPU set aside for doing these time
+	calls will be excluded from other uses. Fio will manually clear it from the
+	CPU mask of other jobs.
 
-dedupe_percentage=int	If set, fio will generate this percentage of
-		identical buffers when writing. These buffers will be
-		naturally dedupable. The contents of the buffers depend on
-		what other buffer compression settings have been set. It's
-		possible to have the individual buffers either fully
-		compressible, or not at all. This option only controls the
-		distribution of unique buffers.
 
-nrfiles=int	Number of files to use for this job. Defaults to 1.
+Target file/device
+~~~~~~~~~~~~~~~~~~
 
-openfiles=int	Number of files to keep open at the same time. Defaults to
-		the same as nrfiles, can be set smaller to limit the number
-		simultaneous opens.
+.. option:: directory=str
 
-file_service_type=str  Defines how fio decides which file from a job to
-		service next. The following types are defined:
+	Prefix filenames with this directory. Used to place files in a different
+	location than :file:`./`.  You can specify a number of directories by
+	separating the names with a ':' character. These directories will be
+	assigned equally distributed to job clones creates with :option:`numjobs` as
+	long as they are using generated filenames. If specific `filename(s)` are
+	set fio will use the first listed directory, and thereby matching the
+	`filename` semantic which generates a file each clone if not specified, but
+	let all clones use the same if set.
 
-			random	Just choose a file at random.
+	See the :option:`filename` option for escaping certain characters.
 
-			roundrobin  Round robin over open files. This
-				is the default.
+.. option:: filename=str
 
-			sequential  Finish one file before moving on to
-				the next. Multiple files can still be
-				open depending on 'openfiles'.
+	Fio normally makes up a `filename` based on the job name, thread number, and
+	file number. If you want to share files between threads in a job or several
+	jobs with fixed file paths, specify a `filename` for each of them to override
+	the default. If the ioengine is file based, you can specify a number of files
+	by separating the names with a ':' colon. So if you wanted a job to open
+	:file:`/dev/sda` and :file:`/dev/sdb` as the two working files, you would use
+	``filename=/dev/sda:/dev/sdb``. This also means that whenever this option is
+	specified, :option:`nrfiles` is ignored. The size of regular files specified
+	by this option will be :option:`size` divided by number of files unless
+	explicit size is specified by :option:`filesize`.
 
-		The string can have a number appended, indicating how
-		often to switch to a new file. So if option random:4 is
-		given, fio will switch to a new random file after 4 ios
-		have been issued.
+	On Windows, disk devices are accessed as :file:`\\\\.\\PhysicalDrive0` for
+	the first device, :file:`\\\\.\\PhysicalDrive1` for the second etc.
+	Note: Windows and FreeBSD prevent write access to areas
+	of the disk containing in-use data (e.g. filesystems).  If the wanted
+	`filename` does need to include a colon, then escape that with a ``\``
+	character. For instance, if the `filename` is :file:`/dev/dsk/foo@3,0:c`,
+	then you would use ``filename="/dev/dsk/foo@3,0\:c"``.  The
+	:file:`-` is a reserved name, meaning stdin or stdout.  Which of the two
+	depends on the read/write direction set.
 
-ioengine=str	Defines how the job issues io to the file. The following
-		types are defined:
+.. option:: filename_format=str
 
-			sync	Basic read(2) or write(2) io. lseek(2) is
-				used to position the io location.
+	If sharing multiple files between jobs, it is usually necessary to have fio
+	generate the exact names that you want. By default, fio will name a file
+	based on the default file format specification of
+	:file:`jobname.jobnumber.filenumber`. With this option, that can be
+	customized. Fio will recognize and replace the following keywords in this
+	string:
 
-			psync 	Basic pread(2) or pwrite(2) io.
+		**$jobname**
+				The name of the worker thread or process.
+		**$jobnum**
+				The incremental number of the worker thread or process.
+		**$filenum**
+				The incremental number of the file for that worker thread or
+				process.
 
-			vsync	Basic readv(2) or writev(2) IO.
+	To have dependent jobs share a set of files, this option can be set to have
+	fio generate filenames that are shared between the two. For instance, if
+	:file:`testfiles.$filenum` is specified, file number 4 for any job will be
+	named :file:`testfiles.4`. The default of :file:`$jobname.$jobnum.$filenum`
+	will be used if no other format specifier is given.
 
-			psyncv	Basic preadv(2) or pwritev(2) IO.
+.. option:: unique_filename=bool
 
-			libaio	Linux native asynchronous io. Note that Linux
-				may only support queued behaviour with
-				non-buffered IO (set direct=1 or buffered=0).
-				This engine defines engine specific options.
+	To avoid collisions between networked clients, fio defaults to prefixing any
+	generated filenames (with a directory specified) with the source of the
+	client connecting. To disable this behavior, set this option to 0.
 
-			posixaio glibc posix asynchronous io.
+.. option:: opendir=str
 
-			solarisaio Solaris native asynchronous io.
+	Recursively open any files below directory `str`.
 
-			windowsaio Windows native asynchronous io.
+.. option:: lockfile=str
 
-			mmap	File is memory mapped and data copied
-				to/from using memcpy(3).
+	Fio defaults to not locking any files before it does I/O to them. If a file
+	or file descriptor is shared, fio can serialize I/O to that file to make the
+	end result consistent. This is usual for emulating real workloads that share
+	files. The lock modes are:
 
-			splice	splice(2) is used to transfer the data and
-				vmsplice(2) to transfer data from user
-				space to the kernel.
+		**none**
+			No locking. The default.
+		**exclusive**
+			Only one thread or process may do I/O at a time, excluding all
+			others.
+		**readwrite**
+			Read-write locking on the file. Many readers may
+			access the file at the same time, but writes get exclusive access.
 
-			syslet-rw Use the syslet system calls to make
-				regular read/write async.
+.. option:: nrfiles=int
 
-			sg	SCSI generic sg v3 io. May either be
-				synchronous using the SG_IO ioctl, or if
-				the target is an sg character device
-				we use read(2) and write(2) for asynchronous
-				io.
+	Number of files to use for this job. Defaults to 1. The size of files
+	will be :option:`size` divided by this unless explicit size is specified by
+	:option:`filesize`. Files are created for each thread separately, and each
+	file will have a file number within its name by default, as explained in
+	:option:`filename` section.
 
-			null	Doesn't transfer any data, just pretends
-				to. This is mainly used to exercise fio
-				itself and for debugging/testing purposes.
 
-			net	Transfer over the network to given host:port.
-				Depending on the protocol used, the hostname,
-				port, listen and filename options are used to
-				specify what sort of connection to make, while
-				the protocol option determines which protocol
-				will be used.
-				This engine defines engine specific options.
+.. option:: openfiles=int
 
-			netsplice Like net, but uses splice/vmsplice to
-				map data and send/receive.
-				This engine defines engine specific options.
+	Number of files to keep open at the same time. Defaults to the same as
+	:option:`nrfiles`, can be set smaller to limit the number simultaneous
+	opens.
 
-			cpuio	Doesn't transfer any data, but burns CPU
-				cycles according to the cpuload= and
-				cpucycle= options. Setting cpuload=85
-				will cause that job to do nothing but burn
-				85% of the CPU. In case of SMP machines,
-				use numjobs=<no_of_cpu> to get desired CPU
-				usage, as the cpuload only loads a single
-				CPU at the desired rate.
+.. option:: file_service_type=str
 
-			guasi	The GUASI IO engine is the Generic Userspace
-				Asyncronous Syscall Interface approach
-				to async IO. See
+	Defines how fio decides which file from a job to service next. The following
+	types are defined:
 
-				http://www.xmailserver.org/guasi-lib.html
+		**random**
+			Choose a file at random.
 
-				for more info on GUASI.
+		**roundrobin**
+			Round robin over opened files. This is the default.
 
-			rdma    The RDMA I/O engine  supports  both  RDMA
-				memory semantics (RDMA_WRITE/RDMA_READ) and
-				channel semantics (Send/Recv) for the
-				InfiniBand, RoCE and iWARP protocols.
+		**sequential**
+			Finish one file before moving on to the next. Multiple files can
+			still be open depending on 'openfiles'.
 
-			falloc	IO engine that does regular fallocate to
-				simulate data transfer as fio ioengine.
-				DDIR_READ  does fallocate(,mode = keep_size,)
-				DDIR_WRITE does fallocate(,mode = 0)
-				DDIR_TRIM  does fallocate(,mode = punch_hole)
+		**zipf**
+			Use a *Zipf* distribution to decide what file to access.
 
-			e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT
-				ioctls to simulate defragment activity in
-				request to DDIR_WRITE event
+		**pareto**
+			Use a *Pareto* distribution to decide what file to access.
 
-			rbd	IO engine supporting direct access to Ceph
-				Rados Block Devices (RBD) via librbd without
-				the need to use the kernel rbd driver. This
-				ioengine defines engine specific options.
+		**gauss**
+			Use a *Gaussian* (normal) distribution to decide what file to
+			access.
 
-			gfapi	Using Glusterfs libgfapi sync interface to
-				direct access to Glusterfs volumes without
-				options.
+	For *random*, *roundrobin*, and *sequential*, a postfix can be appended to
+	tell fio how many I/Os to issue before switching to a new file. For example,
+	specifying ``file_service_type=random:8`` would cause fio to issue
+	8 I/Os before selecting a new file at random. For the non-uniform
+	distributions, a floating point postfix can be given to influence how the
+	distribution is skewed. See :option:`random_distribution` for a description
+	of how that would work.
 
-			gfapi_async Using Glusterfs libgfapi async interface
-				to direct access to Glusterfs volumes without
-				having to go through FUSE. This ioengine
-				defines engine specific options.
+.. option:: ioscheduler=str
 
-			libhdfs	Read and write through Hadoop (HDFS).
-				The 'filename' option is used to specify host,
-				port of the hdfs name-node to connect. This
-				engine interprets offsets a little
-				differently. In HDFS, files once created
-				cannot be modified. So random writes are not
-				possible. To imitate this, libhdfs engine
-				expects bunch of small files to be created
-				over HDFS, and engine will randomly pick a
-				file out of those files based on the offset
-				generated by fio backend. (see the example
-				job file to create such files, use rw=write
-				option). Please note, you might want to set
-				necessary environment variables to work with
-				hdfs/libhdfs properly.
+	Attempt to switch the device hosting the file to the specified I/O scheduler
+	before running.
 
-			external Prefix to specify loading an external
-				IO engine object file. Append the engine
-				filename, eg ioengine=external:/tmp/foo.o
-				to load ioengine foo.o in /tmp.
+.. option:: create_serialize=bool
 
-iodepth=int	This defines how many io units to keep in flight against
-		the file. The default is 1 for each file defined in this
-		job, can be overridden with a larger value for higher
-		concurrency. Note that increasing iodepth beyond 1 will not
-		affect synchronous ioengines (except for small degress when
-		verify_async is in use). Even async engines may impose OS
-		restrictions causing the desired depth not to be achieved.
-		This may happen on Linux when using libaio and not setting
-		direct=1, since buffered IO is not async on that OS. Keep an
-		eye on the IO depth distribution in the fio output to verify
-		that the achieved depth is as expected. Default: 1.
+	If true, serialize the file creation for the jobs.  This may be handy to
+	avoid interleaving of data files, which may greatly depend on the filesystem
+	used and even the number of processors in the system.
 
-iodepth_batch_submit=int
-iodepth_batch=int This defines how many pieces of IO to submit at once.
-		It defaults to 1 which means that we submit each IO
-		as soon as it is available, but can be raised to submit
-		bigger batches of IO at the time.
+.. option:: create_fsync=bool
 
-iodepth_batch_complete=int This defines how many pieces of IO to retrieve
-		at once. It defaults to 1 which means that we'll ask
-		for a minimum of 1 IO in the retrieval process from
-		the kernel. The IO retrieval will go on until we
-		hit the limit set by iodepth_low. If this variable is
-		set to 0, then fio will always check for completed
-		events before queuing more IO. This helps reduce
-		IO latency, at the cost of more retrieval system calls.
+	fsync the data file after creation. This is the default.
 
-iodepth_low=int	The low water mark indicating when to start filling
-		the queue again. Defaults to the same as iodepth, meaning
-		that fio will attempt to keep the queue full at all times.
-		If iodepth is set to eg 16 and iodepth_low is set to 4, then
-		after fio has filled the queue of 16 requests, it will let
-		the depth drain down to 4 before starting to fill it again.
+.. option:: create_on_open=bool
 
-direct=bool	If value is true, use non-buffered io. This is usually
-		O_DIRECT. Note that ZFS on Solaris doesn't support direct io.
-		On Windows the synchronous ioengines don't support direct io.
+	Don't pre-setup the files for I/O, just create open() when it's time to do
+	I/O to that file.
 
-atomic=bool	If value is true, attempt to use atomic direct IO. Atomic
-		writes are guaranteed to be stable once acknowledged by
-		the operating system. Only Linux supports O_ATOMIC right
-		now.
+.. option:: create_only=bool
 
-buffered=bool	If value is true, use buffered io. This is the opposite
-		of the 'direct' option. Defaults to true.
+	If true, fio will only run the setup phase of the job.  If files need to be
+	laid out or updated on disk, only that will be done. The actual job contents
+	are not executed.
 
-offset=int	Start io at the given offset in the file. The data before
-		the given offset will not be touched. This effectively
-		caps the file size at real_size - offset.
+.. option:: allow_file_create=bool
 
-offset_increment=int	If this is provided, then the real offset becomes
-		offset + offset_increment * thread_number, where the thread
-		number is a counter that starts at 0 and is incremented for
-		each sub-job (i.e. when numjobs option is specified). This
-		option is useful if there are several jobs which are intended
-		to operate on a file in parallel disjoint segments, with
-		even spacing between the starting points.
+	If true, fio is permitted to create files as part of its workload. This is
+	the default behavior. If this option is false, then fio will error out if
+	the files it needs to use don't already exist. Default: true.
 
-number_ios=int	Fio will normally perform IOs until it has exhausted the size
-		of the region set by size=, or if it exhaust the allocated
-		time (or hits an error condition). With this setting, the
-		range/size can be set independently of the number of IOs to
-		perform. When fio reaches this number, it will exit normally
-		and report status. Note that this does not extend the amount
-		of IO that will be done, it will only stop fio if this
-		condition is met before other end-of-job criteria.
+.. option:: allow_mounted_write=bool
 
-fsync=int	If writing to a file, issue a sync of the dirty data
-		for every number of blocks given. For example, if you give
-		32 as a parameter, fio will sync the file for every 32
-		writes issued. If fio is using non-buffered io, we may
-		not sync the file. The exception is the sg io engine, which
-		synchronizes the disk cache anyway.
+	If this isn't set, fio will abort jobs that are destructive (e.g. that write)
+	to what appears to be a mounted device or partition. This should help catch
+	creating inadvertently destructive tests, not realizing that the test will
+	destroy data on the mounted file system. Note that some platforms don't allow
+	writing against a mounted device regardless of this option. Default: false.
 
-fdatasync=int	Like fsync= but uses fdatasync() to only sync data and not
-		metadata blocks.
-		In FreeBSD and Windows there is no fdatasync(), this falls back to
-		using fsync()
+.. option:: pre_read=bool
 
-sync_file_range=str:val	Use sync_file_range() for every 'val' number of
-		write operations. Fio will track range of writes that
-		have happened since the last sync_file_range() call. 'str'
-		can currently be one or more of:
+	If this is given, files will be pre-read into memory before starting the
+	given I/O operation. This will also clear the :option:`invalidate` flag,
+	since it is pointless to pre-read and then drop the cache. This will only
+	work for I/O engines that are seek-able, since they allow you to read the
+	same data multiple times. Thus it will not work on e.g. network or splice I/O.
 
-		wait_before	SYNC_FILE_RANGE_WAIT_BEFORE
-		write		SYNC_FILE_RANGE_WRITE
-		wait_after	SYNC_FILE_RANGE_WAIT_AFTER
+.. option:: unlink=bool
 
-		So if you do sync_file_range=wait_before,write:8, fio would
-		use SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE for
-		every 8 writes. Also see the sync_file_range(2) man page.
-		This option is Linux specific.
+	Unlink the job files when done. Not the default, as repeated runs of that
+	job would then waste time recreating the file set again and again.
 
-overwrite=bool	If true, writes to a file will always overwrite existing
-		data. If the file doesn't already exist, it will be
-		created before the write phase begins. If the file exists
-		and is large enough for the specified write phase, nothing
-		will be done.
+.. option:: unlink_each_loop=bool
 
-end_fsync=bool	If true, fsync file contents when a write stage has completed.
+	Unlink job files after each iteration or loop.
 
-fsync_on_close=bool	If true, fio will fsync() a dirty file on close.
-		This differs from end_fsync in that it will happen on every
-		file close, not just at the end of the job.
+.. option:: zonesize=int
 
-rwmixread=int	How large a percentage of the mix should be reads.
+	Divide a file into zones of the specified size. See :option:`zoneskip`.
 
-rwmixwrite=int	How large a percentage of the mix should be writes. If both
-		rwmixread and rwmixwrite is given and the values do not add
-		up to 100%, the latter of the two will be used to override
-		the first. This may interfere with a given rate setting,
-		if fio is asked to limit reads or writes to a certain rate.
-		If that is the case, then the distribution may be skewed.
+.. option:: zonerange=int
 
-random_distribution=str:float	By default, fio will use a completely uniform
-		random distribution when asked to perform random IO. Sometimes
-		it is useful to skew the distribution in specific ways,
-		ensuring that some parts of the data is more hot than others.
-		fio includes the following distribution models:
+	Give size of an I/O zone.  See :option:`zoneskip`.
+
+.. option:: zoneskip=int
+
+	Skip the specified number of bytes when :option:`zonesize` data has been
+	read. The two zone options can be used to only do I/O on zones of a file.
+
+
+I/O type
+~~~~~~~~
+
+.. option:: direct=bool
+
+	If value is true, use non-buffered I/O. This is usually O_DIRECT. Note that
+	ZFS on Solaris doesn't support direct I/O.  On Windows the synchronous
+	ioengines don't support direct I/O.  Default: false.
+
+.. option:: atomic=bool
+
+	If value is true, attempt to use atomic direct I/O. Atomic writes are
+	guaranteed to be stable once acknowledged by the operating system. Only
+	Linux supports O_ATOMIC right now.
+
+.. option:: buffered=bool
+
+	If value is true, use buffered I/O. This is the opposite of the
+	:option:`direct` option. Defaults to true.
+
+.. option:: readwrite=str, rw=str
+
+	Type of I/O pattern. Accepted values are:
+
+		**read**
+				Sequential reads.
+		**write**
+				Sequential writes.
+		**trim**
+				Sequential trims (Linux block devices only).
+		**randwrite**
+				Random writes.
+		**randread**
+				Random reads.
+		**randtrim**
+				Random trims (Linux block devices only).
+		**rw,readwrite**
+				Sequential mixed reads and writes.
+		**randrw**
+				Random mixed reads and writes.
+		**trimwrite**
+				Sequential trim+write sequences. Blocks will be trimmed first,
+				then the same blocks will be written to.
+
+	Fio defaults to read if the option is not specified.  For the mixed I/O
+	types, the default is to split them 50/50.  For certain types of I/O the
+	result may still be skewed a bit, since the speed may be different. It is
+	possible to specify a number of I/O's to do before getting a new offset,
+	this is done by appending a ``:<nr>`` to the end of the string given.  For a
+	random read, it would look like ``rw=randread:8`` for passing in an offset
+	modifier with a value of 8. If the suffix is used with a sequential I/O
+	pattern, then the value specified will be added to the generated offset for
+	each I/O.  For instance, using ``rw=write:4k`` will skip 4k for every
+	write. It turns sequential I/O into sequential I/O with holes.  See the
+	:option:`rw_sequencer` option.
+
+.. option:: rw_sequencer=str
+
+	If an offset modifier is given by appending a number to the ``rw=<str>``
+	line, then this option controls how that number modifies the I/O offset
+	being generated. Accepted values are:
+
+		**sequential**
+			Generate sequential offset.
+		**identical**
+			Generate the same offset.
+
+	``sequential`` is only useful for random I/O, where fio would normally
+	generate a new random offset for every I/O. If you append e.g. 8 to randread,
+	you would get a new random offset for every 8 I/O's. The result would be a
+	seek for only every 8 I/O's, instead of for every I/O. Use ``rw=randread:8``
+	to specify that. As sequential I/O is already sequential, setting
+	``sequential`` for that would not result in any differences.  ``identical``
+	behaves in a similar fashion, except it sends the same offset 8 number of
+	times before generating a new offset.
+
+.. option:: unified_rw_reporting=bool
+
+	Fio normally reports statistics on a per data direction basis, meaning that
+	reads, writes, and trims are accounted and reported separately. If this
+	option is set fio sums the results and report them as "mixed" instead.
+
+.. option:: randrepeat=bool
+
+	Seed the random number generator used for random I/O patterns in a
+	predictable way so the pattern is repeatable across runs. Default: true.
+
+.. option:: allrandrepeat=bool
+
+	Seed all random number generators in a predictable way so results are
+	repeatable across runs.  Default: false.
+
+.. option:: randseed=int
+
+	Seed the random number generators based on this seed value, to be able to
+	control what sequence of output is being generated.  If not set, the random
+	sequence depends on the :option:`randrepeat` setting.
+
+.. option:: fallocate=str
+
+	Whether pre-allocation is performed when laying down files.
+	Accepted values are:
+
+		**none**
+			Do not pre-allocate space.
+
+		**posix**
+			Pre-allocate via :manpage:`posix_fallocate(3)`.
+
+		**keep**
+			Pre-allocate via :manpage:`fallocate(2)` with
+			FALLOC_FL_KEEP_SIZE set.
+
+		**0**
+			Backward-compatible alias for **none**.
+
+		**1**
+			Backward-compatible alias for **posix**.
+
+	May not be available on all supported platforms. **keep** is only available
+	on Linux. If using ZFS on Solaris this must be set to **none** because ZFS
+	doesn't support it. Default: **posix**.
+
+.. option:: fadvise_hint=str
+
+	Use :manpage:`posix_fadvise(2)` to advise the kernel on what I/O patterns
+	are likely to be issued.  Accepted values are:
+
+		**0**
+			Backwards-compatible hint for "no hint".
+
+		**1**
+			Backwards compatible hint for "advise with fio workload type". This
+			uses **FADV_RANDOM** for a random workload, and **FADV_SEQUENTIAL**
+			for a sequential workload.
+
+		**sequential**
+			Advise using **FADV_SEQUENTIAL**.
+
+		**random**
+			Advise using **FADV_RANDOM**.
+
+.. option:: fadvise_stream=int
+
+	Use :manpage:`posix_fadvise(2)` to advise the kernel what stream ID the
+	writes issued belong to. Only supported on Linux. Note, this option may
+	change going forward.
+
+.. option:: offset=int
+
+	Start I/O at the given offset in the file. The data before the given offset
+	will not be touched. This effectively caps the file size at `real_size -
+	offset`. Can be combined with :option:`size` to constrain the start and
+	end range that I/O will be done within.
+
+.. option:: offset_increment=int
+
+	If this is provided, then the real offset becomes `offset + offset_increment
+	* thread_number`, where the thread number is a counter that starts at 0 and
+	is incremented for each sub-job (i.e. when :option:`numjobs` option is
+	specified). This option is useful if there are several jobs which are
+	intended to operate on a file in parallel disjoint segments, with even
+	spacing between the starting points.
+
+.. option:: number_ios=int
+
+	Fio will normally perform I/Os until it has exhausted the size of the region
+	set by :option:`size`, or if it exhaust the allocated time (or hits an error
+	condition). With this setting, the range/size can be set independently of
+	the number of I/Os to perform. When fio reaches this number, it will exit
+	normally and report status. Note that this does not extend the amount of I/O
+	that will be done, it will only stop fio if this condition is met before
+	other end-of-job criteria.
+
+.. option:: fsync=int
+
+	If writing to a file, issue a sync of the dirty data for every number of
+	blocks given. For example, if you give 32 as a parameter, fio will sync the
+	file for every 32 writes issued. If fio is using non-buffered I/O, we may
+	not sync the file. The exception is the sg I/O engine, which synchronizes
+	the disk cache anyway. Defaults to 0, which means no sync every certain
+	number of writes.
+
+.. option:: fdatasync=int
+
+	Like :option:`fsync` but uses :manpage:`fdatasync(2)` to only sync data and
+	not metadata blocks.  In Windows, FreeBSD, and DragonFlyBSD there is no
+	:manpage:`fdatasync(2)`, this falls back to using :manpage:`fsync(2)`.
+	Defaults to 0, which means no sync data every certain number of writes.
+
+.. option:: write_barrier=int
+
+   Make every `N-th` write a barrier write.
+
+.. option:: sync_file_range=str:val
+
+	Use :manpage:`sync_file_range(2)` for every `val` number of write
+	operations. Fio will track range of writes that have happened since the last
+	:manpage:`sync_file_range(2)` call. `str` can currently be one or more of:
+
+		**wait_before**
+			SYNC_FILE_RANGE_WAIT_BEFORE
+		**write**
+			SYNC_FILE_RANGE_WRITE
+		**wait_after**
+			SYNC_FILE_RANGE_WAIT_AFTER
+
+	So if you do ``sync_file_range=wait_before,write:8``, fio would use
+	``SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE`` for every 8
+	writes. Also see the :manpage:`sync_file_range(2)` man page.  This option is
+	Linux specific.
+
+.. option:: overwrite=bool
+
+	If true, writes to a file will always overwrite existing data. If the file
+	doesn't already exist, it will be created before the write phase begins. If
+	the file exists and is large enough for the specified write phase, nothing
+	will be done.
+
+.. option:: end_fsync=bool
+
+	If true, fsync file contents when a write stage has completed.
+
+.. option:: fsync_on_close=bool
+
+	If true, fio will :manpage:`fsync(2)` a dirty file on close.  This differs
+	from end_fsync in that it will happen on every file close, not just at the
+	end of the job.
+
+.. option:: rwmixread=int
+
+	Percentage of a mixed workload that should be reads. Default: 50.
+
+.. option:: rwmixwrite=int
+
+	Percentage of a mixed workload that should be writes. If both
+	:option:`rwmixread` and :option:`rwmixwrite` is given and the values do not
+	add up to 100%, the latter of the two will be used to override the
+	first. This may interfere with a given rate setting, if fio is asked to
+	limit reads or writes to a certain rate.  If that is the case, then the
+	distribution may be skewed. Default: 50.
+
+.. option:: random_distribution=str:float[,str:float][,str:float]
+
+	By default, fio will use a completely uniform random distribution when asked
+	to perform random I/O. Sometimes it is useful to skew the distribution in
+	specific ways, ensuring that some parts of the data is more hot than others.
+	fio includes the following distribution models:
+
+		**random**
+				Uniform random distribution
+
+		**zipf**
+				Zipf distribution
+
+		**pareto**
+				Pareto distribution
+
+		**gauss**
+				Normal (Gaussian) distribution
 
-		random		Uniform random distribution
-		zipf		Zipf distribution
-		pareto		Pareto distribution
+		**zoned**
+				Zoned random distribution
 
-		When using a zipf or pareto distribution, an input value
-		is also needed to define the access pattern. For zipf, this
-		is the zipf theta. For pareto, it's the pareto power. Fio
-		includes a test program, genzipf, that can be used visualize
-		what the given input values will yield in terms of hit rates.
-		If you wanted to use zipf with a theta of 1.2, you would use
-		random_distribution=zipf:1.2 as the option. If a non-uniform
-		model is used, fio will disable use of the random map.
+	When using a **zipf** or **pareto** distribution, an input value is also
+	needed to define the access pattern. For **zipf**, this is the `zipf
+	theta`. For **pareto**, it's the `Pareto power`. Fio includes a test
+	program, :command:`genzipf`, that can be used visualize what the given input
+	values will yield in terms of hit rates.  If you wanted to use **zipf** with
+	a `theta` of 1.2, you would use ``random_distribution=zipf:1.2`` as the
+	option. If a non-uniform model is used, fio will disable use of the random
+	map. For the **gauss** distribution, a normal deviation is supplied as a
+	value between 0 and 100.
 
-percentage_random=int	For a random workload, set how big a percentage should
-		be random. This defaults to 100%, in which case the workload
-		is fully random. It can be set from anywhere from 0 to 100.
-		Setting it to 0 would make the workload fully sequential. Any
-		setting in between will result in a random mix of sequential
-		and random IO, at the given percentages. It is possible to
-		set different values for reads, writes, and trim. To do so,
-		simply use a comma separated list. See blocksize.
-	
-norandommap	Normally fio will cover every block of the file when doing
-		random IO. If this option is given, fio will just get a
-		new random offset without looking at past io history. This
-		means that some blocks may not be read or written, and that
-		some blocks may be read/written more than once. If this option
-		is used with verify= and multiple blocksizes (via bsrange=),
-		only intact blocks are verified, i.e., partially-overwritten
-		blocks are ignored.
+	For a **zoned** distribution, fio supports specifying percentages of I/O
+	access that should fall within what range of the file or device. For
+	example, given a criteria of:
 
-softrandommap=bool See norandommap. If fio runs with the random block map
-		enabled and it fails to allocate the map, if this option is
-		set it will continue without a random block map. As coverage
-		will not be as complete as with random maps, this option is
-		disabled by default.
+	* 60% of accesses should be to the first 10%
+	* 30% of accesses should be to the next 20%
+	* 8% of accesses should be to to the next 30%
+	* 2% of accesses should be to the next 40%
 
-random_generator=str	Fio supports the following engines for generating
-		IO offsets for random IO:
+	we can define that through zoning of the random accesses. For the above
+	example, the user would do::
 
-		tausworthe	Strong 2^88 cycle random number generator
-		lfsr		Linear feedback shift register generator
+		random_distribution=zoned:60/10:30/20:8/30:2/40
 
-		Tausworthe is a strong random number generator, but it
-		requires tracking on the side if we want to ensure that
-		blocks are only read or written once. LFSR guarantees
-		that we never generate the same offset twice, and it's
-		also less computationally expensive. It's not a true
-		random generator, however, though for IO purposes it's
-		typically good enough. LFSR only works with single
-		block sizes, not with workloads that use multiple block
-		sizes. If used with such a workload, fio may read or write
-		some blocks multiple times.
+	similarly to how :option:`bssplit` works for setting ranges and percentages
+	of block sizes. Like :option:`bssplit`, it's possible to specify separate
+	zones for reads, writes, and trims. If just one set is given, it'll apply to
+	all of them.
 
-nice=int	Run the job with the given nice value. See man nice(2).
+.. option:: percentage_random=int[,int][,int]
 
-prio=int	Set the io priority value of this job. Linux limits us to
-		a positive value between 0 and 7, with 0 being the highest.
-		See man ionice(1).
+	For a random workload, set how big a percentage should be random. This
+	defaults to 100%, in which case the workload is fully random. It can be set
+	from anywhere from 0 to 100.  Setting it to 0 would make the workload fully
+	sequential. Any setting in between will result in a random mix of sequential
+	and random I/O, at the given percentages.  Comma-separated values may be
+	specified for reads, writes, and trims as described in :option:`blocksize`.
 
-prioclass=int	Set the io priority class. See man ionice(1).
+.. option:: norandommap
 
-thinktime=int	Stall the job x microseconds after an io has completed before
-		issuing the next. May be used to simulate processing being
-		done by an application. See thinktime_blocks and
-		thinktime_spin.
+	Normally fio will cover every block of the file when doing random I/O. If
+	this option is given, fio will just get a new random offset without looking
+	at past I/O history. This means that some blocks may not be read or written,
+	and that some blocks may be read/written more than once. If this option is
+	used with :option:`verify` and multiple blocksizes (via :option:`bsrange`),
+	only intact blocks are verified, i.e., partially-overwritten blocks are
+	ignored.
 
-thinktime_spin=int
-		Only valid if thinktime is set - pretend to spend CPU time
-		doing something with the data received, before falling back
-		to sleeping for the rest of the period specified by
-		thinktime.
+.. option:: softrandommap=bool
 
-thinktime_blocks=int
-		Only valid if thinktime is set - control how many blocks
-		to issue, before waiting 'thinktime' usecs. If not set,
-		defaults to 1 which will make fio wait 'thinktime' usecs
-		after every block. This effectively makes any queue depth
-		setting redundant, since no more than 1 IO will be queued
-		before we have to complete it and do our thinktime. In
-		other words, this setting effectively caps the queue depth
-		if the latter is larger.
+	See :option:`norandommap`. If fio runs with the random block map enabled and
+	it fails to allocate the map, if this option is set it will continue without
+	a random block map. As coverage will not be as complete as with random maps,
+	this option is disabled by default.
 
-rate=int	Cap the bandwidth used by this job. The number is in bytes/sec,
-		the normal suffix rules apply. You can use rate=500k to limit
-		reads and writes to 500k each, or you can specify read and
-		writes separately. Using rate=1m,500k would limit reads to
-		1MB/sec and writes to 500KB/sec. Capping only reads or
-		writes can be done with rate=,500k or rate=500k,. The former
-		will only limit writes (to 500KB/sec), the latter will only
-		limit reads.
+.. option:: random_generator=str
 
-ratemin=int	Tell fio to do whatever it can to maintain at least this
-		bandwidth. Failing to meet this requirement, will cause
-		the job to exit. The same format as rate is used for
-		read vs write separation.
+	Fio supports the following engines for generating
+	I/O offsets for random I/O:
 
-rate_iops=int	Cap the bandwidth to this number of IOPS. Basically the same
-		as rate, just specified independently of bandwidth. If the
-		job is given a block size range instead of a fixed value,
-		the smallest block size is used as the metric. The same format
-		as rate is used for read vs write separation.
+		**tausworthe**
+			Strong 2^88 cycle random number generator
+		**lfsr**
+			Linear feedback shift register generator
+		**tausworthe64**
+			Strong 64-bit 2^258 cycle random number generator
 
-rate_iops_min=int If fio doesn't meet this rate of IO, it will cause
-		the job to exit. The same format as rate is used for read vs
-		write separation.
+	**tausworthe** is a strong random number generator, but it requires tracking
+	on the side if we want to ensure that blocks are only read or written
+	once. **LFSR** guarantees that we never generate the same offset twice, and
+	it's also less computationally expensive. It's not a true random generator,
+	however, though for I/O purposes it's typically good enough. **LFSR** only
+	works with single block sizes, not with workloads that use multiple block
+	sizes. If used with such a workload, fio may read or write some blocks
+	multiple times. The default value is **tausworthe**, unless the required
+	space exceeds 2^32 blocks. If it does, then **tausworthe64** is
+	selected automatically.
 
-latency_target=int	If set, fio will attempt to find the max performance
-		point that the given workload will run at while maintaining a
-		latency below this target. The values is given in microseconds.
-		See latency_window and latency_percentile
 
-latency_window=int	Used with latency_target to specify the sample window
-		that the job is run at varying queue depths to test the
-		performance. The value is given in microseconds.
+Block size
+~~~~~~~~~~
 
-latency_percentile=float	The percentage of IOs that must fall within the
-		criteria specified by latency_target and latency_window. If not
-		set, this defaults to 100.0, meaning that all IOs must be equal
-		or below to the value set by latency_target.
+.. option:: blocksize=int[,int][,int], bs=int[,int][,int]
 
-max_latency=int	If set, fio will exit the job if it exceeds this maximum
-		latency. It will exit with an ETIME error.
+	The block size in bytes used for I/O units. Default: 4096.  A single value
+	applies to reads, writes, and trims.  Comma-separated values may be
+	specified for reads, writes, and trims.  A value not terminated in a comma
+	applies to subsequent types.
 
-ratecycle=int	Average bandwidth for 'rate' and 'ratemin' over this number
-		of milliseconds.
+	Examples:
 
-cpumask=int	Set the CPU affinity of this job. The parameter given is a
-		bitmask of allowed CPU's the job may run on. So if you want
-		the allowed CPUs to be 1 and 5, you would pass the decimal
-		value of (1 << 1 | 1 << 5), or 34. See man
-		sched_setaffinity(2). This may not work on all supported
-		operating systems or kernel versions. This option doesn't
-		work well for a higher CPU count than what you can store in
-		an integer mask, so it can only control cpus 1-32. For
-		boxes with larger CPU counts, use cpus_allowed.
+		**bs=256k**
+			means 256k for reads, writes and trims.
 
-cpus_allowed=str Controls the same options as cpumask, but it allows a text
-		setting of the permitted CPUs instead. So to use CPUs 1 and
-		5, you would specify cpus_allowed=1,5. This options also
-		allows a range of CPUs. Say you wanted a binding to CPUs
-		1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
+		**bs=8k,32k**
+			means 8k for reads, 32k for writes and trims.
 
-cpus_allowed_policy=str Set the policy of how fio distributes the CPUs
-		specified by cpus_allowed or cpumask. Two policies are
-		supported:
+		**bs=8k,32k,**
+			means 8k for reads, 32k for writes, and default for trims.
 
-		shared	All jobs will share the CPU set specified.
-		split	Each job will get a unique CPU from the CPU set.
+		**bs=,8k**
+			means default for reads, 8k for writes and trims.
 
-		'shared' is the default behaviour, if the option isn't
-		specified. If split is specified, then fio will will assign
-		one cpu per job. If not enough CPUs are given for the jobs
-		listed, then fio will roundrobin the CPUs in the set.
+		**bs=,8k,**
+			means default for reads, 8k for writes, and default for writes.
 
-numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
-		arguments allow comma delimited list of cpu numbers,
-		A-B ranges, or 'all'. Note, to enable numa options support,
-		fio must be built on a system with libnuma-dev(el) installed.
+.. option:: blocksize_range=irange[,irange][,irange], bsrange=irange[,irange][,irange]
 
-numa_mem_policy=str Set this job's memory policy and corresponding NUMA
-		nodes. Format of the argements:
-			<mode>[:<nodelist>]
-		`mode' is one of the following memory policy:
-			default, prefer, bind, interleave, local
-		For `default' and `local' memory policy, no node is
-		needed to be specified.
-		For `prefer', only one node is allowed.
-		For `bind' and `interleave', it allow comma delimited
-		list of numbers, A-B ranges, or 'all'.
+	A range of block sizes in bytes for I/O units.  The issued I/O unit will
+	always be a multiple of the minimum size, unless
+	:option:`blocksize_unaligned` is set.
 
-startdelay=time	Start this job the specified number of seconds after fio
-		has started. Only useful if the job file contains several
-		jobs, and you want to delay starting some jobs to a certain
-		time.
+	Comma-separated ranges may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
 
-runtime=time	Tell fio to terminate processing after the specified number
-		of seconds. It can be quite hard to determine for how long
-		a specified job will run, so this parameter is handy to
-		cap the total runtime to a given time.
+	Example: ``bsrange=1k-4k,2k-8k``.
 
-time_based	If set, fio will run for the duration of the runtime
-		specified even if the file(s) are completely read or
-		written. It will simply loop over the same workload
-		as many times as the runtime allows.
+.. option:: bssplit=str[,str][,str]
 
-ramp_time=time	If set, fio will run the specified workload for this amount
-		of time before logging any performance numbers. Useful for
-		letting performance settle before logging results, thus
-		minimizing the runtime required for stable results. Note
-		that the ramp_time is considered lead in time for a job,
-		thus it will increase the total runtime if a special timeout
-		or runtime is specified.
+	Sometimes you want even finer grained control of the block sizes issued, not
+	just an even split between them.  This option allows you to weight various
+	block sizes, so that you are able to define a specific amount of block sizes
+	issued. The format for this option is::
 
-invalidate=bool	Invalidate the buffer/page cache parts for this file prior
-		to starting io. Defaults to true.
+		bssplit=blocksize/percentage:blocksize/percentage
 
-sync=bool	Use sync io for buffered writes. For the majority of the
-		io engines, this means using O_SYNC.
+	for as many block sizes as needed. So if you want to define a workload that
+	has 50% 64k blocks, 10% 4k blocks, and 40% 32k blocks, you would write::
 
-iomem=str
-mem=str		Fio can use various types of memory as the io unit buffer.
-		The allowed values are:
+		bssplit=4k/10:64k/50:32k/40
 
-			malloc	Use memory from malloc(3) as the buffers.
+	Ordering does not matter. If the percentage is left blank, fio will fill in
+	the remaining values evenly. So a bssplit option like this one::
 
-			shm	Use shared memory as the buffers. Allocated
-				through shmget(2).
+		bssplit=4k/50:1k/:32k/
 
-			shmhuge	Same as shm, but use huge pages as backing.
+	would have 50% 4k ios, and 25% 1k and 32k ios. The percentages always add up
+	to 100, if bssplit is given a range that adds up to more, it will error out.
 
-			mmap	Use mmap to allocate buffers. May either be
-				anonymous memory, or can be file backed if
-				a filename is given after the option. The
-				format is mem=mmap:/path/to/file.
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
 
-			mmaphuge Use a memory mapped huge file as the buffer
-				backing. Append filename after mmaphuge, ala
-				mem=mmaphuge:/hugetlbfs/file
+	If you want a workload that has 50% 2k reads and 50% 4k reads, while having
+	90% 4k writes and 10% 8k writes, you would specify::
 
-		The area allocated is a function of the maximum allowed
-		bs size for the job, multiplied by the io depth given. Note
-		that for shmhuge and mmaphuge to work, the system must have
-		free huge pages allocated. This can normally be checked
-		and set by reading/writing /proc/sys/vm/nr_hugepages on a
-		Linux system. Fio assumes a huge page is 4MB in size. So
-		to calculate the number of huge pages you need for a given
-		job file, add up the io depth of all jobs (normally one unless
-		iodepth= is used) and multiply by the maximum bs set. Then
-		divide that number by the huge page size. You can see the
-		size of the huge pages in /proc/meminfo. If no huge pages
-		are allocated by having a non-zero number in nr_hugepages,
-		using mmaphuge or shmhuge will fail. Also see hugepage-size.
+		bssplit=2k/50:4k/50,4k/90,8k/10
 
-		mmaphuge also needs to have hugetlbfs mounted and the file
-		location should point there. So if it's mounted in /huge,
-		you would use mem=mmaphuge:/huge/somefile.
+.. option:: blocksize_unaligned, bs_unaligned
 
-iomem_align=int	This indiciates the memory alignment of the IO memory buffers.
-		Note that the given alignment is applied to the first IO unit
-		buffer, if using iodepth the alignment of the following buffers
-		are given by the bs used. In other words, if using a bs that is
-		a multiple of the page sized in the system, all buffers will
-		be aligned to this value. If using a bs that is not page
-		aligned, the alignment of subsequent IO memory buffers is the
-		sum of the iomem_align and bs used.
+	If set, fio will issue I/O units with any size within
+	:option:`blocksize_range`, not just multiples of the minimum size.  This
+	typically won't work with direct I/O, as that normally requires sector
+	alignment.
 
-hugepage-size=int
-		Defines the size of a huge page. Must at least be equal
-		to the system setting, see /proc/meminfo. Defaults to 4MB.
-		Should probably always be a multiple of megabytes, so using
-		hugepage-size=Xm is the preferred way to set this to avoid
-		setting a non-pow-2 bad value.
+.. option:: bs_is_seq_rand
 
-exitall		When one job finishes, terminate the rest. The default is
-		to wait for each job to finish, sometimes that is not the
-		desired action.
+	If this option is set, fio will use the normal read,write blocksize settings
+	as sequential,random blocksize settings instead. Any random read or write
+	will use the WRITE blocksize settings, and any sequential read or write will
+	use the READ blocksize settings.
 
-bwavgtime=int	Average the calculated bandwidth over the given time. Value
-		is specified in milliseconds.
+.. option:: blockalign=int[,int][,int], ba=int[,int][,int]
 
-iopsavgtime=int	Average the calculated IOPS over the given time. Value
-		is specified in milliseconds.
+	Boundary to which fio will align random I/O units.  Default:
+	:option:`blocksize`.  Minimum alignment is typically 512b for using direct
+	I/O, though it usually depends on the hardware block size. This option is
+	mutually exclusive with using a random map for files, so it will turn off
+	that option.  Comma-separated values may be specified for reads, writes, and
+	trims as described in :option:`blocksize`.
 
-create_serialize=bool	If true, serialize the file creating for the jobs.
-			This may be handy to avoid interleaving of data
-			files, which may greatly depend on the filesystem
-			used and even the number of processors in the system.
 
-create_fsync=bool	fsync the data file after creation. This is the
-			default.
+Buffers and memory
+~~~~~~~~~~~~~~~~~~
 
-create_on_open=bool	Don't pre-setup the files for IO, just create open()
-			when it's time to do IO to that file.
+.. option:: zero_buffers
 
-create_only=bool	If true, fio will only run the setup phase of the job.
-			If files need to be laid out or updated on disk, only
-			that will be done. The actual job contents are not
-			executed.
+	Initialize buffers with all zeros. Default: fill buffers with random data.
 
-pre_read=bool	If this is given, files will be pre-read into memory before
-		starting the given IO operation. This will also clear
-		the 'invalidate' flag, since it is pointless to pre-read
-		and then drop the cache. This will only work for IO engines
-		that are seekable, since they allow you to read the same data
-		multiple times. Thus it will not work on eg network or splice
-		IO.
+.. option:: refill_buffers
 
-unlink=bool	Unlink the job files when done. Not the default, as repeated
-		runs of that job would then waste time recreating the file
-		set again and again.
+	If this option is given, fio will refill the I/O buffers on every
+	submit. The default is to only fill it at init time and reuse that
+	data. Only makes sense if zero_buffers isn't specified, naturally. If data
+	verification is enabled, `refill_buffers` is also automatically enabled.
 
-loops=int	Run the specified number of iterations of this job. Used
-		to repeat the same workload a given number of times. Defaults
-		to 1.
+.. option:: scramble_buffers=bool
 
-verify_only	Do not perform specified workload---only verify data still
-		matches previous invocation of this workload. This option
-		allows one to check data multiple times at a later date
-		without overwriting it. This option makes sense only for
-		workloads that write data, and does not support workloads
-		with the time_based option set.
+	If :option:`refill_buffers` is too costly and the target is using data
+	deduplication, then setting this option will slightly modify the I/O buffer
+	contents to defeat normal de-dupe attempts. This is not enough to defeat
+	more clever block compression attempts, but it will stop naive dedupe of
+	blocks. Default: true.
 
-do_verify=bool	Run the verify phase after a write phase. Only makes sense if
-		verify is set. Defaults to 1.
+.. option:: buffer_compress_percentage=int
 
-verify=str	If writing to a file, fio can verify the file contents
-		after each iteration of the job. The allowed values are:
+	If this is set, then fio will attempt to provide I/O buffer content (on
+	WRITEs) that compress to the specified level. Fio does this by providing a
+	mix of random data and a fixed pattern. The fixed pattern is either zeroes,
+	or the pattern specified by :option:`buffer_pattern`. If the pattern option
+	is used, it might skew the compression ratio slightly. Note that this is per
+	block size unit, for file/disk wide compression level that matches this
+	setting, you'll also want to set :option:`refill_buffers`.
 
-			md5	Use an md5 sum of the data area and store
-				it in the header of each block.
+.. option:: buffer_compress_chunk=int
 
-			crc64	Use an experimental crc64 sum of the data
-				area and store it in the header of each
-				block.
+	See :option:`buffer_compress_percentage`. This setting allows fio to manage
+	how big the ranges of random data and zeroed data is. Without this set, fio
+	will provide :option:`buffer_compress_percentage` of blocksize random data,
+	followed by the remaining zeroed. With this set to some chunk size smaller
+	than the block size, fio can alternate random and zeroed data throughout the
+	I/O buffer.
 
-			crc32c	Use a crc32c sum of the data area and store
-				it in the header of each block.
+.. option:: buffer_pattern=str
 
-			crc32c-intel Use hardware assisted crc32c calcuation
-				provided on SSE4.2 enabled processors. Falls
-				back to regular software crc32c, if not
-				supported by the system.
+	If set, fio will fill the I/O buffers with this pattern. If not set, the
+	contents of I/O buffers is defined by the other options related to buffer
+	contents. The setting can be any pattern of bytes, and can be prefixed with
+	0x for hex values. It may also be a string, where the string must then be
+	wrapped with ``""``, e.g.::
 
-			crc32	Use a crc32 sum of the data area and store
-				it in the header of each block.
+		buffer_pattern="abcd"
 
-			crc16	Use a crc16 sum of the data area and store
-				it in the header of each block.
+	or::
 
-			crc7	Use a crc7 sum of the data area and store
-				it in the header of each block.
+		buffer_pattern=-12
 
-			xxhash	Use xxhash as the checksum function. Generally
-				the fastest software checksum that fio
-				supports.
+	or::
 
-			sha512	Use sha512 as the checksum function.
+		buffer_pattern=0xdeadface
 
-			sha256	Use sha256 as the checksum function.
+	Also you can combine everything together in any order::
 
-			sha1	Use optimized sha1 as the checksum function.
+		buffer_pattern=0xdeadface"abcd"-12
 
-			meta	Write extra information about each io
-				(timestamp, block number etc.). The block
-				number is verified. The io sequence number is
-				verified for workloads that write data.
-				See also verify_pattern.
+.. option:: dedupe_percentage=int
 
-			null	Only pretend to verify. Useful for testing
-				internals with ioengine=null, not for much
-				else.
+	If set, fio will generate this percentage of identical buffers when
+	writing. These buffers will be naturally dedupable. The contents of the
+	buffers depend on what other buffer compression settings have been set. It's
+	possible to have the individual buffers either fully compressible, or not at
+	all. This option only controls the distribution of unique buffers.
 
-		This option can be used for repeated burn-in tests of a
-		system to make sure that the written data is also
-		correctly read back. If the data direction given is
-		a read or random read, fio will assume that it should
-		verify a previously written file. If the data direction
-		includes any form of write, the verify will be of the
-		newly written data.
+.. option:: invalidate=bool
 
-verifysort=bool	If set, fio will sort written verify blocks when it deems
-		it faster to read them back in a sorted manner. This is
-		often the case when overwriting an existing file, since
-		the blocks are already laid out in the file system. You
-		can ignore this option unless doing huge amounts of really
-		fast IO where the red-black tree sorting CPU time becomes
-		significant.
+	Invalidate the buffer/page cache parts for this file prior to starting
+	I/O if the platform and file type support it. Defaults to true.
+	This will be ignored if :option:`pre_read` is also specified for the
+	same job.
 
-verify_offset=int	Swap the verification header with data somewhere else
-			in the block before writing. Its swapped back before
-			verifying.
+.. option:: sync=bool
 
-verify_interval=int	Write the verification header at a finer granularity
-			than the blocksize. It will be written for chunks the
-			size of header_interval. blocksize should divide this
-			evenly.
+	Use synchronous I/O for buffered writes. For the majority of I/O engines,
+	this means using O_SYNC. Default: false.
 
-verify_pattern=str	If set, fio will fill the io buffers with this
-		pattern. Fio defaults to filling with totally random
-		bytes, but sometimes it's interesting to fill with a known
-		pattern for io verification purposes. Depending on the
-		width of the pattern, fio will fill 1/2/3/4 bytes of the
-		buffer at the time(it can be either a decimal or a hex number).
-		The verify_pattern if larger than a 32-bit quantity has to
-		be a hex number that starts with either "0x" or "0X". Use
-		with verify=meta.
+.. option:: iomem=str, mem=str
 
-verify_fatal=bool	Normally fio will keep checking the entire contents
-		before quitting on a block verification failure. If this
-		option is set, fio will exit the job on the first observed
-		failure.
+	Fio can use various types of memory as the I/O unit buffer.  The allowed
+	values are:
 
-verify_dump=bool	If set, dump the contents of both the original data
-		block and the data block we read off disk to files. This
-		allows later analysis to inspect just what kind of data
-		corruption occurred. Off by default.
+		**malloc**
+			Use memory from :manpage:`malloc(3)` as the buffers.  Default memory
+			type.
 
-verify_async=int	Fio will normally verify IO inline from the submitting
-		thread. This option takes an integer describing how many
-		async offload threads to create for IO verification instead,
-		causing fio to offload the duty of verifying IO contents
-		to one or more separate threads. If using this offload
-		option, even sync IO engines can benefit from using an
-		iodepth setting higher than 1, as it allows them to have
-		IO in flight while verifies are running.
+		**shm**
+			Use shared memory as the buffers. Allocated through
+			:manpage:`shmget(2)`.
 
-verify_async_cpus=str	Tell fio to set the given CPU affinity on the
-		async IO verification threads. See cpus_allowed for the
-		format used.
+		**shmhuge**
+			Same as shm, but use huge pages as backing.
 
-verify_backlog=int	Fio will normally verify the written contents of a
-		job that utilizes verify once that job has completed. In
-		other words, everything is written then everything is read
-		back and verified. You may want to verify continually
-		instead for a variety of reasons. Fio stores the meta data
-		associated with an IO block in memory, so for large
-		verify workloads, quite a bit of memory would be used up
-		holding this meta data. If this option is enabled, fio
-		will write only N blocks before verifying these blocks.
+		**mmap**
+			Use mmap to allocate buffers. May either be anonymous memory, or can
+			be file backed if a filename is given after the option. The format
+			is `mem=mmap:/path/to/file`.
 
-verify_backlog_batch=int	Control how many blocks fio will verify
-		if verify_backlog is set. If not set, will default to
-		the value of verify_backlog (meaning the entire queue
-		is read back and verified).  If verify_backlog_batch is
-		less than verify_backlog then not all blocks will be verified,
-		if verify_backlog_batch is larger than verify_backlog, some
-		blocks will be verified more than once.
+		**mmaphuge**
+			Use a memory mapped huge file as the buffer backing. Append filename
+			after mmaphuge, ala `mem=mmaphuge:/hugetlbfs/file`.
 
-verify_state_save=bool	When a job exits during the write phase of a verify
-		workload, save its current state. This allows fio to replay
-		up until that point, if the verify state is loaded for the
-		verify read phase. The format of the filename is, roughly,
-		<type>-<jobname>-<jobindex>-verify.state. <type> is "local"
-		for a local run, "sock" for a client/server socket connection,
-		and "ip" (192.168.0.1, for instance) for a networked
-		client/server connection.
+		**mmapshared**
+			Same as mmap, but use a MMAP_SHARED mapping.
 
-verify_state_load=bool	If a verify termination trigger was used, fio stores
-		the current write state of each thread. This can be used at
-		verification time so that fio knows how far it should verify.
-		Without this information, fio will run a full verification
-		pass, according to the settings in the job file used.
+		**cudamalloc**
+			Use GPU memory as the buffers for GPUDirect RDMA benchmark.
 
-stonewall
-wait_for_previous Wait for preceding jobs in the job file to exit, before
-		starting this one. Can be used to insert serialization
-		points in the job file. A stone wall also implies starting
-		a new reporting group.
+	The area allocated is a function of the maximum allowed bs size for the job,
+	multiplied by the I/O depth given. Note that for **shmhuge** and
+	**mmaphuge** to work, the system must have free huge pages allocated. This
+	can normally be checked and set by reading/writing
+	:file:`/proc/sys/vm/nr_hugepages` on a Linux system. Fio assumes a huge page
+	is 4MiB in size. So to calculate the number of huge pages you need for a
+	given job file, add up the I/O depth of all jobs (normally one unless
+	:option:`iodepth` is used) and multiply by the maximum bs set. Then divide
+	that number by the huge page size. You can see the size of the huge pages in
+	:file:`/proc/meminfo`. If no huge pages are allocated by having a non-zero
+	number in `nr_hugepages`, using **mmaphuge** or **shmhuge** will fail. Also
+	see :option:`hugepage-size`.
 
-new_group	Start a new reporting group. See: group_reporting.
+	**mmaphuge** also needs to have hugetlbfs mounted and the file location
+	should point there. So if it's mounted in :file:`/huge`, you would use
+	`mem=mmaphuge:/huge/somefile`.
 
-numjobs=int	Create the specified number of clones of this job. May be
-		used to setup a larger number of threads/processes doing
-		the same thing. Each thread is reported separately; to see
-		statistics for all clones as a whole, use group_reporting in
-		conjunction with new_group.
+.. option:: iomem_align=int
 
-group_reporting	It may sometimes be interesting to display statistics for
-		groups of jobs as a whole instead of for each individual job.
-		This is especially true if 'numjobs' is used; looking at
-		individual thread/process output quickly becomes unwieldy.
-		To see the final report per-group instead of per-job, use
-		'group_reporting'. Jobs in a file will be part of the same
-		reporting group, unless if separated by a stonewall, or by
-		using 'new_group'.
+	This indicates the memory alignment of the I/O memory buffers.  Note that
+	the given alignment is applied to the first I/O unit buffer, if using
+	:option:`iodepth` the alignment of the following buffers are given by the
+	:option:`bs` used. In other words, if using a :option:`bs` that is a
+	multiple of the page sized in the system, all buffers will be aligned to
+	this value. If using a :option:`bs` that is not page aligned, the alignment
+	of subsequent I/O memory buffers is the sum of the :option:`iomem_align` and
+	:option:`bs` used.
 
-thread		fio defaults to forking jobs, however if this option is
-		given, fio will use pthread_create(3) to create threads
-		instead.
+.. option:: hugepage-size=int
 
-zonesize=int	Divide a file into zones of the specified size. See zoneskip.
+	Defines the size of a huge page. Must at least be equal to the system
+	setting, see :file:`/proc/meminfo`. Defaults to 4MiB.  Should probably
+	always be a multiple of megabytes, so using ``hugepage-size=Xm`` is the
+	preferred way to set this to avoid setting a non-pow-2 bad value.
 
-zoneskip=int	Skip the specified number of bytes when zonesize data has
-		been read. The two zone options can be used to only do
-		io on zones of a file.
+.. option:: lockmem=int
 
-write_iolog=str	Write the issued io patterns to the specified file. See
-		read_iolog.  Specify a separate file for each job, otherwise
-		the iologs will be interspersed and the file may be corrupt.
+	Pin the specified amount of memory with :manpage:`mlock(2)`. Can be used to
+	simulate a smaller amount of memory.  The amount specified is per worker.
 
-read_iolog=str	Open an iolog with the specified file name and replay the
-		io patterns it contains. This can be used to store a
-		workload and replay it sometime later. The iolog given
-		may also be a blktrace binary file, which allows fio
-		to replay a workload captured by blktrace. See blktrace
-		for how to capture such logging data. For blktrace replay,
-		the file needs to be turned into a blkparse binary data
-		file first (blkparse <device> -o /dev/null -d file_for_fio.bin).
 
-replay_no_stall=int When replaying I/O with read_iolog the default behavior
-		is to attempt to respect the time stamps within the log and
-		replay them with the appropriate delay between IOPS.  By
-		setting this variable fio will not respect the timestamps and
-		attempt to replay them as fast as possible while still
-		respecting ordering.  The result is the same I/O pattern to a
-		given device, but different timings.
+I/O size
+~~~~~~~~
 
-replay_redirect=str While replaying I/O patterns using read_iolog the
-		default behavior is to replay the IOPS onto the major/minor
-		device that each IOP was recorded from.  This is sometimes
-		undesirable because on a different machine those major/minor
-		numbers can map to a different device.  Changing hardware on
-		the same system can also result in a different major/minor
-		mapping.  Replay_redirect causes all IOPS to be replayed onto
-		the single specified device regardless of the device it was
-		recorded from. i.e. replay_redirect=/dev/sdc would cause all
-		IO in the blktrace to be replayed onto /dev/sdc.  This means
-		multiple devices will be replayed onto a single, if the trace
-		contains multiple devices.  If you want multiple devices to be
-		replayed concurrently to multiple redirected devices you must
-		blkparse your trace into separate traces and replay them with
-		independent fio invocations.  Unfortuantely this also breaks
-		the strict time ordering between multiple device accesses.
+.. option:: size=int
 
-write_bw_log=str If given, write a bandwidth log of the jobs in this job
-		file. Can be used to store data of the bandwidth of the
-		jobs in their lifetime. The included fio_generate_plots
-		script uses gnuplot to turn these text files into nice
-		graphs. See write_lat_log for behaviour of given
-		filename. For this option, the suffix is _bw.x.log, where
-		x is the index of the job (1..N, where N is the number of
-		jobs).
+	The total size of file I/O for each thread of this job. Fio will run until
+	this many bytes has been transferred, unless runtime is limited by other options
+	(such as :option:`runtime`, for instance, or increased/decreased by :option:`io_size`).
+	Fio will divide this size between the available files determined by options
+	such as :option:`nrfiles`, :option:`filename`, unless :option:`filesize` is
+	specified by the job. If the result of division happens to be 0, the size is
+	set to the physical size of the given files or devices if they exist.
+	If this option is not specified, fio will use the full size of the given
+	files or devices.  If the files do not exist, size must be given. It is also
+	possible to give size as a percentage between 1 and 100. If ``size=20%`` is
+	given, fio will use 20% of the full size of the given files or devices.
+	Can be combined with :option:`offset` to constrain the start and end range
+	that I/O will be done within.
 
-write_lat_log=str Same as write_bw_log, except that this option stores io
-		submission, completion, and total latencies instead. If no
-		filename is given with this option, the default filename of
-		"jobname_type.log" is used. Even if the filename is given,
-		fio will still append the type of log. So if one specifies
+.. option:: io_size=int, io_limit=int
 
-		write_lat_log=foo
+	Normally fio operates within the region set by :option:`size`, which means
+	that the :option:`size` option sets both the region and size of I/O to be
+	performed. Sometimes that is not what you want. With this option, it is
+	possible to define just the amount of I/O that fio should do. For instance,
+	if :option:`size` is set to 20GiB and :option:`io_size` is set to 5GiB, fio
+	will perform I/O within the first 20GiB but exit when 5GiB have been
+	done. The opposite is also possible -- if :option:`size` is set to 20GiB,
+	and :option:`io_size` is set to 40GiB, then fio will do 40GiB of I/O within
+	the 0..20GiB region.
 
-		The actual log names will be foo_slat.x.log, foo_clat.x.log,
-		and foo_lat.x.log, where x is the index of the job (1..N,
-		where N is the number of jobs). This helps fio_generate_plot
-		fine the logs automatically.
+.. option:: filesize=int
 
-write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
-		given with this option, the default filename of
-		"jobname_type.x.log" is used,where x is the index of the job
-		(1..N, where N is the number of jobs). Even if the filename
-		is given, fio will still append the type of log.
+	Individual file sizes. May be a range, in which case fio will select sizes
+	for files at random within the given range and limited to :option:`size` in
+	total (if that is given). If not given, each created file is the same size.
+	This option overrides :option:`size` in terms of file size, which means
+	this value is used as a fixed size or possible range of each file.
 
-log_avg_msec=int By default, fio will log an entry in the iops, latency,
-		or bw log for every IO that completes. When writing to the
-		disk log, that can quickly grow to a very large size. Setting
-		this option makes fio average the each log entry over the
-		specified period of time, reducing the resolution of the log.
-		Defaults to 0.
+.. option:: file_append=bool
 
-log_offset=int	If this is set, the iolog options will include the byte
-		offset for the IO entry as well as the other data values.
+	Perform I/O after the end of the file. Normally fio will operate within the
+	size of a file. If this option is set, then fio will append to the file
+	instead. This has identical behavior to setting :option:`offset` to the size
+	of a file.  This option is ignored on non-regular files.
 
-log_compression=int	If this is set, fio will compress the IO logs as
-		it goes, to keep the memory footprint lower. When a log
-		reaches the specified size, that chunk is removed and
-		compressed in the background. Given that IO logs are
-		fairly highly compressible, this yields a nice memory
-		savings for longer runs. The downside is that the
-		compression will consume some background CPU cycles, so
-		it may impact the run. This, however, is also true if
-		the logging ends up consuming most of the system memory.
-		So pick your poison. The IO logs are saved normally at the
-		end of a run, by decompressing the chunks and storing them
-		in the specified log file. This feature depends on the
-		availability of zlib.
+.. option:: fill_device=bool, fill_fs=bool
 
-log_store_compressed=bool	If set, and log_compression is also set,
-		fio will store the log files in a compressed format. They
-		can be decompressed with fio, using the --inflate-log
-		command line parameter. The files will be stored with a
-		.fz suffix.
+	Sets size to something really large and waits for ENOSPC (no space left on
+	device) as the terminating condition. Only makes sense with sequential
+	write. For a read workload, the mount point will be filled first then I/O
+	started on the result. This option doesn't make sense if operating on a raw
+	device node, since the size of that is already known by the file system.
+	Additionally, writing beyond end-of-device will not return ENOSPC there.
 
-lockmem=int	Pin down the specified amount of memory with mlock(2). Can
-		potentially be used instead of removing memory or booting
-		with less memory to simulate a smaller amount of memory.
-		The amount specified is per worker.
 
-exec_prerun=str	Before running this job, issue the command specified
-		through system(3). Output is redirected in a file called
-		jobname.prerun.txt.
+I/O engine
+~~~~~~~~~~
 
-exec_postrun=str After the job completes, issue the command specified
-		 though system(3). Output is redirected in a file called
-		 jobname.postrun.txt.
+.. option:: ioengine=str
 
-ioscheduler=str	Attempt to switch the device hosting the file to the specified
-		io scheduler before running.
+	Defines how the job issues I/O to the file. The following types are defined:
 
-disk_util=bool	Generate disk utilization statistics, if the platform
-		supports it. Defaults to on.
+		**sync**
+			Basic :manpage:`read(2)` or :manpage:`write(2)`
+			I/O. :manpage:`lseek(2)` is used to position the I/O location.
+			See :option:`fsync` and :option:`fdatasync` for syncing write I/Os.
 
-disable_lat=bool Disable measurements of total latency numbers. Useful
-		only for cutting back the number of calls to gettimeofday,
-		as that does impact performance at really high IOPS rates.
-		Note that to really get rid of a large amount of these
-		calls, this option must be used with disable_slat and
-		disable_bw as well.
+		**psync**
+			Basic :manpage:`pread(2)` or :manpage:`pwrite(2)` I/O.  Default on
+			all supported operating systems except for Windows.
 
-disable_clat=bool Disable measurements of completion latency numbers. See
-		disable_lat.
+		**vsync**
+			Basic :manpage:`readv(2)` or :manpage:`writev(2)` I/O.  Will emulate
+			queuing by coalescing adjacent I/Os into a single submission.
 
-disable_slat=bool Disable measurements of submission latency numbers. See
-		disable_slat.
+		**pvsync**
+			Basic :manpage:`preadv(2)` or :manpage:`pwritev(2)` I/O.
 
-disable_bw=bool	Disable measurements of throughput/bandwidth numbers. See
-		disable_lat.
+		**pvsync2**
+			Basic :manpage:`preadv2(2)` or :manpage:`pwritev2(2)` I/O.
 
-clat_percentiles=bool Enable the reporting of percentiles of
-		 completion latencies.
+		**libaio**
+			Linux native asynchronous I/O. Note that Linux may only support
+			queued behaviour with non-buffered I/O (set ``direct=1`` or
+			``buffered=0``).
+			This engine defines engine specific options.
 
-percentile_list=float_list Overwrite the default list of percentiles
-		for completion latencies. Each number is a floating
-		number in the range (0,100], and the maximum length of
-		the list is 20. Use ':' to separate the numbers, and
-		list the numbers in ascending order. For example,
-		--percentile_list=99.5:99.9 will cause fio to report
-		the values of completion latency below which 99.5% and
-		99.9% of the observed latencies fell, respectively.
+		**posixaio**
+			POSIX asynchronous I/O using :manpage:`aio_read(3)` and
+			:manpage:`aio_write(3)`.
 
-clocksource=str	Use the given clocksource as the base of timing. The
-		supported options are:
+		**solarisaio**
+			Solaris native asynchronous I/O.
 
-			gettimeofday	gettimeofday(2)
+		**windowsaio**
+			Windows native asynchronous I/O.  Default on Windows.
 
-			clock_gettime	clock_gettime(2)
+		**mmap**
+			File is memory mapped with :manpage:`mmap(2)` and data copied
+			to/from using :manpage:`memcpy(3)`.
 
-			cpu		Internal CPU clock source
+		**splice**
+			:manpage:`splice(2)` is used to transfer the data and
+			:manpage:`vmsplice(2)` to transfer data from user space to the
+			kernel.
 
-		cpu is the preferred clocksource if it is reliable, as it
-		is very fast (and fio is heavy on time calls). Fio will
-		automatically use this clocksource if it's supported and
-		considered reliable on the system it is running on, unless
-		another clocksource is specifically set. For x86/x86-64 CPUs,
-		this means supporting TSC Invariant.
+		**sg**
+			SCSI generic sg v3 I/O. May either be synchronous using the SG_IO
+			ioctl, or if the target is an sg character device we use
+			:manpage:`read(2)` and :manpage:`write(2)` for asynchronous
+			I/O. Requires filename option to specify either block or character
+			devices.
 
-gtod_reduce=bool Enable all of the gettimeofday() reducing options
-		(disable_clat, disable_slat, disable_bw) plus reduce
-		precision of the timeout somewhat to really shrink
-		the gettimeofday() call count. With this option enabled,
-		we only do about 0.4% of the gtod() calls we would have
-		done if all time keeping was enabled.
+		**null**
+			Doesn't transfer any data, just pretends to.  This is mainly used to
+			exercise fio itself and for debugging/testing purposes.
 
-gtod_cpu=int	Sometimes it's cheaper to dedicate a single thread of
-		execution to just getting the current time. Fio (and
-		databases, for instance) are very intensive on gettimeofday()
-		calls. With this option, you can set one CPU aside for
-		doing nothing but logging current time to a shared memory
-		location. Then the other threads/processes that run IO
-		workloads need only copy that segment, instead of entering
-		the kernel with a gettimeofday() call. The CPU set aside
-		for doing these time calls will be excluded from other
-		uses. Fio will manually clear it from the CPU mask of other
-		jobs.
+		**net**
+			Transfer over the network to given ``host:port``.  Depending on the
+			:option:`protocol` used, the :option:`hostname`, :option:`port`,
+			:option:`listen` and :option:`filename` options are used to specify
+			what sort of connection to make, while the :option:`protocol` option
+			determines which protocol will be used.  This engine defines engine
+			specific options.
 
-continue_on_error=str	Normally fio will exit the job on the first observed
-		failure. If this option is set, fio will continue the job when
-		there is a 'non-fatal error' (EIO or EILSEQ) until the runtime
-		is exceeded or the I/O size specified is completed. If this
-		option is used, there are two more stats that are appended,
-		the total error count and the first error. The error field
-		given in the stats is the first error that was hit during the
-		run.
+		**netsplice**
+			Like **net**, but uses :manpage:`splice(2)` and
+			:manpage:`vmsplice(2)` to map data and send/receive.
+			This engine defines engine specific options.
 
-		The allowed values are:
+		**cpuio**
+			Doesn't transfer any data, but burns CPU cycles according to the
+			:option:`cpuload` and :option:`cpuchunks` options. Setting
+			:option:`cpuload` =85 will cause that job to do nothing but burn 85%
+			of the CPU. In case of SMP machines, use :option:`numjobs`
+			=<no_of_cpu> to get desired CPU usage, as the cpuload only loads a
+			single CPU at the desired rate. A job never finishes unless there is
+			at least one non-cpuio job.
 
-			none	Exit on any IO or verify errors.
+		**guasi**
+			The GUASI I/O engine is the Generic Userspace Asyncronous Syscall
+			Interface approach to async I/O. See
 
-			read	Continue on read errors, exit on all others.
+			http://www.xmailserver.org/guasi-lib.html
 
-			write	Continue on write errors, exit on all others.
+			for more info on GUASI.
 
-			io	Continue on any IO error, exit on all others.
+		**rdma**
+			The RDMA I/O engine supports both RDMA memory semantics
+			(RDMA_WRITE/RDMA_READ) and channel semantics (Send/Recv) for the
+			InfiniBand, RoCE and iWARP protocols.
 
-			verify	Continue on verify errors, exit on all others.
+		**falloc**
+			I/O engine that does regular fallocate to simulate data transfer as
+			fio ioengine.
 
-			all	Continue on all errors.
+			DDIR_READ
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE,).
 
-			0		Backward-compatible alias for 'none'.
+			DDIR_WRITE
+				does fallocate(,mode = 0).
 
-			1		Backward-compatible alias for 'all'.
+			DDIR_TRIM
+				does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE).
 
-ignore_error=str Sometimes you want to ignore some errors during test
-		 in that case you can specify error list for each error type.
-		 ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST
-		 errors for given error type is separated with ':'. Error
-		 may be symbol ('ENOSPC', 'ENOMEM') or integer.
-		 Example:
-			ignore_error=EAGAIN,ENOSPC:122
-		 This option will ignore EAGAIN from READ, and ENOSPC and
-		 122(EDQUOT) from WRITE.
+		**ftruncate**
+			I/O engine that sends :manpage:`ftruncate(2)` operations in response
+			to write (DDIR_WRITE) events. Each ftruncate issued sets the file's
+			size to the current block offset. Block size is ignored.
 
-error_dump=bool If set dump every error even if it is non fatal, true
-		by default. If disabled only fatal error will be dumped
+		**e4defrag**
+			I/O engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+			defragment activity in request to DDIR_WRITE event.
 
-cgroup=str	Add job to this control group. If it doesn't exist, it will
-		be created. The system must have a mounted cgroup blkio
-		mount point for this to work. If your system doesn't have it
-		mounted, you can do so with:
+		**rbd**
+			I/O engine supporting direct access to Ceph Rados Block Devices
+			(RBD) via librbd without the need to use the kernel rbd driver. This
+			ioengine defines engine specific options.
 
-		# mount -t cgroup -o blkio none /cgroup
+		**gfapi**
+			Using Glusterfs libgfapi sync interface to direct access to
+			Glusterfs volumes without having to go through FUSE.  This ioengine
+			defines engine specific options.
 
-cgroup_weight=int	Set the weight of the cgroup to this value. See
-		the documentation that comes with the kernel, allowed values
-		are in the range of 100..1000.
+		**gfapi_async**
+			Using Glusterfs libgfapi async interface to direct access to
+			Glusterfs volumes without having to go through FUSE. This ioengine
+			defines engine specific options.
 
-cgroup_nodelete=bool Normally fio will delete the cgroups it has created after
-		the job completion. To override this behavior and to leave
-		cgroups around after the job completion, set cgroup_nodelete=1.
-		This can be useful if one wants to inspect various cgroup
-		files after job completion. Default: false
+		**libhdfs**
+			Read and write through Hadoop (HDFS).  The :file:`filename` option
+			is used to specify host,port of the hdfs name-node to connect.  This
+			engine interprets offsets a little differently.  In HDFS, files once
+			created cannot be modified.  So random writes are not possible. To
+			imitate this, libhdfs engine expects bunch of small files to be
+			created over HDFS, and engine will randomly pick a file out of those
+			files based on the offset generated by fio backend. (see the example
+			job file to create such files, use ``rw=write`` option). Please
+			note, you might want to set necessary environment variables to work
+			with hdfs/libhdfs properly.  Each job uses its own connection to
+			HDFS.
 
-uid=int		Instead of running as the invoking user, set the user ID to
-		this value before the thread/process does any work.
+		**mtd**
+			Read, write and erase an MTD character device (e.g.,
+			:file:`/dev/mtd0`). Discards are treated as erases. Depending on the
+			underlying device type, the I/O may have to go in a certain pattern,
+			e.g., on NAND, writing sequentially to erase blocks and discarding
+			before overwriting. The writetrim mode works well for this
+			constraint.
 
-gid=int		Set group ID, see uid.
+		**pmemblk**
+			Read and write using filesystem DAX to a file on a filesystem
+			mounted with DAX on a persistent memory device through the NVML
+			libpmemblk library.
 
-flow_id=int	The ID of the flow. If not specified, it defaults to being a
-		global flow. See flow.
+		**dev-dax**
+			Read and write using device DAX to a persistent memory device (e.g.,
+			/dev/dax0.0) through the NVML libpmem library.
 
-flow=int	Weight in token-based flow control. If this value is used, then
-		there is a 'flow counter' which is used to regulate the
-		proportion of activity between two or more jobs. fio attempts
-		to keep this flow counter near zero. The 'flow' parameter
-		stands for how much should be added or subtracted to the flow
-		counter on each iteration of the main I/O loop. That is, if
-		one job has flow=8 and another job has flow=-1, then there
-		will be a roughly 1:8 ratio in how much one runs vs the other.
+		**external**
+			Prefix to specify loading an external I/O engine object file. Append
+			the engine filename, e.g. ``ioengine=external:/tmp/foo.o`` to load
+			ioengine :file:`foo.o` in :file:`/tmp`.
 
-flow_watermark=int	The maximum value that the absolute value of the flow
-		counter is allowed to reach before the job must wait for a
-		lower value of the counter.
 
-flow_sleep=int	The period of time, in microseconds, to wait after the flow
-		watermark has been exceeded before retrying operations
+I/O engine specific parameters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 In addition, there are some parameters which are only valid when a specific
 ioengine is in use. These are used identically to normal parameters, with the
-caveat that when used on the command line, they must come after the ioengine
-that defines them is selected.
+caveat that when used on the command line, they must come after the
+:option:`ioengine` that defines them is selected.
 
-[libaio] userspace_reap Normally, with the libaio engine in use, fio will use
-		the io_getevents system call to reap newly returned events.
-		With this flag turned on, the AIO ring will be read directly
-		from user-space to reap events. The reaping mode is only
-		enabled when polling for a minimum of 0 events (eg when
-		iodepth_batch_complete=0).
+.. option:: userspace_reap : [libaio]
 
-[cpu] cpuload=int Attempt to use the specified percentage of CPU cycles.
+	Normally, with the libaio engine in use, fio will use the
+	:manpage:`io_getevents(2)` system call to reap newly returned events.  With
+	this flag turned on, the AIO ring will be read directly from user-space to
+	reap events. The reaping mode is only enabled when polling for a minimum of
+	0 events (e.g. when :option:`iodepth_batch_complete` `=0`).
 
-[cpu] cpuchunks=int Split the load into cycles of the given time. In
-		microseconds.
+.. option:: hipri : [pvsync2]
 
-[cpu] exit_on_io_done=bool Detect when IO threads are done, then exit.
+	Set RWF_HIPRI on I/O, indicating to the kernel that it's of higher priority
+	than normal.
 
-[netsplice] hostname=str
-[net] hostname=str The host name or IP address to use for TCP or UDP based IO.
-		If the job is a TCP listener or UDP reader, the hostname is not
-		used and must be omitted unless it is a valid UDP multicast
-		address.
+.. option:: cpuload=int : [cpuio]
 
-[netsplice] port=int
-[net] port=int	The TCP or UDP port to bind to or connect to. If this is used
-with numjobs to spawn multiple instances of the same job type, then this will
-be the starting port number since fio will use a range of ports.
+	Attempt to use the specified percentage of CPU cycles. This is a mandatory
+	option when using cpuio I/O engine.
 
-[netsplice] interface=str
-[net] interface=str  The IP address of the network interface used to send or
-		receive UDP multicast
+.. option:: cpuchunks=int : [cpuio]
 
-[netsplice] ttl=int
-[net] ttl=int	Time-to-live value for outgoing UDP multicast packets.
-		Default: 1
+	Split the load into cycles of the given time. In microseconds.
 
-[netsplice] nodelay=bool
-[net] nodelay=bool	Set TCP_NODELAY on TCP connections.
+.. option:: exit_on_io_done=bool : [cpuio]
 
-[netsplice] protocol=str
-[netsplice] proto=str
-[net] protocol=str
-[net] proto=str	The network protocol to use. Accepted values are:
+	Detect when I/O threads are done, then exit.
 
-			tcp	Transmission control protocol
-			tcpv6	Transmission control protocol V6
-			udp	User datagram protocol
-			udpv6	User datagram protocol V6
-			unix	UNIX domain socket
+.. option:: hostname=str : [netsplice] [net]
 
-		When the protocol is TCP or UDP, the port must also be given,
-		as well as the hostname if the job is a TCP listener or UDP
-		reader. For unix sockets, the normal filename option should be
-		used and the port is invalid.
+	The host name or IP address to use for TCP or UDP based I/O.  If the job is
+	a TCP listener or UDP reader, the host name is not used and must be omitted
+	unless it is a valid UDP multicast address.
 
-[net] listen	For TCP network connections, tell fio to listen for incoming
-		connections rather than initiating an outgoing connection. The
-		hostname must be omitted if this option is used.
+.. option:: namenode=str : [libhdfs]
 
-[net] pingpong	Normaly a network writer will just continue writing data, and
-		a network reader will just consume packages. If pingpong=1
-		is set, a writer will send its normal payload to the reader,
-		then wait for the reader to send the same payload back. This
-		allows fio to measure network latencies. The submission
-		and completion latencies then measure local time spent
-		sending or receiving, and the completion latency measures
-		how long it took for the other end to receive and send back.
-		For UDP multicast traffic pingpong=1 should only be set for a
-		single reader when multiple readers are listening to the same
-		address.
+	The host name or IP address of a HDFS cluster namenode to contact.
 
-[net] window_size	Set the desired socket buffer size for the connection.
+.. option:: port=int
 
-[net] mss	Set the TCP maximum segment size (TCP_MAXSEG).
+   [netsplice], [net]
 
-[e4defrag] donorname=str
-	        File will be used as a block donor(swap extents between files)
-[e4defrag] inplace=int
-		Configure donor file blocks allocation strategy
-		0(default): Preallocate donor's file on init
-		1 	  : allocate space immidietly inside defragment event,
-			    and free right after event
+		The TCP or UDP port to bind to or connect to. If this is used with
+		:option:`numjobs` to spawn multiple instances of the same job type, then
+		this will be the starting port number since fio will use a range of
+		ports.
+
+   [libhdfs]
+
+		the listening port of the HFDS cluster namenode.
+
+.. option:: interface=str : [netsplice] [net]
+
+	The IP address of the network interface used to send or receive UDP
+	multicast.
+
+.. option:: ttl=int : [netsplice] [net]
+
+	Time-to-live value for outgoing UDP multicast packets. Default: 1.
+
+.. option:: nodelay=bool : [netsplice] [net]
+
+	Set TCP_NODELAY on TCP connections.
+
+.. option:: protocol=str : [netsplice] [net]
+
+.. option:: proto=str : [netsplice] [net]
+
+	The network protocol to use. Accepted values are:
+
+	**tcp**
+		Transmission control protocol.
+	**tcpv6**
+		Transmission control protocol V6.
+	**udp**
+		User datagram protocol.
+	**udpv6**
+		User datagram protocol V6.
+	**unix**
+		UNIX domain socket.
+
+	When the protocol is TCP or UDP, the port must also be given, as well as the
+	hostname if the job is a TCP listener or UDP reader. For unix sockets, the
+	normal filename option should be used and the port is invalid.
+
+.. option:: listen : [net]
+
+	For TCP network connections, tell fio to listen for incoming connections
+	rather than initiating an outgoing connection. The :option:`hostname` must
+	be omitted if this option is used.
+
+.. option:: pingpong : [net]
+
+	Normally a network writer will just continue writing data, and a network
+	reader will just consume packages. If ``pingpong=1`` is set, a writer will
+	send its normal payload to the reader, then wait for the reader to send the
+	same payload back. This allows fio to measure network latencies. The
+	submission and completion latencies then measure local time spent sending or
+	receiving, and the completion latency measures how long it took for the
+	other end to receive and send back.  For UDP multicast traffic
+	``pingpong=1`` should only be set for a single reader when multiple readers
+	are listening to the same address.
+
+.. option:: window_size : [net]
+
+	Set the desired socket buffer size for the connection.
+
+.. option:: mss : [net]
+
+	Set the TCP maximum segment size (TCP_MAXSEG).
+
+.. option:: donorname=str : [e4defrag]
+
+	File will be used as a block donor(swap extents between files).
+
+.. option:: inplace=int : [e4defrag]
+
+	Configure donor file blocks allocation strategy:
+
+	**0**
+		Default. Preallocate donor's file on init.
+	**1**
+		Allocate space immediately inside defragment event,	and free right
+		after event.
+
+.. option:: clustername=str : [rbd]
+
+	Specifies the name of the Ceph cluster.
+
+.. option:: rbdname=str : [rbd]
+
+	Specifies the name of the RBD.
+
+.. option:: pool=str : [rbd]
+
+	Specifies the name of the Ceph pool containing RBD.
+
+.. option:: clientname=str : [rbd]
+
+	Specifies the username (without the 'client.' prefix) used to access the
+	Ceph cluster. If the *clustername* is specified, the *clientname* shall be
+	the full *type.id* string. If no type. prefix is given, fio will add
+	'client.' by default.
+
+.. option:: skip_bad=bool : [mtd]
+
+	Skip operations against known bad blocks.
+
+.. option:: hdfsdirectory : [libhdfs]
+
+	libhdfs will create chunk in this HDFS directory.
+
+.. option:: chunk_size : [libhdfs]
+
+	the size of the chunk to use for each file.
 
 
+I/O depth
+~~~~~~~~~
 
-6.0 Interpreting the output
----------------------------
+.. option:: iodepth=int
 
-fio spits out a lot of output. While running, fio will display the
-status of the jobs created. An example of that would be:
+	Number of I/O units to keep in flight against the file.  Note that
+	increasing *iodepth* beyond 1 will not affect synchronous ioengines (except
+	for small degrees when :option:`verify_async` is in use).  Even async
+	engines may impose OS restrictions causing the desired depth not to be
+	achieved.  This may happen on Linux when using libaio and not setting
+	:option:`direct` =1, since buffered I/O is not async on that OS.  Keep an
+	eye on the I/O depth distribution in the fio output to verify that the
+	achieved depth is as expected. Default: 1.
 
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
+.. option:: iodepth_batch_submit=int, iodepth_batch=int
 
-The characters inside the square brackets denote the current status of
-each thread. The possible values (in typical life cycle order) are:
+	This defines how many pieces of I/O to submit at once.  It defaults to 1
+	which means that we submit each I/O as soon as it is available, but can be
+	raised to submit bigger batches of I/O at the time. If it is set to 0 the
+	:option:`iodepth` value will be used.
 
-Idle	Run
-----    ---
-P		Thread setup, but not started.
-C		Thread created.
-I		Thread initialized, waiting or generating necessary data.
-	p	Thread running pre-reading file(s).
-	R	Running, doing sequential reads.
-	r	Running, doing random reads.
-	W	Running, doing sequential writes.
-	w	Running, doing random writes.
-	M	Running, doing mixed sequential reads/writes.
-	m	Running, doing mixed random reads/writes.
-	F	Running, currently waiting for fsync()
-	f	Running, finishing up (writing IO logs, etc)
-	V	Running, doing verification of written data.
-E		Thread exited, not reaped by main thread yet.
-_		Thread reaped, or
-X		Thread reaped, exited with an error.
-K		Thread reaped, exited due to signal.
+.. option:: iodepth_batch_complete_min=int, iodepth_batch_complete=int
 
-Fio will condense the thread string as not to take up more space on the
-command line as is needed. For instance, if you have 10 readers and 10
-writers running, the output would look like this:
+	This defines how many pieces of I/O to retrieve at once. It defaults to 1
+	which means that we'll ask for a minimum of 1 I/O in the retrieval process
+	from the kernel. The I/O retrieval will go on until we hit the limit set by
+	:option:`iodepth_low`. If this variable is set to 0, then fio will always
+	check for completed events before queuing more I/O. This helps reduce I/O
+	latency, at the cost of more retrieval system calls.
 
-Jobs: 20 (f=20): [R(10),W(10)] [4.0% done] [2103MB/0KB/0KB /s] [538K/0/0 iops] [eta 57m:36s]
+.. option:: iodepth_batch_complete_max=int
 
-Fio will still maintain the ordering, though. So the above means that jobs
-1..10 are readers, and 11..20 are writers.
+	This defines maximum pieces of I/O to retrieve at once. This variable should
+	be used along with :option:`iodepth_batch_complete_min` =int variable,
+	specifying the range of min and max amount of I/O which should be
+	retrieved. By default it is equal to :option:`iodepth_batch_complete_min`
+	value.
 
-The other values are fairly self explanatory - number of threads
-currently running and doing io, rate of io since last check (read speed
-listed first, then write speed), and the estimated completion percentage
-and time for the running group. It's impossible to estimate runtime of
-the following groups (if any). Note that the string is displayed in order,
-so it's possible to tell which of the jobs are currently doing what. The
-first character is the first job defined in the job file, and so forth.
+	Example #1::
 
-When fio is done (or interrupted by ctrl-c), it will show the data for
-each thread, group of threads, and disks in that order. For each data
-direction, the output looks like:
+		iodepth_batch_complete_min=1
+		iodepth_batch_complete_max=<iodepth>
 
-Client1 (g=0): err= 0:
-  write: io=    32MB, bw=   666KB/s, iops=89 , runt= 50320msec
-    slat (msec): min=    0, max=  136, avg= 0.03, stdev= 1.92
-    clat (msec): min=    0, max=  631, avg=48.50, stdev=86.82
-    bw (KB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
-  cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
-  IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
-     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
-     issued r/w: total=0/32768, short=0/0
-     lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
-     lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
+	which means that we will retrieve at least 1 I/O and up to the whole
+	submitted queue depth. If none of I/O has been completed yet, we will wait.
+
+	Example #2::
+
+		iodepth_batch_complete_min=0
+		iodepth_batch_complete_max=<iodepth>
+
+	which means that we can retrieve up to the whole submitted queue depth, but
+	if none of I/O has been completed yet, we will NOT wait and immediately exit
+	the system call. In this example we simply do polling.
+
+.. option:: iodepth_low=int
+
+	The low water mark indicating when to start filling the queue
+	again. Defaults to the same as :option:`iodepth`, meaning that fio will
+	attempt to keep the queue full at all times.  If :option:`iodepth` is set to
+	e.g. 16 and *iodepth_low* is set to 4, then after fio has filled the queue of
+	16 requests, it will let the depth drain down to 4 before starting to fill
+	it again.
+
+.. option:: io_submit_mode=str
+
+	This option controls how fio submits the I/O to the I/O engine. The default
+	is `inline`, which means that the fio job threads submit and reap I/O
+	directly. If set to `offload`, the job threads will offload I/O submission
+	to a dedicated pool of I/O threads. This requires some coordination and thus
+	has a bit of extra overhead, especially for lower queue depth I/O where it
+	can increase latencies. The benefit is that fio can manage submission rates
+	independently of the device completion rates. This avoids skewed latency
+	reporting if I/O gets back up on the device side (the coordinated omission
+	problem).
+
+
+I/O rate
+~~~~~~~~
+
+.. option:: thinktime=time
+
+	Stall the job for the specified period of time after an I/O has completed before issuing the
+	next. May be used to simulate processing being done by an application.
+	When the unit is omitted, the value is given in microseconds.  See
+	:option:`thinktime_blocks` and :option:`thinktime_spin`.
+
+.. option:: thinktime_spin=time
+
+	Only valid if :option:`thinktime` is set - pretend to spend CPU time doing
+	something with the data received, before falling back to sleeping for the
+	rest of the period specified by :option:`thinktime`.  When the unit is
+	omitted, the value is given in microseconds.
+
+.. option:: thinktime_blocks=int
+
+	Only valid if :option:`thinktime` is set - control how many blocks to issue,
+	before waiting `thinktime` usecs. If not set, defaults to 1 which will make
+	fio wait `thinktime` usecs after every block. This effectively makes any
+	queue depth setting redundant, since no more than 1 I/O will be queued
+	before we have to complete it and do our thinktime. In other words, this
+	setting effectively caps the queue depth if the latter is larger.
+
+.. option:: rate=int[,int][,int]
+
+	Cap the bandwidth used by this job. The number is in bytes/sec, the normal
+	suffix rules apply.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_min=int[,int][,int]
+
+	Tell fio to do whatever it can to maintain at least this bandwidth. Failing
+	to meet this requirement will cause the job to exit.  Comma-separated values
+	may be specified for reads, writes, and trims as described in
+	:option:`blocksize`.
+
+.. option:: rate_iops=int[,int][,int]
+
+	Cap the bandwidth to this number of IOPS. Basically the same as
+	:option:`rate`, just specified independently of bandwidth. If the job is
+	given a block size range instead of a fixed value, the smallest block size
+	is used as the metric.  Comma-separated values may be specified for reads,
+	writes, and trims as described in :option:`blocksize`.
+
+.. option:: rate_iops_min=int[,int][,int]
+
+	If fio doesn't meet this rate of I/O, it will cause the job to exit.
+	Comma-separated values may be specified for reads, writes, and trims as
+	described in :option:`blocksize`.
+
+.. option:: rate_process=str
+
+	This option controls how fio manages rated I/O submissions. The default is
+	`linear`, which submits I/O in a linear fashion with fixed delays between
+	I/Os that gets adjusted based on I/O completion rates. If this is set to
+	`poisson`, fio will submit I/O based on a more real world random request
+	flow, known as the Poisson process
+	(https://en.wikipedia.org/wiki/Poisson_point_process). The lambda will be
+	10^6 / IOPS for the given workload.
+
+
+I/O latency
+~~~~~~~~~~~
+
+.. option:: latency_target=time
+
+	If set, fio will attempt to find the max performance point that the given
+	workload will run at while maintaining a latency below this target.  When
+	the unit is omitted, the value is given in microseconds.  See
+	:option:`latency_window` and :option:`latency_percentile`.
+
+.. option:: latency_window=time
+
+	Used with :option:`latency_target` to specify the sample window that the job
+	is run at varying queue depths to test the performance.  When the unit is
+	omitted, the value is given in microseconds.
+
+.. option:: latency_percentile=float
+
+	The percentage of I/Os that must fall within the criteria specified by
+	:option:`latency_target` and :option:`latency_window`. If not set, this
+	defaults to 100.0, meaning that all I/Os must be equal or below to the value
+	set by :option:`latency_target`.
+
+.. option:: max_latency=time
+
+	If set, fio will exit the job with an ETIMEDOUT error if it exceeds this
+	maximum latency. When the unit is omitted, the value is given in
+	microseconds.
+
+.. option:: rate_cycle=int
+
+	Average bandwidth for :option:`rate` and :option:`rate_min` over this number
+	of milliseconds.
+
+
+I/O replay
+~~~~~~~~~~
+
+.. option:: write_iolog=str
+
+	Write the issued I/O patterns to the specified file. See
+	:option:`read_iolog`.  Specify a separate file for each job, otherwise the
+	iologs will be interspersed and the file may be corrupt.
+
+.. option:: read_iolog=str
+
+	Open an iolog with the specified file name and replay the I/O patterns it
+	contains. This can be used to store a workload and replay it sometime
+	later. The iolog given may also be a blktrace binary file, which allows fio
+	to replay a workload captured by :command:`blktrace`. See
+	:manpage:`blktrace(8)` for how to capture such logging data. For blktrace
+	replay, the file needs to be turned into a blkparse binary data file first
+	(``blkparse <device> -o /dev/null -d file_for_fio.bin``).
+
+.. option:: replay_no_stall=int
+
+	When replaying I/O with :option:`read_iolog` the default behavior is to
+	attempt to respect the time stamps within the log and replay them with the
+	appropriate delay between IOPS. By setting this variable fio will not
+	respect the timestamps and attempt to replay them as fast as possible while
+	still respecting ordering. The result is the same I/O pattern to a given
+	device, but different timings.
+
+.. option:: replay_redirect=str
+
+	While replaying I/O patterns using :option:`read_iolog` the default behavior
+	is to replay the IOPS onto the major/minor device that each IOP was recorded
+	from.  This is sometimes undesirable because on a different machine those
+	major/minor numbers can map to a different device.  Changing hardware on the
+	same system can also result in a different major/minor mapping.
+	``replay_redirect`` causes all IOPS to be replayed onto the single specified
+	device regardless of the device it was recorded
+	from. i.e. :option:`replay_redirect` = :file:`/dev/sdc` would cause all I/O
+	in the blktrace or iolog to be replayed onto :file:`/dev/sdc`.  This means
+	multiple devices will be replayed onto a single device, if the trace
+	contains multiple devices. If you want multiple devices to be replayed
+	concurrently to multiple redirected devices you must blkparse your trace
+	into separate traces and replay them with independent fio invocations.
+	Unfortunately this also breaks the strict time ordering between multiple
+	device accesses.
+
+.. option:: replay_align=int
+
+	Force alignment of I/O offsets and lengths in a trace to this power of 2
+	value.
+
+.. option:: replay_scale=int
+
+	Scale sector offsets down by this factor when replaying traces.
+
+
+Threads, processes and job synchronization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: thread
+
+	Fio defaults to forking jobs, however if this option is given, fio will use
+	POSIX Threads function :manpage:`pthread_create(3)` to create threads instead
+	of forking processes.
+
+.. option:: wait_for=str
+
+	Specifies the name of the already defined job to wait for. Single waitee
+	name only may be specified. If set, the job won't be started until all
+	workers of the waitee job are done.
+
+	``wait_for`` operates on the job name basis, so there are a few
+	limitations. First, the waitee must be defined prior to the waiter job
+	(meaning no forward references). Second, if a job is being referenced as a
+	waitee, it must have a unique name (no duplicate waitees).
+
+.. option:: nice=int
+
+	Run the job with the given nice value. See man :manpage:`nice(2)`.
+
+	On Windows, values less than -15 set the process class to "High"; -1 through
+	-15 set "Above Normal"; 1 through 15 "Below Normal"; and above 15 "Idle"
+	priority class.
+
+.. option:: prio=int
+
+	Set the I/O priority value of this job. Linux limits us to a positive value
+	between 0 and 7, with 0 being the highest.  See man
+	:manpage:`ionice(1)`. Refer to an appropriate manpage for other operating
+	systems since meaning of priority may differ.
+
+.. option:: prioclass=int
+
+	Set the I/O priority class. See man :manpage:`ionice(1)`.
+
+.. option:: cpumask=int
+
+	Set the CPU affinity of this job. The parameter given is a bitmask of
+	allowed CPU's the job may run on. So if you want the allowed CPUs to be 1
+	and 5, you would pass the decimal value of (1 << 1 | 1 << 5), or 34. See man
+	:manpage:`sched_setaffinity(2)`. This may not work on all supported
+	operating systems or kernel versions. This option doesn't work well for a
+	higher CPU count than what you can store in an integer mask, so it can only
+	control cpus 1-32. For boxes with larger CPU counts, use
+	:option:`cpus_allowed`.
+
+.. option:: cpus_allowed=str
+
+	Controls the same options as :option:`cpumask`, but it allows a text setting
+	of the permitted CPUs instead. So to use CPUs 1 and 5, you would specify
+	``cpus_allowed=1,5``. This options also allows a range of CPUs. Say you
+	wanted a binding to CPUs 1, 5, and 8-15, you would set
+	``cpus_allowed=1,5,8-15``.
+
+.. option:: cpus_allowed_policy=str
+
+	Set the policy of how fio distributes the CPUs specified by
+	:option:`cpus_allowed` or cpumask. Two policies are supported:
+
+		**shared**
+			All jobs will share the CPU set specified.
+		**split**
+			Each job will get a unique CPU from the CPU set.
+
+	**shared** is the default behaviour, if the option isn't specified. If
+	**split** is specified, then fio will will assign one cpu per job. If not
+	enough CPUs are given for the jobs listed, then fio will roundrobin the CPUs
+	in the set.
+
+.. option:: numa_cpu_nodes=str
+
+	Set this job running on specified NUMA nodes' CPUs. The arguments allow
+	comma delimited list of cpu numbers, A-B ranges, or `all`. Note, to enable
+	numa options support, fio must be built on a system with libnuma-dev(el)
+	installed.
+
+.. option:: numa_mem_policy=str
+
+	Set this job's memory policy and corresponding NUMA nodes. Format of the
+	arguments::
+
+		<mode>[:<nodelist>]
+
+	``mode`` is one of the following memory policy: ``default``, ``prefer``,
+	``bind``, ``interleave``, ``local`` For ``default`` and ``local`` memory
+	policy, no node is needed to be specified.  For ``prefer``, only one node is
+	allowed.  For ``bind`` and ``interleave``, it allow comma delimited list of
+	numbers, A-B ranges, or `all`.
+
+.. option:: cgroup=str
+
+	Add job to this control group. If it doesn't exist, it will be created. The
+	system must have a mounted cgroup blkio mount point for this to work. If
+	your system doesn't have it mounted, you can do so with::
+
+		# mount -t cgroup -o blkio none /cgroup
+
+.. option:: cgroup_weight=int
+
+	Set the weight of the cgroup to this value. See the documentation that comes
+	with the kernel, allowed values are in the range of 100..1000.
+
+.. option:: cgroup_nodelete=bool
+
+	Normally fio will delete the cgroups it has created after the job
+	completion. To override this behavior and to leave cgroups around after the
+	job completion, set ``cgroup_nodelete=1``.  This can be useful if one wants
+	to inspect various cgroup files after job completion. Default: false.
+
+.. option:: flow_id=int
+
+	The ID of the flow. If not specified, it defaults to being a global
+	flow. See :option:`flow`.
+
+.. option:: flow=int
+
+	Weight in token-based flow control. If this value is used, then there is a
+	'flow counter' which is used to regulate the proportion of activity between
+	two or more jobs. Fio attempts to keep this flow counter near zero. The
+	``flow`` parameter stands for how much should be added or subtracted to the
+	flow counter on each iteration of the main I/O loop. That is, if one job has
+	``flow=8`` and another job has ``flow=-1``, then there will be a roughly 1:8
+	ratio in how much one runs vs the other.
+
+.. option:: flow_watermark=int
+
+	The maximum value that the absolute value of the flow counter is allowed to
+	reach before the job must wait for a lower value of the counter.
+
+.. option:: flow_sleep=int
+
+	The period of time, in microseconds, to wait after the flow watermark has
+	been exceeded before retrying operations.
+
+.. option:: stonewall, wait_for_previous
+
+	Wait for preceding jobs in the job file to exit, before starting this
+	one. Can be used to insert serialization points in the job file. A stone
+	wall also implies starting a new reporting group, see
+	:option:`group_reporting`.
+
+.. option:: exitall
+
+	When one job finishes, terminate the rest. The default is to wait for each
+	job to finish, sometimes that is not the desired action.
+
+.. option:: exec_prerun=str
+
+	Before running this job, issue the command specified through
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.prerun.txt`.
+
+.. option:: exec_postrun=str
+
+	After the job completes, issue the command specified though
+	:manpage:`system(3)`. Output is redirected in a file called
+	:file:`jobname.postrun.txt`.
+
+.. option:: uid=int
+
+	Instead of running as the invoking user, set the user ID to this value
+	before the thread/process does any work.
+
+.. option:: gid=int
+
+	Set group ID, see :option:`uid`.
+
+
+Verification
+~~~~~~~~~~~~
+
+.. option:: verify_only
+
+	Do not perform specified workload, only verify data still matches previous
+	invocation of this workload. This option allows one to check data multiple
+	times at a later date without overwriting it. This option makes sense only
+	for workloads that write data, and does not support workloads with the
+	:option:`time_based` option set.
+
+.. option:: do_verify=bool
+
+	Run the verify phase after a write phase. Only valid if :option:`verify` is
+	set. Default: true.
+
+.. option:: verify=str
+
+	If writing to a file, fio can verify the file contents after each iteration
+	of the job. Each verification method also implies verification of special
+	header, which is written to the beginning of each block. This header also
+	includes meta information, like offset of the block, block number, timestamp
+	when block was written, etc.  :option:`verify` can be combined with
+	:option:`verify_pattern` option.  The allowed values are:
+
+		**md5**
+			Use an md5 sum of the data area and store it in the header of
+			each block.
+
+		**crc64**
+			Use an experimental crc64 sum of the data area and store it in the
+			header of each block.
+
+		**crc32c**
+			Use a crc32c sum of the data area and store it in the header of each
+			block.
+
+		**crc32c-intel**
+			Use hardware assisted crc32c calculation provided on SSE4.2 enabled
+			processors. Falls back to regular software crc32c, if not supported
+			by the system.
+
+		**crc32**
+			Use a crc32 sum of the data area and store it in the header of each
+			block.
+
+		**crc16**
+			Use a crc16 sum of the data area and store it in the header of each
+			block.
+
+		**crc7**
+			Use a crc7 sum of the data area and store it in the header of each
+			block.
+
+		**xxhash**
+			Use xxhash as the checksum function. Generally the fastest software
+			checksum that fio supports.
+
+		**sha512**
+			Use sha512 as the checksum function.
+
+		**sha256**
+			Use sha256 as the checksum function.
+
+		**sha1**
+			Use optimized sha1 as the checksum function.
+
+		**sha3-224**
+			Use optimized sha3-224 as the checksum function.
+
+		**sha3-256**
+			Use optimized sha3-256 as the checksum function.
+
+		**sha3-384**
+			Use optimized sha3-384 as the checksum function.
+
+		**sha3-512**
+			Use optimized sha3-512 as the checksum function.
+
+		**meta**
+			This option is deprecated, since now meta information is included in
+			generic verification header and meta verification happens by
+			default. For detailed information see the description of the
+			:option:`verify` setting. This option is kept because of
+			compatibility's sake with old configurations. Do not use it.
+
+		**pattern**
+			Verify a strict pattern. Normally fio includes a header with some
+			basic information and checksumming, but if this option is set, only
+			the specific pattern set with :option:`verify_pattern` is verified.
+
+		**null**
+			Only pretend to verify. Useful for testing internals with
+			:option:`ioengine` `=null`, not for much else.
+
+	This option can be used for repeated burn-in tests of a system to make sure
+	that the written data is also correctly read back. If the data direction
+	given is a read or random read, fio will assume that it should verify a
+	previously written file. If the data direction includes any form of write,
+	the verify will be of the newly written data.
+
+.. option:: verifysort=bool
+
+	If true, fio will sort written verify blocks when it deems it faster to read
+	them back in a sorted manner. This is often the case when overwriting an
+	existing file, since the blocks are already laid out in the file system. You
+	can ignore this option unless doing huge amounts of really fast I/O where
+	the red-black tree sorting CPU time becomes significant. Default: true.
+
+.. option:: verifysort_nr=int
+
+   Pre-load and sort verify blocks for a read workload.
+
+.. option:: verify_offset=int
+
+	Swap the verification header with data somewhere else in the block before
+	writing. It is swapped back before verifying.
+
+.. option:: verify_interval=int
+
+	Write the verification header at a finer granularity than the
+	:option:`blocksize`. It will be written for chunks the size of
+	``verify_interval``. :option:`blocksize` should divide this evenly.
+
+.. option:: verify_pattern=str
+
+	If set, fio will fill the I/O buffers with this pattern. Fio defaults to
+	filling with totally random bytes, but sometimes it's interesting to fill
+	with a known pattern for I/O verification purposes. Depending on the width
+	of the pattern, fio will fill 1/2/3/4 bytes of the buffer at the time(it can
+	be either a decimal or a hex number).  The ``verify_pattern`` if larger than
+	a 32-bit quantity has to be a hex number that starts with either "0x" or
+	"0X". Use with :option:`verify`. Also, ``verify_pattern`` supports %o
+	format, which means that for each block offset will be written and then
+	verified back, e.g.::
+
+		verify_pattern=%o
+
+	Or use combination of everything::
+
+		verify_pattern=0xff%o"abcd"-12
+
+.. option:: verify_fatal=bool
+
+	Normally fio will keep checking the entire contents before quitting on a
+	block verification failure. If this option is set, fio will exit the job on
+	the first observed failure. Default: false.
+
+.. option:: verify_dump=bool
+
+	If set, dump the contents of both the original data block and the data block
+	we read off disk to files. This allows later analysis to inspect just what
+	kind of data corruption occurred. Off by default.
+
+.. option:: verify_async=int
+
+	Fio will normally verify I/O inline from the submitting thread. This option
+	takes an integer describing how many async offload threads to create for I/O
+	verification instead, causing fio to offload the duty of verifying I/O
+	contents to one or more separate threads. If using this offload option, even
+	sync I/O engines can benefit from using an :option:`iodepth` setting higher
+	than 1, as it allows them to have I/O in flight while verifies are running.
+
+.. option:: verify_async_cpus=str
+
+	Tell fio to set the given CPU affinity on the async I/O verification
+	threads. See :option:`cpus_allowed` for the format used.
+
+.. option:: verify_backlog=int
+
+	Fio will normally verify the written contents of a job that utilizes verify
+	once that job has completed. In other words, everything is written then
+	everything is read back and verified. You may want to verify continually
+	instead for a variety of reasons. Fio stores the meta data associated with
+	an I/O block in memory, so for large verify workloads, quite a bit of memory
+	would be used up holding this meta data. If this option is enabled, fio will
+	write only N blocks before verifying these blocks.
+
+.. option:: verify_backlog_batch=int
+
+	Control how many blocks fio will verify if :option:`verify_backlog` is
+	set. If not set, will default to the value of :option:`verify_backlog`
+	(meaning the entire queue is read back and verified).  If
+	``verify_backlog_batch`` is less than :option:`verify_backlog` then not all
+	blocks will be verified, if ``verify_backlog_batch`` is larger than
+	:option:`verify_backlog`, some blocks will be verified more than once.
+
+.. option:: verify_state_save=bool
+
+	When a job exits during the write phase of a verify workload, save its
+	current state. This allows fio to replay up until that point, if the verify
+	state is loaded for the verify read phase. The format of the filename is,
+	roughly::
+
+	<type>-<jobname>-<jobindex>-verify.state.
+
+	<type> is "local" for a local run, "sock" for a client/server socket
+	connection, and "ip" (192.168.0.1, for instance) for a networked
+	client/server connection.
+
+.. option:: verify_state_load=bool
+
+	If a verify termination trigger was used, fio stores the current write state
+	of each thread. This can be used at verification time so that fio knows how
+	far it should verify.  Without this information, fio will run a full
+	verification pass, according to the settings in the job file used.
+
+.. option:: trim_percentage=int
+
+	Number of verify blocks to discard/trim.
+
+.. option:: trim_verify_zero=bool
+
+	Verify that trim/discarded blocks are returned as zeroes.
+
+.. option:: trim_backlog=int
+
+	Verify that trim/discarded blocks are returned as zeroes.
+
+.. option:: trim_backlog_batch=int
+
+	Trim this number of I/O blocks.
+
+.. option:: experimental_verify=bool
+
+	Enable experimental verification.
+
+
+Steady state
+~~~~~~~~~~~~
+
+.. option:: steadystate=str:float, ss=str:float
+
+	Define the criterion and limit for assessing steady state performance. The
+	first parameter designates the criterion whereas the second parameter sets
+	the threshold. When the criterion falls below the threshold for the
+	specified duration, the job will stop. For example, `iops_slope:0.1%` will
+	direct fio to terminate the job when the least squares regression slope
+	falls below 0.1% of the mean IOPS. If :option:`group_reporting` is enabled
+	this will apply to all jobs in the group. Below is the list of available
+	steady state assessment criteria. All assessments are carried out using only
+	data from the rolling collection window. Threshold limits can be expressed
+	as a fixed value or as a percentage of the mean in the collection window.
+
+		**iops**
+			Collect IOPS data. Stop the job if all individual IOPS measurements
+			are within the specified limit of the mean IOPS (e.g., ``iops:2``
+			means that all individual IOPS values must be within 2 of the mean,
+			whereas ``iops:0.2%`` means that all individual IOPS values must be
+			within 0.2% of the mean IOPS to terminate the job).
+
+		**iops_slope**
+			Collect IOPS data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+		**bw**
+			Collect bandwidth data. Stop the job if all individual bandwidth
+			measurements are within the specified limit of the mean bandwidth.
+
+		**bw_slope**
+			Collect bandwidth data and calculate the least squares regression
+			slope. Stop the job if the slope falls below the specified limit.
+
+.. option:: steadystate_duration=time, ss_dur=time
+
+	A rolling window of this duration will be used to judge whether steady state
+	has been reached. Data will be collected once per second. The default is 0
+	which disables steady state detection.  When the unit is omitted, the
+	value is given in seconds.
+
+.. option:: steadystate_ramp_time=time, ss_ramp=time
+
+	Allow the job to run for the specified duration before beginning data
+	collection for checking the steady state job termination criterion. The
+	default is 0.  When the unit is omitted, the value is given in seconds.
+
+
+Measurements and reporting
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: per_job_logs=bool
+
+	If set, this generates bw/clat/iops log with per file private filenames. If
+	not set, jobs with identical names will share the log filename. Default:
+	true.
+
+.. option:: group_reporting
+
+	It may sometimes be interesting to display statistics for groups of jobs as
+	a whole instead of for each individual job.  This is especially true if
+	:option:`numjobs` is used; looking at individual thread/process output
+	quickly becomes unwieldy.  To see the final report per-group instead of
+	per-job, use :option:`group_reporting`. Jobs in a file will be part of the
+	same reporting group, unless if separated by a :option:`stonewall`, or by
+	using :option:`new_group`.
+
+.. option:: new_group
+
+	Start a new reporting group. See: :option:`group_reporting`.  If not given,
+	all jobs in a file will be part of the same reporting group, unless
+	separated by a :option:`stonewall`.
+
+.. option:: stats
+
+	By default, fio collects and shows final output results for all jobs
+	that run. If this option is set to 0, then fio will ignore it in
+	the final stat output.
+
+.. option:: write_bw_log=str
+
+	If given, write a bandwidth log for this job. Can be used to store data of
+	the bandwidth of the jobs in their lifetime. The included
+	:command:`fio_generate_plots` script uses :command:`gnuplot` to turn these
+	text files into nice graphs. See :option:`write_lat_log` for behaviour of
+	given filename. For this option, the postfix is :file:`_bw.x.log`, where `x`
+	is the index of the job (`1..N`, where `N` is the number of jobs). If
+	:option:`per_job_logs` is false, then the filename will not include the job
+	index.  See `Log File Formats`_.
+
+.. option:: write_lat_log=str
+
+	Same as :option:`write_bw_log`, except that this option stores I/O
+	submission, completion, and total latencies instead. If no filename is given
+	with this option, the default filename of :file:`jobname_type.log` is
+	used. Even if the filename is given, fio will still append the type of
+	log. So if one specifies::
+
+		write_lat_log=foo
+
+	The actual log names will be :file:`foo_slat.x.log`, :file:`foo_clat.x.log`,
+	and :file:`foo_lat.x.log`, where `x` is the index of the job (1..N, where N
+	is the number of jobs). This helps :command:`fio_generate_plot` find the
+	logs automatically. If :option:`per_job_logs` is false, then the filename
+	will not include the job index.  See `Log File Formats`_.
+
+.. option:: write_hist_log=str
+
+	Same as :option:`write_lat_log`, but writes I/O completion latency
+	histograms. If no filename is given with this option, the default filename
+	of :file:`jobname_clat_hist.x.log` is used, where `x` is the index of the
+	job (1..N, where `N` is the number of jobs). Even if the filename is given,
+	fio will still append the type of log.  If :option:`per_job_logs` is false,
+	then the filename will not include the job index. See `Log File Formats`_.
+
+.. option:: write_iops_log=str
+
+	Same as :option:`write_bw_log`, but writes IOPS. If no filename is given
+	with this option, the default filename of :file:`jobname_type.x.log` is
+	used,where `x` is the index of the job (1..N, where `N` is the number of
+	jobs). Even if the filename is given, fio will still append the type of
+	log. If :option:`per_job_logs` is false, then the filename will not include
+	the job index. See `Log File Formats`_.
+
+.. option:: log_avg_msec=int
+
+	By default, fio will log an entry in the iops, latency, or bw log for every
+	I/O that completes. When writing to the disk log, that can quickly grow to a
+	very large size. Setting this option makes fio average the each log entry
+	over the specified period of time, reducing the resolution of the log.  See
+	:option:`log_max_value` as well. Defaults to 0, logging all entries.
+
+.. option:: log_hist_msec=int
+
+	Same as :option:`log_avg_msec`, but logs entries for completion latency
+	histograms. Computing latency percentiles from averages of intervals using
+	:option:`log_avg_msec` is inaccurate. Setting this option makes fio log
+	histogram entries over the specified period of time, reducing log sizes for
+	high IOPS devices while retaining percentile accuracy.  See
+	:option:`log_hist_coarseness` as well. Defaults to 0, meaning histogram
+	logging is disabled.
+
+.. option:: log_hist_coarseness=int
+
+	Integer ranging from 0 to 6, defining the coarseness of the resolution of
+	the histogram logs enabled with :option:`log_hist_msec`. For each increment
+	in coarseness, fio outputs half as many bins. Defaults to 0, for which
+	histogram logs contain 1216 latency bins. See `Log File Formats`_.
+
+.. option:: log_max_value=bool
+
+	If :option:`log_avg_msec` is set, fio logs the average over that window. If
+	you instead want to log the maximum value, set this option to 1. Defaults to
+	0, meaning that averaged values are logged.
+
+.. option:: log_offset=int
+
+	If this is set, the iolog options will include the byte offset for the I/O
+	entry as well as the other data values.
+
+.. option:: log_compression=int
+
+	If this is set, fio will compress the I/O logs as it goes, to keep the
+	memory footprint lower. When a log reaches the specified size, that chunk is
+	removed and compressed in the background. Given that I/O logs are fairly
+	highly compressible, this yields a nice memory savings for longer runs. The
+	downside is that the compression will consume some background CPU cycles, so
+	it may impact the run. This, however, is also true if the logging ends up
+	consuming most of the system memory.  So pick your poison. The I/O logs are
+	saved normally at the end of a run, by decompressing the chunks and storing
+	them in the specified log file. This feature depends on the availability of
+	zlib.
+
+.. option:: log_compression_cpus=str
+
+	Define the set of CPUs that are allowed to handle online log compression for
+	the I/O jobs. This can provide better isolation between performance
+	sensitive jobs, and background compression work.
+
+.. option:: log_store_compressed=bool
+
+	If set, fio will store the log files in a compressed format. They can be
+	decompressed with fio, using the :option:`--inflate-log` command line
+	parameter. The files will be stored with a :file:`.fz` suffix.
+
+.. option:: log_unix_epoch=bool
+
+	If set, fio will log Unix timestamps to the log files produced by enabling
+	write_type_log for each log type, instead of the default zero-based
+	timestamps.
+
+.. option:: block_error_percentiles=bool
+
+	If set, record errors in trim block-sized units from writes and trims and
+	output a histogram of how many trims it took to get to errors, and what kind
+	of error was encountered.
+
+.. option:: bwavgtime=int
+
+	Average the calculated bandwidth over the given time. Value is specified in
+	milliseconds. If the job also does bandwidth logging through
+	:option:`write_bw_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: iopsavgtime=int
+
+	Average the calculated IOPS over the given time. Value is specified in
+	milliseconds. If the job also does IOPS logging through
+	:option:`write_iops_log`, then the minimum of this option and
+	:option:`log_avg_msec` will be used.  Default: 500ms.
+
+.. option:: disk_util=bool
+
+	Generate disk utilization statistics, if the platform supports it.
+	Default: true.
+
+.. option:: disable_lat=bool
+
+	Disable measurements of total latency numbers. Useful only for cutting back
+	the number of calls to :manpage:`gettimeofday(2)`, as that does impact
+	performance at really high IOPS rates.  Note that to really get rid of a
+	large amount of these calls, this option must be used with
+	:option:`disable_slat` and :option:`disable_bw_measurement` as well.
+
+.. option:: disable_clat=bool
+
+	Disable measurements of completion latency numbers. See
+	:option:`disable_lat`.
+
+.. option:: disable_slat=bool
+
+	Disable measurements of submission latency numbers. See
+	:option:`disable_slat`.
+
+.. option:: disable_bw_measurement=bool, disable_bw=bool
+
+	Disable measurements of throughput/bandwidth numbers. See
+	:option:`disable_lat`.
+
+.. option:: clat_percentiles=bool
+
+	Enable the reporting of percentiles of completion latencies.
+
+.. option:: percentile_list=float_list
+
+	Overwrite the default list of percentiles for completion latencies and the
+	block error histogram.  Each number is a floating number in the range
+	(0,100], and the maximum length of the list is 20. Use ``:`` to separate the
+	numbers, and list the numbers in ascending order. For example,
+	``--percentile_list=99.5:99.9`` will cause fio to report the values of
+	completion latency below which 99.5% and 99.9% of the observed latencies
+	fell, respectively.
+
+
+Error handling
+~~~~~~~~~~~~~~
+
+.. option:: exitall_on_error
+
+	When one job finishes in error, terminate the rest. The default is to wait
+	for each job to finish.
+
+.. option:: continue_on_error=str
+
+	Normally fio will exit the job on the first observed failure. If this option
+	is set, fio will continue the job when there is a 'non-fatal error' (EIO or
+	EILSEQ) until the runtime is exceeded or the I/O size specified is
+	completed. If this option is used, there are two more stats that are
+	appended, the total error count and the first error. The error field given
+	in the stats is the first error that was hit during the run.
+
+	The allowed values are:
+
+		**none**
+			Exit on any I/O or verify errors.
+
+		**read**
+			Continue on read errors, exit on all others.
+
+		**write**
+			Continue on write errors, exit on all others.
+
+		**io**
+			Continue on any I/O error, exit on all others.
+
+		**verify**
+			Continue on verify errors, exit on all others.
+
+		**all**
+			Continue on all errors.
+
+		**0**
+			Backward-compatible alias for 'none'.
+
+		**1**
+			Backward-compatible alias for 'all'.
+
+.. option:: ignore_error=str
+
+	Sometimes you want to ignore some errors during test in that case you can
+	specify error list for each error type.
+	``ignore_error=READ_ERR_LIST,WRITE_ERR_LIST,VERIFY_ERR_LIST`` errors for
+	given error type is separated with ':'. Error may be symbol ('ENOSPC',
+	'ENOMEM') or integer.  Example::
+
+		ignore_error=EAGAIN,ENOSPC:122
+
+	This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from
+	WRITE.
+
+.. option:: error_dump=bool
+
+	If set dump every error even if it is non fatal, true by default. If
+	disabled only fatal error will be dumped.
+
+Running predefined workloads
+----------------------------
+
+Fio includes predefined profiles that mimic the I/O workloads generated by
+other tools.
+
+.. option:: profile=str
+
+	The predefined workload to run.  Current profiles are:
+
+		**tiobench**
+			Threaded I/O bench (tiotest/tiobench) like workload.
+
+		**act**
+			Aerospike Certification Tool (ACT) like workload.
+
+To view a profile's additional options use :option:`--cmdhelp` after specifying
+the profile.  For example::
+
+$ fio --profile=act --cmdhelp
+
+Act profile options
+~~~~~~~~~~~~~~~~~~~
+
+.. option:: device-names=str
+	:noindex:
+
+	Devices to use.
+
+.. option:: load=int
+	:noindex:
+
+	ACT load multiplier.  Default: 1.
+
+.. option:: test-duration=time
+	:noindex:
+
+	How long the entire test takes to run.  Default: 24h.
+
+.. option:: threads-per-queue=int
+	:noindex:
+
+	Number of read IO threads per device.  Default: 8.
+
+.. option:: read-req-num-512-blocks=int
+	:noindex:
+
+	Number of 512B blocks to read at the time.  Default: 3.
+
+.. option:: large-block-op-kbytes=int
+	:noindex:
+
+	Size of large block ops in KiB (writes).  Default: 131072.
+
+.. option:: prep
+	:noindex:
+
+	Set to run ACT prep phase.
+
+Tiobench profile options
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. option:: size=str
+	:noindex:
+
+	Size in MiB
+
+.. option:: block=int
+	:noindex:
+
+	Block size in bytes.  Default: 4096.
+
+.. option:: numruns=int
+	:noindex:
+
+	Number of runs.
+
+.. option:: dir=str
+	:noindex:
+
+	Test directory.
+
+.. option:: threads=int
+	:noindex:
+
+	Number of threads.
+
+Interpreting the output
+-----------------------
+
+Fio spits out a lot of output. While running, fio will display the status of the
+jobs created. An example of that would be::
+
+    Jobs: 1 (f=1): [_(1),M(1)][24.8%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 01m:31s]
+
+The characters inside the square brackets denote the current status of each
+thread. The possible values (in typical life cycle order) are:
+
++------+-----+-----------------------------------------------------------+
+| Idle | Run |                                                           |
++======+=====+===========================================================+
+| P    |     | Thread setup, but not started.                            |
++------+-----+-----------------------------------------------------------+
+| C    |     | Thread created.                                           |
++------+-----+-----------------------------------------------------------+
+| I    |     | Thread initialized, waiting or generating necessary data. |
++------+-----+-----------------------------------------------------------+
+|      |  p  | Thread running pre-reading file(s).                       |
++------+-----+-----------------------------------------------------------+
+|      |  R  | Running, doing sequential reads.                          |
++------+-----+-----------------------------------------------------------+
+|      |  r  | Running, doing random reads.                              |
++------+-----+-----------------------------------------------------------+
+|      |  W  | Running, doing sequential writes.                         |
++------+-----+-----------------------------------------------------------+
+|      |  w  | Running, doing random writes.                             |
++------+-----+-----------------------------------------------------------+
+|      |  M  | Running, doing mixed sequential reads/writes.             |
++------+-----+-----------------------------------------------------------+
+|      |  m  | Running, doing mixed random reads/writes.                 |
++------+-----+-----------------------------------------------------------+
+|      |  F  | Running, currently waiting for :manpage:`fsync(2)`        |
++------+-----+-----------------------------------------------------------+
+|      |  V  | Running, doing verification of written data.              |
++------+-----+-----------------------------------------------------------+
+| E    |     | Thread exited, not reaped by main thread yet.             |
++------+-----+-----------------------------------------------------------+
+| _    |     | Thread reaped, or                                         |
++------+-----+-----------------------------------------------------------+
+| X    |     | Thread reaped, exited with an error.                      |
++------+-----+-----------------------------------------------------------+
+| K    |     | Thread reaped, exited due to signal.                      |
++------+-----+-----------------------------------------------------------+
+
+Fio will condense the thread string as not to take up more space on the command
+line as is needed. For instance, if you have 10 readers and 10 writers running,
+the output would look like this::
+
+    Jobs: 20 (f=20): [R(10),W(10)][4.0%][r=20.5MiB/s,w=23.5MiB/s][r=82,w=94 IOPS][eta 57m:36s]
+
+Fio will still maintain the ordering, though. So the above means that jobs 1..10
+are readers, and 11..20 are writers.
+
+The other values are fairly self explanatory -- number of threads currently
+running and doing I/O, the number of currently open files (f=), the rate of I/O
+since last check (read speed listed first, then write speed and optionally trim
+speed), and the estimated completion percentage and time for the current
+running group. It's impossible to estimate runtime of the following groups (if
+any). Note that the string is displayed in order, so it's possible to tell which
+of the jobs are currently doing what. The first character is the first job
+defined in the job file, and so forth.
+
+When fio is done (or interrupted by :kbd:`ctrl-c`), it will show the data for
+each thread, group of threads, and disks in that order. For each data direction,
+the output looks like::
+
+    Client1 (g=0): err= 0:
+      write: io=    32MiB, bw=   666KiB/s, iops=89 , runt= 50320msec
+        slat (msec): min=    0, max=  136, avg= 0.03, stdev= 1.92
+        clat (msec): min=    0, max=  631, avg=48.50, stdev=86.82
+        bw (KiB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
+      cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
+      IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
+         submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+         complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+         issued r/w: total=0/32768, short=0/0
+         lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
+         lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
 
 The client number is printed, along with the group id and error of that
-thread. Below is the io statistics, here for writes. In the order listed,
-they denote:
+thread. Below is the I/O statistics, here for writes. In the order listed, they
+denote:
 
-io=		Number of megabytes io performed
-bw=		Average bandwidth rate
-iops=           Average IOs performed per second
-runt=		The runtime of that thread
-	slat=	Submission latency (avg being the average, stdev being the
-		standard deviation). This is the time it took to submit
-		the io. For sync io, the slat is really the completion
-		latency, since queue/complete is one operation there. This
-		value can be in milliseconds or microseconds, fio will choose
-		the most appropriate base and print that. In the example
-		above, milliseconds is the best scale. Note: in --minimal mode
+**io**
+		Number of megabytes I/O performed.
+
+**bw**
+		Average bandwidth rate.
+
+**iops**
+		Average I/Os performed per second.
+
+**runt**
+		The runtime of that thread.
+
+**slat**
+		Submission latency (avg being the average, stdev being the standard
+		deviation). This is the time it took to submit the I/O. For sync I/O,
+		the slat is really the completion latency, since queue/complete is one
+		operation there. This value can be in milliseconds or microseconds, fio
+		will choose the most appropriate base and print that. In the example
+		above, milliseconds is the best scale. Note: in :option:`--minimal` mode
 		latencies are always expressed in microseconds.
-	clat=	Completion latency. Same names as slat, this denotes the
-		time from submission to completion of the io pieces. For
-		sync io, clat will usually be equal (or very close) to 0,
-		as the time from submit to complete is basically just
-		CPU time (io has already been done, see slat explanation).
-	bw=	Bandwidth. Same names as the xlat stats, but also includes
-		an approximate percentage of total aggregate bandwidth
-		this thread received in this group. This last value is
-		only really useful if the threads in this group are on the
-		same disk, since they are then competing for disk access.
-cpu=		CPU usage. User and system time, along with the number
-		of context switches this thread went through, usage of
-		system and user time, and finally the number of major
-		and minor page faults.
-IO depths=	The distribution of io depths over the job life time. The
-		numbers are divided into powers of 2, so for example the
-		16= entries includes depths up to that value but higher
-		than the previous entry. In other words, it covers the
-		range from 16 to 31.
-IO submit=	How many pieces of IO were submitting in a single submit
-		call. Each entry denotes that amount and below, until
-		the previous entry - eg, 8=100% mean that we submitted
-		anywhere in between 5-8 ios per submit call.
-IO complete=	Like the above submit number, but for completions instead.
-IO issued=	The number of read/write requests issued, and how many
-		of them were short.
-IO latencies=	The distribution of IO completion latencies. This is the
-		time from when IO leaves fio and when it gets completed.
-		The numbers follow the same pattern as the IO depths,
-		meaning that 2=1.6% means that 1.6% of the IO completed
-		within 2 msecs, 20=12.8% means that 12.8% of the IO
-		took more than 10 msecs, but less than (or equal to) 20 msecs.
+
+**clat**
+		Completion latency. Same names as slat, this denotes the time from
+		submission to completion of the I/O pieces. For sync I/O, clat will
+		usually be equal (or very close) to 0, as the time from submit to
+		complete is basically just CPU time (I/O has already been done, see slat
+		explanation).
+
+**bw**
+		Bandwidth. Same names as the xlat stats, but also includes an
+		approximate percentage of total aggregate bandwidth this thread received
+		in this group. This last value is only really useful if the threads in
+		this group are on the same disk, since they are then competing for disk
+		access.
+
+**cpu**
+		CPU usage. User and system time, along with the number of context
+		switches this thread went through, usage of system and user time, and
+		finally the number of major and minor page faults. The CPU utilization
+		numbers are averages for the jobs in that reporting group, while the
+		context and fault counters are summed.
+
+**IO depths**
+		The distribution of I/O depths over the job life time. The numbers are
+		divided into powers of 2, so for example the 16= entries includes depths
+		up to that value but higher than the previous entry. In other words, it
+		covers the range from 16 to 31.
+
+**IO submit**
+		How many pieces of I/O were submitting in a single submit call. Each
+		entry denotes that amount and below, until the previous entry -- e.g.,
+		8=100% mean that we submitted anywhere in between 5-8 I/Os per submit
+		call.
+
+**IO complete**
+		Like the above submit number, but for completions instead.
+
+**IO issued**
+		The number of read/write requests issued, and how many of them were
+		short.
+
+**IO latencies**
+		The distribution of I/O completion latencies. This is the time from when
+		I/O leaves fio and when it gets completed.  The numbers follow the same
+		pattern as the I/O depths, meaning that 2=1.6% means that 1.6% of the
+		I/O completed within 2 msecs, 20=12.8% means that 12.8% of the I/O took
+		more than 10 msecs, but less than (or equal to) 20 msecs.
 
 After each client has been listed, the group statistics are printed. They
-will look like this:
+will look like this::
 
-Run status group 0 (all jobs):
-   READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec
-  WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec
+    Run status group 0 (all jobs):
+       READ: io=64MB, aggrb=22178, minb=11355, maxb=11814, mint=2840msec, maxt=2955msec
+      WRITE: io=64MB, aggrb=1302, minb=666, maxb=669, mint=50093msec, maxt=50320msec
 
 For each data direction, it prints:
 
-io=		Number of megabytes io performed.
-aggrb=		Aggregate bandwidth of threads in this group.
-minb=		The minimum average bandwidth a thread saw.
-maxb=		The maximum average bandwidth a thread saw.
-mint=		The smallest runtime of the threads in that group.
-maxt=		The longest runtime of the threads in that group.
+**io**
+		Number of megabytes I/O performed.
+**aggrb**
+		Aggregate bandwidth of threads in this group.
+**minb**
+		The minimum average bandwidth a thread saw.
+**maxb**
+		The maximum average bandwidth a thread saw.
+**mint**
+		The smallest runtime of the threads in that group.
+**maxt**
+		The longest runtime of the threads in that group.
 
-And finally, the disk statistics are printed. They will look like this:
+And finally, the disk statistics are printed. They will look like this::
 
-Disk stats (read/write):
-  sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
+  Disk stats (read/write):
+    sda: ios=16398/16511, merge=30/162, ticks=6853/819634, in_queue=826487, util=100.00%
 
 Each value is printed for both reads and writes, with reads first. The
 numbers denote:
 
-ios=		Number of ios performed by all groups.
-merge=		Number of merges io the io scheduler.
-ticks=		Number of ticks we kept the disk busy.
-io_queue=	Total time spent in the disk queue.
-util=		The disk utilization. A value of 100% means we kept the disk
+**ios**
+		Number of I/Os performed by all groups.
+**merge**
+		Number of merges I/O the I/O scheduler.
+**ticks**
+		Number of ticks we kept the disk busy.
+**io_queue**
+		Total time spent in the disk queue.
+**util**
+		The disk utilization. A value of 100% means we kept the disk
 		busy constantly, 50% would be a disk idling half of the time.
 
-It is also possible to get fio to dump the current output while it is
-running, without terminating the job. To do that, send fio the USR1 signal.
-You can also get regularly timed dumps by using the --status-interval
-parameter, or by creating a file in /tmp named fio-dump-status. If fio
-sees this file, it will unlink it and dump the current output status.
+It is also possible to get fio to dump the current output while it is running,
+without terminating the job. To do that, send fio the **USR1** signal.  You can
+also get regularly timed dumps by using the :option:`--status-interval`
+parameter, or by creating a file in :file:`/tmp` named
+:file:`fio-dump-status`. If fio sees this file, it will unlink it and dump the
+current output status.
 
 
-7.0 Terse output
-----------------
+Terse output
+------------
 
-For scripted usage where you typically want to generate tables or graphs
-of the results, fio can output the results in a semicolon separated format.
-The format is one long line of values, such as:
+For scripted usage where you typically want to generate tables or graphs of the
+results, fio can output the results in a semicolon separated format.  The format
+is one long line of values, such as::
 
-2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
-A description of this job goes here.
+    2;card0;0;0;7139336;121836;60004;1;10109;27.932460;116.933948;220;126861;3495.446807;1085.368601;226;126864;3523.635629;1089.012448;24063;99944;50.275485%;59818.274627;5540.657370;7155060;122104;60004;1;8338;29.086342;117.839068;388;128077;5032.488518;1234.785715;391;128085;5061.839412;1236.909129;23436;100928;50.287926%;59964.832030;5644.844189;14.595833%;19.394167%;123706;0;7313;0.1%;0.1%;0.1%;0.1%;0.1%;0.1%;100.0%;0.00%;0.00%;0.00%;0.00%;0.00%;0.00%;0.01%;0.02%;0.05%;0.16%;6.04%;40.40%;52.68%;0.64%;0.01%;0.00%;0.01%;0.00%;0.00%;0.00%;0.00%;0.00%
+    A description of this job goes here.
 
 The job description (if provided) follows on a second line.
 
-To enable terse output, use the --minimal command line option. The first
-value is the version of the terse output format. If the output has to
-be changed for some reason, this number will be incremented by 1 to
-signify that change.
+To enable terse output, use the :option:`--minimal` command line option. The
+first value is the version of the terse output format. If the output has to be
+changed for some reason, this number will be incremented by 1 to signify that
+change.
 
 Split up, the format is as follows:
 
-	terse version, fio version, jobname, groupid, error
-	READ status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, deviation (usec)
-		Completion latency: min, max, mean, deviation (usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, deviation (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, deviation
-	WRITE status:
-		Total IO (KB), bandwidth (KB/sec), IOPS, runtime (msec)
-		Submission latency: min, max, mean, deviation (usec)
-		Completion latency: min, max, mean, deviation (usec)
-		Completion latency percentiles: 20 fields (see below)
-		Total latency: min, max, mean, deviation (usec)
-		Bw (KB/s): min, max, aggregate percentage of total, mean, deviation
-	CPU usage: user, system, context switches, major faults, minor faults
-	IO depths: <=1, 2, 4, 8, 16, 32, >=64
-	IO latencies microseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
-	IO latencies milliseconds: <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
-	Disk utilization: Disk name, Read ios, write ios,
-			  Read merges, write merges,
-			  Read ticks, write ticks,
-			  Time spent in queue, disk utilization percentage
-	Additional Info (dependent on continue_on_error, default off): total # errors, first error code
+    ::
 
-	Additional Info (dependent on description being set): Text description
+        terse version, fio version, jobname, groupid, error
 
-Completion latency percentiles can be a grouping of up to 20 sets, so
-for the terse output fio writes all of them. Each field will look like this:
+    READ status::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev (usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev
+
+    WRITE status:
+
+    ::
+
+        Total IO (KiB), bandwidth (KiB/sec), IOPS, runtime (msec)
+        Submission latency: min, max, mean, stdev (usec)
+        Completion latency: min, max, mean, stdev(usec)
+        Completion latency percentiles: 20 fields (see below)
+        Total latency: min, max, mean, stdev (usec)
+        Bw (KiB/s): min, max, aggregate percentage of total, mean, stdev
+
+    CPU usage::
+
+        user, system, context switches, major faults, minor faults
+
+    I/O depths::
+
+        <=1, 2, 4, 8, 16, 32, >=64
+
+    I/O latencies microseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000
+
+    I/O latencies milliseconds::
+
+        <=2, 4, 10, 20, 50, 100, 250, 500, 750, 1000, 2000, >=2000
+
+    Disk utilization::
+
+        Disk name, Read ios, write ios,
+        Read merges, write merges,
+        Read ticks, write ticks,
+        Time spent in queue, disk utilization percentage
+
+    Additional Info (dependent on continue_on_error, default off)::
+
+        total # errors, first error code
+
+    Additional Info (dependent on description being set)::
+
+        Text description
+
+Completion latency percentiles can be a grouping of up to 20 sets, so for the
+terse output fio writes all of them. Each field will look like this::
 
 	1.00%=6112
 
-which is the Xth percentile, and the usec latency associated with it.
+which is the Xth percentile, and the `usec` latency associated with it.
 
-For disk utilization, all disks used by fio are shown. So for each disk
-there will be a disk utilization section.
+For disk utilization, all disks used by fio are shown. So for each disk there
+will be a disk utilization section.
 
 
-8.0 Trace file format
----------------------
-There are two trace file format that you can encounter. The older (v1) format
-is unsupported since version 1.20-rc3 (March 2008). It will still be described
+Trace file format
+-----------------
+
+There are two trace file format that you can encounter. The older (v1) format is
+unsupported since version 1.20-rc3 (March 2008). It will still be described
 below in case that you get an old trace and want to understand it.
 
 In any case the trace is a simple text file with a single action per line.
 
 
-8.1 Trace file format v1
-------------------------
-Each line represents a single io action in the following format:
+Trace file format v1
+~~~~~~~~~~~~~~~~~~~~
 
-rw, offset, length
+Each line represents a single I/O action in the following format::
 
-where rw=0/1 for read/write, and the offset and length entries being in bytes.
+	rw, offset, length
 
-This format is not supported in Fio versions => 1.20-rc3.
+where `rw=0/1` for read/write, and the offset and length entries being in bytes.
+
+This format is not supported in fio versions => 1.20-rc3.
 
 
-8.2 Trace file format v2
-------------------------
-The second version of the trace file format was added in Fio version 1.17.
-It allows to access more then one file per trace and has a bigger set of
-possible file actions.
+Trace file format v2
+~~~~~~~~~~~~~~~~~~~~
 
-The first line of the trace file has to be:
+The second version of the trace file format was added in fio version 1.17.  It
+allows to access more then one file per trace and has a bigger set of possible
+file actions.
 
-fio version 2 iolog
+The first line of the trace file has to be::
+
+    fio version 2 iolog
 
 Following this can be lines in two different formats, which are described below.
 
-The file management format:
+The file management format::
 
-filename action
+    filename action
 
 The filename is given as an absolute path. The action can be one of these:
 
-add          Add the given filename to the trace
-open         Open the file with the given filename. The filename has to have
-             been added with the add action before.
-close        Close the file with the given filename. The file has to have been
-             opened before.
+**add**
+		Add the given filename to the trace.
+**open**
+		Open the file with the given filename. The filename has to have
+		been added with the **add** action before.
+**close**
+		Close the file with the given filename. The file has to have been
+		opened before.
 
 
-The file io action format:
+The file I/O action format::
 
-filename action offset length
+    filename action offset length
 
-The filename is given as an absolute path, and has to have been added and opened
-before it can be used with this format. The offset and length are given in
-bytes. The action can be one of these:
+The `filename` is given as an absolute path, and has to have been added and
+opened before it can be used with this format. The `offset` and `length` are
+given in bytes. The `action` can be one of these:
 
-wait       Wait for 'offset' microseconds. Everything below 100 is discarded.
-read       Read 'length' bytes beginning from 'offset'
-write      Write 'length' bytes beginning from 'offset'
-sync       fsync() the file
-datasync   fdatasync() the file
-trim       trim the given file from the given 'offset' for 'length' bytes
+**wait**
+	   Wait for `offset` microseconds. Everything below 100 is discarded.
+	   The time is relative to the previous `wait` statement.
+**read**
+	   Read `length` bytes beginning from `offset`.
+**write**
+	   Write `length` bytes beginning from `offset`.
+**sync**
+	   :manpage:`fsync(2)` the file.
+**datasync**
+	   :manpage:`fdatasync(2)` the file.
+**trim**
+	   Trim the given file from the given `offset` for `length` bytes.
+
+CPU idleness profiling
+----------------------
+
+In some cases, we want to understand CPU overhead in a test. For example, we
+test patches for the specific goodness of whether they reduce CPU usage.
+Fio implements a balloon approach to create a thread per CPU that runs at idle
+priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each CPU
+can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean and
+standard deviation of time to complete an unit work is reported in "unit work"
+section. Options can be chosen to report detailed percpu idleness or overall
+system idleness by aggregating percpu stats.
 
 
-9.0 CPU idleness profiling
---------------------------
-In some cases, we want to understand CPU overhead in a test. For example,
-we test patches for the specific goodness of whether they reduce CPU usage.
-fio implements a balloon approach to create a thread per CPU that runs at
-idle priority, meaning that it only runs when nobody else needs the cpu.
-By measuring the amount of work completed by the thread, idleness of each
-CPU can be derived accordingly.
+Verification and triggers
+-------------------------
 
-An unit work is defined as touching a full page of unsigned characters. Mean
-and standard deviation of time to complete an unit work is reported in "unit
-work" section. Options can be chosen to report detailed percpu idleness or
-overall system idleness by aggregating percpu stats.
+Fio is usually run in one of two ways, when data verification is done. The first
+is a normal write job of some sort with verify enabled. When the write phase has
+completed, fio switches to reads and verifies everything it wrote. The second
+model is running just the write phase, and then later on running the same job
+(but with reads instead of writes) to repeat the same I/O patterns and verify
+the contents. Both of these methods depend on the write phase being completed,
+as fio otherwise has no idea how much data was written.
 
-
-10.0 Verification and triggers
-------------------------------
-Fio is usually run in one of two ways, when data verification is done. The
-first is a normal write job of some sort with verify enabled. When the
-write phase has completed, fio switches to reads and verifies everything
-it wrote. The second model is running just the write phase, and then later
-on running the same job (but with reads instead of writes) to repeat the
-same IO patterns and verify the contents. Both of these methods depend
-on the write phase being completed, as fio otherwise has no idea how much
-data was written.
-
-With verification triggers, fio supports dumping the current write state
-to local files. Then a subsequent read verify workload can load this state
-and know exactly where to stop. This is useful for testing cases where
-power is cut to a server in a managed fashion, for instance.
+With verification triggers, fio supports dumping the current write state to
+local files. Then a subsequent read verify workload can load this state and know
+exactly where to stop. This is useful for testing cases where power is cut to a
+server in a managed fashion, for instance.
 
 A verification trigger consists of two things:
 
-1) Storing the write state of each job
-2) Executing a trigger command
+1) Storing the write state of each job.
+2) Executing a trigger command.
 
-The write state is relatively small, on the order of hundreds of bytes
-to single kilobytes. It contains information on the number of completions
-done, the last X completions, etc.
+The write state is relatively small, on the order of hundreds of bytes to single
+kilobytes. It contains information on the number of completions done, the last X
+completions, etc.
 
-A trigger is invoked either through creation ('touch') of a specified
-file in the system, or through a timeout setting. If fio is run with
---trigger-file=/tmp/trigger-file, then it will continually check for
-the existence of /tmp/trigger-file. When it sees this file, it will
-fire off the trigger (thus saving state, and executing the trigger
+A trigger is invoked either through creation ('touch') of a specified file in
+the system, or through a timeout setting. If fio is run with
+:option:`--trigger-file` = :file:`/tmp/trigger-file`, then it will continually
+check for the existence of :file:`/tmp/trigger-file`. When it sees this file, it
+will fire off the trigger (thus saving state, and executing the trigger
 command).
 
-For client/server runs, there's both a local and remote trigger. If
-fio is running as a server backend, it will send the job states back
-to the client for safe storage, then execute the remote trigger, if
-specified. If a local trigger is specified, the server will still send
-back the write state, but the client will then execute the trigger.
+For client/server runs, there's both a local and remote trigger. If fio is
+running as a server backend, it will send the job states back to the client for
+safe storage, then execute the remote trigger, if specified. If a local trigger
+is specified, the server will still send back the write state, but the client
+will then execute the trigger.
 
-10.1 Verification trigger example
----------------------------------
-Lets say we want to run a powercut test on the remote machine 'server'.
-Our write workload is in write-test.fio. We want to cut power to 'server'
-at some point during the run, and we'll run this test from the safety
-or our local machine, 'localbox'. On the server, we'll start the fio
-backend normally:
+Verification trigger example
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-server# fio --server
+Lets say we want to run a powercut test on the remote machine 'server'.  Our
+write workload is in :file:`write-test.fio`. We want to cut power to 'server' at
+some point during the run, and we'll run this test from the safety or our local
+machine, 'localbox'. On the server, we'll start the fio backend normally::
 
-and on the client, we'll fire off the workload:
+	server# fio --server
 
-localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+and on the client, we'll fire off the workload::
 
-We set /tmp/my-trigger as the trigger file, and we tell fio to execute
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
 
-echo b > /proc/sysrq-trigger
+We set :file:`/tmp/my-trigger` as the trigger file, and we tell fio to execute::
 
-on the server once it has received the trigger and sent us the write
-state. This will work, but it's not _really_ cutting power to the server,
-it's merely abruptly rebooting it. If we have a remote way of cutting
-power to the server through IPMI or similar, we could do that through
-a local trigger command instead. Lets assume we have a script that does
-IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
-then have run fio with a local trigger instead:
+	echo b > /proc/sysrq-trigger
 
-localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+on the server once it has received the trigger and sent us the write state. This
+will work, but it's not **really** cutting power to the server, it's merely
+abruptly rebooting it. If we have a remote way of cutting power to the server
+through IPMI or similar, we could do that through a local trigger command
+instead. Lets assume we have a script that does IPMI reboot of a given hostname,
+ipmi-reboot. On localbox, we could then have run fio with a local trigger
+instead::
 
-For this case, fio would wait for the server to send us the write state,
-then execute 'ipmi-reboot server' when that happened.
+	localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
 
-10.1 Loading verify state
--------------------------
-To load store write state, read verification job file must contain
-the verify_state_load option. If that is set, fio will load the previously
+For this case, fio would wait for the server to send us the write state, then
+execute ``ipmi-reboot server`` when that happened.
+
+Loading verify state
+~~~~~~~~~~~~~~~~~~~~
+
+To load store write state, read verification job file must contain the
+:option:`verify_state_load` option. If that is set, fio will load the previously
 stored state. For a local fio run this is done by loading the files directly,
-and on a client/server run, the server backend will ask the client to send
-the files over and load them from there.
+and on a client/server run, the server backend will ask the client to send the
+files over and load them from there.
+
+
+Log File Formats
+----------------
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+    *time* (`msec`), *value*, *data direction*, *offset*
+
+Time for the log entry is always in milliseconds. The *value* logged depends
+on the type of log, it will be one of the following:
+
+    **Latency log**
+		Value is latency in usecs
+    **Bandwidth log**
+		Value is in KiB/sec
+    **IOPS log**
+		Value is IOPS
+
+*Data direction* is one of the following:
+
+	**0**
+		I/O is a READ
+	**1**
+		I/O is a WRITE
+	**2**
+		I/O is a TRIM
+
+The *offset* is the offset, in bytes, from the start of the file, for that
+particular I/O. The logging of the offset can be toggled with
+:option:`log_offset`.
+
+If windowed logging is enabled through :option:`log_avg_msec` then fio doesn't
+log individual I/Os. Instead of logs the average values over the specified period
+of time. Since 'data direction' and 'offset' are per-I/O values, they aren't
+applicable if windowed logging is enabled. If windowed logging is enabled and
+:option:`log_max_value` is set, then fio logs maximum values in that window
+instead of averages.
+
+
+Client/server
+-------------
+
+Normally fio is invoked as a stand-alone application on the machine where the
+I/O workload should be generated. However, the frontend and backend of fio can
+be run separately. Ie the fio server can generate an I/O workload on the "Device
+Under Test" while being controlled from another machine.
+
+Start the server on the machine which has access to the storage DUT::
+
+	fio --server=args
+
+where args defines what fio listens to. The arguments are of the form
+``type,hostname`` or ``IP,port``. *type* is either ``ip`` (or ip4) for TCP/IP
+v4, ``ip6`` for TCP/IP v6, or ``sock`` for a local unix domain socket.
+*hostname* is either a hostname or IP address, and *port* is the port to listen
+to (only valid for TCP/IP, not a local socket). Some examples:
+
+1) ``fio --server``
+
+   Start a fio server, listening on all interfaces on the default port (8765).
+
+2) ``fio --server=ip:hostname,4444``
+
+   Start a fio server, listening on IP belonging to hostname and on port 4444.
+
+3) ``fio --server=ip6:::1,4444``
+
+   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
+
+4) ``fio --server=,4444``
+
+   Start a fio server, listening on all interfaces on port 4444.
+
+5) ``fio --server=1.2.3.4``
+
+   Start a fio server, listening on IP 1.2.3.4 on the default port.
+
+6) ``fio --server=sock:/tmp/fio.sock``
+
+   Start a fio server, listening on the local socket /tmp/fio.sock.
+
+Once a server is running, a "client" can connect to the fio server with::
+
+	fio <local-args> --client=<server> <remote-args> <job file(s)>
+
+where `local-args` are arguments for the client where it is running, `server`
+is the connect string, and `remote-args` and `job file(s)` are sent to the
+server. The `server` string follows the same format as it does on the server
+side, to allow IP/hostname/socket and port strings.
+
+Fio can connect to multiple servers this way::
+
+    fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server to
+load a local file as well. This is done by using :option:`--remote-config` ::
+
+   fio --client=server --remote-config /path/to/file.fio
+
+Then fio will open this local (to the server) job file instead of being passed
+one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the
+:option:`--client` option.  For example, here is an example :file:`host.list`
+file containing 2 hostnames::
+
+	host1.your.dns.domain
+	host2.your.dns.domain
+
+The fio command would then be::
+
+    fio --client=host.list <job file(s)>
+
+In this mode, you cannot input server-specific parameters or job files -- all
+servers receive the same job file.
+
+In order to let ``fio --client`` runs use a shared filesystem from multiple
+hosts, ``fio --client`` now prepends the IP address of the server to the
+filename.  For example, if fio is using directory :file:`/mnt/nfs/fio` and is
+writing filename :file:`fileio.tmp`, with a :option:`--client` `hostfile`
+containing two hostnames ``h1`` and ``h2`` with IP addresses 192.168.10.120 and
+192.168.10.121, then fio will create two files::
+
+	/mnt/nfs/fio/192.168.10.120.fileio.tmp
+	/mnt/nfs/fio/192.168.10.121.fileio.tmp

diff --git a/Makefile b/Makefile
index 52e515b..1f0f5d0 100644
--- a/Makefile
+++ b/Makefile

@@ -1,3 +1,9 @@
+ifeq ($(SRCDIR),)
+SRCDIR := .
+endif
+
+VPATH := $(SRCDIR)
+
 ifneq ($(wildcard config-host.mak),)
 all:
 include config-host.mak
@@ -14,33 +20,38 @@
 include config-host.mak
 endif
 
-DEBUGFLAGS = -D_FORTIFY_SOURCE=2 -DFIO_INC_DEBUG
+DEBUGFLAGS = -DFIO_INC_DEBUG
 CPPFLAGS= -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -DFIO_INTERNAL $(DEBUGFLAGS)
-OPTFLAGS= -O3 -g -ffast-math
-CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS)
+OPTFLAGS= -g -ffast-math
+CFLAGS	= -std=gnu99 -Wwrite-strings -Wall -Wdeclaration-after-statement $(OPTFLAGS) $(EXTFLAGS) $(BUILD_CFLAGS) -I. -I$(SRCDIR)
 LIBS	+= -lm $(EXTLIBS)
 PROGS	= fio
-SCRIPTS = tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio
+SCRIPTS = $(addprefix $(SRCDIR)/,tools/fio_generate_plots tools/plot/fio2gnuplot tools/genfio tools/fiologparser.py tools/hist/fiologparser_hist.py)
+
+ifndef CONFIG_FIO_NO_OPT
+  CFLAGS += -O3 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2
+endif
 
 ifdef CONFIG_GFIO
   PROGS += gfio
 endif
 
-SOURCE := gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
+SOURCE :=	$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/crc/*.c)) \
+		$(patsubst $(SRCDIR)/%,%,$(wildcard $(SRCDIR)/lib/*.c)) \
+		gettime.c ioengines.c init.c stat.c log.c time.c filesetup.c \
 		eta.c verify.c memory.c io_u.c parse.c mutex.c options.c \
-		lib/rbtree.c smalloc.c filehash.c profile.c debug.c lib/rand.c \
-		lib/num2str.c lib/ieee754.c $(wildcard crc/*.c) engines/cpu.c \
+		smalloc.c filehash.c profile.c debug.c engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
-		memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \
-		cconv.c lib/prio_tree.c json.c lib/zipf.c lib/axmap.c \
-		lib/lfsr.c gettime-thread.c helpers.c lib/flist_sort.c \
-		lib/hweight.c lib/getrusage.c idletime.c td_error.c \
+		engines/ftruncate.c \
+		server.c client.c iolog.c backend.c libfio.c flow.c cconv.c \
+		gettime-thread.c helpers.c json.c idletime.c td_error.c \
 		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
-		lib/tp.c lib/bloom.c
+		workqueue.c rate-submit.c optgroup.c helper_thread.c \
+		steadystate.c
 
 ifdef CONFIG_LIBHDFS
   HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
-  HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/amd64/server -L$(JAVA_HOME)/jre/lib/amd64/server -ljvm $(FIO_LIBHDFS_LIB)/libhdfs.a
+  HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -L$(JAVA_HOME)/jre/lib/$(FIO_HDFS_CPU)/server -ljvm $(FIO_LIBHDFS_LIB)/libhdfs.a
   CFLAGS += $(HDFSFLAGS)
   SOURCE += engines/libhdfs.c
 endif
@@ -88,16 +99,19 @@
   SOURCE += engines/rbd.c
 endif
 ifndef CONFIG_STRSEP
-  SOURCE += lib/strsep.c
+  SOURCE += oslib/strsep.c
 endif
 ifndef CONFIG_STRCASESTR
-  SOURCE += lib/strcasestr.c
+  SOURCE += oslib/strcasestr.c
+endif
+ifndef CONFIG_STRLCAT
+  SOURCE += oslib/strlcat.c
 endif
 ifndef CONFIG_GETOPT_LONG_ONLY
-  SOURCE += lib/getopt_long.c
+  SOURCE += oslib/getopt_long.c
 endif
 ifndef CONFIG_INET_ATON
-  SOURCE += lib/inet_aton.c
+  SOURCE += oslib/inet_aton.c
 endif
 ifdef CONFIG_GFAPI
   SOURCE += engines/glusterfs.c
@@ -107,16 +121,27 @@
     CFLAGS += "-DGFAPI_USE_FADVISE"
   endif
 endif
+ifdef CONFIG_MTD
+  SOURCE += engines/mtd.c
+  SOURCE += oslib/libmtd.c
+  SOURCE += oslib/libmtd_legacy.c
+endif
+ifdef CONFIG_PMEMBLK
+  SOURCE += engines/pmemblk.c
+endif
+ifdef CONFIG_LINUX_DEVDAX
+  SOURCE += engines/dev-dax.c
+endif
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
-		engines/binject.c lib/linux-dev-lookup.c
+		engines/binject.c oslib/linux-dev-lookup.c
   LIBS += -lpthread -ldl
   LDFLAGS += -rdynamic
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
   SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c \
-		lib/linux-dev-lookup.c
+		oslib/linux-dev-lookup.c
   LIBS += -ldl
   LDFLAGS += -rdynamic
 endif
@@ -125,6 +150,7 @@
   CPPFLAGS += -D__EXTENSIONS__
 endif
 ifeq ($(CONFIG_TARGET_OS), FreeBSD)
+  SOURCE += trim.c
   LIBS	 += -lpthread -lrt
   LDFLAGS += -rdynamic
 endif
@@ -137,6 +163,7 @@
   LDFLAGS += -rdynamic
 endif
 ifeq ($(CONFIG_TARGET_OS), DragonFly)
+  SOURCE += trim.c
   LIBS	 += -lpthread -lrt
   LDFLAGS += -rdynamic
 endif
@@ -153,13 +180,12 @@
   LIBS	 += -lpthread -ldl
 endif
 ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
-  SOURCE := $(filter-out engines/mmap.c,$(SOURCE))
   SOURCE += os/windows/posix.c
   LIBS	 += -lpthread -lpsapi -lws2_32
   CFLAGS += -DPSAPI_VERSION=1 -Ios/windows/posix/include -Wno-format -static
 endif
 
-OBJS = $(SOURCE:.c=.o)
+OBJS := $(SOURCE:.c=.o)
 
 FIO_OBJS = $(OBJS) fio.o
 
@@ -174,7 +200,7 @@
 -include $(OBJS:.o=.d)
 
 T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o t/debug.o
+T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o t/debug.o t/arch.o
 T_SMALLOC_PROGS = t/stest
 
 T_IEEE_OBJS = t/ieee754.o
@@ -182,7 +208,8 @@
 T_IEEE_PROGS = t/ieee754
 
 T_ZIPF_OBS = t/genzipf.o
-T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
+T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/pattern.o lib/zipf.o \
+		lib/strntol.o lib/gauss.o t/genzipf.o oslib/strcasestr.o
 T_ZIPF_PROGS = t/fio-genzipf
 
 T_AXMAP_OBJS = t/axmap.o
@@ -190,36 +217,62 @@
 T_AXMAP_PROGS = t/axmap
 
 T_LFSR_TEST_OBJS = t/lfsr-test.o
-T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o t/log.o t/debug.o
+T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o t/log.o t/debug.o t/arch.o
 T_LFSR_TEST_PROGS = t/lfsr-test
 
+T_GEN_RAND_OBJS = t/gen-rand.o
+T_GEN_RAND_OBJS += t/log.o t/debug.o lib/rand.o lib/pattern.o lib/strntol.o \
+			oslib/strcasestr.o
+T_GEN_RAND_PROGS = t/gen-rand
+
 ifeq ($(CONFIG_TARGET_OS), Linux)
 T_BTRACE_FIO_OBJS = t/btrace2fio.o
-T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o lib/linux-dev-lookup.o
+T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o oslib/linux-dev-lookup.o
 T_BTRACE_FIO_PROGS = t/fio-btrace2fio
 endif
 
 T_DEDUPE_OBJS = t/dedupe.o
 T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \
-		memalign.o lib/bloom.o t/debug.o crc/xxhash.o crc/murmur3.o \
-		crc/crc32c.o crc/crc32c-intel.o crc/fnv.o
+		lib/memalign.o lib/bloom.o t/debug.o crc/xxhash.o t/arch.o \
+		crc/murmur3.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o crc/fnv.o
 T_DEDUPE_PROGS = t/fio-dedupe
 
+T_VS_OBJS = t/verify-state.o t/log.o crc/crc32c.o crc/crc32c-intel.o crc/crc32c-arm64.o t/debug.o
+T_VS_PROGS = t/fio-verify-state
+
+T_PIPE_ASYNC_OBJS = t/read-to-pipe-async.o
+T_PIPE_ASYNC_PROGS = t/read-to-pipe-async
+
+T_MEMLOCK_OBJS = t/memlock.o
+T_MEMLOCK_PROGS = t/memlock
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
 T_OBJS += $(T_ZIPF_OBJS)
 T_OBJS += $(T_AXMAP_OBJS)
 T_OBJS += $(T_LFSR_TEST_OBJS)
+T_OBJS += $(T_GEN_RAND_OBJS)
 T_OBJS += $(T_BTRACE_FIO_OBJS)
 T_OBJS += $(T_DEDUPE_OBJS)
+T_OBJS += $(T_VS_OBJS)
+T_OBJS += $(T_PIPE_ASYNC_OBJS)
+T_OBJS += $(T_MEMLOCK_OBJS)
+
+ifneq (,$(findstring CYGWIN,$(CONFIG_TARGET_OS)))
+    T_DEDUPE_OBJS += os/windows/posix.o lib/hweight.o
+    T_SMALLOC_OBJS += os/windows/posix.o lib/hweight.o
+    T_LFSR_TEST_OBJS += os/windows/posix.o lib/hweight.o
+endif
 
 T_TEST_PROGS = $(T_SMALLOC_PROGS)
 T_TEST_PROGS += $(T_IEEE_PROGS)
 T_PROGS += $(T_ZIPF_PROGS)
 T_TEST_PROGS += $(T_AXMAP_PROGS)
 T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_GEN_RAND_PROGS)
 T_PROGS += $(T_BTRACE_FIO_PROGS)
 T_PROGS += $(T_DEDUPE_PROGS)
+T_PROGS += $(T_VS_PROGS)
 
 PROGS += $(T_PROGS)
 
@@ -238,7 +291,7 @@
 else
 	INSTALL = install
 endif
-prefix = /usr/local
+prefix = $(INSTALL_PREFIX)
 bindir = $(prefix)/bin
 
 ifeq ($(CONFIG_TARGET_OS), Darwin)
@@ -251,27 +304,32 @@
 
 all: $(PROGS) $(T_TEST_PROGS) $(SCRIPTS) FORCE
 
-.PHONY: all install clean
+.PHONY: all install clean test
 .PHONY: FORCE cscope
 
 FIO-VERSION-FILE: FORCE
-	@$(SHELL) ./FIO-VERSION-GEN
+	@$(SHELL) $(SRCDIR)/FIO-VERSION-GEN
 -include FIO-VERSION-FILE
 
 override CFLAGS += -DFIO_VERSION='"$(FIO_VERSION)"'
 
 %.o : %.c
+	@mkdir -p $(dir $@)
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
-	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $*.c > $*.d
+	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
 	@mv -f $*.d $*.d.tmp
 	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
-	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -1 | \
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
 		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
 	@rm -f $*.d.tmp
 
 ifdef CONFIG_ARITHMETIC
 lex.yy.c: exp/expression-parser.l
-	$(QUIET_LEX)$(LEX) exp/expression-parser.l
+ifdef CONFIG_LEX_USE_O
+	$(QUIET_LEX)$(LEX) -o $@ $<
+else
+	$(QUIET_LEX)$(LEX) $<
+endif
 
 lex.yy.o: lex.yy.c y.tab.h
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
@@ -280,7 +338,7 @@
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
 
 y.tab.c: exp/expression-parser.y
-	$(QUIET_YACC)$(YACC) -l -d -b y exp/expression-parser.y
+	$(QUIET_YACC)$(YACC) -o $@ -l -d -b y $<
 
 y.tab.h: y.tab.c
 
@@ -294,35 +352,48 @@
 parse.o: lex.yy.o y.tab.o
 endif
 
-init.o: FIO-VERSION-FILE init.c
-	$(QUIET_CC)$(CC) -o init.o $(CFLAGS) $(CPPFLAGS) -c init.c
+init.o: init.c FIO-VERSION-FILE
+	@mkdir -p $(dir $@)
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $(SRCDIR)/$*.c > $*.d
+	@mv -f $*.d $*.d.tmp
+	@sed -e 's|.*:|$*.o:|' < $*.d.tmp > $*.d
+	@sed -e 's/.*://' -e 's/\\$$//' < $*.d.tmp | fmt -w 1 | \
+		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
+	@rm -f $*.d.tmp
 
 gcompat.o: gcompat.c gcompat.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gcompat.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 goptions.o: goptions.c goptions.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c goptions.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 ghelpers.o: ghelpers.c ghelpers.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c ghelpers.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gerror.o: gerror.c gerror.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gerror.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gclient.o: gclient.c gclient.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gclient.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 gfio.o: gfio.c ghelpers.c
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c gfio.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 graph.o: graph.c graph.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c graph.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 cairo_text_helpers.o: cairo_text_helpers.c cairo_text_helpers.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c cairo_text_helpers.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
 
 printing.o: printing.c printing.h
-	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c printing.c
+	$(QUIET_CC)$(CC) $(CFLAGS) $(GTK_CFLAGS) $(CPPFLAGS) -c $<
+
+t/read-to-pipe-async: $(T_PIPE_ASYNC_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_PIPE_ASYNC_OBJS) $(LIBS)
+
+t/memlock: $(T_MEMLOCK_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_MEMLOCK_OBJS) $(LIBS)
 
 t/stest: $(T_SMALLOC_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_SMALLOC_OBJS) $(LIBS)
@@ -345,6 +416,9 @@
 t/lfsr-test: $(T_LFSR_TEST_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
 
+t/gen-rand: $(T_GEN_RAND_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_GEN_RAND_OBJS) $(LIBS)
+
 ifeq ($(CONFIG_TARGET_OS), Linux)
 t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
@@ -353,11 +427,15 @@
 t/fio-dedupe: $(T_DEDUPE_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
 
+t/fio-verify-state: $(T_VS_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_VS_OBJS) $(LIBS)
+
 clean: FORCE
-	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d oslib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
+	@rm -rf  doc/output
 
 distclean: clean FORCE
-	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf
+	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf fiologparser_hist.pdf
 
 cscope:
 	@cscope -b -R
@@ -369,13 +447,18 @@
 	@man -t ./fio.1 | ps2pdf - fio.pdf
 	@man -t tools/fio_generate_plots.1 | ps2pdf - fio_generate_plots.pdf
 	@man -t tools/plot/fio2gnuplot.1 | ps2pdf - fio2gnuplot.pdf
+	@man -t tools/hist/fiologparser_hist.py.1 | ps2pdf - fiologparser_hist.pdf
+
+test: fio
+	./fio --minimal --thread --ioengine=null --runtime=1s --name=nulltest --rw=randrw --iodepth=2 --norandommap --random_generator=tausworthe64 --size=16T --name=verifynulltest --rw=write --verify=crc32c --verify_state_save=0 --size=100M
 
 install: $(PROGS) $(SCRIPTS) tools/plot/fio2gnuplot.1 FORCE
 	$(INSTALL) -m 755 -d $(DESTDIR)$(bindir)
 	$(INSTALL) $(PROGS) $(SCRIPTS) $(DESTDIR)$(bindir)
 	$(INSTALL) -m 755 -d $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 fio.1 $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 tools/fio_generate_plots.1 $(DESTDIR)$(mandir)/man1
-	$(INSTALL) -m 644 tools/plot/fio2gnuplot.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/fio.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/fio_generate_plots.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/fio2gnuplot.1 $(DESTDIR)$(mandir)/man1
+	$(INSTALL) -m 644 $(SRCDIR)/tools/hist/fiologparser_hist.py.1 $(DESTDIR)$(mandir)/man1
 	$(INSTALL) -m 755 -d $(DESTDIR)$(sharedir)
-	$(INSTALL) -m 644 tools/plot/*gpm $(DESTDIR)$(sharedir)/
+	$(INSTALL) -m 644 $(SRCDIR)/tools/plot/*gpm $(DESTDIR)$(sharedir)/

diff --git a/README b/README
index 18d1c4f..951550b 100644
--- a/README
+++ b/README

@@ -1,18 +1,31 @@
-fio
----
+Overview and history
+--------------------
 
-fio is a tool that will spawn a number of threads or processes doing a
-particular type of io action as specified by the user. fio takes a
-number of global parameters, each inherited by the thread unless
-otherwise parameters given to them overriding that setting is given.
-The typical use of fio is to write a job file matching the io load
-one wants to simulate.
+Fio was originally written to save me the hassle of writing special test case
+programs when I wanted to test a specific workload, either for performance
+reasons or to find/reproduce a bug. The process of writing such a test app can
+be tiresome, especially if you have to do it often.  Hence I needed a tool that
+would be able to simulate a given I/O workload without resorting to writing a
+tailored test case again and again.
+
+A test work load is difficult to define, though. There can be any number of
+processes or threads involved, and they can each be using their own way of
+generating I/O. You could have someone dirtying large amounts of memory in an
+memory mapped file, or maybe several threads issuing reads using asynchronous
+I/O. fio needed to be flexible enough to simulate both of these cases, and many
+more.
+
+Fio spawns a number of threads or processes doing a particular type of I/O
+action as specified by the user. fio takes a number of global parameters, each
+inherited by the thread unless otherwise parameters given to them overriding
+that setting is given.  The typical use of fio is to write a job file matching
+the I/O load one wants to simulate.
 
 
 Source
 ------
 
-fio resides in a git repo, the canonical place is:
+Fio resides in a git repo, the canonical place is:
 
 	git://git.kernel.dk/fio.git
 
@@ -21,62 +34,36 @@
 
 	http://git.kernel.dk/fio.git
 
-Snapshots are frequently generated and include the git meta data as well.
+Snapshots are frequently generated and :file:`fio-git-*.tar.gz` include the git
+meta data as well. Other tarballs are archives of official fio releases.
 Snapshots can download from:
 
 	http://brick.kernel.dk/snaps/
 
-There are also two official mirrors. Both of these are synced within
-an hour of commits landing at git.kernel.dk. So if the main repo is
-down for some reason, either one of those is safe to use:
+There are also two official mirrors. Both of these are automatically synced with
+the main repository, when changes are pushed. If the main repo is down for some
+reason, either one of these is safe to use as a backup:
 
 	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
 	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
 
 or
 
+	git://github.com/axboe/fio.git
+
 	https://github.com/axboe/fio.git
 
 
-Binary packages
----------------
-
-Debian:
-Starting with Debian "Squeeze", fio packages are part of the official
-Debian repository. http://packages.debian.org/search?keywords=fio
-
-Ubuntu:
-Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
-of the Ubuntu "universe" repository.
-http://packages.ubuntu.com/search?keywords=fio
-
-Red Hat, CentOS & Co:
-Dag Wieërs has RPMs for Red Hat related distros, find them here:
-http://dag.wieers.com/rpm/packages/fio/
-
-Mandriva:
-Mandriva has integrated fio into their package repository, so installing
-on that distro should be as easy as typing 'urpmi fio'.
-
-Solaris:
-Packages for Solaris are available from OpenCSW. Install their pkgutil
-tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
-'pkgutil -i fio'.
-
-Windows:
-Bruce Cran <bruce@cran.org.uk> has fio packages for Windows at
-http://www.bluestop.org/fio/ .
-
-
 Mailing list
 ------------
 
 The fio project mailing list is meant for anything related to fio including
 general discussion, bug reporting, questions, and development.
 
-An automated mail detailing recent commits is automatically sent to the
-list at most daily. The list address is fio@vger.kernel.org, subscribe
-by sending an email to majordomo@vger.kernel.org with
+An automated mail detailing recent commits is automatically sent to the list at
+most daily. The list address is fio@vger.kernel.org, subscribe by sending an
+email to majordomo@vger.kernel.org with
 
 	subscribe fio
 
@@ -89,262 +76,152 @@
 	http://maillist.kernel.dk/fio-devel/
 
 
+Author
+------
+
+Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing of
+the Linux I/O subsystem and schedulers. He got tired of writing specific test
+applications to simulate a given workload, and found that the existing I/O
+benchmark/test tools out there weren't flexible enough to do what he wanted.
+
+Jens Axboe <axboe@kernel.dk> 20060905
+
+
+Binary packages
+---------------
+
+Debian:
+	Starting with Debian "Squeeze", fio packages are part of the official
+	Debian repository. http://packages.debian.org/search?keywords=fio .
+
+Ubuntu:
+	Starting with Ubuntu 10.04 LTS (aka "Lucid Lynx"), fio packages are part
+	of the Ubuntu "universe" repository.
+	http://packages.ubuntu.com/search?keywords=fio .
+
+Red Hat, Fedora, CentOS & Co:
+	Starting with Fedora 9/Extra Packages for Enterprise Linux 4, fio
+	packages are part of the Fedora/EPEL repositories.
+	https://admin.fedoraproject.org/pkgdb/package/rpms/fio/ .
+
+Mandriva:
+	Mandriva has integrated fio into their package repository, so installing
+	on that distro should be as easy as typing ``urpmi fio``.
+
+Solaris:
+	Packages for Solaris are available from OpenCSW. Install their pkgutil
+	tool (http://www.opencsw.org/get-it/pkgutil/) and then install fio via
+	``pkgutil -i fio``.
+
+Windows:
+	Rebecca Cran <rebecca+fio@bluestop.org> has fio packages for Windows at
+	http://www.bluestop.org/fio/ .
+
+BSDs:
+	Packages for BSDs may be available from their binary package repositories.
+	Look for a package "fio" using their binary package managers.
+
+
 Building
 --------
 
-Just type 'configure', 'make' and 'make install'.
+Just type::
 
-Note that GNU make is required. On BSD it's available from devel/gmake;
-on Solaris it's in the SUNWgmake package. On platforms where GNU make
-isn't the default, type 'gmake' instead of 'make'.
+ $ ./configure
+ $ make
+ $ make install
 
-Configure will print the enabled options. Note that on Linux based
-platforms, the libaio development packages must be installed to use
-the libaio engine. Depending on distro, it is usually called
-libaio-devel or libaio-dev.
+Note that GNU make is required. On BSDs it's available from devel/gmake within
+ports directory; on Solaris it's in the SUNWgmake package.  On platforms where
+GNU make isn't the default, type ``gmake`` instead of ``make``.
+
+Configure will print the enabled options. Note that on Linux based platforms,
+the libaio development packages must be installed to use the libaio
+engine. Depending on distro, it is usually called libaio-devel or libaio-dev.
 
 For gfio, gtk 2.18 (or newer), associated glib threads, and cairo are required
-to be installed.  gfio isn't built automatically and can be enabled
-with a --enable-gfio option to configure.
+to be installed.  gfio isn't built automatically and can be enabled with a
+``--enable-gfio`` option to configure.
 
-To build FIO with a cross-compiler:
+To build fio with a cross-compiler::
+
  $ make clean
  $ make CROSS_COMPILE=/path/to/toolchain/prefix
+
 Configure will attempt to determine the target platform automatically.
 
-It's possible to build fio for ESX as well, use the --esx switch to
+It's possible to build fio for ESX as well, use the ``--esx`` switch to
 configure.
 
 
 Windows
--------
+~~~~~~~
 
-On Windows, Cygwin (http://www.cygwin.com/) is required in order to
-build fio. To create an MSI installer package install WiX 3.8 from
-http://wixtoolset.org and run dobuild.cmd from the
-os/windows directory.
+On Windows, Cygwin (http://www.cygwin.com/) is required in order to build
+fio. To create an MSI installer package install WiX 3.8 from
+http://wixtoolset.org and run :file:`dobuild.cmd` from the :file:`os/windows`
+directory.
 
 How to compile fio on 64-bit Windows:
 
- 1. Install Cygwin (http://www.cygwin.com/). Install 'make' and all
-    packages starting with 'mingw64-i686' and 'mingw64-x86_64'.
+ 1. Install Cygwin (http://www.cygwin.com/). Install **make** and all
+    packages starting with **mingw64-i686** and **mingw64-x86_64**.
  2. Open the Cygwin Terminal.
  3. Go to the fio directory (source files).
- 4. Run 'make clean && make -j'.
+ 4. Run ``make clean && make -j``.
 
-To build fio on 32-bit Windows, run './configure --build-32bit-win' before 'make'.
+To build fio on 32-bit Windows, run ``./configure --build-32bit-win`` before
+``make``.
 
-It's recommended that once built or installed, fio be run in a Command Prompt
-or other 'native' console such as console2, since there are known to be display
-and signal issues when running it under a Cygwin shell
-(see http://code.google.com/p/mintty/issues/detail?id=56 for details).
+It's recommended that once built or installed, fio be run in a Command Prompt or
+other 'native' console such as console2, since there are known to be display and
+signal issues when running it under a Cygwin shell (see
+http://code.google.com/p/mintty/issues/detail?id=56 for details).
 
 
-Command line
-------------
+Documentation
+~~~~~~~~~~~~~
 
-$ fio
-	--debug			Enable some debugging options (see below)
-	--parse-only		Parse options only, don't start any IO
-	--output		Write output to file
-	--runtime		Runtime in seconds
-	--bandwidth-log		Generate per-job bandwidth logs
-	--minimal		Minimal (terse) output
-	--output-format=type	Output format (terse,json,normal)
-	--terse-version=type	Terse version output format (default 3, or 2 or 4).
-	--version		Print version info and exit
-	--help			Print this page
-	--cpuclock-test		Perform test/validation of CPU clock
-	--crctest[=test]	Test speed of checksum functions
-	--cmdhelp=cmd		Print command help, "all" for all of them
-	--enghelp=engine	Print ioengine help, or list available ioengines
-	--enghelp=engine,cmd	Print help for an ioengine cmd
-	--showcmd		Turn a job file into command line options
-	--readonly		Turn on safety read-only checks, preventing
-				writes
-	--eta=when		When ETA estimate should be printed
-				May be "always", "never" or "auto"
-	--eta-newline=time	Force a new line for every 'time' period passed
-	--status-interval=t	Force full status dump every 't' period passed
-	--section=name		Only run specified section in job file.
-				Multiple sections can be specified.
-	--alloc-size=kb		Set smalloc pool to this size in kb (def 1024)
-	--warnings-fatal	Fio parser warnings are fatal
-	--max-jobs		Maximum number of threads/processes to support
-	--server=args		Start backend server. See Client/Server section.
-	--client=host		Connect to specified backend.
-	--remote-config=file	Tell fio server to load this local file
-	--idle-prof=option	Report cpu idleness on a system or percpu basis
-				(option=system,percpu) or run unit work
-				calibration only (option=calibrate).
-	--inflate-log=log	Inflate and output compressed log
+Fio uses Sphinx_ to generate documentation from the reStructuredText_ files.
+To build HTML formatted documentation run ``make -C doc html`` and direct your
+browser to :file:`./doc/output/html/index.html`.  To build manual page run
+``make -C doc man`` and then ``man doc/output/man/fio.1``.  To see what other
+output formats are supported run ``make -C doc help``.
 
-
-Any parameters following the options will be assumed to be job files,
-unless they match a job file parameter. Multiple job files can be listed 
-and each job file will be regarded as a separate group. fio will stonewall
-execution between each group.
-
-The --readonly option is an extra safety guard to prevent users from
-accidentally starting a write workload when that is not desired.  Fio
-will only write if rw=write/randwrite/rw/randrw is given.  This extra
-safety net can be used as an extra precaution as --readonly will also
-enable a write check in the io engine core to prevent writes due to
-unknown user space bug(s).
-
-The --debug option triggers additional logging by fio.
-Currently, additional logging is available for:
-
-	process		Dump info related to processes
-	file		Dump info related to file actions
-	io		Dump info related to IO queuing
-	mem		Dump info related to memory allocations
-	blktrace	Dump info related to blktrace setup
-	verify		Dump info related to IO verification
-	all		Enable all debug options
-	random		Dump info related to random offset generation
-	parse		Dump info related to option matching and parsing
-	diskutil	Dump info related to disk utilization updates
-	job:x		Dump info only related to job number x
-	mutex		Dump info only related to mutex up/down ops
-	profile		Dump info related to profile extensions
-	time		Dump info related to internal time keeping
-	net		Dump info related to networking connections
-	rate		Dump info related to IO rate switching
-	compress	Dump info related to log compress/decompress
-	? or help	Show available debug options.
-
-One can specify multiple debug options: e.g. --debug=file,mem will enable
-file and memory debugging.
-
-The --section option allows one to combine related jobs into one file.
-E.g. one job file could define light, moderate, and heavy sections. Tell fio to
-run only the "heavy" section by giving --section=heavy command line option.
-One can also specify the "write" operations in one section and "verify"
-operation in another section.  The --section option only applies to job
-sections.  The reserved 'global' section is always parsed and used.
-
-The --alloc-size switch allows one to use a larger pool size for smalloc.
-If running large jobs with randommap enabled, fio can run out of memory.
-Smalloc is an internal allocator for shared structures from a fixed size
-memory pool. The pool size defaults to 1024k and can grow to 128 pools.
-
-NOTE: While running .fio_smalloc.* backing store files are visible in /tmp.
-
-
-Job file
---------
-
-See the HOWTO file for a complete description of job file syntax and
-parameters.  The --cmdhelp option also lists all options. If used with
-an option argument, --cmdhelp will detail the given option.  The job file
-format is in the ini style format, as that is easy for the user to review
-and modify.
-
-This README contains the terse version. Job files can describe big and
-complex setups that are not possible with the command line.  Job files
-are a good practice even for simple jobs since the file provides an
-easily accessed record of the workload and can include comments.
-
-See the examples/ directory for inspiration on how to write job files.  Note
-the copyright and license requirements currently apply to examples/ files.
-
-
-Client/server
-------------
-
-Normally fio is invoked as a stand-alone application on the machine
-where the IO workload should be generated. However, the frontend and
-backend of fio can be run separately. Ie the fio server can generate
-an IO workload on the "Device Under Test" while being controlled from
-another machine.
-
-Start the server on the machine which has access to the storage DUT:
-
-fio --server=args
-
-where args defines what fio listens to. The arguments are of the form
-'type,hostname or IP,port'. 'type' is either 'ip' (or ip4) for TCP/IP v4,
-'ip6' for TCP/IP v6, or 'sock' for a local unix domain socket.
-'hostname' is either a hostname or IP address, and 'port' is the port to
-listen to (only valid for TCP/IP, not a local socket). Some examples:
-
-1) fio --server
-
-   Start a fio server, listening on all interfaces on the default port (8765).
-
-2) fio --server=ip:hostname,4444
-
-   Start a fio server, listening on IP belonging to hostname and on port 4444.
-
-3) fio --server=ip6:::1,4444
-
-   Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
-
-4) fio --server=,4444
-
-   Start a fio server, listening on all interfaces on port 4444.
-
-5) fio --server=1.2.3.4
-
-   Start a fio server, listening on IP 1.2.3.4 on the default port.
-
-6) fio --server=sock:/tmp/fio.sock
-
-   Start a fio server, listening on the local socket /tmp/fio.sock.
-
-Once a server is running, a "client" can connect to the fio server with:
-
-fio --local-args --client=<server> --remote-args <job file(s)>
-
-where --local-args are arguments for the client where it is
-running, 'server' is the connect string, and --remote-args and <job file(s)>
-are sent to the server. The 'server' string follows the same format as it
-does on the server side, to allow IP/hostname/socket and port strings.
-
-Fio can connect to multiple servers this way:
-
-fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
-
-If the job file is located on the fio server, then you can tell the server
-to load a local file as well. This is done by using --remote-config:
-
-fio --client=server --remote-config /path/to/file.fio
-
-Then the fio server will open this local (to the server) job file instead
-of being passed one from the client.
+.. _reStructuredText: http://www.sphinx-doc.org/rest.html
+.. _Sphinx: http://www.sphinx-doc.org
 
 
 Platforms
 ---------
 
 Fio works on (at least) Linux, Solaris, AIX, HP-UX, OSX, NetBSD, OpenBSD,
-Windows and FreeBSD.  Some features and/or options may only be available on
-some of the platforms, typically because those features only apply to that
-platform (like the solarisaio engine, or the splice engine on Linux).
+Windows, FreeBSD, and DragonFly. Some features and/or options may only be
+available on some of the platforms, typically because those features only apply
+to that platform (like the solarisaio engine, or the splice engine on Linux).
 
 Some features are not available on FreeBSD/Solaris even if they could be
-implemented, I'd be happy to take patches for that. An example of that is
-disk utility statistics and (I think) huge page support, support for that
-does exist in FreeBSD/Solaris.
+implemented, I'd be happy to take patches for that. An example of that is disk
+utility statistics and (I think) huge page support, support for that does exist
+in FreeBSD/Solaris.
 
-Fio uses pthread mutexes for signalling and locking and FreeBSD does not
-support process shared pthread mutexes. As a result, only threads are
-supported on FreeBSD. This could be fixed with sysv ipc locking or
-other locking alternatives.
+Fio uses pthread mutexes for signalling and locking and some platforms do not
+support process shared pthread mutexes. As a result, on such platforms only
+threads are supported. This could be fixed with sysv ipc locking or other
+locking alternatives.
 
-Other *BSD platforms are untested, but fio should work there almost out
-of the box. Since I don't do test runs or even compiles on those platforms,
-your mileage may vary. Sending me patches for other platforms is greatly
+Other \*BSD platforms are untested, but fio should work there almost out of the
+box. Since I don't do test runs or even compiles on those platforms, your
+mileage may vary. Sending me patches for other platforms is greatly
 appreciated. There's a lot of value in having the same test/benchmark tool
 available on all platforms.
 
-Note that POSIX aio is not enabled by default on AIX. Messages like these:
+Note that POSIX aio is not enabled by default on AIX. Messages like these::
 
     Symbol resolution failed for /usr/lib/libc.a(posix_aio.o) because:
         Symbol _posix_kaio_rdwr (number 2) is not exported from dependent module /unix.
 
-indicate one needs to enable POSIX aio. Run the following commands as root:
+indicate one needs to enable POSIX aio. Run the following commands as root::
 
     # lsdev -C -l posix_aio0
         posix_aio0 Defined  Posix Asynchronous I/O
@@ -352,20 +229,41 @@
     # lsdev -C -l posix_aio0
         posix_aio0 Available  Posix Asynchronous I/O
 
-POSIX aio should work now. To make the change permanent:
+POSIX aio should work now. To make the change permanent::
 
     # chdev -l posix_aio0 -P -a autoconfig='available'
         posix_aio0 changed
 
 
-Author
-------
+Running fio
+-----------
 
-Fio was written by Jens Axboe <axboe@kernel.dk> to enable flexible testing
-of the Linux IO subsystem and schedulers. He got tired of writing
-specific test applications to simulate a given workload, and found that
-the existing io benchmark/test tools out there weren't flexible enough
-to do what he wanted.
+Running fio is normally the easiest part - you just give it the job file
+(or job files) as parameters::
 
-Jens Axboe <axboe@kernel.dk> 20060905
+	$ fio [options] [jobfile] ...
 
+and it will start doing what the *jobfile* tells it to do. You can give more
+than one job file on the command line, fio will serialize the running of those
+files. Internally that is the same as using the :option:`stonewall` parameter
+described in the parameter section.
+
+If the job file contains only one job, you may as well just give the parameters
+on the command line. The command line parameters are identical to the job
+parameters, with a few extra that control global parameters.  For example, for
+the job file parameter :option:`iodepth=2 <iodepth>`, the mirror command line
+option would be :option:`--iodepth 2 <iodepth>` or :option:`--iodepth=2
+<iodepth>`. You can also use the command line for giving more than one job
+entry. For each :option:`--name <name>` option that fio sees, it will start a
+new job with that name.  Command line entries following a
+:option:`--name <name>` entry will apply to that job, until there are no more
+entries or a new :option:`--name <name>` entry is seen. This is similar to the
+job file options, where each option applies to the current job until a new []
+job entry is seen.
+
+fio does not need to run as root, except if the files or devices specified in
+the job section requires that. Some other options may also be restricted, such
+as memory locking, I/O scheduler switching, and decreasing the nice value.
+
+If *jobfile* is specified as ``-``, the job file will be read from standard
+input.

diff --git a/README.version b/README.version
index dd9b1fe..7c841d5 100644
--- a/README.version
+++ b/README.version

@@ -1,2 +1,2 @@
-URL: https://github.com/axboe/fio/archive/fio-2.1.9.tar.gz
-Version: 2.1.9
+URL: http://brick.kernel.dk/snaps/fio-2.20.tar.gz
+Version: 2.20

diff --git a/REPORTING-BUGS b/REPORTING-BUGS
index c6150d1..d8876ae 100644
--- a/REPORTING-BUGS
+++ b/REPORTING-BUGS

@@ -2,8 +2,10 @@
 ---------------
 
 If you notice anything that seems like a fio bug, please do send email
-to the list (fio@vger.kernel.org, see README) about it. You'll need
-to report at least:
+to the list (fio@vger.kernel.org, see README) about it. If you are not
+running the newest release of fio, upgrading first is recommended.
+
+When reporting a bug, you'll need to include:
 
 1) A description of what you think the bug is
 2) Environment (Linux distro version, kernel version). This is mostly
@@ -12,4 +14,8 @@
 4) How to reproduce. Please include a full list of the parameters
    passed to fio and the job file used (if any).
 
+A bug report can never have too much information. Any time information
+is left out and has to be asked for, it'll add to the turn-around time
+of getting to the bottom of it and committing a fix.
+
 That's it!

diff --git a/STEADYSTATE-TODO b/STEADYSTATE-TODO
new file mode 100644
index 0000000..e4b146e
--- /dev/null
+++ b/STEADYSTATE-TODO

@@ -0,0 +1,14 @@
+Known issues/TODO (for steady-state)
+
+- Allow user to specify the frequency of measurements
+
+- Better documentation for output
+
+- Report read, write, trim IOPS/BW separately
+
+- Semantics for the ring buffer ss->head are confusing. ss->head points
+  to the beginning of the buffer up through the point where the buffer
+  is filled for the first time. afterwards, when a new element is added,
+  ss->head is advanced to point to the second element in the buffer. if
+  steady state is attained upon adding a new element, ss->head is not
+  advanced so it actually does point to the head of the buffer.

diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..7543393
--- /dev/null
+++ b/appveyor.yml

@@ -0,0 +1,27 @@
+clone_depth: 50
+environment:
+  MAKEFLAGS: -j 2
+  matrix:
+    - platform: x86_64
+      BUILD_ARCH: x64
+      CYG_ROOT: C:\cygwin64
+      CONFIGURE_OPTIONS:
+    - platform: x86
+      BUILD_ARCH: x86
+      CYG_ROOT: C:\cygwin
+      CONFIGURE_OPTIONS: --build-32bit-win
+
+build_script:
+  - SET PATH=%CYG_ROOT%\bin;%PATH%
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && ./configure ${CONFIGURE_OPTIONS} && make.exe'
+
+after_build:
+  - cd os\windows && dobuild.cmd %BUILD_ARCH%
+
+test_script:
+  - SET PATH=%CYG_ROOT%\bin;%PATH%
+  - 'bash.exe -lc "cd \"${APPVEYOR_BUILD_FOLDER}\" && file.exe fio.exe && make.exe test'
+
+artifacts:
+  - path: os\windows\*.msi
+    name: msi

diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h
index a6cfaf2..0912a86 100644
--- a/arch/arch-aarch64.h
+++ b/arch/arch-aarch64.h

@@ -8,11 +8,6 @@
 
 #define FIO_ARCH	(arch_aarch64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		30
-#define __NR_ioprio_get		31
-#endif
-
 #define nop		do { __asm__ __volatile__ ("yield"); } while (0)
 #define read_barrier()	do { __sync_synchronize(); } while (0)
 #define write_barrier()	do { __sync_synchronize(); } while (0)
@@ -32,4 +27,8 @@
 
 #define ARCH_HAVE_FFZ
 
+#ifdef ARCH_HAVE_CRC_CRYPTO
+#define ARCH_HAVE_ARM64_CRC_CRYPTO
+#endif
+
 #endif

diff --git a/arch/arch-alpha.h b/arch/arch-alpha.h
index c0f784f..9318e15 100644
--- a/arch/arch-alpha.h
+++ b/arch/arch-alpha.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_alpha)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		442
-#define __NR_ioprio_get		443
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		413
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		468
-#define __NR_sys_tee		470
-#define __NR_sys_vmsplice	471
-#endif
-
 #define nop			do { } while (0)
 #define read_barrier()		__asm__ __volatile__("mb": : :"memory")
 #define write_barrier()		__asm__ __volatile__("wmb": : :"memory")

diff --git a/arch/arch-arm.h b/arch/arch-arm.h
index bab886e..31671fd 100644
--- a/arch/arch-arm.h
+++ b/arch/arch-arm.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_arm)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		314
-#define __NR_ioprio_get		315
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		270
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		340
-#define __NR_sys_tee		342
-#define __NR_sys_vmsplice	343
-#endif
-
 #if defined (__ARM_ARCH_4__) || defined (__ARM_ARCH_4T__) \
 	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5E__)\
 	|| defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
@@ -25,7 +10,7 @@
 #define nop             __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t")
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
 #define write_barrier()	__asm__ __volatile__ ("" : : : "memory")
-#elif defined(__ARM_ARCH_7A__)
+#elif defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_8A__)
 #define	nop		__asm__ __volatile__ ("nop")
 #define read_barrier()	__sync_synchronize()
 #define write_barrier()	__sync_synchronize()

diff --git a/arch/arch-hppa.h b/arch/arch-hppa.h
index c1c079e..eb4fc33 100644
--- a/arch/arch-hppa.h
+++ b/arch/arch-hppa.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_hppa)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		267
-#define __NR_ioprio_get		268
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		236
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		291
-#define __NR_sys_tee		293
-#define __NR_sys_vmsplice	294
-#endif
-
 #define nop	do { } while (0)
 
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")

diff --git a/arch/arch-ia64.h b/arch/arch-ia64.h
index 8e8dd7f..53c049f 100644
--- a/arch/arch-ia64.h
+++ b/arch/arch-ia64.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_ia64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		1274
-#define __NR_ioprio_get		1275
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		1234
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		1297
-#define __NR_sys_tee		1301
-#define __NR_sys_vmsplice	1302
-#endif
-
 #define nop		asm volatile ("hint @pause" ::: "memory");
 #define read_barrier()	asm volatile ("mf" ::: "memory")
 #define write_barrier()	asm volatile ("mf" ::: "memory")

diff --git a/arch/arch-mips.h b/arch/arch-mips.h
index 0b781d1..6f157fb 100644
--- a/arch/arch-mips.h
+++ b/arch/arch-mips.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_mips)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		314
-#define __NR_ioprio_get		315
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		215
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		263
-#define __NR_sys_tee		265
-#define __NR_sys_vmsplice	266
-#endif
-
 #define read_barrier()		__asm__ __volatile__("": : :"memory")
 #define write_barrier()		__asm__ __volatile__("": : :"memory")
 #define nop			__asm__ __volatile__("": : :"memory")

diff --git a/arch/arch-ppc.h b/arch/arch-ppc.h
index d4a080c..4a8aa97 100644
--- a/arch/arch-ppc.h
+++ b/arch/arch-ppc.h

@@ -8,21 +8,6 @@
 
 #define FIO_ARCH	(arch_ppc)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		273
-#define __NR_ioprio_get		274
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		233
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		283
-#define __NR_sys_tee		284
-#define __NR_sys_vmsplice	285
-#endif
-
 #define nop	do { } while (0)
 
 #ifdef __powerpc64__
@@ -33,18 +18,24 @@
 
 #define write_barrier()	__asm__ __volatile__ ("sync" : : : "memory")
 
+#ifdef __powerpc64__
+#define PPC_CNTLZL "cntlzd"
+#else
+#define PPC_CNTLZL "cntlzw"
+#endif
+
 static inline int __ilog2(unsigned long bitmask)
 {
 	int lz;
 
-	asm ("cntlzw %0,%1" : "=r" (lz) : "r" (bitmask));
-	return 31 - lz;
+	asm (PPC_CNTLZL " %0,%1" : "=r" (lz) : "r" (bitmask));
+	return BITS_PER_LONG - 1 - lz;
 }
 
 static inline int arch_ffz(unsigned long bitmask)
 {
 	if ((bitmask = ~bitmask) == 0)
-		return 32;
+		return BITS_PER_LONG;
 	return  __ilog2(bitmask & -bitmask);
 }
 
@@ -61,6 +52,21 @@
 #define SPRN_ATBL  0x20E /* Alternate Time Base Lower */
 #define SPRN_ATBU  0x20F /* Alternate Time Base Upper */
 
+#ifdef __powerpc64__
+static inline unsigned long long get_cpu_clock(void)
+{
+	unsigned long long rval;
+
+	asm volatile(
+		"90:	mfspr %0, %1;\n"
+		"	cmpwi %0,0;\n"
+		"	beq-  90b;\n"
+	: "=r" (rval)
+	: "i" (SPRN_TBRL));
+
+	return rval;
+}
+#else
 static inline unsigned long long get_cpu_clock(void)
 {
 	unsigned int tbl, tbu0, tbu1;
@@ -81,6 +87,7 @@
 	ret = (((unsigned long long)tbu0) << 32) | tbl;
 	return ret;
 }
+#endif
 
 #if 0
 static void atb_child(void)
@@ -130,4 +137,12 @@
  * #define ARCH_HAVE_CPU_CLOCK
  */
 
+/*
+ * Let's have it defined for ppc64
+ */
+
+#ifdef __powerpc64__
+#define ARCH_HAVE_CPU_CLOCK
+#endif
+
 #endif

diff --git a/arch/arch-s390.h b/arch/arch-s390.h
index cc7a1d1..2e84bf8 100644
--- a/arch/arch-s390.h
+++ b/arch/arch-s390.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_s390)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		282
-#define __NR_ioprio_get		283
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		253
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		306
-#define __NR_sys_tee		308
-#define __NR_sys_vmsplice	309
-#endif
-
 #define nop		asm volatile("nop" : : : "memory")
 #define read_barrier()	asm volatile("bcr 15,0" : : : "memory")
 #define write_barrier()	asm volatile("bcr 15,0" : : : "memory")

diff --git a/arch/arch-sh.h b/arch/arch-sh.h
index 9acbbbe..58ff226 100644
--- a/arch/arch-sh.h
+++ b/arch/arch-sh.h

@@ -5,21 +5,6 @@
 
 #define FIO_ARCH	(arch_sh)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set	288
-#define __NR_ioprio_get	289
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64	250
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		313
-#define __NR_sys_tee		315
-#define __NR_sys_vmsplice	316
-#endif
-
 #define nop             __asm__ __volatile__ ("nop": : :"memory")
 
 #define mb()								\

diff --git a/arch/arch-sparc.h b/arch/arch-sparc.h
index fe47b80..f82a1f2 100644
--- a/arch/arch-sparc.h
+++ b/arch/arch-sparc.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_sparc)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		196
-#define __NR_ioprio_get		218
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		209
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		232
-#define __NR_sys_tee		280
-#define __NR_sys_vmsplice	25
-#endif
-
 #define nop	do { } while (0)
 
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")

diff --git a/arch/arch-sparc64.h b/arch/arch-sparc64.h
index e793ae5..80c697b 100644
--- a/arch/arch-sparc64.h
+++ b/arch/arch-sparc64.h

@@ -3,21 +3,6 @@
 
 #define FIO_ARCH	(arch_sparc64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		196
-#define __NR_ioprio_get		218
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		209
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		232
-#define __NR_sys_tee		280
-#define __NR_sys_vmsplice	25
-#endif
-
 #define nop	do { } while (0)
 
 #define membar_safe(type) \

diff --git a/arch/arch-x86-common.h b/arch/arch-x86-common.h
index 31aa79f..cbf66b8 100644
--- a/arch/arch-x86-common.h
+++ b/arch/arch-x86-common.h

@@ -15,8 +15,9 @@
 #define ARCH_HAVE_INIT
 
 extern int tsc_reliable;
+extern int arch_random;
 
-static inline int arch_init_intel(unsigned int level)
+static inline void arch_init_intel(unsigned int level)
 {
 	unsigned int eax, ebx, ecx = 0, edx;
 
@@ -26,47 +27,51 @@
 	eax = 1;
 	do_cpuid(&eax, &ebx, &ecx, &edx);
 	if (!(edx & (1U << 4)))
-		return 0;
+		return;
 
 	/*
 	 * Check for constant rate and synced (across cores) TSC
 	 */
 	eax = 0x80000007;
 	do_cpuid(&eax, &ebx, &ecx, &edx);
-	return edx & (1U << 8);
+	tsc_reliable = (edx & (1U << 8)) != 0;
+
+	/*
+	 * Check for FDRAND
+	 */
+	eax = 0x1;
+	do_cpuid(&eax, &ebx, &ecx, &edx);
+	arch_random = (ecx & (1U << 30)) != 0;
 }
 
-static inline int arch_init_amd(unsigned int level)
+static inline void arch_init_amd(unsigned int level)
 {
 	unsigned int eax, ebx, ecx, edx;
 
 	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
 	if (eax < 0x80000007)
-		return 0;
+		return;
 
 	cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
-	if (edx & (1 << 8))
-		return 1;
-
-	return 0;
+	tsc_reliable = (edx & (1U << 8)) != 0;
 }
 
-static inline int arch_init(char *envp[])
+static inline void arch_init(char *envp[])
 {
 	unsigned int level;
 	char str[13];
 
+	arch_random = tsc_reliable = 0;
+
 	cpuid(0, &level, (unsigned int *) &str[0],
 			 (unsigned int *) &str[8],
 			 (unsigned int *) &str[4]);
 
 	str[12] = '\0';
 	if (!strcmp(str, "GenuineIntel"))
-		tsc_reliable = arch_init_intel(level);
+		arch_init_intel(level);
 	else if (!strcmp(str, "AuthenticAMD"))
-		tsc_reliable = arch_init_amd(level);
-
-	return 0;
+		arch_init_amd(level);
 }
 
 #endif

diff --git a/arch/arch-x86.h b/arch/arch-x86.h
index 385a912..457b44c 100644
--- a/arch/arch-x86.h
+++ b/arch/arch-x86.h

@@ -12,22 +12,7 @@
 
 #include "arch-x86-common.h"
 
-#define FIO_ARCH	(arch_i386)
-
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		289
-#define __NR_ioprio_get		290
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		250
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		313
-#define __NR_sys_tee		315
-#define __NR_sys_vmsplice	316
-#endif
+#define FIO_ARCH	(arch_x86)
 
 #define	FIO_HUGE_PAGE		4194304
 

diff --git a/arch/arch-x86_64.h b/arch/arch-x86_64.h
index 61ac75e..e686d10 100644
--- a/arch/arch-x86_64.h
+++ b/arch/arch-x86_64.h

@@ -1,5 +1,5 @@
-#ifndef ARCH_X86_64_h
-#define ARCH_X86_64_h
+#ifndef ARCH_X86_64_H
+#define ARCH_X86_64_H
 
 static inline void do_cpuid(unsigned int *eax, unsigned int *ebx,
 			    unsigned int *ecx, unsigned int *edx)
@@ -14,28 +14,6 @@
 
 #define FIO_ARCH	(arch_x86_64)
 
-#ifndef __NR_ioprio_set
-#define __NR_ioprio_set		251
-#define __NR_ioprio_get		252
-#endif
-
-#ifndef __NR_fadvise64
-#define __NR_fadvise64		221
-#endif
-
-#ifndef __NR_sys_splice
-#define __NR_sys_splice		275
-#define __NR_sys_tee		276
-#define __NR_sys_vmsplice	278
-#endif
-
-#ifndef __NR_shmget
-#define __NR_shmget		 29
-#define __NR_shmat		 30
-#define __NR_shmctl		 31
-#define __NR_shmdt		 67
-#endif
-
 #define	FIO_HUGE_PAGE		2097152
 
 #define nop		__asm__ __volatile__("rep;nop": : :"memory")
@@ -60,4 +38,34 @@
 #define ARCH_HAVE_SSE4_2
 #define ARCH_HAVE_CPU_CLOCK
 
+#define RDRAND_LONG	".byte 0x48,0x0f,0xc7,0xf0"
+#define RDSEED_LONG	".byte 0x48,0x0f,0xc7,0xf8"
+#define RDRAND_RETRY	100
+
+static inline int arch_rand_long(unsigned long *val)
+{
+	int ok;
+
+	asm volatile("1: " RDRAND_LONG "\n\t"
+		     "jc 2f\n\t"
+		     "decl %0\n\t"
+		     "jnz 1b\n\t"
+		     "2:"
+		     : "=r" (ok), "=a" (*val)
+		     : "0" (RDRAND_RETRY));
+
+	return ok;
+}
+
+static inline int arch_rand_seed(unsigned long *seed)
+{
+	unsigned char ok;
+
+	asm volatile(RDSEED_LONG "\n\t"
+			"setc %0"
+			: "=qm" (ok), "=a" (*seed));
+
+	return 0;
+}
+
 #endif

diff --git a/arch/arch.h b/arch/arch.h
index 5671b9a..00d247c 100644
--- a/arch/arch.h
+++ b/arch/arch.h

@@ -3,7 +3,7 @@
 
 enum {
 	arch_x86_64 = 1,
-	arch_i386,
+	arch_x86,
 	arch_ppc,
 	arch_ia64,
 	arch_s390,
@@ -63,11 +63,7 @@
 #include "arch-generic.h"
 #endif
 
-#ifdef ARCH_HAVE_FFZ
-#define ffz(bitmask)	arch_ffz(bitmask)
-#else
 #include "../lib/ffz.h"
-#endif
 
 #ifndef ARCH_HAVE_INIT
 static inline int arch_init(char *envp[])

diff --git a/backend.c b/backend.c
index fdb7413..9a684ed 100644
--- a/backend.c
+++ b/backend.c

@@ -35,6 +35,7 @@
 #include <sys/wait.h>
 #include <sys/ipc.h>
 #include <sys/mman.h>
+#include <math.h>
 
 #include "fio.h"
 #ifndef FIO_NO_HAVE_SHM_H
@@ -48,17 +49,15 @@
 #include "cgroup.h"
 #include "profile.h"
 #include "lib/rand.h"
-#include "memalign.h"
+#include "lib/memalign.h"
 #include "server.h"
 #include "lib/getrusage.h"
 #include "idletime.h"
 #include "err.h"
-#include "lib/tp.h"
-
-static pthread_t helper_thread;
-static pthread_mutex_t helper_lock;
-pthread_cond_t helper_cond;
-int helper_do_stat = 0;
+#include "workqueue.h"
+#include "lib/mountcheck.h"
+#include "rate-submit.h"
+#include "helper_thread.h"
 
 static struct fio_mutex *startup_mutex;
 static struct flist_head *cgroup_list;
@@ -76,10 +75,6 @@
 int shm_id = 0;
 int temp_stall_ts;
 unsigned long done_secs = 0;
-volatile int helper_exit = 0;
-
-#define PAGE_ALIGN(buf)	\
-	(char *) (((uintptr_t) (buf) + page_mask) & ~page_mask)
 
 #define JOB_START_TIMEOUT	(5 * 1000)
 
@@ -98,7 +93,7 @@
 	}
 }
 
-static void sig_show_status(int sig)
+void sig_show_status(int sig)
 {
 	show_running_run_stats();
 }
@@ -141,8 +136,8 @@
 /*
  * Check if we are above the minimum rate given.
  */
-static int __check_min_rate(struct thread_data *td, struct timeval *now,
-			    enum fio_ddir ddir)
+static bool __check_min_rate(struct thread_data *td, struct timeval *now,
+			     enum fio_ddir ddir)
 {
 	unsigned long long bytes = 0;
 	unsigned long iops = 0;
@@ -155,13 +150,13 @@
 	assert(ddir_rw(ddir));
 
 	if (!td->o.ratemin[ddir] && !td->o.rate_iops_min[ddir])
-		return 0;
+		return false;
 
 	/*
 	 * allow a 2 second settle period in the beginning
 	 */
 	if (mtime_since(&td->start, now) < 2000)
-		return 0;
+		return false;
 
 	iops += td->this_io_blocks[ddir];
 	bytes += td->this_io_bytes[ddir];
@@ -175,16 +170,16 @@
 	if (td->rate_bytes[ddir] || td->rate_blocks[ddir]) {
 		spent = mtime_since(&td->lastrate[ddir], now);
 		if (spent < td->o.ratecycle)
-			return 0;
+			return false;
 
-		if (td->o.rate[ddir]) {
+		if (td->o.rate[ddir] || td->o.ratemin[ddir]) {
 			/*
 			 * check bandwidth specified rate
 			 */
 			if (bytes < td->rate_bytes[ddir]) {
-				log_err("%s: min rate %u not met\n", td->o.name,
-								ratemin);
-				return 1;
+				log_err("%s: rate_min=%uB/s not met, only transferred %lluB\n",
+					td->o.name, ratemin, bytes);
+				return true;
 			} else {
 				if (spent)
 					rate = ((bytes - td->rate_bytes[ddir]) * 1000) / spent;
@@ -193,10 +188,9 @@
 
 				if (rate < ratemin ||
 				    bytes < td->rate_bytes[ddir]) {
-					log_err("%s: min rate %u not met, got"
-						" %luKB/sec\n", td->o.name,
-							ratemin, rate);
-					return 1;
+					log_err("%s: rate_min=%uB/s not met, got %luB/s\n",
+						td->o.name, ratemin, rate);
+					return true;
 				}
 			}
 		} else {
@@ -204,9 +198,9 @@
 			 * checks iops specified rate
 			 */
 			if (iops < rate_iops) {
-				log_err("%s: min iops rate %u not met\n",
-						td->o.name, rate_iops);
-				return 1;
+				log_err("%s: rate_iops_min=%u not met, only performed %lu IOs\n",
+						td->o.name, rate_iops, iops);
+				return true;
 			} else {
 				if (spent)
 					rate = ((iops - td->rate_blocks[ddir]) * 1000) / spent;
@@ -215,9 +209,9 @@
 
 				if (rate < rate_iops_min ||
 				    iops < td->rate_blocks[ddir]) {
-					log_err("%s: min iops rate %u not met,"
-						" got %lu\n", td->o.name,
-							rate_iops_min, rate);
+					log_err("%s: rate_iops_min=%u not met, got %lu IOPS\n",
+						td->o.name, rate_iops_min, rate);
+					return true;
 				}
 			}
 		}
@@ -226,19 +220,18 @@
 	td->rate_bytes[ddir] = bytes;
 	td->rate_blocks[ddir] = iops;
 	memcpy(&td->lastrate[ddir], now, sizeof(*now));
-	return 0;
+	return false;
 }
 
-static int check_min_rate(struct thread_data *td, struct timeval *now,
-			  uint64_t *bytes_done)
+static bool check_min_rate(struct thread_data *td, struct timeval *now)
 {
-	int ret = 0;
+	bool ret = false;
 
-	if (bytes_done[DDIR_READ])
+	if (td->bytes_done[DDIR_READ])
 		ret |= __check_min_rate(td, now, DDIR_READ);
-	if (bytes_done[DDIR_WRITE])
+	if (td->bytes_done[DDIR_WRITE])
 		ret |= __check_min_rate(td, now, DDIR_WRITE);
-	if (bytes_done[DDIR_TRIM])
+	if (td->bytes_done[DDIR_TRIM])
 		ret |= __check_min_rate(td, now, DDIR_TRIM);
 
 	return ret;
@@ -255,7 +248,7 @@
 	/*
 	 * get immediately available events, if any
 	 */
-	r = io_u_queued_complete(td, 0, NULL);
+	r = io_u_queued_complete(td, 0);
 	if (r < 0)
 		return;
 
@@ -276,27 +269,27 @@
 	}
 
 	if (td->cur_depth)
-		r = io_u_queued_complete(td, td->cur_depth, NULL);
+		r = io_u_queued_complete(td, td->cur_depth);
 }
 
 /*
  * Helper to handle the final sync of a file. Works just like the normal
  * io path, just does everything sync.
  */
-static int fio_io_sync(struct thread_data *td, struct fio_file *f)
+static bool fio_io_sync(struct thread_data *td, struct fio_file *f)
 {
 	struct io_u *io_u = __get_io_u(td);
 	int ret;
 
 	if (!io_u)
-		return 1;
+		return true;
 
 	io_u->ddir = DDIR_SYNC;
 	io_u->file = f;
 
 	if (td_io_prep(td, io_u)) {
 		put_io_u(td, io_u);
-		return 1;
+		return true;
 	}
 
 requeue:
@@ -304,25 +297,27 @@
 	if (ret < 0) {
 		td_verror(td, io_u->error, "td_io_queue");
 		put_io_u(td, io_u);
-		return 1;
+		return true;
 	} else if (ret == FIO_Q_QUEUED) {
-		if (io_u_queued_complete(td, 1, NULL) < 0)
-			return 1;
+		if (td_io_commit(td))
+			return true;
+		if (io_u_queued_complete(td, 1) < 0)
+			return true;
 	} else if (ret == FIO_Q_COMPLETED) {
 		if (io_u->error) {
 			td_verror(td, io_u->error, "td_io_queue");
-			return 1;
+			return true;
 		}
 
-		if (io_u_sync_complete(td, io_u, NULL) < 0)
-			return 1;
+		if (io_u_sync_complete(td, io_u) < 0)
+			return true;
 	} else if (ret == FIO_Q_BUSY) {
 		if (td_io_commit(td))
-			return 1;
+			return true;
 		goto requeue;
 	}
 
-	return 0;
+	return false;
 }
 
 static int fio_file_fsync(struct thread_data *td, struct fio_file *f)
@@ -351,20 +346,37 @@
 		__update_tv_cache(td);
 }
 
-static inline int runtime_exceeded(struct thread_data *td, struct timeval *t)
+static inline bool runtime_exceeded(struct thread_data *td, struct timeval *t)
 {
 	if (in_ramp_time(td))
-		return 0;
+		return false;
 	if (!td->o.timeout)
-		return 0;
+		return false;
 	if (utime_since(&td->epoch, t) >= td->o.timeout)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static int break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
-			       int *retptr)
+/*
+ * We need to update the runtime consistently in ms, but keep a running
+ * tally of the current elapsed time in microseconds for sub millisecond
+ * updates.
+ */
+static inline void update_runtime(struct thread_data *td,
+				  unsigned long long *elapsed_us,
+				  const enum fio_ddir ddir)
+{
+	if (ddir == DDIR_WRITE && td_write(td) && td->o.verify_only)
+		return;
+
+	td->ts.runtime[ddir] -= (elapsed_us[ddir] + 999) / 1000;
+	elapsed_us[ddir] += utime_since_now(&td->start);
+	td->ts.runtime[ddir] += (elapsed_us[ddir] + 999) / 1000;
+}
+
+static bool break_on_this_error(struct thread_data *td, enum fio_ddir ddir,
+				int *retptr)
 {
 	int ret = *retptr;
 
@@ -377,7 +389,7 @@
 
 		eb = td_error_type(ddir, err);
 		if (!(td->o.continue_on_error & (1 << eb)))
-			return 1;
+			return true;
 
 		if (td_non_fatal_error(td, eb, err)) {
 		        /*
@@ -387,7 +399,7 @@
 			update_error_count(td, err);
 			td_clear_error(td);
 			*retptr = 0;
-			return 0;
+			return false;
 		} else if (td->o.fill_device && err == ENOSPC) {
 			/*
 			 * We expect to hit this error if
@@ -395,18 +407,18 @@
 			 */
 			td_clear_error(td);
 			fio_mark_td_terminate(td);
-			return 1;
+			return true;
 		} else {
 			/*
 			 * Stop the I/O in case of a fatal
 			 * error.
 			 */
 			update_error_count(td, err);
-			return 1;
+			return true;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 static void check_update_rusage(struct thread_data *td)
@@ -418,18 +430,20 @@
 	}
 }
 
-static int wait_for_completions(struct thread_data *td, struct timeval *time,
-				uint64_t *bytes_done)
+static int wait_for_completions(struct thread_data *td, struct timeval *time)
 {
 	const int full = queue_full(td);
 	int min_evts = 0;
 	int ret;
 
+	if (td->flags & TD_F_REGROW_LOGS)
+		return io_u_quiesce(td);
+
 	/*
 	 * if the queue is full, we MUST reap at least 1 event
 	 */
-	min_evts = min(td->o.iodepth_batch_complete, td->cur_depth);
-	if (full && !min_evts)
+	min_evts = min(td->o.iodepth_batch_complete_min, td->cur_depth);
+	if ((full && !min_evts) || !td->o.iodepth_batch_complete_min)
 		min_evts = 1;
 
 	if (time && (__should_check_rate(td, DDIR_READ) ||
@@ -438,7 +452,7 @@
 		fio_gettime(time, NULL);
 
 	do {
-		ret = io_u_queued_complete(td, min_evts, bytes_done);
+		ret = io_u_queued_complete(td, min_evts);
 		if (ret < 0)
 			break;
 	} while (full && (td->cur_depth > td->o.iodepth_low));
@@ -446,13 +460,138 @@
 	return ret;
 }
 
+int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timeval *comp_time)
+{
+	int ret2;
+
+	switch (*ret) {
+	case FIO_Q_COMPLETED:
+		if (io_u->error) {
+			*ret = -io_u->error;
+			clear_io_u(td, io_u);
+		} else if (io_u->resid) {
+			int bytes = io_u->xfer_buflen - io_u->resid;
+			struct fio_file *f = io_u->file;
+
+			if (bytes_issued)
+				*bytes_issued += bytes;
+
+			if (!from_verify)
+				trim_io_piece(td, io_u);
+
+			/*
+			 * zero read, fail
+			 */
+			if (!bytes) {
+				if (!from_verify)
+					unlog_io_piece(td, io_u);
+				td_verror(td, EIO, "full resid");
+				put_io_u(td, io_u);
+				break;
+			}
+
+			io_u->xfer_buflen = io_u->resid;
+			io_u->xfer_buf += bytes;
+			io_u->offset += bytes;
+
+			if (ddir_rw(io_u->ddir))
+				td->ts.short_io_u[io_u->ddir]++;
+
+			f = io_u->file;
+			if (io_u->offset == f->real_file_size)
+				goto sync_done;
+
+			requeue_io_u(td, &io_u);
+		} else {
+sync_done:
+			if (comp_time && (__should_check_rate(td, DDIR_READ) ||
+			    __should_check_rate(td, DDIR_WRITE) ||
+			    __should_check_rate(td, DDIR_TRIM)))
+				fio_gettime(comp_time, NULL);
+
+			*ret = io_u_sync_complete(td, io_u);
+			if (*ret < 0)
+				break;
+		}
+
+		if (td->flags & TD_F_REGROW_LOGS)
+			regrow_logs(td);
+
+		/*
+		 * when doing I/O (not when verifying),
+		 * check for any errors that are to be ignored
+		 */
+		if (!from_verify)
+			break;
+
+		return 0;
+	case FIO_Q_QUEUED:
+		/*
+		 * if the engine doesn't have a commit hook,
+		 * the io_u is really queued. if it does have such
+		 * a hook, it has to call io_u_queued() itself.
+		 */
+		if (td->io_ops->commit == NULL)
+			io_u_queued(td, io_u);
+		if (bytes_issued)
+			*bytes_issued += io_u->xfer_buflen;
+		break;
+	case FIO_Q_BUSY:
+		if (!from_verify)
+			unlog_io_piece(td, io_u);
+		requeue_io_u(td, &io_u);
+		ret2 = td_io_commit(td);
+		if (ret2 < 0)
+			*ret = ret2;
+		break;
+	default:
+		assert(*ret < 0);
+		td_verror(td, -(*ret), "td_io_queue");
+		break;
+	}
+
+	if (break_on_this_error(td, ddir, ret))
+		return 1;
+
+	return 0;
+}
+
+static inline bool io_in_polling(struct thread_data *td)
+{
+	return !td->o.iodepth_batch_complete_min &&
+		   !td->o.iodepth_batch_complete_max;
+}
+/*
+ * Unlinks files from thread data fio_file structure
+ */
+static int unlink_all_files(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int ret = 0;
+
+	for_each_file(td, f, i) {
+		if (f->filetype != FIO_TYPE_FILE)
+			continue;
+		ret = td_io_unlink_file(td, f);
+		if (ret)
+			break;
+	}
+
+	if (ret)
+		td_verror(td, ret, "unlink_all_files");
+
+	return ret;
+}
+
 /*
  * The main verify engine. Runs over the writes we previously submitted,
  * reads the blocks back in, and checks the crc/md5 of the data.
  */
 static void do_verify(struct thread_data *td, uint64_t verify_bytes)
 {
-	uint64_t bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
 	struct fio_file *f;
 	struct io_u *io_u;
 	int ret, min_events;
@@ -478,12 +617,21 @@
 	if (td->error)
 		return;
 
+	/*
+	 * verify_state needs to be reset before verification
+	 * proceeds so that expected random seeds match actual
+	 * random seeds in headers. The main loop will reset
+	 * all random number generators if randrepeat is set.
+	 */
+	if (!td->o.rand_repeatable)
+		td_fill_verify_state_seed(td);
+
 	td_set_runstate(td, TD_VERIFYING);
 
 	io_u = NULL;
 	while (!td->terminate) {
 		enum fio_ddir ddir;
-		int ret2, full;
+		int full;
 
 		update_tv_cache(td);
 		check_update_rusage(td);
@@ -514,11 +662,11 @@
 				break;
 			}
 		} else {
-			if (ddir_rw_sum(bytes_done) + td->o.rw_min_bs > verify_bytes)
+			if (ddir_rw_sum(td->bytes_done) + td->o.rw_min_bs > verify_bytes)
 				break;
 
 			while ((io_u = get_io_u(td)) != NULL) {
-				if (IS_ERR(io_u)) {
+				if (IS_ERR_OR_NULL(io_u)) {
 					io_u = NULL;
 					ret = FIO_Q_BUSY;
 					goto reap;
@@ -539,7 +687,7 @@
 					continue;
 				} else if (io_u->ddir == DDIR_TRIM) {
 					io_u->ddir = DDIR_READ;
-					io_u->flags |= IO_U_F_TRIMMED;
+					io_u_set(td, io_u, IO_U_F_TRIMMED);
 					break;
 				} else if (io_u->ddir == DDIR_WRITE) {
 					io_u->ddir = DDIR_READ;
@@ -569,57 +717,8 @@
 			fio_gettime(&io_u->start_time, NULL);
 
 		ret = td_io_queue(td, io_u);
-		switch (ret) {
-		case FIO_Q_COMPLETED:
-			if (io_u->error) {
-				ret = -io_u->error;
-				clear_io_u(td, io_u);
-			} else if (io_u->resid) {
-				int bytes = io_u->xfer_buflen - io_u->resid;
 
-				/*
-				 * zero read, fail
-				 */
-				if (!bytes) {
-					td_verror(td, EIO, "full resid");
-					put_io_u(td, io_u);
-					break;
-				}
-
-				io_u->xfer_buflen = io_u->resid;
-				io_u->xfer_buf += bytes;
-				io_u->offset += bytes;
-
-				if (ddir_rw(io_u->ddir))
-					td->ts.short_io_u[io_u->ddir]++;
-
-				f = io_u->file;
-				if (io_u->offset == f->real_file_size)
-					goto sync_done;
-
-				requeue_io_u(td, &io_u);
-			} else {
-sync_done:
-				ret = io_u_sync_complete(td, io_u, bytes_done);
-				if (ret < 0)
-					break;
-			}
-			continue;
-		case FIO_Q_QUEUED:
-			break;
-		case FIO_Q_BUSY:
-			requeue_io_u(td, &io_u);
-			ret2 = td_io_commit(td);
-			if (ret2 < 0)
-				ret = ret2;
-			break;
-		default:
-			assert(ret < 0);
-			td_verror(td, -ret, "td_io_queue");
-			break;
-		}
-
-		if (break_on_this_error(td, ddir, &ret))
+		if (io_queue_event(td, io_u, &ret, ddir, NULL, 1, NULL))
 			break;
 
 		/*
@@ -629,8 +728,8 @@
 		 */
 reap:
 		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete)
-			ret = wait_for_completions(td, NULL, bytes_done);
+		if (full || io_in_polling(td))
+			ret = wait_for_completions(td, NULL);
 
 		if (ret < 0)
 			break;
@@ -642,7 +741,7 @@
 		min_events = td->cur_depth;
 
 		if (min_events)
-			ret = io_u_queued_complete(td, min_events, NULL);
+			ret = io_u_queued_complete(td, min_events);
 	} else
 		cleanup_pending_aio(td);
 
@@ -651,12 +750,12 @@
 	dprint(FD_VERIFY, "exiting loop\n");
 }
 
-static unsigned int exceeds_number_ios(struct thread_data *td)
+static bool exceeds_number_ios(struct thread_data *td)
 {
 	unsigned long long number_ios;
 
 	if (!td->o.number_ios)
-		return 0;
+		return false;
 
 	number_ios = ddir_rw_sum(td->io_blocks);
 	number_ios += td->io_u_queued + td->io_u_in_flight;
@@ -664,21 +763,21 @@
 	return number_ios >= (td->o.number_ios * td->loops);
 }
 
-static int io_issue_bytes_exceeded(struct thread_data *td)
+static bool io_bytes_exceeded(struct thread_data *td, uint64_t *this_bytes)
 {
 	unsigned long long bytes, limit;
 
 	if (td_rw(td))
-		bytes = td->io_issue_bytes[DDIR_READ] + td->io_issue_bytes[DDIR_WRITE];
+		bytes = this_bytes[DDIR_READ] + this_bytes[DDIR_WRITE];
 	else if (td_write(td))
-		bytes = td->io_issue_bytes[DDIR_WRITE];
+		bytes = this_bytes[DDIR_WRITE];
 	else if (td_read(td))
-		bytes = td->io_issue_bytes[DDIR_READ];
+		bytes = this_bytes[DDIR_READ];
 	else
-		bytes = td->io_issue_bytes[DDIR_TRIM];
+		bytes = this_bytes[DDIR_TRIM];
 
-	if (td->o.io_limit)
-		limit = td->o.io_limit;
+	if (td->o.io_size)
+		limit = td->o.io_size;
 	else
 		limit = td->o.size;
 
@@ -686,26 +785,47 @@
 	return bytes >= limit || exceeds_number_ios(td);
 }
 
-static int io_complete_bytes_exceeded(struct thread_data *td)
+static bool io_issue_bytes_exceeded(struct thread_data *td)
 {
-	unsigned long long bytes, limit;
+	return io_bytes_exceeded(td, td->io_issue_bytes);
+}
 
-	if (td_rw(td))
-		bytes = td->this_io_bytes[DDIR_READ] + td->this_io_bytes[DDIR_WRITE];
-	else if (td_write(td))
-		bytes = td->this_io_bytes[DDIR_WRITE];
-	else if (td_read(td))
-		bytes = td->this_io_bytes[DDIR_READ];
-	else
-		bytes = td->this_io_bytes[DDIR_TRIM];
+static bool io_complete_bytes_exceeded(struct thread_data *td)
+{
+	return io_bytes_exceeded(td, td->this_io_bytes);
+}
 
-	if (td->o.io_limit)
-		limit = td->o.io_limit;
-	else
-		limit = td->o.size;
+/*
+ * used to calculate the next io time for rate control
+ *
+ */
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+	uint64_t secs, remainder, bps, bytes, iops;
 
-	limit *= td->loops;
-	return bytes >= limit || exceeds_number_ios(td);
+	assert(!(td->flags & TD_F_CHILD));
+	bytes = td->rate_io_issue_bytes[ddir];
+	bps = td->rate_bps[ddir];
+
+	if (td->o.rate_process == RATE_PROCESS_POISSON) {
+		uint64_t val;
+		iops = bps / td->o.bs[ddir];
+		val = (int64_t) (1000000 / iops) *
+				-logf(__rand_0_1(&td->poisson_state[ddir]));
+		if (val) {
+			dprint(FD_RATE, "poisson rate iops=%llu, ddir=%d\n",
+					(unsigned long long) 1000000 / val,
+					ddir);
+		}
+		td->last_usec[ddir] += val;
+		return td->last_usec[ddir];
+	} else if (bps) {
+		secs = bytes / bps;
+		remainder = bytes % bps;
+		return remainder * 1000000 / bps + secs * 1000000;
+	}
+
+	return 0;
 }
 
 /*
@@ -714,13 +834,15 @@
  *
  * Returns number of bytes written and trimmed.
  */
-static uint64_t do_io(struct thread_data *td)
+static void do_io(struct thread_data *td, uint64_t *bytes_done)
 {
-	uint64_t bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
 	unsigned int i;
 	int ret = 0;
 	uint64_t total_bytes, bytes_issued = 0;
 
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i];
+
 	if (in_ramp_time(td))
 		td_set_runstate(td, TD_RAMP);
 	else
@@ -730,11 +852,11 @@
 
 	total_bytes = td->o.size;
 	/*
-	* Allow random overwrite workloads to write up to io_limit
+	* Allow random overwrite workloads to write up to io_size
 	* before starting verification phase as 'size' doesn't apply.
 	*/
 	if (td_write(td) && td_random(td) && td->o.norandommap)
-		total_bytes = max(total_bytes, (uint64_t) td->o.io_limit);
+		total_bytes = max(total_bytes, (uint64_t) td->o.io_size);
 	/*
 	 * If verify_backlog is enabled, we'll run the verify in this
 	 * handler as well. For that case, we may need up to twice the
@@ -744,12 +866,17 @@
 	   (td_write(td) && td->o.verify_backlog))
 		total_bytes += td->o.size;
 
+	/* In trimwrite mode, each byte is trimmed and then written, so
+	 * allow total_bytes to be twice as big */
+	if (td_trimwrite(td))
+		total_bytes += td->total_io_size;
+
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
 		(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
 		td->o.time_based) {
 		struct timeval comp_time;
 		struct io_u *io_u;
-		int ret2, full;
+		int full;
 		enum fio_ddir ddir;
 
 		check_update_rusage(td);
@@ -770,7 +897,14 @@
 		if (flow_threshold_exceeded(td))
 			continue;
 
-		if (bytes_issued >= total_bytes)
+		/*
+		 * Break if we exceeded the bytes. The exception is time
+		 * based runs, but we still need to break out of the loop
+		 * for those to run verification, if enabled.
+		 */
+		if (bytes_issued >= total_bytes &&
+		    (!td->o.time_based ||
+		     (td->o.time_based && td->o.verify != VERIFY_NONE)))
 			break;
 
 		io_u = get_io_u(td);
@@ -829,98 +963,54 @@
 		    !td->o.experimental_verify)
 			log_io_piece(td, io_u);
 
-		ret = td_io_queue(td, io_u);
-		switch (ret) {
-		case FIO_Q_COMPLETED:
-			if (io_u->error) {
-				ret = -io_u->error;
-				unlog_io_piece(td, io_u);
-				clear_io_u(td, io_u);
-			} else if (io_u->resid) {
-				int bytes = io_u->xfer_buflen - io_u->resid;
-				struct fio_file *f = io_u->file;
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			const unsigned long blen = io_u->xfer_buflen;
+			const enum fio_ddir ddir = acct_ddir(io_u);
 
-				bytes_issued += bytes;
+			if (td->error)
+				break;
 
-				trim_io_piece(td, io_u);
+			workqueue_enqueue(&td->io_wq, &io_u->work);
+			ret = FIO_Q_QUEUED;
 
-				/*
-				 * zero read, fail
-				 */
-				if (!bytes) {
-					unlog_io_piece(td, io_u);
-					td_verror(td, EIO, "full resid");
-					put_io_u(td, io_u);
-					break;
-				}
-
-				io_u->xfer_buflen = io_u->resid;
-				io_u->xfer_buf += bytes;
-				io_u->offset += bytes;
-
-				if (ddir_rw(io_u->ddir))
-					td->ts.short_io_u[io_u->ddir]++;
-
-				if (io_u->offset == f->real_file_size)
-					goto sync_done;
-
-				requeue_io_u(td, &io_u);
-			} else {
-sync_done:
-				if (__should_check_rate(td, DDIR_READ) ||
-				    __should_check_rate(td, DDIR_WRITE) ||
-				    __should_check_rate(td, DDIR_TRIM))
-					fio_gettime(&comp_time, NULL);
-
-				ret = io_u_sync_complete(td, io_u, bytes_done);
-				if (ret < 0)
-					break;
-				bytes_issued += io_u->xfer_buflen;
+			if (ddir_rw(ddir)) {
+				td->io_issues[ddir]++;
+				td->io_issue_bytes[ddir] += blen;
+				td->rate_io_issue_bytes[ddir] += blen;
 			}
-			break;
-		case FIO_Q_QUEUED:
+
+			if (should_check_rate(td))
+				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
+		} else {
+			ret = td_io_queue(td, io_u);
+
+			if (should_check_rate(td))
+				td->rate_next_io_time[ddir] = usec_for_io(td, ddir);
+
+			if (io_queue_event(td, io_u, &ret, ddir, &bytes_issued, 0, &comp_time))
+				break;
+
 			/*
-			 * if the engine doesn't have a commit hook,
-			 * the io_u is really queued. if it does have such
-			 * a hook, it has to call io_u_queued() itself.
+			 * See if we need to complete some commands. Note that
+			 * we can get BUSY even without IO queued, if the
+			 * system is resource starved.
 			 */
-			if (td->io_ops->commit == NULL)
-				io_u_queued(td, io_u);
-			bytes_issued += io_u->xfer_buflen;
-			break;
-		case FIO_Q_BUSY:
-			unlog_io_piece(td, io_u);
-			requeue_io_u(td, &io_u);
-			ret2 = td_io_commit(td);
-			if (ret2 < 0)
-				ret = ret2;
-			break;
-		default:
-			assert(ret < 0);
-			put_io_u(td, io_u);
-			break;
-		}
-
-		if (break_on_this_error(td, ddir, &ret))
-			break;
-
-		/*
-		 * See if we need to complete some commands. Note that we
-		 * can get BUSY even without IO queued, if the system is
-		 * resource starved.
-		 */
 reap:
-		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete)
-			ret = wait_for_completions(td, &comp_time, bytes_done);
+			full = queue_full(td) ||
+				(ret == FIO_Q_BUSY && td->cur_depth);
+			if (full || io_in_polling(td))
+				ret = wait_for_completions(td, &comp_time);
+		}
 		if (ret < 0)
 			break;
-		if (!ddir_rw_sum(bytes_done) && !(td->io_ops->flags & FIO_NOIO))
+		if (!ddir_rw_sum(td->bytes_done) &&
+		    !td_ioengine_flagged(td, FIO_NOIO))
 			continue;
 
-		if (!in_ramp_time(td) && should_check_rate(td, bytes_done)) {
-			if (check_min_rate(td, &comp_time, bytes_done)) {
-				if (exitall_on_terminate)
+		if (!in_ramp_time(td) && should_check_rate(td)) {
+			if (check_min_rate(td, &comp_time)) {
+				if (exitall_on_terminate || td->o.exitall_error)
 					fio_terminate_threads(td->groupid);
 				td_verror(td, EIO, "check_min_rate");
 				break;
@@ -960,9 +1050,14 @@
 	if (!td->error) {
 		struct fio_file *f;
 
-		i = td->cur_depth;
+		if (td->o.io_submit_mode == IO_MODE_OFFLOAD) {
+			workqueue_flush(&td->io_wq);
+			i = 0;
+		} else
+			i = td->cur_depth;
+
 		if (i) {
-			ret = io_u_queued_complete(td, i, bytes_done);
+			ret = io_u_queued_complete(td, i);
 			if (td->o.fill_device && td->error == ENOSPC)
 				td->error = 0;
 		}
@@ -987,7 +1082,43 @@
 	if (!ddir_rw_sum(td->this_io_bytes))
 		td->done = 1;
 
-	return bytes_done[DDIR_WRITE] + bytes_done[DDIR_TRIM];
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		bytes_done[i] = td->bytes_done[i] - bytes_done[i];
+}
+
+static void free_file_completion_logging(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (!f->last_write_comp)
+			break;
+		sfree(f->last_write_comp);
+	}
+}
+
+static int init_file_completion_logging(struct thread_data *td,
+					unsigned int depth)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (td->o.verify == VERIFY_NONE || !td->o.verify_state_save)
+		return 0;
+
+	for_each_file(td, f, i) {
+		f->last_write_comp = scalloc(depth, sizeof(uint64_t));
+		if (!f->last_write_comp)
+			goto cleanup;
+	}
+
+	return 0;
+
+cleanup:
+	free_file_completion_logging(td);
+	log_err("fio: failed to alloc write comp data\n");
+	return 1;
 }
 
 static void cleanup_io_u(struct thread_data *td)
@@ -1008,8 +1139,7 @@
 	io_u_qexit(&td->io_u_freelist);
 	io_u_qexit(&td->io_u_all);
 
-	if (td->last_write_comp)
-		sfree(td->last_write_comp);
+	free_file_completion_logging(td);
 }
 
 static int init_io_u(struct thread_data *td)
@@ -1026,7 +1156,7 @@
 	td->orig_buffer_size = (unsigned long long) max_bs
 					* (unsigned long long) max_units;
 
-	if ((td->io_ops->flags & FIO_NOIO) || !(td_read(td) || td_write(td)))
+	if (td_ioengine_flagged(td, FIO_NOIO) || !(td_read(td) || td_write(td)))
 		data_xfer = 0;
 
 	err = 0;
@@ -1046,7 +1176,7 @@
 	 * lucky and the allocator gives us an aligned address.
 	 */
 	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
-	    (td->io_ops->flags & FIO_RAWIO))
+	    td_ioengine_flagged(td, FIO_RAWIO))
 		td->orig_buffer_size += page_mask + td->o.mem_align;
 
 	if (td->o.mem_type == MEM_SHMHUGE || td->o.mem_type == MEM_MMAPHUGE) {
@@ -1065,8 +1195,8 @@
 		return 1;
 
 	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
-	    (td->io_ops->flags & FIO_RAWIO))
-		p = PAGE_ALIGN(td->orig_buffer) + td->o.mem_align;
+	    td_ioengine_flagged(td, FIO_RAWIO))
+		p = PTR_ALIGN(td->orig_buffer, page_mask) + td->o.mem_align;
 	else
 		p = td->orig_buffer;
 
@@ -1126,27 +1256,28 @@
 		p += max_bs;
 	}
 
-	if (td->o.verify != VERIFY_NONE) {
-		td->last_write_comp = scalloc(max_units, sizeof(uint64_t));
-		if (!td->last_write_comp) {
-			log_err("fio: failed to alloc write comp data\n");
-			return 1;
-		}
-	}
+	if (init_file_completion_logging(td, max_units))
+		return 1;
 
 	return 0;
 }
 
+/*
+ * This function is Linux specific.
+ * FIO_HAVE_IOSCHED_SWITCH enabled currently means it's Linux.
+ */
 static int switch_ioscheduler(struct thread_data *td)
 {
+#ifdef FIO_HAVE_IOSCHED_SWITCH
 	char tmp[256], tmp2[128];
 	FILE *f;
 	int ret;
 
-	if (td->io_ops->flags & FIO_DISKLESSIO)
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
 		return 0;
 
-	sprintf(tmp, "%s/queue/scheduler", td->sysfs_root);
+	assert(td->files && td->files[0]);
+	sprintf(tmp, "%s/queue/scheduler", td->files[0]->du->sysfs_root);
 
 	f = fopen(tmp, "r+");
 	if (!f) {
@@ -1174,14 +1305,26 @@
 	/*
 	 * Read back and check that the selected scheduler is now the default.
 	 */
+	memset(tmp, 0, sizeof(tmp));
 	ret = fread(tmp, sizeof(tmp), 1, f);
 	if (ferror(f) || ret < 0) {
 		td_verror(td, errno, "fread");
 		fclose(f);
 		return 1;
 	}
-	tmp[sizeof(tmp) - 1] = '\0';
+	/*
+	 * either a list of io schedulers or "none\n" is expected.
+	 */
+	tmp[strlen(tmp) - 1] = '\0';
 
+	/*
+	 * Write to "none" entry doesn't fail, so check the result here.
+	 */
+	if (!strcmp(tmp, "none")) {
+		log_err("fio: io scheduler is not tunable\n");
+		fclose(f);
+		return 0;
+	}
 
 	sprintf(tmp2, "[%s]", td->o.ioscheduler);
 	if (!strstr(tmp, tmp2)) {
@@ -1193,25 +1336,28 @@
 
 	fclose(f);
 	return 0;
+#else
+	return 0;
+#endif
 }
 
-static int keep_running(struct thread_data *td)
+static bool keep_running(struct thread_data *td)
 {
 	unsigned long long limit;
 
 	if (td->done)
-		return 0;
+		return false;
 	if (td->o.time_based)
-		return 1;
+		return true;
 	if (td->o.loops) {
 		td->o.loops--;
-		return 1;
+		return true;
 	}
 	if (exceeds_number_ios(td))
-		return 0;
+		return false;
 
-	if (td->o.io_limit)
-		limit = td->o.io_limit;
+	if (td->o.io_size)
+		limit = td->o.io_size;
 	else
 		limit = td->o.size;
 
@@ -1219,25 +1365,26 @@
 		uint64_t diff;
 
 		/*
-		 * If the difference is less than the minimum IO size, we
+		 * If the difference is less than the maximum IO size, we
 		 * are done.
 		 */
 		diff = limit - ddir_rw_sum(td->io_bytes);
 		if (diff < td_max_bs(td))
-			return 0;
+			return false;
 
-		if (fio_files_done(td))
-			return 0;
+		if (fio_files_done(td) && !td->o.io_size)
+			return false;
 
-		return 1;
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 static int exec_string(struct thread_options *o, const char *string, const char *mode)
 {
-	int ret, newlen = strlen(string) + strlen(o->name) + strlen(mode) + 9 + 1;
+	size_t newlen = strlen(string) + strlen(o->name) + strlen(mode) + 9 + 1;
+	int ret;
 	char *str;
 
 	str = malloc(newlen);
@@ -1257,8 +1404,6 @@
  */
 static uint64_t do_dry_run(struct thread_data *td)
 {
-	uint64_t bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
-
 	td_set_runstate(td, TD_RUNNING);
 
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
@@ -1270,10 +1415,10 @@
 			break;
 
 		io_u = get_io_u(td);
-		if (!io_u)
+		if (IS_ERR_OR_NULL(io_u))
 			break;
 
-		io_u->flags |= IO_U_F_FLIGHT;
+		io_u_set(td, io_u, IO_U_F_FLIGHT);
 		io_u->error = 0;
 		io_u->resid = 0;
 		if (ddir_rw(acct_ddir(io_u)))
@@ -1289,26 +1434,37 @@
 		    !td->o.experimental_verify)
 			log_io_piece(td, io_u);
 
-		ret = io_u_sync_complete(td, io_u, bytes_done);
+		ret = io_u_sync_complete(td, io_u);
 		(void) ret;
 	}
 
-	return bytes_done[DDIR_WRITE] + bytes_done[DDIR_TRIM];
+	return td->bytes_done[DDIR_WRITE] + td->bytes_done[DDIR_TRIM];
 }
 
+struct fork_data {
+	struct thread_data *td;
+	struct sk_out *sk_out;
+};
+
 /*
  * Entry point for the thread based jobs. The process based jobs end up
  * here as well, after a little setup.
  */
 static void *thread_main(void *data)
 {
-	unsigned long long elapsed;
-	struct thread_data *td = data;
+	struct fork_data *fd = data;
+	unsigned long long elapsed_us[DDIR_RWDIR_CNT] = { 0, };
+	struct thread_data *td = fd->td;
 	struct thread_options *o = &td->o;
-	pthread_condattr_t attr;
+	struct sk_out *sk_out = fd->sk_out;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
+	int deadlock_loop_cnt;
 	int clear_state;
 	int ret;
 
+	sk_out_assign(sk_out);
+	free(fd);
+
 	if (!o->use_thread) {
 		setsid();
 		td->pid = getpid();
@@ -1327,12 +1483,18 @@
 	INIT_FLIST_HEAD(&td->verify_list);
 	INIT_FLIST_HEAD(&td->trim_list);
 	INIT_FLIST_HEAD(&td->next_rand_list);
-	pthread_mutex_init(&td->io_u_lock, NULL);
 	td->io_hist_tree = RB_ROOT;
 
-	pthread_condattr_init(&attr);
-	pthread_cond_init(&td->verify_cond, &attr);
-	pthread_cond_init(&td->free_cond, &attr);
+	ret = mutex_cond_init_pshared(&td->io_u_lock, &td->free_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_init_pshared");
+		goto err;
+	}
+	ret = cond_init_pshared(&td->verify_cond);
+	if (ret) {
+		td_verror(td, ret, "mutex_cond_pshared");
+		goto err;
+	}
 
 	td_set_runstate(td, TD_INITIALIZED);
 	dprint(FD_MUTEX, "up startup_mutex\n");
@@ -1355,6 +1517,14 @@
 	}
 
 	/*
+	 * Do this early, we don't want the compress threads to be limited
+	 * to the same CPUs as the IO workers. So do this before we set
+	 * any potential CPU affinity
+	 */
+	if (iolog_compress_init(td, sk_out))
+		goto err;
+
+	/*
 	 * If we have a gettimeofday() thread, make sure we exclude that
 	 * thread from this job
 	 */
@@ -1488,57 +1658,95 @@
 			goto err;
 	}
 
-	if (td->flags & TD_F_COMPRESS_LOG)
-		tp_init(&td->tp_data);
-
 	fio_verify_init(td);
 
-	fio_gettime(&td->epoch, NULL);
+	if (rate_submit_init(td, sk_out))
+		goto err;
+
+	set_epoch_time(td, o->log_unix_epoch);
 	fio_getrusage(&td->ru_start);
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(td->epoch));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(td->epoch));
+
+	if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
+			o->ratemin[DDIR_TRIM]) {
+	        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
+					sizeof(td->bw_sample_time));
+	}
+
+	memset(bytes_done, 0, sizeof(bytes_done));
 	clear_state = 0;
+
 	while (keep_running(td)) {
 		uint64_t verify_bytes;
 
 		fio_gettime(&td->start, NULL);
-		memcpy(&td->bw_sample_time, &td->start, sizeof(td->start));
-		memcpy(&td->iops_sample_time, &td->start, sizeof(td->start));
 		memcpy(&td->tv_cache, &td->start, sizeof(td->start));
 
-		if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
-				o->ratemin[DDIR_TRIM]) {
-		        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
-						sizeof(td->bw_sample_time));
-		}
+		if (clear_state) {
+			clear_io_state(td, 0);
 
-		if (clear_state)
-			clear_io_state(td);
+			if (o->unlink_each_loop && unlink_all_files(td))
+				break;
+		}
 
 		prune_io_piece_log(td);
 
-		if (td->o.verify_only && (td_write(td) || td_rw(td)))
+		if (td->o.verify_only && td_write(td))
 			verify_bytes = do_dry_run(td);
-		else
-			verify_bytes = do_io(td);
+		else {
+			do_io(td, bytes_done);
+
+			if (!ddir_rw_sum(bytes_done)) {
+				fio_mark_td_terminate(td);
+				verify_bytes = 0;
+			} else {
+				verify_bytes = bytes_done[DDIR_WRITE] +
+						bytes_done[DDIR_TRIM];
+			}
+		}
+
+		/*
+		 * If we took too long to shut down, the main thread could
+		 * already consider us reaped/exited. If that happens, break
+		 * out and clean up.
+		 */
+		if (td->runstate >= TD_EXITED)
+			break;
 
 		clear_state = 1;
 
-		fio_mutex_down(stat_mutex);
-		if (td_read(td) && td->io_bytes[DDIR_READ]) {
-			elapsed = mtime_since_now(&td->start);
-			td->ts.runtime[DDIR_READ] += elapsed;
-		}
-		if (td_write(td) && td->io_bytes[DDIR_WRITE]) {
-			elapsed = mtime_since_now(&td->start);
-			td->ts.runtime[DDIR_WRITE] += elapsed;
-		}
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM]) {
-			elapsed = mtime_since_now(&td->start);
-			td->ts.runtime[DDIR_TRIM] += elapsed;
-		}
+		/*
+		 * Make sure we've successfully updated the rusage stats
+		 * before waiting on the stat mutex. Otherwise we could have
+		 * the stat thread holding stat mutex and waiting for
+		 * the rusage_sem, which would never get upped because
+		 * this thread is waiting for the stat mutex.
+		 */
+		deadlock_loop_cnt = 0;
+		do {
+			check_update_rusage(td);
+			if (!fio_mutex_down_trylock(stat_mutex))
+				break;
+			usleep(1000);
+			if (deadlock_loop_cnt++ > 5000) {
+				log_err("fio seems to be stuck grabbing stat_mutex, forcibly exiting\n");
+				td->error = EDEADLK;
+				goto err;
+			}
+		} while (1);
+
+		if (td_read(td) && td->io_bytes[DDIR_READ])
+			update_runtime(td, elapsed_us, DDIR_READ);
+		if (td_write(td) && td->io_bytes[DDIR_WRITE])
+			update_runtime(td, elapsed_us, DDIR_WRITE);
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+			update_runtime(td, elapsed_us, DDIR_TRIM);
 		fio_gettime(&td->start, NULL);
 		fio_mutex_up(stat_mutex);
 
@@ -1547,17 +1755,22 @@
 
 		if (!o->do_verify ||
 		    o->verify == VERIFY_NONE ||
-		    (td->io_ops->flags & FIO_UNIDIR))
+		    td_ioengine_flagged(td, FIO_UNIDIR))
 			continue;
 
-		clear_io_state(td);
+		clear_io_state(td, 0);
 
 		fio_gettime(&td->start, NULL);
 
 		do_verify(td, verify_bytes);
 
+		/*
+		 * See comment further up for why this is done here.
+		 */
+		check_update_rusage(td);
+
 		fio_mutex_down(stat_mutex);
-		td->ts.runtime[DDIR_READ] += mtime_since_now(&td->start);
+		update_runtime(td, elapsed_us, DDIR_READ);
 		fio_gettime(&td->start, NULL);
 		fio_mutex_up(stat_mutex);
 
@@ -1565,6 +1778,20 @@
 			break;
 	}
 
+	/*
+	 * If td ended up with no I/O when it should have had,
+	 * then something went wrong unless FIO_NOIO or FIO_DISKLESSIO.
+	 * (Are we not missing other flags that can be ignored ?)
+	 */
+	if ((td->o.size || td->o.io_size) && !ddir_rw_sum(bytes_done) &&
+	    !(td_ioengine_flagged(td, FIO_NOIO) ||
+	      td_ioengine_flagged(td, FIO_DISKLESSIO)))
+		log_err("%s: No I/O performed by %s, "
+			 "perhaps try --debug=io option for details?\n",
+			 td->o.name, td->io_ops->name);
+
+	td_set_runstate(td, TD_FINISHING);
+
 	update_rusage_stat(td);
 	td->ts.total_run_time = mtime_since_now(&td->epoch);
 	td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
@@ -1572,28 +1799,20 @@
 	td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 
 	if (td->o.verify_state_save && !(td->flags & TD_F_VSTATE_SAVED) &&
-	    (td->o.verify != VERIFY_NONE && td_write(td))) {
-		struct all_io_list *state;
-		size_t sz;
-
-		state = get_all_io_list(td->thread_number, &sz);
-		if (state) {
-			__verify_save_state(state, "local");
-			free(state);
-		}
-	}
+	    (td->o.verify != VERIFY_NONE && td_write(td)))
+		verify_save_state(td->thread_number);
 
 	fio_unpin_memory(td);
 
-	fio_writeout_logs(td);
+	td_writeout_logs(td, true);
 
-	if (td->flags & TD_F_COMPRESS_LOG)
-		tp_exit(&td->tp_data);
+	iolog_compress_exit(td);
+	rate_submit_exit(td);
 
 	if (o->exec_postrun)
 		exec_string(o, o->exec_postrun, (const char *)"postrun");
 
-	if (exitall_on_terminate)
+	if (exitall_on_terminate || (o->exitall_error && td->error))
 		fio_terminate_threads(td->groupid);
 
 err:
@@ -1610,6 +1829,15 @@
 	cgroup_shutdown(td, &cgroup_mnt);
 	verify_free_state(td);
 
+	if (td->zone_state_index) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++)
+			free(td->zone_state_index[i]);
+		free(td->zone_state_index);
+		td->zone_state_index = NULL;
+	}
+
 	if (fio_option_is_set(o, cpumask)) {
 		ret = fio_cpuset_exit(&o->cpumask);
 		if (ret)
@@ -1622,9 +1850,6 @@
 	if (o->write_iolog_file)
 		write_iolog_close(td);
 
-	fio_mutex_remove(td->mutex);
-	td->mutex = NULL;
-
 	td_set_runstate(td, TD_EXITED);
 
 	/*
@@ -1633,52 +1858,15 @@
 	 */
 	check_update_rusage(td);
 
+	sk_out_drop();
 	return (void *) (uintptr_t) td->error;
 }
 
-
-/*
- * We cannot pass the td data into a forked process, so attach the td and
- * pass it to the thread worker.
- */
-static int fork_main(int shmid, int offset)
-{
-	struct thread_data *td;
-	void *data, *ret;
-
-#if !defined(__hpux) && !defined(CONFIG_NO_SHM)
-	data = shmat(shmid, NULL, 0);
-	if (data == (void *) -1) {
-		int __err = errno;
-
-		perror("shmat");
-		return __err;
-	}
-#else
-	/*
-	 * HP-UX inherits shm mappings?
-	 */
-	data = threads;
-#endif
-
-	td = data + offset * sizeof(struct thread_data);
-	ret = thread_main(td);
-	shmdt(data);
-	return (int) (uintptr_t) ret;
-}
-
-static void dump_td_info(struct thread_data *td)
-{
-	log_err("fio: job '%s' hasn't exited in %lu seconds, it appears to "
-		"be stuck. Doing forceful exit of this job.\n", td->o.name,
-			(unsigned long) time_since_now(&td->terminate_time));
-}
-
 /*
  * Run over the job map and reap the threads that have exited, if any.
  */
-static void reap_threads(unsigned int *nr_running, unsigned int *t_rate,
-			 unsigned int *m_rate)
+static void reap_threads(unsigned int *nr_running, uint64_t *t_rate,
+			 uint64_t *m_rate)
 {
 	struct thread_data *td;
 	unsigned int cputhreads, realthreads, pending;
@@ -1756,8 +1944,13 @@
 		 * move on.
 		 */
 		if (td->terminate &&
+		    td->runstate < TD_FSYNCING &&
 		    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
-			dump_td_info(td);
+			log_err("fio: job '%s' (state=%d) hasn't exited in "
+				"%lu seconds, it appears to be stuck. Doing "
+				"forceful exit of this job.\n",
+				td->o.name, td->runstate,
+				(unsigned long) time_since_now(&td->terminate_time));
 			td_set_runstate(td, TD_REAPED);
 			goto reaped;
 		}
@@ -1785,29 +1978,29 @@
 		fio_terminate_threads(TERMINATE_ALL);
 }
 
-static int __check_trigger_file(void)
+static bool __check_trigger_file(void)
 {
 	struct stat sb;
 
 	if (!trigger_file)
-		return 0;
+		return false;
 
 	if (stat(trigger_file, &sb))
-		return 0;
+		return false;
 
 	if (unlink(trigger_file) < 0)
 		log_err("fio: failed to unlink %s: %s\n", trigger_file,
 							strerror(errno));
 
-	return 1;
+	return true;
 }
 
-static int trigger_timedout(void)
+static bool trigger_timedout(void)
 {
 	if (trigger_timeout)
 		return time_since_genesis() >= trigger_timeout;
 
-	return 0;
+	return false;
 }
 
 void exec_trigger(const char *cmd)
@@ -1828,7 +2021,7 @@
 		if (nr_clients)
 			fio_clients_send_trigger(trigger_remote_cmd);
 		else {
-			verify_save_state();
+			verify_save_state(IO_LIST_ALL);
 			fio_terminate_threads(TERMINATE_ALL);
 			exec_trigger(trigger_cmd);
 		}
@@ -1848,7 +2041,7 @@
 		ret = fio_server_get_verify_state(td->o.name,
 					td->thread_number - 1, &data);
 		if (!ret)
-			verify_convert_assign_state(td, data);
+			verify_assign_state(td, data);
 	} else
 		ret = verify_load_state(td, "local");
 
@@ -1862,13 +2055,69 @@
 	usleep(usecs);
 }
 
+static bool check_mount_writes(struct thread_data *td)
+{
+	struct fio_file *f;
+	unsigned int i;
+
+	if (!td_write(td) || td->o.allow_mounted_write)
+		return false;
+
+	/*
+	 * If FIO_HAVE_CHARDEV_SIZE is defined, it's likely that chrdevs
+	 * are mkfs'd and mounted.
+	 */
+	for_each_file(td, f, i) {
+#ifdef FIO_HAVE_CHARDEV_SIZE
+		if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR)
+#else
+		if (f->filetype != FIO_TYPE_BLOCK)
+#endif
+			continue;
+		if (device_is_mounted(f->file_name))
+			goto mounted;
+	}
+
+	return false;
+mounted:
+	log_err("fio: %s appears mounted, and 'allow_mounted_write' isn't set. Aborting.\n", f->file_name);
+	return true;
+}
+
+static bool waitee_running(struct thread_data *me)
+{
+	const char *waitee = me->o.wait_for;
+	const char *self = me->o.name;
+	struct thread_data *td;
+	int i;
+
+	if (!waitee)
+		return false;
+
+	for_each_td(td, i) {
+		if (!strcmp(td->o.name, self) || strcmp(td->o.name, waitee))
+			continue;
+
+		if (td->runstate < TD_EXITED) {
+			dprint(FD_PROCESS, "%s fenced by %s(%s)\n",
+					self, td->o.name,
+					runstate_to_name(td->runstate));
+			return true;
+		}
+	}
+
+	dprint(FD_PROCESS, "%s: %s completed, can run\n", self, waitee);
+	return false;
+}
+
 /*
  * Main function for kicking off and reaping jobs, as needed.
  */
-static void run_threads(void)
+static void run_threads(struct sk_out *sk_out)
 {
 	struct thread_data *td;
-	unsigned int i, todo, nr_running, m_rate, t_rate, nr_started;
+	unsigned int i, todo, nr_running, nr_started;
+	uint64_t m_rate, t_rate;
 	uint64_t spent;
 
 	if (fio_gtod_offload && fio_start_gtod_thread())
@@ -1880,13 +2129,15 @@
 
 	nr_thread = nr_process = 0;
 	for_each_td(td, i) {
+		if (check_mount_writes(td))
+			return;
 		if (td->o.use_thread)
 			nr_thread++;
 		else
 			nr_process++;
 	}
 
-	if (output_format == FIO_OUTPUT_NORMAL) {
+	if (output_format & FIO_OUTPUT_NORMAL) {
 		log_info("Starting ");
 		if (nr_thread)
 			log_info("%d thread%s", nr_thread,
@@ -1953,6 +2204,7 @@
 		struct thread_data *map[REAL_MAX_JOBS];
 		struct timeval this_start;
 		int this_jobs = 0, left;
+		struct fork_data *fd;
 
 		/*
 		 * create threads (TD_NOT_CREATED -> TD_CREATED)
@@ -1983,6 +2235,12 @@
 				break;
 			}
 
+			if (waitee_running(td)) {
+				dprint(FD_PROCESS, "%s: waiting for %s\n",
+						td->o.name, td->o.wait_for);
+				continue;
+			}
+
 			init_disk_util(td);
 
 			td->rusage_sem = fio_mutex_init(FIO_MUTEX_LOCKED);
@@ -1996,15 +2254,20 @@
 			map[this_jobs++] = td;
 			nr_started++;
 
+			fd = calloc(1, sizeof(*fd));
+			fd->td = td;
+			fd->sk_out = sk_out;
+
 			if (td->o.use_thread) {
 				int ret;
 
 				dprint(FD_PROCESS, "will pthread_create\n");
 				ret = pthread_create(&td->thread, NULL,
-							thread_main, td);
+							thread_main, fd);
 				if (ret) {
 					log_err("pthread_create: %s\n",
 							strerror(ret));
+					free(fd);
 					nr_started--;
 					break;
 				}
@@ -2017,14 +2280,15 @@
 				dprint(FD_PROCESS, "will fork\n");
 				pid = fork();
 				if (!pid) {
-					int ret = fork_main(shm_id, i);
+					int ret;
 
+					ret = (int)(uintptr_t)thread_main(fd);
 					_exit(ret);
 				} else if (i == fio_debug_jobno)
 					*fio_debug_jobp = pid;
 			}
 			dprint(FD_MUTEX, "wait on startup_mutex\n");
-			if (fio_mutex_down_timeout(startup_mutex, 10)) {
+			if (fio_mutex_down_timeout(startup_mutex, 10000)) {
 				log_err("fio: job startup hung? exiting.\n");
 				fio_terminate_threads(TERMINATE_ALL);
 				fio_abort = 1;
@@ -2109,81 +2373,13 @@
 	update_io_ticks();
 }
 
-static void wait_for_helper_thread_exit(void)
-{
-	void *ret;
-
-	helper_exit = 1;
-	pthread_cond_signal(&helper_cond);
-	pthread_join(helper_thread, &ret);
-}
-
 static void free_disk_util(void)
 {
 	disk_util_prune_entries();
-
-	pthread_cond_destroy(&helper_cond);
+	helper_thread_destroy();
 }
 
-static void *helper_thread_main(void *data)
-{
-	int ret = 0;
-
-	fio_mutex_up(startup_mutex);
-
-	while (!ret) {
-		uint64_t sec = DISK_UTIL_MSEC / 1000;
-		uint64_t nsec = (DISK_UTIL_MSEC % 1000) * 1000000;
-		struct timespec ts;
-		struct timeval tv;
-
-		gettimeofday(&tv, NULL);
-		ts.tv_sec = tv.tv_sec + sec;
-		ts.tv_nsec = (tv.tv_usec * 1000) + nsec;
-
-		if (ts.tv_nsec >= 1000000000ULL) {
-			ts.tv_nsec -= 1000000000ULL;
-			ts.tv_sec++;
-		}
-
-		pthread_cond_timedwait(&helper_cond, &helper_lock, &ts);
-
-		ret = update_io_ticks();
-
-		if (helper_do_stat) {
-			helper_do_stat = 0;
-			__show_running_run_stats();
-		}
-
-		if (!is_backend)
-			print_thread_status();
-	}
-
-	return NULL;
-}
-
-static int create_helper_thread(void)
-{
-	int ret;
-
-	setup_disk_util();
-
-	pthread_cond_init(&helper_cond, NULL);
-	pthread_mutex_init(&helper_lock, NULL);
-
-	ret = pthread_create(&helper_thread, NULL, helper_thread_main, NULL);
-	if (ret) {
-		log_err("Can't create helper thread: %s\n", strerror(ret));
-		return 1;
-	}
-
-	dprint(FD_MUTEX, "wait on startup_mutex\n");
-	fio_mutex_down(startup_mutex);
-	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
-	return 0;
-}
-
-int fio_backend(void)
+int fio_backend(struct sk_out *sk_out)
 {
 	struct thread_data *td;
 	int i;
@@ -2213,14 +2409,14 @@
 
 	set_genesis_time();
 	stat_init();
-	create_helper_thread();
+	helper_thread_create(startup_mutex, sk_out);
 
 	cgroup_list = smalloc(sizeof(*cgroup_list));
 	INIT_FLIST_HEAD(cgroup_list);
 
-	run_threads();
+	run_threads(sk_out);
 
-	wait_for_helper_thread_exit();
+	helper_thread_exit();
 
 	if (!fio_abort) {
 		__show_run_stats();
@@ -2228,18 +2424,26 @@
 			for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 				struct io_log *log = agg_io_log[i];
 
-				flush_log(log);
+				flush_log(log, false);
 				free_log(log);
 			}
 		}
 	}
 
 	for_each_td(td, i) {
+		if (td->ss.dur) {
+			if (td->ss.iops_data != NULL) {
+				free(td->ss.iops_data);
+				free(td->ss.bw_data);
+			}
+		}
 		fio_options_free(td);
 		if (td->rusage_sem) {
 			fio_mutex_remove(td->rusage_sem);
 			td->rusage_sem = NULL;
 		}
+		fio_mutex_remove(td->mutex);
+		td->mutex = NULL;
 	}
 
 	free_disk_util();

diff --git a/blktrace.c b/blktrace.c
index 9afc5be..a3474cb 100644
--- a/blktrace.c
+++ b/blktrace.c

@@ -4,12 +4,14 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
 #include <dirent.h>
 
 #include "flist.h"
 #include "fio.h"
 #include "blktrace_api.h"
-#include "lib/linux-dev-lookup.h"
+#include "oslib/linux-dev-lookup.h"
 
 #define TRACE_FIFO_SIZE	8192
 
@@ -127,17 +129,37 @@
 	flist_add_tail(&ipo->list, &td->io_log_list);
 }
 
-static int trace_add_file(struct thread_data *td, __u32 device)
+static int get_dev_blocksize(const char *dev, unsigned int *bs)
 {
-	static unsigned int last_maj, last_min, last_fileno;
+	int fd;
+
+	fd = open(dev, O_RDONLY);
+	if (fd < 0)
+		return 1;
+
+	if (ioctl(fd, BLKSSZGET, bs) < 0) {
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+static int trace_add_file(struct thread_data *td, __u32 device,
+			  unsigned int *bs)
+{
+	static unsigned int last_maj, last_min, last_fileno, last_bs;
 	unsigned int maj = FMAJOR(device);
 	unsigned int min = FMINOR(device);
 	struct fio_file *f;
-	char dev[256];
 	unsigned int i;
+	char dev[256];
 
-	if (last_maj == maj && last_min == min)
+	if (last_maj == maj && last_min == min) {
+		*bs = last_bs;
 		return last_fileno;
+	}
 
 	last_maj = maj;
 	last_min = min;
@@ -145,14 +167,17 @@
 	/*
 	 * check for this file in our list
 	 */
-	for_each_file(td, f, i)
+	for_each_file(td, f, i) {
 		if (f->major == maj && f->minor == min) {
 			last_fileno = f->fileno;
-			return last_fileno;
+			last_bs = f->bs;
+			goto out;
 		}
+	}
 
 	strcpy(dev, "/dev");
 	if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
+		unsigned int this_bs;
 		int fileno;
 
 		if (td->o.replay_redirect)
@@ -164,31 +189,48 @@
 
 		dprint(FD_BLKTRACE, "add devices %s\n", dev);
 		fileno = add_file_exclusive(td, dev);
+
+		if (get_dev_blocksize(dev, &this_bs))
+			this_bs = 512;
+
 		td->o.open_files++;
 		td->files[fileno]->major = maj;
 		td->files[fileno]->minor = min;
+		td->files[fileno]->bs = this_bs;
 		trace_add_open_close_event(td, fileno, FIO_LOG_OPEN_FILE);
+
 		last_fileno = fileno;
+		last_bs = this_bs;
 	}
 
+out:
+	*bs = last_bs;
 	return last_fileno;
 }
 
+static void t_bytes_align(struct thread_options *o, struct blk_io_trace *t)
+{
+	if (!o->replay_align)
+		return;
+
+	t->bytes = (t->bytes + o->replay_align - 1) & ~(o->replay_align - 1);
+}
+
 /*
  * Store blk_io_trace data in an ipo for later retrieval.
  */
 static void store_ipo(struct thread_data *td, unsigned long long offset,
 		      unsigned int bytes, int rw, unsigned long long ttime,
-		      int fileno)
+		      int fileno, unsigned int bs)
 {
 	struct io_piece *ipo = malloc(sizeof(*ipo));
 
 	init_ipo(ipo);
 
-	/*
-	 * the 512 is wrong here, it should be the hardware sector size...
-	 */
-	ipo->offset = offset * 512;
+	ipo->offset = offset * bs;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
 	ipo->len = bytes;
 	ipo->delay = ttime / 1000;
 	if (rw)
@@ -225,27 +267,28 @@
 static void handle_trace_discard(struct thread_data *td,
 				 struct blk_io_trace *t,
 				 unsigned long long ttime,
-				 unsigned long *ios, unsigned int *bs)
+				 unsigned long *ios, unsigned int *rw_bs)
 {
 	struct io_piece *ipo = malloc(sizeof(*ipo));
+	unsigned int bs;
 	int fileno;
 
 	init_ipo(ipo);
-	fileno = trace_add_file(td, t->device);
+	fileno = trace_add_file(td, t->device, &bs);
 
 	ios[DDIR_TRIM]++;
-	if (t->bytes > bs[DDIR_TRIM])
-		bs[DDIR_TRIM] = t->bytes;
+	if (t->bytes > rw_bs[DDIR_TRIM])
+		rw_bs[DDIR_TRIM] = t->bytes;
 
 	td->o.size += t->bytes;
 
 	memset(ipo, 0, sizeof(*ipo));
 	INIT_FLIST_HEAD(&ipo->list);
 
-	/*
-	 * the 512 is wrong here, it should be the hardware sector size...
-	 */
-	ipo->offset = t->sector * 512;
+	ipo->offset = t->sector * bs;
+	if (td->o.replay_scale)
+		ipo->offset = ipo->offset / td->o.replay_scale;
+	ipo_bytes_align(td->o.replay_align, ipo);
 	ipo->len = t->bytes;
 	ipo->delay = ttime / 1000;
 	ipo->ddir = DDIR_TRIM;
@@ -259,21 +302,22 @@
 
 static void handle_trace_fs(struct thread_data *td, struct blk_io_trace *t,
 			    unsigned long long ttime, unsigned long *ios,
-			    unsigned int *bs)
+			    unsigned int *rw_bs)
 {
+	unsigned int bs;
 	int rw;
 	int fileno;
 
-	fileno = trace_add_file(td, t->device);
+	fileno = trace_add_file(td, t->device, &bs);
 
 	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
 
-	if (t->bytes > bs[rw])
-		bs[rw] = t->bytes;
+	if (t->bytes > rw_bs[rw])
+		rw_bs[rw] = t->bytes;
 
 	ios[rw]++;
 	td->o.size += t->bytes;
-	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno);
+	store_ipo(td, t->sector, t->bytes, rw, ttime, fileno, bs);
 }
 
 /*
@@ -284,7 +328,7 @@
 			 unsigned long *ios, unsigned int *bs)
 {
 	static unsigned long long last_ttime;
-	unsigned long long delay;
+	unsigned long long delay = 0;
 
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
 		return;
@@ -299,6 +343,8 @@
 		}
 	}
 
+	t_bytes_align(&td->o, t);
+
 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
 		handle_trace_notify(t);
 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
@@ -327,6 +373,47 @@
 	return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
 }
 
+static enum fio_ddir t_get_ddir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_READ))
+		return DDIR_READ;
+	else if (t->action & BLK_TC_ACT(BLK_TC_WRITE))
+		return DDIR_WRITE;
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return DDIR_INVAL;
+}
+
+static void depth_inc(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]++;
+}
+
+static void depth_dec(struct blk_io_trace *t, int *depth)
+{
+	enum fio_ddir ddir;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL)
+		depth[ddir]--;
+}
+
+static void depth_end(struct blk_io_trace *t, int *this_depth, int *depth)
+{
+	enum fio_ddir ddir = DDIR_INVAL;
+
+	ddir = t_get_ddir(t);
+	if (ddir != DDIR_INVAL) {
+		depth[ddir] = max(depth[ddir], this_depth[ddir]);
+		this_depth[ddir] = 0;
+	}
+}
+
 /*
  * Load a blktrace file by reading all the blk_io_trace entries, and storing
  * them as io_pieces like the fio text version would do.
@@ -339,7 +426,7 @@
 	struct fifo *fifo;
 	int fd, i, old_state;
 	struct fio_file *f;
-	int this_depth, depth;
+	int this_depth[DDIR_RWDIR_CNT], depth[DDIR_RWDIR_CNT], max_depth;
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
@@ -353,10 +440,14 @@
 
 	td->o.size = 0;
 
-	ios[0] = ios[1] = 0;
-	rw_bs[0] = rw_bs[1] = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		ios[i] = 0;
+		rw_bs[i] = 0;
+		this_depth[i] = 0;
+		depth[i] = 0;
+	}
+
 	skipped_writes = 0;
-	this_depth = depth = 0;
 	do {
 		int ret = trace_fifo_get(td, fifo, fd, &t, sizeof(t));
 
@@ -392,11 +483,12 @@
 		}
 		if ((t.action & BLK_TC_ACT(BLK_TC_NOTIFY)) == 0) {
 			if ((t.action & 0xffff) == __BLK_TA_QUEUE)
-				this_depth++;
-			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE) {
-				depth = max(depth, this_depth);
-				this_depth = 0;
-			}
+				depth_inc(&t, this_depth);
+			else if (((t.action & 0xffff) == __BLK_TA_BACKMERGE) ||
+				((t.action & 0xffff) == __BLK_TA_FRONTMERGE))
+				depth_dec(&t, this_depth);
+			else if ((t.action & 0xffff) == __BLK_TA_COMPLETE)
+				depth_end(&t, this_depth, depth);
 
 			if (t_is_write(&t) && read_only) {
 				skipped_writes++;
@@ -426,8 +518,14 @@
 	 * For stacked devices, we don't always get a COMPLETE event so
 	 * the depth grows to insane values. Limit it to something sane(r).
 	 */
-	if (!depth || depth > 1024)
-		depth = 1024;
+	max_depth = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (depth[i] > 1024)
+			depth[i] = 1024;
+		else if (!depth[i] && ios[i])
+			depth[i] = 1;
+		max_depth = max(depth[i], max_depth);
+	}
 
 	if (skipped_writes)
 		log_err("fio: %s skips replay of %lu writes due to read-only\n",
@@ -451,16 +549,17 @@
 
 	/*
 	 * We need to do direct/raw ios to the device, to avoid getting
-	 * read-ahead in our way.
+	 * read-ahead in our way. But only do so if the minimum block size
+	 * is a multiple of 4k, otherwise we don't know if it's safe to do so.
 	 */
-	td->o.odirect = 1;
+	if (!fio_option_is_set(&td->o, odirect) && !(td_min_bs(td) & 4095))
+		td->o.odirect = 1;
 
 	/*
-	 * we don't know if this option was set or not. it defaults to 1,
-	 * so we'll just guess that we should override it if it's still 1
+	 * If depth wasn't manually set, use probed depth
 	 */
-	if (td->o.iodepth == 1)
-		td->o.iodepth = td->o.iodepth_low = depth;
+	if (!fio_option_is_set(&td->o, iodepth))
+		td->o.iodepth = td->o.iodepth_low = max_depth;
 
 	return 0;
 err:

diff --git a/cconv.c b/cconv.c
index 0fca764..3295824 100644
--- a/cconv.c
+++ b/cconv.c

@@ -23,8 +23,11 @@
 
 static void free_thread_options_to_cpu(struct thread_options *o)
 {
+	int i;
+
 	free(o->description);
 	free(o->name);
+	free(o->wait_for);
 	free(o->directory);
 	free(o->filename);
 	free(o->filename_format);
@@ -36,12 +39,18 @@
 	free(o->bw_log_file);
 	free(o->lat_log_file);
 	free(o->iops_log_file);
+	free(o->hist_log_file);
 	free(o->replay_redirect);
 	free(o->exec_prerun);
 	free(o->exec_postrun);
 	free(o->ioscheduler);
 	free(o->profile);
 	free(o->cgroup);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		free(o->bssplit[i]);
+		free(o->zone_split[i]);
+	}
 }
 
 void convert_thread_options_to_cpu(struct thread_options *o,
@@ -54,6 +63,7 @@
 
 	string_to_cpu(&o->description, top->description);
 	string_to_cpu(&o->name, top->name);
+	string_to_cpu(&o->wait_for, top->wait_for);
 	string_to_cpu(&o->directory, top->directory);
 	string_to_cpu(&o->filename, top->filename);
 	string_to_cpu(&o->filename_format, top->filename_format);
@@ -65,6 +75,7 @@
 	string_to_cpu(&o->bw_log_file, top->bw_log_file);
 	string_to_cpu(&o->lat_log_file, top->lat_log_file);
 	string_to_cpu(&o->iops_log_file, top->iops_log_file);
+	string_to_cpu(&o->hist_log_file, top->hist_log_file);
 	string_to_cpu(&o->replay_redirect, top->replay_redirect);
 	string_to_cpu(&o->exec_prerun, top->exec_prerun);
 	string_to_cpu(&o->exec_postrun, top->exec_postrun);
@@ -72,18 +83,21 @@
 	string_to_cpu(&o->profile, top->profile);
 	string_to_cpu(&o->cgroup, top->cgroup);
 
+	o->allow_create = le32_to_cpu(top->allow_create);
+	o->allow_mounted_write = le32_to_cpu(top->allow_mounted_write);
 	o->td_ddir = le32_to_cpu(top->td_ddir);
 	o->rw_seq = le32_to_cpu(top->rw_seq);
 	o->kb_base = le32_to_cpu(top->kb_base);
-	o->unit_base = le32_to_cpu(top->kb_base);
+	o->unit_base = le32_to_cpu(top->unit_base);
 	o->ddir_seq_nr = le32_to_cpu(top->ddir_seq_nr);
 	o->ddir_seq_add = le64_to_cpu(top->ddir_seq_add);
 	o->iodepth = le32_to_cpu(top->iodepth);
 	o->iodepth_low = le32_to_cpu(top->iodepth_low);
 	o->iodepth_batch = le32_to_cpu(top->iodepth_batch);
-	o->iodepth_batch_complete = le32_to_cpu(top->iodepth_batch_complete);
+	o->iodepth_batch_complete_min = le32_to_cpu(top->iodepth_batch_complete_min);
+	o->iodepth_batch_complete_max = le32_to_cpu(top->iodepth_batch_complete_max);
 	o->size = le64_to_cpu(top->size);
-	o->io_limit = le64_to_cpu(top->io_limit);
+	o->io_size = le64_to_cpu(top->io_size);
 	o->size_percent = le32_to_cpu(top->size_percent);
 	o->fill_device = le32_to_cpu(top->fill_device);
 	o->file_append = le32_to_cpu(top->file_append);
@@ -106,9 +120,19 @@
 			}
 		}
 
+		o->zone_split_nr[i] = le32_to_cpu(top->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			o->zone_split[i] = malloc(o->zone_split_nr[i] * sizeof(struct zone_split));
+			for (j = 0; j < o->zone_split_nr[i]; j++) {
+				o->zone_split[i][j].access_perc = top->zone_split[i][j].access_perc;
+				o->zone_split[i][j].size_perc = top->zone_split[i][j].size_perc;
+			}
+		}
+
 		o->rwmix[i] = le32_to_cpu(top->rwmix[i]);
-		o->rate[i] = le32_to_cpu(top->rate[i]);
-		o->ratemin[i] = le32_to_cpu(top->ratemin[i]);
+		o->rate[i] = le64_to_cpu(top->rate[i]);
+		o->ratemin[i] = le64_to_cpu(top->ratemin[i]);
 		o->rate_iops[i] = le32_to_cpu(top->rate_iops[i]);
 		o->rate_iops_min[i] = le32_to_cpu(top->rate_iops_min[i]);
 
@@ -116,6 +140,8 @@
 	}
 
 	o->ratecycle = le32_to_cpu(top->ratecycle);
+	o->io_submit_mode = le32_to_cpu(top->io_submit_mode);
+	o->unique_filename = le32_to_cpu(top->unique_filename);
 	o->nr_files = le32_to_cpu(top->nr_files);
 	o->open_files = le32_to_cpu(top->open_files);
 	o->file_lock_mode = le32_to_cpu(top->file_lock_mode);
@@ -148,23 +174,30 @@
 	o->verify_batch = le32_to_cpu(top->verify_batch);
 	o->use_thread = le32_to_cpu(top->use_thread);
 	o->unlink = le32_to_cpu(top->unlink);
+	o->unlink_each_loop = le32_to_cpu(top->unlink_each_loop);
 	o->do_disk_util = le32_to_cpu(top->do_disk_util);
 	o->override_sync = le32_to_cpu(top->override_sync);
 	o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
 	o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
 	o->rand_seed = le64_to_cpu(top->rand_seed);
 	o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
+	o->log_hist_msec = le32_to_cpu(top->log_hist_msec);
+	o->log_hist_coarseness = le32_to_cpu(top->log_hist_coarseness);
+	o->log_max = le32_to_cpu(top->log_max);
 	o->log_offset = le32_to_cpu(top->log_offset);
 	o->log_gz = le32_to_cpu(top->log_gz);
 	o->log_gz_store = le32_to_cpu(top->log_gz_store);
+	o->log_unix_epoch = le32_to_cpu(top->log_unix_epoch);
 	o->norandommap = le32_to_cpu(top->norandommap);
 	o->softrandommap = le32_to_cpu(top->softrandommap);
 	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
 	o->fsync_on_close = le32_to_cpu(top->fsync_on_close);
 	o->bs_is_seq_rand = le32_to_cpu(top->bs_is_seq_rand);
 	o->random_distribution = le32_to_cpu(top->random_distribution);
+	o->exitall_error = le32_to_cpu(top->exitall_error);
 	o->zipf_theta.u.f = fio_uint64_to_double(le64_to_cpu(top->zipf_theta.u.i));
 	o->pareto_h.u.f = fio_uint64_to_double(le64_to_cpu(top->pareto_h.u.i));
+	o->gauss_dev.u.f = fio_uint64_to_double(le64_to_cpu(top->gauss_dev.u.i));
 	o->random_generator = le32_to_cpu(top->random_generator);
 	o->hugepage_size = le32_to_cpu(top->hugepage_size);
 	o->rw_min_bs = le32_to_cpu(top->rw_min_bs);
@@ -180,6 +213,10 @@
 	o->start_delay_high = le64_to_cpu(top->start_delay_high);
 	o->timeout = le64_to_cpu(top->timeout);
 	o->ramp_time = le64_to_cpu(top->ramp_time);
+	o->ss_dur = le64_to_cpu(top->ss_dur);
+	o->ss_ramp_time = le64_to_cpu(top->ss_ramp_time);
+	o->ss_state = le32_to_cpu(top->ss_state);
+	o->ss_limit.u.f = fio_uint64_to_double(le64_to_cpu(top->ss_limit.u.i));
 	o->zone_range = le64_to_cpu(top->zone_range);
 	o->zone_size = le64_to_cpu(top->zone_size);
 	o->zone_skip = le64_to_cpu(top->zone_skip);
@@ -198,6 +235,7 @@
 	o->new_group = le32_to_cpu(top->new_group);
 	o->numjobs = le32_to_cpu(top->numjobs);
 	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
+	o->gpu_dev_id = le32_to_cpu(top->gpu_dev_id);
 	o->iolog = le32_to_cpu(top->iolog);
 	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
 	o->nice = le32_to_cpu(top->nice);
@@ -205,6 +243,7 @@
 	o->ioprio_class = le32_to_cpu(top->ioprio_class);
 	o->file_service_type = le32_to_cpu(top->file_service_type);
 	o->group_reporting = le32_to_cpu(top->group_reporting);
+	o->stats = le32_to_cpu(top->stats);
 	o->fadvise_hint = le32_to_cpu(top->fadvise_hint);
 	o->fallocate_mode = le32_to_cpu(top->fallocate_mode);
 	o->zero_buffers = le32_to_cpu(top->zero_buffers);
@@ -242,14 +281,25 @@
 	o->compress_percentage = le32_to_cpu(top->compress_percentage);
 	o->compress_chunk = le32_to_cpu(top->compress_chunk);
 	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
+	o->skip_bad = le32_to_cpu(top->skip_bad);
+	o->block_error_hist = le32_to_cpu(top->block_error_hist);
+	o->replay_align = le32_to_cpu(top->replay_align);
+	o->replay_scale = le32_to_cpu(top->replay_scale);
+	o->per_job_logs = le32_to_cpu(top->per_job_logs);
+	o->write_bw_log = le32_to_cpu(top->write_bw_log);
+	o->write_lat_log = le32_to_cpu(top->write_lat_log);
+	o->write_iops_log = le32_to_cpu(top->write_iops_log);
+	o->write_hist_log = le32_to_cpu(top->write_hist_log);
 
 	o->trim_backlog = le64_to_cpu(top->trim_backlog);
+	o->rate_process = le32_to_cpu(top->rate_process);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
 		o->percentile_list[i].u.f = fio_uint64_to_double(le64_to_cpu(top->percentile_list[i].u.i));
 #if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
 #endif
 }
 
@@ -263,6 +313,7 @@
 
 	string_to_net(top->description, o->description);
 	string_to_net(top->name, o->name);
+	string_to_net(top->wait_for, o->wait_for);
 	string_to_net(top->directory, o->directory);
 	string_to_net(top->filename, o->filename);
 	string_to_net(top->filename_format, o->filename_format);
@@ -274,6 +325,7 @@
 	string_to_net(top->bw_log_file, o->bw_log_file);
 	string_to_net(top->lat_log_file, o->lat_log_file);
 	string_to_net(top->iops_log_file, o->iops_log_file);
+	string_to_net(top->hist_log_file, o->hist_log_file);
 	string_to_net(top->replay_redirect, o->replay_redirect);
 	string_to_net(top->exec_prerun, o->exec_prerun);
 	string_to_net(top->exec_postrun, o->exec_postrun);
@@ -281,20 +333,25 @@
 	string_to_net(top->profile, o->profile);
 	string_to_net(top->cgroup, o->cgroup);
 
+	top->allow_create = cpu_to_le32(o->allow_create);
+	top->allow_mounted_write = cpu_to_le32(o->allow_mounted_write);
 	top->td_ddir = cpu_to_le32(o->td_ddir);
 	top->rw_seq = cpu_to_le32(o->rw_seq);
 	top->kb_base = cpu_to_le32(o->kb_base);
-	top->unit_base = cpu_to_le32(o->kb_base);
+	top->unit_base = cpu_to_le32(o->unit_base);
 	top->ddir_seq_nr = cpu_to_le32(o->ddir_seq_nr);
 	top->iodepth = cpu_to_le32(o->iodepth);
 	top->iodepth_low = cpu_to_le32(o->iodepth_low);
 	top->iodepth_batch = cpu_to_le32(o->iodepth_batch);
-	top->iodepth_batch_complete = cpu_to_le32(o->iodepth_batch_complete);
+	top->iodepth_batch_complete_min = cpu_to_le32(o->iodepth_batch_complete_min);
+	top->iodepth_batch_complete_max = cpu_to_le32(o->iodepth_batch_complete_max);
 	top->size_percent = cpu_to_le32(o->size_percent);
 	top->fill_device = cpu_to_le32(o->fill_device);
 	top->file_append = cpu_to_le32(o->file_append);
 	top->ratecycle = cpu_to_le32(o->ratecycle);
+	top->io_submit_mode = cpu_to_le32(o->io_submit_mode);
 	top->nr_files = cpu_to_le32(o->nr_files);
+	top->unique_filename = cpu_to_le32(o->unique_filename);
 	top->open_files = cpu_to_le32(o->open_files);
 	top->file_lock_mode = cpu_to_le32(o->file_lock_mode);
 	top->odirect = cpu_to_le32(o->odirect);
@@ -322,23 +379,28 @@
 	top->verify_batch = cpu_to_le32(o->verify_batch);
 	top->use_thread = cpu_to_le32(o->use_thread);
 	top->unlink = cpu_to_le32(o->unlink);
+	top->unlink_each_loop = cpu_to_le32(o->unlink_each_loop);
 	top->do_disk_util = cpu_to_le32(o->do_disk_util);
 	top->override_sync = cpu_to_le32(o->override_sync);
 	top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
 	top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
 	top->rand_seed = __cpu_to_le64(o->rand_seed);
 	top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
+	top->log_max = cpu_to_le32(o->log_max);
 	top->log_offset = cpu_to_le32(o->log_offset);
 	top->log_gz = cpu_to_le32(o->log_gz);
 	top->log_gz_store = cpu_to_le32(o->log_gz_store);
+	top->log_unix_epoch = cpu_to_le32(o->log_unix_epoch);
 	top->norandommap = cpu_to_le32(o->norandommap);
 	top->softrandommap = cpu_to_le32(o->softrandommap);
 	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
 	top->fsync_on_close = cpu_to_le32(o->fsync_on_close);
 	top->bs_is_seq_rand = cpu_to_le32(o->bs_is_seq_rand);
 	top->random_distribution = cpu_to_le32(o->random_distribution);
+	top->exitall_error = cpu_to_le32(o->exitall_error);
 	top->zipf_theta.u.i = __cpu_to_le64(fio_double_to_uint64(o->zipf_theta.u.f));
 	top->pareto_h.u.i = __cpu_to_le64(fio_double_to_uint64(o->pareto_h.u.f));
+	top->gauss_dev.u.i = __cpu_to_le64(fio_double_to_uint64(o->gauss_dev.u.f));
 	top->random_generator = cpu_to_le32(o->random_generator);
 	top->hugepage_size = cpu_to_le32(o->hugepage_size);
 	top->rw_min_bs = cpu_to_le32(o->rw_min_bs);
@@ -359,6 +421,7 @@
 	top->new_group = cpu_to_le32(o->new_group);
 	top->numjobs = cpu_to_le32(o->numjobs);
 	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
+	top->gpu_dev_id = cpu_to_le32(o->gpu_dev_id);
 	top->iolog = cpu_to_le32(o->iolog);
 	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
 	top->nice = cpu_to_le32(o->nice);
@@ -366,6 +429,7 @@
 	top->ioprio_class = cpu_to_le32(o->ioprio_class);
 	top->file_service_type = cpu_to_le32(o->file_service_type);
 	top->group_reporting = cpu_to_le32(o->group_reporting);
+	top->stats = cpu_to_le32(o->stats);
 	top->fadvise_hint = cpu_to_le32(o->fadvise_hint);
 	top->fallocate_mode = cpu_to_le32(o->fallocate_mode);
 	top->zero_buffers = cpu_to_le32(o->zero_buffers);
@@ -403,6 +467,15 @@
 	top->compress_percentage = cpu_to_le32(o->compress_percentage);
 	top->compress_chunk = cpu_to_le32(o->compress_chunk);
 	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
+	top->block_error_hist = cpu_to_le32(o->block_error_hist);
+	top->skip_bad = cpu_to_le32(o->skip_bad);
+	top->replay_align = cpu_to_le32(o->replay_align);
+	top->replay_scale = cpu_to_le32(o->replay_scale);
+	top->per_job_logs = cpu_to_le32(o->per_job_logs);
+	top->write_bw_log = cpu_to_le32(o->write_bw_log);
+	top->write_lat_log = cpu_to_le32(o->write_lat_log);
+	top->write_iops_log = cpu_to_le32(o->write_iops_log);
+	top->write_hist_log = cpu_to_le32(o->write_hist_log);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		top->bs[i] = cpu_to_le32(o->bs[i]);
@@ -424,9 +497,24 @@
 			}
 		}
 
+		top->zone_split_nr[i] = cpu_to_le32(o->zone_split_nr[i]);
+
+		if (o->zone_split_nr[i]) {
+			unsigned int zone_split_nr = o->zone_split_nr[i];
+
+			if (zone_split_nr > ZONESPLIT_MAX) {
+				log_err("fio: ZONESPLIT_MAX is too small\n");
+				zone_split_nr = ZONESPLIT_MAX;
+			}
+			for (j = 0; j < zone_split_nr; j++) {
+				top->zone_split[i][j].access_perc = o->zone_split[i][j].access_perc;
+				top->zone_split[i][j].size_perc = o->zone_split[i][j].size_perc;
+			}
+		}
+
 		top->rwmix[i] = cpu_to_le32(o->rwmix[i]);
-		top->rate[i] = cpu_to_le32(o->rate[i]);
-		top->ratemin[i] = cpu_to_le32(o->ratemin[i]);
+		top->rate[i] = cpu_to_le64(o->rate[i]);
+		top->ratemin[i] = cpu_to_le64(o->ratemin[i]);
 		top->rate_iops[i] = cpu_to_le32(o->rate_iops[i]);
 		top->rate_iops_min[i] = cpu_to_le32(o->rate_iops_min[i]);
 
@@ -437,12 +525,16 @@
 	memcpy(top->buffer_pattern, o->buffer_pattern, MAX_PATTERN_SIZE);
 
 	top->size = __cpu_to_le64(o->size);
-	top->io_limit = __cpu_to_le64(o->io_limit);
+	top->io_size = __cpu_to_le64(o->io_size);
 	top->verify_backlog = __cpu_to_le64(o->verify_backlog);
 	top->start_delay = __cpu_to_le64(o->start_delay);
 	top->start_delay_high = __cpu_to_le64(o->start_delay_high);
 	top->timeout = __cpu_to_le64(o->timeout);
 	top->ramp_time = __cpu_to_le64(o->ramp_time);
+	top->ss_dur = __cpu_to_le64(top->ss_dur);
+	top->ss_ramp_time = __cpu_to_le64(top->ss_ramp_time);
+	top->ss_state = cpu_to_le32(top->ss_state);
+	top->ss_limit.u.i = __cpu_to_le64(fio_double_to_uint64(o->ss_limit.u.f));
 	top->zone_range = __cpu_to_le64(o->zone_range);
 	top->zone_size = __cpu_to_le64(o->zone_size);
 	top->zone_skip = __cpu_to_le64(o->zone_skip);
@@ -454,12 +546,14 @@
 	top->trim_backlog = __cpu_to_le64(o->trim_backlog);
 	top->offset_increment = __cpu_to_le64(o->offset_increment);
 	top->number_ios = __cpu_to_le64(o->number_ios);
+	top->rate_process = cpu_to_le32(o->rate_process);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++)
 		top->percentile_list[i].u.i = __cpu_to_le64(fio_double_to_uint64(o->percentile_list[i].u.f));
 #if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
 #endif
 
 }

diff --git a/cgroup.c b/cgroup.c
index 34b61de..a297e2a 100644
--- a/cgroup.c
+++ b/cgroup.c

@@ -102,9 +102,9 @@
 	char *str = malloc(64);
 
 	if (td->o.cgroup)
-		sprintf(str, "%s%s%s", mnt, FIO_OS_PATH_SEPARATOR, td->o.cgroup);
+		sprintf(str, "%s/%s", mnt, td->o.cgroup);
 	else
-		sprintf(str, "%s%s%s", mnt, FIO_OS_PATH_SEPARATOR, td->o.name);
+		sprintf(str, "%s/%s", mnt, td->o.name);
 
 	return str;
 }
@@ -116,7 +116,7 @@
 	char tmp[256];
 	FILE *f;
 
-	sprintf(tmp, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, filename);
+	sprintf(tmp, "%s/%s", path, filename);
 	f = fopen(tmp, "w");
 	if (!f) {
 		td_verror(td, errno, onerr);

diff --git a/client.c b/client.c
index 760ec85..80096bf 100644
--- a/client.c
+++ b/client.c

@@ -33,6 +33,8 @@
 static void handle_stop(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_start(struct fio_client *client, struct fio_net_cmd *cmd);
 
+static void convert_text(struct fio_net_cmd *cmd);
+
 struct client_ops fio_client_ops = {
 	.text		= handle_text,
 	.disk_util	= handle_du,
@@ -59,6 +61,7 @@
 
 static int sum_stat_nr;
 static struct json_object *root = NULL;
+static struct json_object *job_opt_object = NULL;
 static struct json_array *clients_array = NULL;
 static struct json_array *du_array = NULL;
 
@@ -69,6 +72,8 @@
 #define FIO_CLIENT_HASH_MASK	(FIO_CLIENT_HASH_SZ - 1)
 static struct flist_head client_hash[FIO_CLIENT_HASH_SZ];
 
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *, bool *);
+
 static void fio_client_add_hash(struct fio_client *client)
 {
 	int bucket = hash_long(client->fd, FIO_CLIENT_HASH_BITS);
@@ -117,10 +122,23 @@
 
 static void fio_client_json_init(void)
 {
-	if (output_format != FIO_OUTPUT_JSON)
+	char time_buf[32];
+	time_t time_p;
+
+	if (!(output_format & FIO_OUTPUT_JSON))
 		return;
+
+	time(&time_p);
+	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
+	time_buf[strlen(time_buf) - 1] = '\0';
+
 	root = json_create_object();
 	json_object_add_value_string(root, "fio version", fio_version_string);
+	json_object_add_value_int(root, "timestamp", time_p);
+	json_object_add_value_string(root, "time", time_buf);
+
+	job_opt_object = json_create_object();
+	json_object_add_value_object(root, "global options", job_opt_object);
 	clients_array = json_create_array();
 	json_object_add_value_array(root, "client_stats", clients_array);
 	du_array = json_create_array();
@@ -129,9 +147,11 @@
 
 static void fio_client_json_fini(void)
 {
-	if (output_format != FIO_OUTPUT_JSON)
+	if (!(output_format & FIO_OUTPUT_JSON))
 		return;
-	json_print_object(root);
+
+	log_info("\n");
+	json_print_object(root, NULL);
 	log_info("\n");
 	json_free_object(root);
 	root = NULL;
@@ -174,6 +194,8 @@
 	}
 	if (client->files)
 		free(client->files);
+	if (client->opt_lists)
+		free(client->opt_lists);
 
 	if (!client->did_stat)
 		sum_stat_clients--;
@@ -184,12 +206,43 @@
 	free(client);
 }
 
+static int fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
+{
+	if (!--eta->pending) {
+		eta_fn(&eta->eta);
+		free(eta);
+		return 0;
+	}
+
+	return 1;
+}
+
+static void fio_drain_client_text(struct fio_client *client)
+{
+	do {
+		struct fio_net_cmd *cmd;
+
+		cmd = fio_net_recv_cmd(client->fd, false);
+		if (!cmd)
+			break;
+
+		if (cmd->opcode == FIO_NET_CMD_TEXT) {
+			convert_text(cmd);
+			client->ops->text(client, cmd);
+		}
+
+		free(cmd);
+	} while (1);
+}
+
 static void remove_client(struct fio_client *client)
 {
 	assert(client->refs);
 
 	dprint(FD_NET, "client: removed <%s>\n", client->hostname);
 
+	fio_drain_client_text(client);
+
 	if (!flist_empty(&client->list))
 		flist_del_init(&client->list);
 
@@ -294,7 +347,7 @@
 	return NULL;
 }
 
-int fio_client_add_ini_file(void *cookie, const char *ini_file, int remote)
+int fio_client_add_ini_file(void *cookie, const char *ini_file, bool remote)
 {
 	struct fio_client *client = cookie;
 	struct client_file *cf;
@@ -504,7 +557,7 @@
 	return fio_net_send_quit(client->fd);
 }
 
-void fio_clients_terminate(void)
+static void fio_clients_terminate(void)
 {
 	struct flist_head *entry;
 	struct fio_client *client;
@@ -523,11 +576,6 @@
 	fio_clients_terminate();
 }
 
-static void sig_show_status(int sig)
-{
-	show_running_run_stats();
-}
-
 static void client_signal_handler(void)
 {
 	struct sigaction act;
@@ -741,7 +789,7 @@
 }
 
 int fio_client_send_ini(struct fio_client *client, const char *filename,
-			int remote)
+			bool remote)
 {
 	int ret;
 
@@ -768,6 +816,8 @@
 	struct flist_head *entry, *tmp;
 
 	flist_for_each_safe(entry, tmp, &client_list) {
+		bool failed = false;
+
 		client = flist_entry(entry, struct fio_client, list);
 
 		if (client->nr_files) {
@@ -779,12 +829,13 @@
 				cf = &client->files[i];
 
 				if (fio_client_send_cf(client, cf)) {
+					failed = true;
 					remove_client(client);
 					break;
 				}
 			}
 		}
-		if (client->sent_job)
+		if (client->sent_job || failed)
 			continue;
 		if (!filename || fio_client_send_ini(client, filename, 0))
 			remove_client(client);
@@ -857,10 +908,10 @@
 		dst->io_u_complete[i]	= le32_to_cpu(src->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
 		dst->io_u_lat_u[i]	= le32_to_cpu(src->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
 		dst->io_u_lat_m[i]	= le32_to_cpu(src->io_u_lat_m[i]);
-	}
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
@@ -891,6 +942,25 @@
 	dst->latency_target	= le64_to_cpu(src->latency_target);
 	dst->latency_window	= le64_to_cpu(src->latency_window);
 	dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
+
+	dst->nr_block_infos	= le64_to_cpu(src->nr_block_infos);
+	for (i = 0; i < dst->nr_block_infos; i++)
+		dst->block_infos[i] = le32_to_cpu(src->block_infos[i]);
+
+	dst->ss_dur		= le64_to_cpu(src->ss_dur);
+	dst->ss_state		= le32_to_cpu(src->ss_state);
+	dst->ss_head		= le32_to_cpu(src->ss_head);
+	dst->ss_limit.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_limit.u.i));
+	dst->ss_slope.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_slope.u.i));
+	dst->ss_deviation.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_deviation.u.i));
+	dst->ss_criterion.u.f 	= fio_uint64_to_double(le64_to_cpu(src->ss_criterion.u.i));
+
+	if (dst->ss_state & __FIO_SS_DATA) {
+		for (i = 0; i < dst->ss_dur; i++ ) {
+			dst->ss_iops_data[i] = le64_to_cpu(src->ss_iops_data[i]);
+			dst->ss_bw_data[i] = le64_to_cpu(src->ss_bw_data[i]);
+		}
+	}
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
@@ -902,7 +972,7 @@
 		dst->min_run[i]		= le64_to_cpu(src->min_run[i]);
 		dst->max_bw[i]		= le64_to_cpu(src->max_bw[i]);
 		dst->min_bw[i]		= le64_to_cpu(src->min_bw[i]);
-		dst->io_kb[i]		= le64_to_cpu(src->io_kb[i]);
+		dst->iobytes[i]		= le64_to_cpu(src->iobytes[i]);
 		dst->agg[i]		= le64_to_cpu(src->agg[i]);
 	}
 
@@ -924,9 +994,13 @@
 static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
+	struct flist_head *opt_list = NULL;
 	struct json_object *tsobj;
 
-	tsobj = show_thread_status(&p->ts, &p->rs);
+	if (client->opt_lists && p->ts.thread_number <= client->jobs)
+		opt_list = &client->opt_lists[p->ts.thread_number - 1];
+
+	tsobj = show_thread_status(&p->ts, &p->rs, opt_list, NULL);
 	client->did_stat = 1;
 	if (tsobj) {
 		json_object_add_client_info(tsobj, client);
@@ -936,7 +1010,7 @@
 	if (sum_stat_clients <= 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
@@ -946,7 +1020,7 @@
 
 	if (++sum_stat_nr == sum_stat_clients) {
 		strcpy(client_ts.name, "All clients");
-		tsobj = show_thread_status(&client_ts, &client_gs);
+		tsobj = show_thread_status(&client_ts, &client_gs, NULL, NULL);
 		if (tsobj) {
 			json_object_add_client_info(tsobj, client);
 			json_array_add_value_object(clients_array, tsobj);
@@ -958,7 +1032,41 @@
 {
 	struct group_run_stats *gs = (struct group_run_stats *) cmd->payload;
 
-	show_group_stats(gs);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_group_stats(gs, NULL);
+}
+
+static void handle_job_opt(struct fio_client *client, struct fio_net_cmd *cmd)
+{
+	struct cmd_job_option *pdu = (struct cmd_job_option *) cmd->payload;
+	struct print_option *p;
+
+	if (!job_opt_object)
+		return;
+
+	pdu->global = le16_to_cpu(pdu->global);
+	pdu->truncated = le16_to_cpu(pdu->truncated);
+	pdu->groupid = le32_to_cpu(pdu->groupid);
+
+	p = malloc(sizeof(*p));
+	p->name = strdup((char *) pdu->name);
+	if (pdu->value[0] != '\0')
+		p->value = strdup((char *) pdu->value);
+	else
+		p->value = NULL;
+
+	if (pdu->global) {
+		const char *pos = "";
+
+		if (p->value)
+			pos = p->value;
+
+		json_object_add_value_string(job_opt_object, p->name, pos);
+	} else if (client->opt_lists) {
+		struct flist_head *opt_list = &client->opt_lists[pdu->groupid];
+
+		flist_add_tail(&p->list, opt_list);
+	}
 }
 
 static void handle_text(struct fio_client *client, struct fio_net_cmd *cmd)
@@ -1019,13 +1127,16 @@
 		log_info("\nDisk stats (read/write):\n");
 	}
 
-	if (output_format == FIO_OUTPUT_JSON) {
+	if (output_format & FIO_OUTPUT_JSON) {
 		struct json_object *duobj;
 		json_array_add_disk_util(&du->dus, &du->agg, du_array);
 		duobj = json_array_last_value_object(du_array);
 		json_object_add_client_info(duobj, client);
-	} else
-		print_disk_util(&du->dus, &du->agg, output_format == FIO_OUTPUT_TERSE);
+	}
+	if (output_format & FIO_OUTPUT_TERSE)
+		print_disk_util(&du->dus, &du->agg, 1, NULL);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		print_disk_util(&du->dus, &du->agg, 0, NULL);
 }
 
 static void convert_jobs_eta(struct jobs_eta *je)
@@ -1039,11 +1150,11 @@
 	je->files_open		= le32_to_cpu(je->files_open);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		je->m_rate[i]	= le32_to_cpu(je->m_rate[i]);
-		je->t_rate[i]	= le32_to_cpu(je->t_rate[i]);
+		je->m_rate[i]	= le64_to_cpu(je->m_rate[i]);
+		je->t_rate[i]	= le64_to_cpu(je->t_rate[i]);
 		je->m_iops[i]	= le32_to_cpu(je->m_iops[i]);
 		je->t_iops[i]	= le32_to_cpu(je->t_iops[i]);
-		je->rate[i]	= le32_to_cpu(je->rate[i]);
+		je->rate[i]	= le64_to_cpu(je->rate[i]);
 		je->iops[i]	= le32_to_cpu(je->iops[i]);
 	}
 
@@ -1087,15 +1198,7 @@
 	strcpy((char *) dst->run_str, (char *) je->run_str);
 }
 
-void fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
-{
-	if (!--eta->pending) {
-		eta_fn(&eta->eta);
-		free(eta);
-	}
-}
-
-static void remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
+static bool remove_reply_cmd(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct fio_net_cmd_reply *reply = NULL;
 	struct flist_head *entry;
@@ -1111,12 +1214,13 @@
 
 	if (!reply) {
 		log_err("fio: client: unable to find matching tag (%llx)\n", (unsigned long long) cmd->tag);
-		return;
+		return false;
 	}
 
 	flist_del(&reply->list);
 	cmd->tag = reply->saved_tag;
 	free(reply);
+	return true;
 }
 
 int fio_client_wait_for_reply(struct fio_client *client, uint64_t tag)
@@ -1154,6 +1258,7 @@
 
 	client->eta_in_flight = NULL;
 	flist_del_init(&client->eta_list);
+	client->eta_timeouts = 0;
 
 	if (client->ops->jobs_eta)
 		client->ops->jobs_eta(client, je);
@@ -1162,6 +1267,111 @@
 	fio_client_dec_jobs_eta(eta, client->ops->eta);
 }
 
+static void client_flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+				      uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry;
+	unsigned int *io_u_plat;
+
+	int stride = 1 << hist_coarseness;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+
+		s = (struct io_sample *)((char *)__get_sample(samples, log_offset, i) +
+			i * sizeof(struct io_u_plat_entry));
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		fprintf(f, "%lu, %u, %u, ", (unsigned long) s->time,
+						io_sample_ddir(s), s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%lu, ", hist_sum(j, stride, io_u_plat, NULL));
+		}
+		fprintf(f, "%lu\n", (unsigned long)
+			hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat, NULL));
+
+	}
+}
+
+static int fio_client_handle_iolog(struct fio_client *client,
+				   struct fio_net_cmd *cmd)
+{
+	struct cmd_iolog_pdu *pdu;
+	bool store_direct;
+	char *log_pathname;
+
+	pdu = convert_iolog(cmd, &store_direct);
+	if (!pdu) {
+		log_err("fio: failed converting IO log\n");
+		return 1;
+	}
+
+        /* allocate buffer big enough for next sprintf() call */
+	log_pathname = malloc(10 + strlen((char *)pdu->name) +
+			strlen(client->hostname));
+	if (!log_pathname) {
+		log_err("fio: memory allocation of unique pathname failed\n");
+		return -1;
+	}
+	/* generate a unique pathname for the log file using hostname */
+	sprintf(log_pathname, "%s.%s", pdu->name, client->hostname);
+
+	if (store_direct) {
+		ssize_t ret;
+		size_t sz;
+		int fd;
+
+		fd = open((const char *) log_pathname,
+				O_WRONLY | O_CREAT | O_TRUNC, 0644);
+		if (fd < 0) {
+			log_err("fio: open log %s: %s\n",
+				log_pathname, strerror(errno));
+			return 1;
+		}
+
+		sz = cmd->pdu_len - sizeof(*pdu);
+		ret = write(fd, pdu->samples, sz);
+		close(fd);
+
+		if (ret != sz) {
+			log_err("fio: short write on compressed log\n");
+			return 1;
+		}
+
+		return 0;
+	} else {
+		FILE *f;
+		f = fopen((const char *) log_pathname, "w");
+		if (!f) {
+			log_err("fio: fopen log %s : %s\n",
+				log_pathname, strerror(errno));
+			return 1;
+		}
+
+		if (pdu->log_type == IO_LOG_TYPE_HIST) {
+			client_flush_hist_samples(f, pdu->log_hist_coarseness, pdu->samples,
+					   pdu->nr_samples * sizeof(struct io_sample));
+		} else {
+			flush_samples(f, pdu->samples,
+					pdu->nr_samples * sizeof(struct io_sample));
+		}
+		fclose(f);
+		return 0;
+	}
+}
+
 static void handle_probe(struct fio_client *client, struct fio_net_cmd *cmd)
 {
 	struct cmd_probe_reply_pdu *probe = (struct cmd_probe_reply_pdu *) cmd->payload;
@@ -1195,6 +1405,17 @@
 	client->jobs = le32_to_cpu(pdu->jobs);
 	client->nr_stat = le32_to_cpu(pdu->stat_outputs);
 
+	if (client->jobs) {
+		int i;
+
+		if (client->opt_lists)
+			free(client->opt_lists);
+
+		client->opt_lists = malloc(client->jobs * sizeof(struct flist_head));
+		for (i = 0; i < client->jobs; i++)
+			INIT_FLIST_HEAD(&client->opt_lists[i]);
+	}
+
 	sum_stat_clients += client->nr_stat;
 }
 
@@ -1245,7 +1466,11 @@
 	 */
 	nr_samples = le64_to_cpu(pdu->nr_samples);
 
-	total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset));
+	if (pdu->log_type == IO_LOG_TYPE_HIST)
+		total = nr_samples * (__log_entry_sz(le32_to_cpu(pdu->log_offset)) +
+					sizeof(struct io_u_plat_entry));
+	else
+		total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset));
 	ret = malloc(total + sizeof(*pdu));
 	ret->nr_samples = nr_samples;
 
@@ -1291,18 +1516,23 @@
  * This has been compressed on the server side, since it can be big.
  * Uncompress here.
  */
-static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd)
+static struct cmd_iolog_pdu *convert_iolog(struct fio_net_cmd *cmd,
+					   bool *store_direct)
 {
 	struct cmd_iolog_pdu *pdu = (struct cmd_iolog_pdu *) cmd->payload;
 	struct cmd_iolog_pdu *ret;
 	uint64_t i;
+	int compressed;
 	void *samples;
 
+	*store_direct = false;
+
 	/*
 	 * Convert if compressed and we support it. If it's not
 	 * compressed, we need not do anything.
 	 */
-	if (le32_to_cpu(pdu->compressed)) {
+	compressed = le32_to_cpu(pdu->compressed);
+	if (compressed == XMIT_COMPRESSED) {
 #ifndef CONFIG_ZLIB
 		log_err("fio: server sent compressed data by mistake\n");
 		return NULL;
@@ -1312,6 +1542,9 @@
 			log_err("fio: failed decompressing log\n");
 			return NULL;
 		}
+	} else if (compressed == STORE_COMPRESSED) {
+		*store_direct = true;
+		ret = pdu;
 	} else
 		ret = pdu;
 
@@ -1320,14 +1553,21 @@
 	ret->log_type		= le32_to_cpu(ret->log_type);
 	ret->compressed		= le32_to_cpu(ret->compressed);
 	ret->log_offset		= le32_to_cpu(ret->log_offset);
+	ret->log_hist_coarseness = le32_to_cpu(ret->log_hist_coarseness);
+
+	if (*store_direct)
+		return ret;
 
 	samples = &ret->samples[0];
 	for (i = 0; i < ret->nr_samples; i++) {
 		struct io_sample *s;
 
 		s = __get_sample(samples, ret->log_offset, i);
+		if (ret->log_type == IO_LOG_TYPE_HIST)
+			s = (struct io_sample *)((void *)s + sizeof(struct io_u_plat_entry) * i);
+
 		s->time		= le64_to_cpu(s->time);
-		s->val		= le64_to_cpu(s->val);
+		s->data.val	= le64_to_cpu(s->data.val);
 		s->__ddir	= le32_to_cpu(s->__ddir);
 		s->bs		= le32_to_cpu(s->bs);
 
@@ -1336,6 +1576,12 @@
 
 			so->offset = le64_to_cpu(so->offset);
 		}
+
+		if (ret->log_type == IO_LOG_TYPE_HIST) {
+			s->data.plat_entry = (struct io_u_plat_entry *)(((void *)s) + sizeof(*s));
+			s->data.plat_entry->list.next = NULL;
+			s->data.plat_entry->list.prev = NULL;
+		}
 	}
 
 	return ret;
@@ -1348,8 +1594,8 @@
 	fio_net_send_cmd(fd, FIO_NET_CMD_SENDFILE, rep, size, &tag, NULL);
 }
 
-static int send_file(struct fio_client *client, struct cmd_sendfile *pdu,
-		     uint64_t tag)
+static int fio_send_file(struct fio_client *client, struct cmd_sendfile *pdu,
+			 uint64_t tag)
 {
 	struct cmd_sendfile_reply *rep;
 	struct stat sb;
@@ -1386,10 +1632,11 @@
 {
 	struct client_ops *ops = client->ops;
 	struct fio_net_cmd *cmd;
+	int size;
 
 	dprint(FD_NET, "client: handle %s\n", client->hostname);
 
-	cmd = fio_net_recv_cmd(client->fd);
+	cmd = fio_net_recv_cmd(client->fd, true);
 	if (!cmd)
 		return 0;
 
@@ -1418,6 +1665,15 @@
 	case FIO_NET_CMD_TS: {
 		struct cmd_ts_pdu *p = (struct cmd_ts_pdu *) cmd->payload;
 
+		dprint(FD_NET, "client: ts->ss_state = %u\n", (unsigned int) le32_to_cpu(p->ts.ss_state));
+		if (le32_to_cpu(p->ts.ss_state) & __FIO_SS_DATA) {
+			dprint(FD_NET, "client: received steadystate ring buffers\n");
+
+			size = le64_to_cpu(p->ts.ss_dur);
+			p->ts.ss_iops_data = (uint64_t *) ((struct cmd_ts_pdu *)cmd->payload + 1);
+			p->ts.ss_bw_data = p->ts.ss_iops_data + size;
+		}
+
 		convert_ts(&p->ts, &p->ts);
 		convert_gs(&p->rs, &p->rs);
 
@@ -1435,7 +1691,8 @@
 	case FIO_NET_CMD_ETA: {
 		struct jobs_eta *je = (struct jobs_eta *) cmd->payload;
 
-		remove_reply_cmd(client, cmd);
+		if (!remove_reply_cmd(client, cmd))
+			break;
 		convert_jobs_eta(je);
 		handle_eta(client, cmd);
 		break;
@@ -1477,12 +1734,7 @@
 		break;
 		}
 	case FIO_NET_CMD_IOLOG:
-		if (ops->iolog) {
-			struct cmd_iolog_pdu *pdu;
-
-			pdu = convert_iolog(cmd);
-			ops->iolog(client, pdu);
-		}
+		fio_client_handle_iolog(client, cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ops->update_job(client, cmd);
@@ -1490,17 +1742,27 @@
 		break;
 	case FIO_NET_CMD_VTRIGGER: {
 		struct all_io_list *pdu = (struct all_io_list *) cmd->payload;
-		char buf[64];
+		char buf[128];
+		int off = 0;
 
-		__verify_save_state(pdu, server_name(client, buf, sizeof(buf)));
+		if (aux_path) {
+			strcpy(buf, aux_path);
+			off = strlen(buf);
+		}
+
+		__verify_save_state(pdu, server_name(client, &buf[off], sizeof(buf) - off));
 		exec_trigger(trigger_cmd);
 		break;
 		}
 	case FIO_NET_CMD_SENDFILE: {
 		struct cmd_sendfile *pdu = (struct cmd_sendfile *) cmd->payload;
-		send_file(client, pdu, cmd->tag);
+		fio_send_file(client, pdu, cmd->tag);
 		break;
 		}
+	case FIO_NET_CMD_JOB_OPT: {
+		handle_job_opt(client, cmd);
+		break;
+	}
 	default:
 		log_err("fio: unknown client op: %s\n", fio_server_op(cmd->opcode));
 		break;
@@ -1569,12 +1831,43 @@
 					(uintptr_t) eta, &client->cmd_list);
 	}
 
-	while (skipped--)
-		fio_client_dec_jobs_eta(eta, ops->eta);
+	while (skipped--) {
+		if (!fio_client_dec_jobs_eta(eta, ops->eta))
+			break;
+	}
 
 	dprint(FD_NET, "client: requested eta tag %p\n", eta);
 }
 
+/*
+ * A single SEND_ETA timeout isn't fatal. Attempt to recover.
+ */
+static int handle_cmd_timeout(struct fio_client *client,
+			      struct fio_net_cmd_reply *reply)
+{
+	flist_del(&reply->list);
+	free(reply);
+
+	if (reply->opcode != FIO_NET_CMD_SEND_ETA)
+		return 1;
+
+	log_info("client <%s>: timeout on SEND_ETA\n", client->hostname);
+
+	flist_del_init(&client->eta_list);
+	if (client->eta_in_flight) {
+		fio_client_dec_jobs_eta(client->eta_in_flight, client->ops->eta);
+		client->eta_in_flight = NULL;
+	}
+
+	/*
+	 * If we fail 5 in a row, give up...
+	 */
+	if (client->eta_timeouts++ > 5)
+		return 1;
+
+	return 0;
+}
+
 static int client_check_cmd_timeout(struct fio_client *client,
 				    struct timeval *now)
 {
@@ -1588,10 +1881,11 @@
 		if (mtime_since(&reply->tv, now) < FIO_NET_CLIENT_TIMEOUT)
 			continue;
 
+		if (!handle_cmd_timeout(client, reply))
+			continue;
+
 		log_err("fio: client %s, timeout on cmd %s\n", client->hostname,
 						fio_server_op(reply->opcode));
-		flist_del(&reply->list);
-		free(reply);
 		ret = 1;
 	}
 

diff --git a/client.h b/client.h
index 8818de2..fc9c196 100644
--- a/client.h
+++ b/client.h

@@ -22,7 +22,7 @@
 
 struct client_file {
 	char *file;
-	int remote;
+	bool remote;
 };
 
 struct fio_client {
@@ -41,6 +41,8 @@
 
 	char *name;
 
+	struct flist_head *opt_lists;
+
 	int state;
 
 	int skip_newline;
@@ -60,6 +62,7 @@
 
 	struct flist_head eta_list;
 	struct client_eta *eta_in_flight;
+	unsigned int eta_timeouts;
 
 	struct flist_head cmd_list;
 
@@ -73,12 +76,10 @@
 	unsigned int nr_files;
 };
 
-struct cmd_iolog_pdu;
 typedef void (client_cmd_op)(struct fio_client *, struct fio_net_cmd *);
 typedef void (client_eta_op)(struct jobs_eta *je);
 typedef void (client_timed_out_op)(struct fio_client *);
 typedef void (client_jobs_eta_op)(struct fio_client *client, struct jobs_eta *je);
-typedef void (client_iolog_op)(struct fio_client *client, struct cmd_iolog_pdu *);
 
 struct client_ops {
 	client_cmd_op		*text;
@@ -95,7 +96,6 @@
 	client_cmd_op		*stop;
 	client_cmd_op		*start;
 	client_cmd_op		*job_start;
-	client_iolog_op		*iolog;
 	client_timed_out_op	*removed;
 
 	unsigned int eta_msec;
@@ -111,7 +111,6 @@
 };
 
 extern int fio_handle_client(struct fio_client *);
-extern void fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op fn);
 extern void fio_client_sum_jobs_eta(struct jobs_eta *dst, struct jobs_eta *je);
 
 enum {
@@ -125,14 +124,13 @@
 extern int fio_start_client(struct fio_client *);
 extern int fio_start_all_clients(void);
 extern int fio_clients_send_ini(const char *);
-extern int fio_client_send_ini(struct fio_client *, const char *, int);
+extern int fio_client_send_ini(struct fio_client *, const char *, bool);
 extern int fio_handle_clients(struct client_ops *);
 extern int fio_client_add(struct client_ops *, const char *, void **);
 extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
 extern void fio_client_add_cmd_option(void *, const char *);
-extern int fio_client_add_ini_file(void *, const char *, int);
+extern int fio_client_add_ini_file(void *, const char *, bool);
 extern int fio_client_terminate(struct fio_client *);
-extern void fio_clients_terminate(void);
 extern struct fio_client *fio_get_client(struct fio_client *);
 extern void fio_put_client(struct fio_client *);
 extern int fio_client_update_options(struct fio_client *, struct thread_options *, uint64_t *);
@@ -146,5 +144,9 @@
 	FIO_CLIENT_TYPE_GUI		= 2,
 };
 
+extern int sum_stat_clients;
+extern struct thread_stat client_ts;
+extern struct group_run_stats client_gs;
+
 #endif
 

diff --git a/compiler/compiler.h b/compiler/compiler.h
index 40e857c..20df21d 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h

@@ -1,5 +1,6 @@
 #ifndef FIO_COMPILER_H
 #define FIO_COMPILER_H
+#include <assert.h>
 
 #if __GNUC__ >= 4
 #include "compiler-gcc4.h"
@@ -33,9 +34,16 @@
 	1; \
 })
 
+
+#if defined(CONFIG_STATIC_ASSERT)
+#define compiletime_assert(condition, msg) _Static_assert(condition, msg)
+
+#elif !defined(CONFIG_DISABLE_OPTIMIZATIONS)
+
 #ifndef __compiletime_error
 #define __compiletime_error(message)
 #endif
+
 #ifndef __compiletime_error_fallback
 #define __compiletime_error_fallback(condition)	do { } while (0)
 #endif
@@ -55,4 +63,10 @@
 #define compiletime_assert(condition, msg) \
 	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
 
+#else
+
+#define compiletime_assert(condition, msg)	do { } while (0)
+
+#endif
+
 #endif

diff --git a/configure b/configure
index 892335b..21bcaf4 100755
--- a/configure
+++ b/configure

@@ -135,11 +135,18 @@
 exit_val=0
 gfio_check="no"
 libhdfs="no"
+pmemblk="no"
+devdax="no"
+disable_lex=""
+disable_pmem="no"
+prefix=/usr/local
 
 # parse options
 for opt do
   optarg=`expr "x$opt" : 'x[^=]*=\(.*\)'`
   case "$opt" in
+  --prefix=*) prefix="$optarg"
+  ;;
   --cpu=*) cpu="$optarg"
   ;;
   #  esx is cross compiled and cannot be detect through simple uname calls
@@ -154,18 +161,31 @@
   ;;
   --build-static) build_static="yes"
   ;;
-  --enable-gfio)
-  gfio_check="yes"
+  --enable-gfio) gfio_check="yes"
   ;;
   --disable-numa) disable_numa="yes"
   ;;
+  --disable-rdma) disable_rdma="yes"
+  ;;
   --disable-rbd) disable_rbd="yes"
   ;;
+  --disable-rbd-blkin) disable_rbd_blkin="yes"
+  ;;
   --disable-gfapi) disable_gfapi="yes"
   ;;
   --enable-libhdfs) libhdfs="yes"
   ;;
-  --disable-shm) output_sym "CONFIG_NO_SHM"
+  --disable-lex) disable_lex="yes"
+  ;;
+  --enable-lex) disable_lex="no"
+  ;;
+  --disable-shm) no_shm="yes"
+  ;;
+  --disable-optimizations) disable_opt="yes"
+  ;;
+  --disable-pmem) disable_pmem="yes"
+  ;;
+  --enable-cuda) enable_cuda="yes"
   ;;
   --help)
     show_help="yes"
@@ -178,15 +198,24 @@
 done
 
 if test "$show_help" = "yes" ; then
-  echo "--cpu=                 Specify target CPU if auto-detect fails"
-  echo "--cc=                  Specify compiler to use"
-  echo "--extra-cflags=        Specify extra CFLAGS to pass to compiler"
-  echo "--build-32bit-win      Enable 32-bit build on Windows"
-  echo "--build-static         Build a static fio"
-  echo "--esx                  Configure build options for esx"
-  echo "--enable-gfio          Enable building of gtk gfio"
-  echo "--disable-numa         Disable libnuma even if found"
-  echo "--enable-libhdfs       Enable hdfs support"
+  echo "--prefix=               Use this directory as installation prefix"
+  echo "--cpu=                  Specify target CPU if auto-detect fails"
+  echo "--cc=                   Specify compiler to use"
+  echo "--extra-cflags=         Specify extra CFLAGS to pass to compiler"
+  echo "--build-32bit-win       Enable 32-bit build on Windows"
+  echo "--build-static          Build a static fio"
+  echo "--esx                   Configure build options for esx"
+  echo "--enable-gfio           Enable building of gtk gfio"
+  echo "--disable-numa          Disable libnuma even if found"
+  echo "--disable-rdma          Disable RDMA support even if found"
+  echo "--disable-gfapi         Disable gfapi"
+  echo "--enable-libhdfs        Enable hdfs support"
+  echo "--disable-lex           Disable use of lex/yacc for math"
+  echo "--disable-pmem          Disable pmem based engines even if found"
+  echo "--enable-lex            Enable use of lex/yacc for math"
+  echo "--disable-shm           Disable SHM support"
+  echo "--disable-optimizations Don't enable compiler optimizations"
+  echo "--enable-cuda           Enable GPUDirect RDMA support"
   exit $exit_val
 fi
 
@@ -214,11 +243,28 @@
 echo >> $config_host_mak
 echo "CONFIG_TARGET_OS=$targetos" >> $config_host_mak
 
+if test "$no_shm" = "yes" ; then
+  output_sym "CONFIG_NO_SHM"
+fi
+
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_FIO_NO_OPT"
+fi
+
 # Some host OSes need non-standard checks for which CPU to use.
 # Note that these checks are broken for cross-compilation: if you're
 # cross-compiling to one of these OSes then you'll need to specify
 # the correct CPU with the --cpu option.
 case $targetos in
+AIX|OpenBSD)
+  # Unless explicitly enabled, turn off lex.
+  # OpenBSD will hit syntax error when enabled.
+  if test -z "$disable_lex" ; then
+    disable_lex="yes"
+  else
+    force_no_lex_o="yes"
+  fi
+  ;;
 Darwin)
   # on Leopard most of the system is 32-bit, so we have to ask the kernel if
   # we can run 64-bit userspace code.
@@ -228,6 +274,17 @@
   if test -z "$cpu" && test "$(sysctl -n hw.optional.x86_64)" = "1"; then
     cpu="x86_64"
   fi
+  # Error at compile time linking of weak/partial symbols if possible...
+cat > $TMPC <<EOF
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "" "-Wl,-no_weak_imports" "disable weak symbols"; then
+    echo "Disabling weak symbols"
+    LDFLAGS="$LDFLAGS -Wl,-no_weak_imports"
+  fi
   ;;
 SunOS)
   # `uname -m` returns i86pc even on an x86_64 box, so default based on isainfo
@@ -237,37 +294,49 @@
   LIBS="-lnsl -lsocket"
   ;;
 CYGWIN*)
-  echo "Forcing known good options on Windows"
+  # We still force some options, so keep this message here.
+  echo "Forcing some known good options on Windows"
   if test -z "$CC" ; then
     if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
       CC="i686-w64-mingw32-gcc"
+      if test -e "../zlib/contrib/vstudio/vc14/x86/ZlibStatReleaseWithoutAsm/zlibstat.lib"; then
+        echo "Building with zlib support"
+        output_sym "CONFIG_ZLIB"
+        echo "LIBS=../zlib/contrib/vstudio/vc14/x86/ZlibStatReleaseWithoutAsm/zlibstat.lib" >> $config_host_mak
+      fi
     else
       CC="x86_64-w64-mingw32-gcc"
+      if test -e "../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib"; then
+        echo "Building with zlib support"
+        output_sym "CONFIG_ZLIB"
+        echo "LIBS=../zlib/contrib/vstudio/vc14/x64/ZlibStatReleaseWithoutAsm/zlibstat.lib" >> $config_host_mak
+      fi
     fi
   fi
-  output_sym "CONFIG_LITTLE_ENDIAN"
   if test ! -z "$build_32bit_win" && test "$build_32bit_win" = "yes"; then
     output_sym "CONFIG_32BIT"
   else
     output_sym "CONFIG_64BIT_LLP64"
   fi
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SOCKLEN_T"
-  output_sym "CONFIG_FADVISE"
-  output_sym "CONFIG_SFAA"
-  output_sym "CONFIG_RUSAGE_THREAD"
+  # We need this to be output_sym'd here because this is Windows specific.
+  # The regular configure path never sets this config.
   output_sym "CONFIG_WINDOWSAIO"
-  output_sym "CONFIG_FDATASYNC"
-  output_sym "CONFIG_CLOCK_MONOTONIC"
-  output_sym "CONFIG_GETTIMEOFDAY"
-  output_sym "CONFIG_CLOCK_GETTIME"
-  output_sym "CONFIG_SCHED_IDLE"
-  output_sym "CONFIG_TCP_NODELAY"
-  output_sym "CONFIG_TLS_THREAD"
-  output_sym "CONFIG_IPV6"
+  # We now take the regular configuration path without having exit 0 here.
+  # Flags below are still necessary mostly for MinGW.
+  socklen_t="yes"
+  sfaa="yes"
+  rusage_thread="yes"
+  fdatasync="yes"
+  clock_gettime="yes" # clock_monotonic probe has dependency on this
+  clock_monotonic="yes"
+  gettimeofday="yes"
+  sched_idle="yes"
+  tcp_nodelay="yes"
+  tls_thread="yes"
+  static_assert="yes"
+  ipv6="yes"
   echo "CC=$CC" >> $config_host_mak
-  echo "BUILD_CFLAGS=$CFLAGS -include config-host.h -D_GNU_SOURCE" >> $config_host_mak
-  exit 0
+  echo "BUILD_CFLAGS=$CFLAGS -I../zlib -include config-host.h -D_GNU_SOURCE" >> $config_host_mak
   ;;
 esac
 
@@ -302,6 +371,8 @@
   fi
 elif check_define __arm__ ; then
   cpu="arm"
+elif check_define __aarch64__ ; then
+  cpu="aarch64"
 elif check_define __hppa__ ; then
   cpu="hppa"
 else
@@ -314,7 +385,7 @@
     cpu="$cpu"
   ;;
   i386|i486|i586|i686|i86pc|BePC)
-    cpu="i386"
+    cpu="x86"
   ;;
   x86_64|amd64)
     cpu="x86_64"
@@ -322,6 +393,9 @@
   armv*b|armv*l|arm)
     cpu="arm"
   ;;
+  aarch64)
+    cpu="arm64"
+  ;;
   hppa|parisc|parisc64)
     cpu="hppa"
   ;;
@@ -351,7 +425,9 @@
 ##########################################
 # check cross compile
 
-cross_compile="no"
+if test "$cross_compile" != "yes" ; then
+  cross_compile="no"
+fi
 cat > $TMPC <<EOF
 int main(void)
 {
@@ -366,7 +442,9 @@
 
 ##########################################
 # check endianness
-bigendian="no"
+if test "$bigendian" != "yes" ; then
+  bigendian="no"
+fi
 if test "$cross_compile" = "no" ; then
   cat > $TMPC <<EOF
 #include <inttypes.h>
@@ -437,7 +515,9 @@
 
 ##########################################
 # zlib probe
-zlib="no"
+if test "$zlib" != "yes" ; then
+  zlib="no"
+fi
 cat > $TMPC <<EOF
 #include <zlib.h>
 int main(void)
@@ -456,8 +536,11 @@
 
 ##########################################
 # linux-aio probe
-libaio="no"
-cat > $TMPC <<EOF
+if test "$libaio" != "yes" ; then
+  libaio="no"
+fi
+if test "$esx" != "yes" ; then
+  cat > $TMPC <<EOF
 #include <libaio.h>
 #include <stddef.h>
 int main(void)
@@ -466,21 +549,26 @@
   return 0;
 }
 EOF
-if compile_prog "" "-laio" "libaio" ; then
-  libaio=yes
-  LIBS="-laio $LIBS"
-else
-  if test "$libaio" = "yes" ; then
-    feature_not_found "linux AIO" "libaio-dev or libaio-devel"
+  if compile_prog "" "-laio" "libaio" ; then
+    libaio=yes
+    LIBS="-laio $LIBS"
+  else
+    if test "$libaio" = "yes" ; then
+      feature_not_found "linux AIO" "libaio-dev or libaio-devel"
+    fi
+    libaio=no
   fi
-  libaio=no
 fi
 echo "Linux AIO support             $libaio"
 
 ##########################################
 # posix aio probe
-posix_aio="no"
-posix_aio_lrt="no"
+if test "$posix_aio" != "yes" ; then
+  posix_aio="no"
+fi
+if test "$posix_aio_lrt" != "yes" ; then
+  posix_aio_lrt="no"
+fi
 cat > $TMPC <<EOF
 #include <aio.h>
 int main(void)
@@ -502,7 +590,9 @@
 
 ##########################################
 # posix aio fsync probe
-posix_aio_fsync="no"
+if test "$posix_aio_fsync" != "yes" ; then
+  posix_aio_fsync="no"
+fi
 if test "$posix_aio" = "yes" ; then
   cat > $TMPC <<EOF
 #include <fcntl.h>
@@ -521,8 +611,40 @@
 echo "POSIX AIO fsync               $posix_aio_fsync"
 
 ##########################################
+# POSIX pshared attribute probe
+if test "$posix_pshared" != "yes" ; then
+  posix_pshared="no"
+fi
+cat > $TMPC <<EOF
+#include <unistd.h>
+int main(void)
+{
+#if defined(_POSIX_THREAD_PROCESS_SHARED) && ((_POSIX_THREAD_PROCESS_SHARED + 0) > 0)
+# if defined(__CYGWIN__)
+#  error "_POSIX_THREAD_PROCESS_SHARED is buggy on Cygwin"
+# elif defined(__APPLE__)
+#  include <AvailabilityMacros.h>
+#  include <TargetConditionals.h>
+#  if TARGET_OS_MAC && MAC_OS_X_VERSION_MIN_REQUIRED < 1070
+#   error "_POSIX_THREAD_PROCESS_SHARED is buggy/unsupported prior to OSX 10.7"
+#  endif
+# endif
+#else
+# error "_POSIX_THREAD_PROCESS_SHARED is unsupported"
+#endif
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "posix_pshared" ; then
+  posix_pshared=yes
+fi
+echo "POSIX pshared support         $posix_pshared"
+
+##########################################
 # solaris aio probe
-solaris_aio="no"
+if test "$solaris_aio" != "yes" ; then
+  solaris_aio="no"
+fi
 cat > $TMPC <<EOF
 #include <sys/types.h>
 #include <sys/asynch.h>
@@ -541,17 +663,20 @@
 echo "Solaris AIO support           $solaris_aio"
 
 ##########################################
-# __sync_fetch_and_and test
-sfaa="no"
+# __sync_fetch_and_add test
+if test "$sfaa" != "yes" ; then
+  sfaa="no"
+fi
 cat > $TMPC << EOF
-static int sfaa(int *ptr)
+#include <inttypes.h>
+static int sfaa(uint64_t *ptr)
 {
   return __sync_fetch_and_add(ptr, 0);
 }
 
 int main(int argc, char **argv)
 {
-  int val = 42;
+  uint64_t val = 42;
   sfaa(&val);
   return val;
 }
@@ -563,7 +688,9 @@
 
 ##########################################
 # libverbs probe
-libverbs="no"
+if test "$libverbs" != "yes" ; then
+  libverbs="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <infiniband/arch.h>
@@ -573,7 +700,7 @@
   return 0;
 }
 EOF
-if compile_prog "" "-libverbs" "libverbs" ; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-libverbs" "libverbs" ; then
     libverbs="yes"
     LIBS="-libverbs $LIBS"
 fi
@@ -581,7 +708,9 @@
 
 ##########################################
 # rdmacm probe
-rdmacm="no"
+if test "$rdmacm" != "yes" ; then
+  rdmacm="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <rdma/rdma_cma.h>
@@ -591,7 +720,7 @@
   return 0;
 }
 EOF
-if compile_prog "" "-lrdmacm" "rdma"; then
+if test "$disable_rdma" != "yes" && compile_prog "" "-lrdmacm" "rdma"; then
     rdmacm="yes"
     LIBS="-lrdmacm $LIBS"
 fi
@@ -599,7 +728,9 @@
 
 ##########################################
 # Linux fallocate probe
-linux_fallocate="no"
+if test "$linux_fallocate" != "yes" ; then
+  linux_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -617,7 +748,9 @@
 
 ##########################################
 # POSIX fadvise probe
-posix_fadvise="no"
+if test "$posix_fadvise" != "yes" ; then
+  posix_fadvise="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -634,7 +767,9 @@
 
 ##########################################
 # POSIX fallocate probe
-posix_fallocate="no"
+if test "$posix_fallocate" != "yes" ; then
+  posix_fallocate="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -651,8 +786,12 @@
 
 ##########################################
 # sched_set/getaffinity 2 or 3 argument test
-linux_2arg_affinity="no"
-linux_3arg_affinity="no"
+if test "$linux_2arg_affinity" != "yes" ; then
+  linux_2arg_affinity="no"
+fi
+if test "$linux_3arg_affinity" != "yes" ; then
+  linux_3arg_affinity="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -681,7 +820,9 @@
 
 ##########################################
 # clock_gettime probe
-clock_gettime="no"
+if test "$clock_gettime" != "yes" ; then
+  clock_gettime="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <time.h>
@@ -700,7 +841,9 @@
 
 ##########################################
 # CLOCK_MONOTONIC probe
-clock_monotonic="no"
+if test "$clock_monotonic" != "yes" ; then
+  clock_monotonic="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -717,8 +860,30 @@
 echo "CLOCK_MONOTONIC               $clock_monotonic"
 
 ##########################################
+# CLOCK_MONOTONIC_RAW probe
+if test "$clock_monotonic_raw" != "yes" ; then
+  clock_monotonic_raw="no"
+fi
+if test "$clock_gettime" = "yes" ; then
+  cat > $TMPC << EOF
+#include <stdio.h>
+#include <time.h>
+int main(int argc, char **argv)
+{
+  return clock_gettime(CLOCK_MONOTONIC_RAW, NULL);
+}
+EOF
+  if compile_prog "" "$LIBS" "clock monotonic"; then
+      clock_monotonic_raw="yes"
+  fi
+fi
+echo "CLOCK_MONOTONIC_RAW           $clock_monotonic_raw"
+
+##########################################
 # CLOCK_MONOTONIC_PRECISE probe
-clock_monotonic_precise="no"
+if test "$clock_monotonic_precise" != "yes" ; then
+  clock_monotonic_precise="no"
+fi
 if test "$clock_gettime" = "yes" ; then
   cat > $TMPC << EOF
 #include <stdio.h>
@@ -735,8 +900,30 @@
 echo "CLOCK_MONOTONIC_PRECISE       $clock_monotonic_precise"
 
 ##########################################
+# clockid_t probe
+if test "$clockid_t" != "yes" ; then
+  clockid_t="no"
+fi
+cat > $TMPC << EOF
+#include <time.h>
+#include <string.h>
+int main(int argc, char **argv)
+{
+  volatile clockid_t cid;
+  memset((void*)&cid, 0, sizeof(cid));
+  return 0;
+}
+EOF
+if compile_prog "" "$LIBS" "clockid_t"; then
+  clockid_t="yes"
+fi
+echo "clockid_t                     $clockid_t"
+
+##########################################
 # gettimeofday() probe
-gettimeofday="no"
+if test "$gettimeofday" != "yes" ; then
+  gettimeofday="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <stdio.h>
@@ -753,7 +940,9 @@
 
 ##########################################
 # fdatasync() probe
-fdatasync="no"
+if test "$fdatasync" != "yes" ; then
+  fdatasync="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -769,7 +958,9 @@
 
 ##########################################
 # sync_file_range() probe
-sync_file_range="no"
+if test "$sync_file_range" != "yes" ; then
+  sync_file_range="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <unistd.h>
@@ -789,7 +980,9 @@
 
 ##########################################
 # ext4 move extent probe
-ext4_me="no"
+if test "$ext4_me" != "yes" ; then
+  ext4_me="no"
+fi
 cat > $TMPC << EOF
 #include <fcntl.h>
 #include <sys/ioctl.h>
@@ -811,7 +1004,9 @@
 
 ##########################################
 # splice probe
-linux_splice="no"
+if test "$linux_splice" != "yes" ; then
+  linux_splice="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <fcntl.h>
@@ -827,7 +1022,9 @@
 
 ##########################################
 # GUASI probe
-guasi="no"
+if test "$guasi" != "yes" ; then
+  guasi="no"
+fi
 cat > $TMPC << EOF
 #include <guasi.h>
 #include <guasi_syscalls.h>
@@ -844,7 +1041,9 @@
 
 ##########################################
 # fusion-aw probe
-fusion_aw="no"
+if test "$fusion_aw" != "yes" ; then
+  fusion_aw="no"
+fi
 cat > $TMPC << EOF
 #include <nvm/nvm_primitives.h>
 int main(int argc, char **argv)
@@ -856,15 +1055,17 @@
   return nvm_atomic_write(handle, 0, 0, 0);
 }
 EOF
-if compile_prog "" "-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -lvsl -ldl" "fusion-aw"; then
-  LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -lvsl -ldl $LIBS"
+if compile_prog "" "-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthread" "fusion-aw"; then
+  LIBS="-L/usr/lib/fio -L/usr/lib/nvm -lnvm-primitives -ldl -lpthread $LIBS"
   fusion_aw="yes"
 fi
 echo "Fusion-io atomic engine       $fusion_aw"
 
 ##########################################
 # libnuma probe
-libnuma="no"
+if test "$libnuma" != "yes" ; then
+  libnuma="no"
+fi
 cat > $TMPC << EOF
 #include <numa.h>
 int main(int argc, char **argv)
@@ -879,7 +1080,7 @@
 echo "libnuma                       $libnuma"
 
 ##########################################
-# libnuma 2.x version API
+# libnuma 2.x version API, initialize with "no" only if $libnuma is set to "yes"
 if test "$libnuma" = "yes" ; then
 libnuma_v2="no"
 cat > $TMPC << EOF
@@ -898,12 +1099,15 @@
 
 ##########################################
 # strsep() probe
-strsep="no"
+if test "$strsep" != "yes" ; then
+  strsep="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
 {
-  strsep(NULL, NULL);
+  static char *string = "This is a string";
+  strsep(&string, "needle");
   return 0;
 }
 EOF
@@ -914,7 +1118,9 @@
 
 ##########################################
 # strcasestr() probe
-strcasestr="no"
+if test "$strcasestr" != "yes" ; then
+  strcasestr="no"
+fi
 cat > $TMPC << EOF
 #include <string.h>
 int main(int argc, char **argv)
@@ -928,8 +1134,31 @@
 echo "strcasestr                    $strcasestr"
 
 ##########################################
+# strlcat() probe
+if test "$strlcat" != "yes" ; then
+  strlcat="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+int main(int argc, char **argv)
+{
+  static char dst[64];
+  static char *string = "This is a string";
+  memset(dst, 0, sizeof(dst));
+  strlcat(dst, string, sizeof(dst));
+  return 0;
+}
+EOF
+if compile_prog "" "" "strlcat"; then
+  strlcat="yes"
+fi
+echo "strlcat                       $strlcat"
+
+##########################################
 # getopt_long_only() probe
-getopt_long_only="no"
+if test "$getopt_long_only" != "yes" ; then
+  getopt_long_only="no"
+fi
 cat > $TMPC << EOF
 #include <unistd.h>
 #include <stdio.h>
@@ -947,7 +1176,9 @@
 
 ##########################################
 # inet_aton() probe
-inet_aton="no"
+if test "$inet_aton" != "yes" ; then
+  inet_aton="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 #include <arpa/inet.h>
@@ -965,7 +1196,9 @@
 
 ##########################################
 # socklen_t probe
-socklen_t="no"
+if test "$socklen_t" != "yes" ; then
+  socklen_t="no"
+fi
 cat > $TMPC << EOF
 #include <sys/socket.h>
 int main(int argc, char **argv)
@@ -981,7 +1214,9 @@
 
 ##########################################
 # Whether or not __thread is supported for TLS
-tls_thread="no"
+if test "$tls_thread" != "yes" ; then
+  tls_thread="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 static __thread int ret;
@@ -997,7 +1232,9 @@
 
 ##########################################
 # Check if we have required gtk/glib support for gfio
-gfio="no"
+if test "$gfio" != "yes" ; then
+  gfio="no"
+fi
 if test "$gfio_check" = "yes" ; then
   cat > $TMPC << EOF
 #include <glib.h>
@@ -1008,7 +1245,7 @@
   gdk_threads_enter();
   gdk_threads_leave();
 
-  printf("%d", GTK_CHECK_VERSION(2, 18, 0));
+  return GTK_CHECK_VERSION(2, 18, 0) ? 0 : 1; /* 0 on success */
 }
 EOF
 GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
@@ -1024,8 +1261,8 @@
   exit 1
 fi
 if compile_prog "$GTK_CFLAGS" "$GTK_LIBS" "gfio" ; then
-  r=$($TMPE)
-  if test "$r" != "0" ; then
+  $TMPE
+  if test "$?" = "0" ; then
     gfio="yes"
     GFIO_LIBS="$LIBS $GTK_LIBS"
     CFLAGS="$CFLAGS $GTK_CFLAGS"
@@ -1044,8 +1281,11 @@
   echo "gtk 2.18 or higher            $gfio"
 fi
 
+##########################################
 # Check whether we have getrusage(RUSAGE_THREAD)
-rusage_thread="no"
+if test "$rusage_thread" != "yes" ; then
+  rusage_thread="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1063,7 +1303,9 @@
 
 ##########################################
 # Check whether we have SCHED_IDLE
-sched_idle="no"
+if test "$sched_idle" != "yes" ; then
+  sched_idle="no"
+fi
 cat > $TMPC << EOF
 #include <sched.h>
 int main(int argc, char **argv)
@@ -1079,7 +1321,9 @@
 
 ##########################################
 # Check whether we have TCP_NODELAY
-tcp_nodelay="no"
+if test "$tcp_nodelay" != "yes" ; then
+  tcp_nodelay="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1097,7 +1341,9 @@
 
 ##########################################
 # Check whether we have SO_SNDBUF
-window_size="no"
+if test "$window_size" != "yes" ; then
+  window_size="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1116,7 +1362,9 @@
 
 ##########################################
 # Check whether we have TCP_MAXSEG
-mss="no"
+if test "$mss" != "yes" ; then
+  mss="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/types.h>
@@ -1136,7 +1384,9 @@
 
 ##########################################
 # Check whether we have RLIMIT_MEMLOCK
-rlimit_memlock="no"
+if test "$rlimit_memlock" != "yes" ; then
+  rlimit_memlock="no"
+fi
 cat > $TMPC << EOF
 #include <sys/time.h>
 #include <sys/resource.h>
@@ -1153,7 +1403,9 @@
 
 ##########################################
 # Check whether we have pwritev/preadv
-pwritev="no"
+if test "$pwritev" != "yes" ; then
+  pwritev="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 #include <sys/uio.h>
@@ -1168,8 +1420,28 @@
 echo "pwritev/preadv                $pwritev"
 
 ##########################################
+# Check whether we have pwritev2/preadv2
+if test "$pwritev2" != "yes" ; then
+  pwritev2="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/uio.h>
+int main(int argc, char **argv)
+{
+  return pwritev2(0, NULL, 1, 0, 0) + preadv2(0, NULL, 1, 0, 0);
+}
+EOF
+if compile_prog "" "" "pwritev2"; then
+  pwritev2="yes"
+fi
+echo "pwritev2/preadv2              $pwritev2"
+
+##########################################
 # Check whether we have the required functions for ipv6
-ipv6="no"
+if test "$ipv6" != "yes" ; then
+  ipv6="no"
+fi
 cat > $TMPC << EOF
 #include <sys/types.h>
 #include <sys/socket.h>
@@ -1196,13 +1468,14 @@
 
 ##########################################
 # check for rbd
-rbd="no"
+if test "$rbd" != "yes" ; then
+  rbd="no"
+fi
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
 
 int main(int argc, char **argv)
 {
-
   rados_t cluster;
   rados_ioctx_t io_ctx;
   const char pool[] = "rbd";
@@ -1221,8 +1494,38 @@
 echo "Rados Block Device engine     $rbd"
 
 ##########################################
+# check for rbd_poll
+if test "$rbd_poll" != "yes" ; then
+  rbd_poll="no"
+fi
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+#include <sys/eventfd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+  rbd_completion_t comp;
+
+  int fd = eventfd(0, EFD_NONBLOCK);
+  rbd_set_image_notification(image, fd, EVENT_TYPE_EVENTFD);
+  rbd_poll_io_events(image, comp, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_poll="yes"
+fi
+echo "rbd_poll                      $rbd_poll"
+fi
+
+##########################################
 # check for rbd_invaidate_cache()
-rbd_inval="no"
+if test "$rbd_inval" != "yes" ; then
+  rbd_inval="no"
+fi
 if test "$rbd" = "yes"; then
 cat > $TMPC << EOF
 #include <rbd/librbd.h>
@@ -1241,8 +1544,40 @@
 fi
 
 ##########################################
+# check for blkin
+if test "$rbd_blkin" != "yes" ; then
+  rbd_blkin="no"
+fi
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+#include <zipkin_c.h>
+
+int main(int argc, char **argv)
+{
+  int r;
+  struct blkin_trace_info t_info;
+  blkin_init_trace_info(&t_info);
+  rbd_completion_t completion;
+  rbd_image_t image;
+  uint64_t off;
+  size_t len;
+  const char *buf;
+  r = rbd_aio_write_traced(image, off, len, buf, completion, &t_info);
+  return 0;
+}
+EOF
+if test "$disable_rbd" != "yes" && test "$disable_rbd_blkin" != "yes" \
+ && compile_prog "" "-lrbd -lrados -lblkin" "rbd_blkin"; then
+  LIBS="-lblkin $LIBS"
+  rbd_blkin="yes"
+fi
+echo "rbd blkin tracing             $rbd_blkin"
+
+##########################################
 # Check whether we have setvbuf
-setvbuf="no"
+if test "$setvbuf" != "yes" ; then
+  setvbuf="no"
+fi
 cat > $TMPC << EOF
 #include <stdio.h>
 int main(int argc, char **argv)
@@ -1258,14 +1593,16 @@
 fi
 echo "setvbuf                       $setvbuf"
 
+##########################################
 # check for gfapi
-gfapi="no"
+if test "$gfapi" != "yes" ; then
+  gfapi="no"
+fi
 cat > $TMPC << EOF
 #include <glusterfs/api/glfs.h>
 
 int main(int argc, char **argv)
 {
-
   glfs_t *g = glfs_new("foo");
 
   return 0;
@@ -1278,7 +1615,7 @@
  echo "Gluster API engine            $gfapi"
 
 ##########################################
-# check for gfapi fadvise support
+# check for gfapi fadvise support, initialize with "no" only if $gfapi is set to "yes"
 if test "$gfapi" = "yes" ; then
 gf_fadvise="no"
 cat > $TMPC << EOF
@@ -1300,7 +1637,9 @@
 
 ##########################################
 # check for gfapi trim support
-gf_trim="no"
+if test "$gf_trim" != "yes" ; then
+  gf_trim="no"
+fi
 if test "$gfapi" = "yes" ; then
 cat > $TMPC << EOF
 #include <glusterfs/api/glfs.h>
@@ -1318,7 +1657,9 @@
 
 ##########################################
 # Check if we support stckf on s390
-s390_z196_facilities="no"
+if test "$s390_z196_facilities" != "yes" ; then
+  s390_z196_facilities="no"
+fi
 cat > $TMPC << EOF
 #define STFLE_BITS_Z196 45 /* various z196 facilities ... */
 int main(int argc, char **argv)
@@ -1366,14 +1707,102 @@
   if test "$hdfs_conf_error" = "1" ; then
     exit 1
   fi
+  FIO_HDFS_CPU=$cpu
+  if test "$FIO_HDFS_CPU" = "x86_64" ; then
+    FIO_HDFS_CPU="amd64"
+  fi
 fi
 echo "HDFS engine                   $libhdfs"
 
+##########################################
+# Check whether we have MTD
+if test "$mtd" != "yes" ; then
+  mtd="no"
+fi
+cat > $TMPC << EOF
+#include <string.h>
+#include <mtd/mtd-user.h>
+#include <sys/ioctl.h>
+int main(int argc, char **argv)
+{
+  struct mtd_write_req ops;
+  struct mtd_info_user info;
+  memset(&ops, 0, sizeof(ops));
+  info.type = MTD_MLCNANDFLASH;
+  return ioctl(0, MEMGETINFO, &info);
+}
+EOF
+if compile_prog "" "" "mtd"; then
+  mtd="yes"
+fi
+echo "MTD                           $mtd"
+
+##########################################
+# Check whether we have libpmem
+if test "$libpmem" != "yes" ; then
+  libpmem="no"
+fi
+cat > $TMPC << EOF
+#include <libpmem.h>
+int main(int argc, char **argv)
+{
+  int rc;
+  rc = pmem_is_pmem(0, 0);
+  return 0;
+}
+EOF
+if compile_prog "" "-lpmem" "libpmem"; then
+  libpmem="yes"
+  LIBS="-lpmem $LIBS"
+fi
+echo "libpmem                       $libpmem"
+
+##########################################
+# Check whether we have libpmemblk
+# libpmem is a prerequisite
+if test "$libpmemblk" != "yes" ; then
+  libpmemblk="no"
+fi
+if test "$libpmem" = "yes"; then
+  cat > $TMPC << EOF
+#include <libpmemblk.h>
+int main(int argc, char **argv)
+{
+  PMEMblkpool *pbp;
+  pbp = pmemblk_open("", 0);
+  return 0;
+}
+EOF
+  if compile_prog "" "-lpmemblk" "libpmemblk"; then
+    libpmemblk="yes"
+    LIBS="-lpmemblk $LIBS"
+  fi
+fi
+echo "libpmemblk                    $libpmemblk"
+
+# Choose the ioengines
+if test "$libpmem" = "yes" && test "$disable_pmem" = "no"; then
+  devdax="yes"
+  if test "$libpmemblk" = "yes"; then
+    pmemblk="yes"
+  fi
+fi
+
+##########################################
+# Report whether pmemblk engine is enabled
+echo "NVML pmemblk engine           $pmemblk"
+
+##########################################
+# Report whether dev-dax engine is enabled
+echo "NVML dev-dax engine           $devdax"
+
+##########################################
 # Check if we have lex/yacc available
 yacc="no"
 yacc_is_bison="no"
 lex="no"
 arith="no"
+if test "$disable_lex" = "no" || test -z "$disable_lex" ; then
 if test "$targetos" != "SunOS" ; then
 LEX=$(which lex 2> /dev/null)
 if test -x "$LEX" ; then
@@ -1410,9 +1839,182 @@
 fi
 fi
 fi
+fi
+
+# Check if lex fails using -o
+if test "$arith" = "yes" ; then
+if test "$force_no_lex_o" = "yes" ; then
+  lex_use_o="no"
+else
+$LEX -o lex.yy.c exp/expression-parser.l 2> /dev/null
+if test "$?" = "0" ; then
+  lex_use_o="yes"
+else
+  lex_use_o="no"
+fi
+fi
+fi
 
 echo "lex/yacc for arithmetic       $arith"
 
+##########################################
+# Check whether we have setmntent/getmntent
+if test "$getmntent" != "yes" ; then
+  getmntent="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <mntent.h>
+int main(int argc, char **argv)
+{
+  FILE *mtab = setmntent(NULL, "r");
+  struct mntent *mnt = getmntent(mtab);
+  endmntent(mtab);
+  return 0;
+}
+EOF
+if compile_prog "" "" "getmntent"; then
+  getmntent="yes"
+fi
+echo "getmntent                     $getmntent"
+
+##########################################
+# Check whether we have getmntinfo
+# These are originally added for BSDs, but may also work
+# on other operating systems with getmntinfo(3).
+
+# getmntinfo(3) for FreeBSD/DragonFlyBSD/OpenBSD.
+# Note that NetBSD needs -Werror to catch warning as error.
+if test "$getmntinfo" != "yes" ; then
+  getmntinfo="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+int main(int argc, char **argv)
+{
+  struct statfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+if compile_prog "-Werror" "" "getmntinfo"; then
+  getmntinfo="yes"
+fi
+echo "getmntinfo                    $getmntinfo"
+
+# getmntinfo(3) for NetBSD.
+if test "$getmntinfo_statvfs" != "yes" ; then
+  getmntinfo_statvfs="no"
+fi
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/statvfs.h>
+int main(int argc, char **argv)
+{
+  struct statvfs *st;
+  return getmntinfo(&st, MNT_NOWAIT);
+}
+EOF
+# Skip the test if the one with statfs arg is detected.
+if test "$getmntinfo" != "yes" && compile_prog "-Werror" "" "getmntinfo_statvfs"; then
+  getmntinfo_statvfs="yes"
+  echo "getmntinfo_statvfs            $getmntinfo_statvfs"
+fi
+
+##########################################
+# Check whether we have _Static_assert
+if test "$static_assert" != "yes" ; then
+  static_assert="no"
+fi
+cat > $TMPC << EOF
+#include <assert.h>
+#include <stdlib.h>
+#undef offsetof
+#ifdef __compiler_offsetof
+#define offsetof(TYPE,MEMBER) __compiler_offsetof(TYPE,MEMBER)
+#else
+#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#define container_of(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+struct foo {
+  int a, b;
+};
+
+int main(int argc, char **argv)
+{
+  _Static_assert(offsetof(struct foo, a) == 0 , "Check");
+  return 0 ;
+}
+EOF
+if compile_prog "" "" "static_assert"; then
+    static_assert="yes"
+fi
+echo "Static Assert                 $static_assert"
+
+##########################################
+# Check whether we have bool / stdbool.h
+if test "$have_bool" != "yes" ; then
+  have_bool="no"
+fi
+cat > $TMPC << EOF
+#include <stdbool.h>
+int main(int argc, char **argv)
+{
+  bool var = true;
+  return var != false;
+}
+EOF
+if compile_prog "" "" "bool"; then
+  have_bool="yes"
+fi
+echo "bool                          $have_bool"
+
+##########################################
+# check march=armv8-a+crc+crypto
+if test "$march_armv8_a_crc_crypto" != "yes" ; then
+  march_armv8_a_crc_crypto="no"
+fi
+if test "$cpu" = "arm64" ; then
+  cat > $TMPC <<EOF
+#include <sys/auxv.h>
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+int main(void)
+{
+  return 0;
+}
+EOF
+  if compile_prog "-march=armv8-a+crc+crypto" "" ""; then
+    march_armv8_a_crc_crypto="yes"
+    CFLAGS="$CFLAGS -march=armv8-a+crc+crypto -DARCH_HAVE_CRC_CRYPTO"
+  fi
+fi
+echo "march_armv8_a_crc_crypto      $march_armv8_a_crc_crypto"
+
+##########################################
+# cuda probe
+if test "$cuda" != "yes" ; then
+  cuda="no"
+fi
+cat > $TMPC << EOF
+#include <cuda.h>
+int main(int argc, char **argv)
+{
+  return cuInit(0);
+}
+EOF
+if test "$enable_cuda" = "yes" && compile_prog "" "-lcuda" "cuda"; then
+  cuda="yes"
+  LIBS="-lcuda $LIBS"
+fi
+echo "cuda                          $cuda"
+
 #############################################################################
 
 if test "$wordsize" = "64" ; then
@@ -1439,6 +2041,9 @@
 if test "$posix_aio_fsync" = "yes" ; then
   output_sym "CONFIG_POSIXAIO_FSYNC"
 fi
+if test "$posix_pshared" = "yes" ; then
+  output_sym "CONFIG_PSHARED"
+fi
 if test "$linux_fallocate" = "yes" ; then
   output_sym "CONFIG_LINUX_FALLOCATE"
 fi
@@ -1463,9 +2068,15 @@
 if test "$clock_monotonic" = "yes" ; then
   output_sym "CONFIG_CLOCK_MONOTONIC"
 fi
+if test "$clock_monotonic_raw" = "yes" ; then
+  output_sym "CONFIG_CLOCK_MONOTONIC_RAW"
+fi
 if test "$clock_monotonic_precise" = "yes" ; then
   output_sym "CONFIG_CLOCK_MONOTONIC_PRECISE"
 fi
+if test "$clockid_t" = "yes"; then
+  output_sym "CONFIG_CLOCKID_T"
+fi
 if test "$gettimeofday" = "yes" ; then
   output_sym "CONFIG_GETTIMEOFDAY"
 fi
@@ -1483,6 +2094,9 @@
 if test "$strcasestr" = "yes" ; then
   output_sym "CONFIG_STRCASESTR"
 fi
+if test "$strlcat" = "yes" ; then
+  output_sym "CONFIG_STRLCAT"
+fi
 if test "$getopt_long_only" = "yes" ; then
   output_sym "CONFIG_GETOPT_LONG_ONLY"
 fi
@@ -1517,7 +2131,7 @@
   output_sym "CONFIG_RUSAGE_THREAD"
 fi
 if test "$gfio" = "yes" ; then
-  echo "CONFIG_GFIO=y" >> $config_host_mak
+  output_sym "CONFIG_GFIO"
 fi
 if test "$esx" = "yes" ; then
   output_sym "CONFIG_ESX"
@@ -1541,15 +2155,24 @@
 if test "$pwritev" = "yes" ; then
   output_sym "CONFIG_PWRITEV"
 fi
+if test "$pwritev2" = "yes" ; then
+  output_sym "CONFIG_PWRITEV2"
+fi
 if test "$ipv6" = "yes" ; then
   output_sym "CONFIG_IPV6"
 fi
 if test "$rbd" = "yes" ; then
   output_sym "CONFIG_RBD"
 fi
+if test "$rbd_poll" = "yes" ; then
+  output_sym "CONFIG_RBD_POLL"
+fi
 if test "$rbd_inval" = "yes" ; then
   output_sym "CONFIG_RBD_INVAL"
 fi
+if test "$rbd_blkin" = "yes" ; then
+  output_sym "CONFIG_RBD_BLKIN"
+fi
 if test "$setvbuf" = "yes" ; then
   output_sym "CONFIG_SETVBUF"
 fi
@@ -1568,10 +2191,20 @@
 fi
 if test "$libhdfs" = "yes" ; then
   output_sym "CONFIG_LIBHDFS"
+  echo "FIO_HDFS_CPU=$FIO_HDFS_CPU" >> $config_host_mak
   echo "JAVA_HOME=$JAVA_HOME" >> $config_host_mak
   echo "FIO_LIBHDFS_INCLUDE=$FIO_LIBHDFS_INCLUDE" >> $config_host_mak
   echo "FIO_LIBHDFS_LIB=$FIO_LIBHDFS_LIB" >> $config_host_mak
  fi
+if test "$mtd" = "yes" ; then
+  output_sym "CONFIG_MTD"
+fi
+if test "$pmemblk" = "yes" ; then
+  output_sym "CONFIG_PMEMBLK"
+fi
+if test "$devdax" = "yes" ; then
+  output_sym "CONFIG_LINUX_DEVDAX"
+fi
 if test "$arith" = "yes" ; then
   output_sym "CONFIG_ARITHMETIC"
   if test "$yacc_is_bison" = "yes" ; then
@@ -1579,11 +2212,34 @@
   else
     echo "YACC=$YACC" >> $config_host_mak
   fi
+  if test "$lex_use_o" = "yes" ; then
+    echo "CONFIG_LEX_USE_O=y" >> $config_host_mak
+  fi
 fi
-
+if test "$getmntent" = "yes" ; then
+  output_sym "CONFIG_GETMNTENT"
+fi
+if test "$getmntinfo" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO"
+fi
+if test "$getmntinfo_statvfs" = "yes" ; then
+  output_sym "CONFIG_GETMNTINFO_STATVFS"
+fi
+if test "$static_assert" = "yes" ; then
+  output_sym "CONFIG_STATIC_ASSERT"
+fi
+if test "$have_bool" = "yes" ; then
+  output_sym "CONFIG_HAVE_BOOL"
+fi
+if test "$disable_opt" = "yes" ; then
+  output_sym "CONFIG_DISABLE_OPTIMIZATIONS"
+fi
 if test "$zlib" = "no" ; then
   echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
 fi
+if test "$cuda" = "yes" ; then
+  output_sym "CONFIG_CUDA"
+fi
 
 echo "LIBS+=$LIBS" >> $config_host_mak
 echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
@@ -1591,3 +2247,11 @@
 echo "LDFLAGS+=$LDFLAGS" >> $config_host_mak
 echo "CC=$cc" >> $config_host_mak
 echo "BUILD_CFLAGS=$BUILD_CFLAGS $CFLAGS" >> $config_host_mak
+echo "INSTALL_PREFIX=$prefix" >> $config_host_mak
+
+if [ `dirname $0` != "." -a ! -e Makefile ]; then
+    cat > Makefile <<EOF
+SRCDIR:=`dirname $0`
+include \$(SRCDIR)/Makefile
+EOF
+fi

diff --git a/crc/crc32c-arm64.c b/crc/crc32c-arm64.c
new file mode 100644
index 0000000..c3f42c7
--- /dev/null
+++ b/crc/crc32c-arm64.c

@@ -0,0 +1,115 @@
+#include "crc32c.h"
+
+#define CRC32C3X8(ITR) \
+	crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
+	crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
+	crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));
+
+#define CRC32C7X3X8(ITR) do {\
+	CRC32C3X8((ITR)*7+0) \
+	CRC32C3X8((ITR)*7+1) \
+	CRC32C3X8((ITR)*7+2) \
+	CRC32C3X8((ITR)*7+3) \
+	CRC32C3X8((ITR)*7+4) \
+	CRC32C3X8((ITR)*7+5) \
+	CRC32C3X8((ITR)*7+6) \
+	} while(0)
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32             (1 << 7)
+#endif /* HWCAP_CRC32 */
+
+int crc32c_arm64_available = 0;
+
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+
+#include <sys/auxv.h>
+#include <arm_acle.h>
+#include <arm_neon.h>
+
+static int crc32c_probed;
+
+/*
+ * Function to calculate reflected crc with PMULL Instruction
+ * crc done "by 3" for fixed input block size of 1024 bytes
+ */
+uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
+{
+	signed long len = length;
+	uint32_t crc = ~0;
+	uint32_t crc0, crc1, crc2;
+
+	/* Load two consts: K1 and K2 */
+	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
+	uint64_t t0, t1;
+
+	while ((len -= 1024) >= 0) {
+		/* Do first 8 bytes here for better pipelining */
+		crc0 = __crc32cd(crc, *(const uint64_t *)data);
+		crc1 = 0;
+		crc2 = 0;
+		data += sizeof(uint64_t);
+
+		/* Process block inline
+		   Process crc0 last to avoid dependency with above */
+		CRC32C7X3X8(0);
+		CRC32C7X3X8(1);
+		CRC32C7X3X8(2);
+		CRC32C7X3X8(3);
+		CRC32C7X3X8(4);
+		CRC32C7X3X8(5);
+
+		data += 42*3*sizeof(uint64_t);
+
+		/* Merge crc0 and crc1 into crc2
+		   crc1 multiply by K2
+		   crc0 multiply by K1 */
+
+		t1 = (uint64_t)vmull_p64(crc1, k2);
+		t0 = (uint64_t)vmull_p64(crc0, k1);
+		crc = __crc32cd(crc2, *(const uint64_t *)data);
+		crc1 = __crc32cd(0, t1);
+		crc ^= crc1;
+		crc0 = __crc32cd(0, t0);
+		crc ^= crc0;
+
+		data += sizeof(uint64_t);
+	}
+
+	if (!(len += 1024))
+		return crc;
+
+	while ((len -= sizeof(uint64_t)) >= 0) {
+                crc = __crc32cd(crc, *(const uint64_t *)data);
+                data += sizeof(uint64_t);
+        }
+
+        /* The following is more efficient than the straight loop */
+        if (len & sizeof(uint32_t)) {
+                crc = __crc32cw(crc, *(const uint32_t *)data);
+                data += sizeof(uint32_t);
+        }
+        if (len & sizeof(uint16_t)) {
+                crc = __crc32ch(crc, *(const uint16_t *)data);
+                data += sizeof(uint16_t);
+        }
+        if (len & sizeof(uint8_t)) {
+                crc = __crc32cb(crc, *(const uint8_t *)data);
+        }
+
+	return crc;
+}
+
+void crc32c_arm64_probe(void)
+{
+	unsigned long hwcap;
+
+	if (!crc32c_probed) {
+		hwcap = getauxval(AT_HWCAP);
+		if (hwcap & HWCAP_CRC32)
+			crc32c_arm64_available = 1;
+		crc32c_probed = 1;
+	}
+}
+
+#endif /* ARCH_HAVE_ARM64_CRC_CRYPTO */

diff --git a/crc/crc32c.h b/crc/crc32c.h
index 11bcf9c..5d66407 100644
--- a/crc/crc32c.h
+++ b/crc/crc32c.h

@@ -21,8 +21,19 @@
 #include "../arch/arch.h"
 
 extern uint32_t crc32c_sw(unsigned char const *, unsigned long);
+extern int crc32c_arm64_available;
 extern int crc32c_intel_available;
 
+#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO
+extern uint32_t crc32c_arm64(unsigned char const *, unsigned long);
+extern void crc32c_arm64_probe(void);
+#else
+#define crc32c_arm64 crc32c_sw
+static inline void crc32c_arm64_probe(void)
+{
+}
+#endif
+
 #ifdef ARCH_HAVE_SSE4_2
 extern uint32_t crc32c_intel(unsigned char const *, unsigned long);
 extern void crc32c_intel_probe(void);
@@ -35,6 +46,9 @@
 
 static inline uint32_t fio_crc32c(unsigned char const *buf, unsigned long len)
 {
+	if (crc32c_arm64_available)
+		return crc32c_arm64(buf, len);
+
 	if (crc32c_intel_available)
 		return crc32c_intel(buf, len);
 

diff --git a/crc/fnv.c b/crc/fnv.c
index 04c0560..4cd0650 100644
--- a/crc/fnv.c
+++ b/crc/fnv.c

@@ -2,14 +2,32 @@
 
 #define FNV_PRIME	0x100000001b3ULL
 
+/*
+ * 64-bit fnv, but don't require 64-bit multiples of data. Use bytes
+ * for the last unaligned chunk.
+ */
 uint64_t fnv(const void *buf, uint32_t len, uint64_t hval)
 {
 	const uint64_t *ptr = buf;
-	const uint64_t *end = (void *) buf + len;
 
-	while (ptr < end) {
+	while (len) {
 		hval *= FNV_PRIME;
-		hval ^= (uint64_t) *ptr++;
+		if (len >= sizeof(uint64_t)) {
+			hval ^= (uint64_t) *ptr++;
+			len -= sizeof(uint64_t);
+			continue;
+		} else {
+			const uint8_t *ptr8 = (const uint8_t *) ptr;
+			uint64_t val = 0;
+			int i;
+
+			for (i = 0; i < len; i++) {
+				val <<= 8;
+				val |= (uint8_t) *ptr8++;
+			}
+			hval ^= val;
+			break;
+		}
 	}
 
 	return hval;

diff --git a/crc/sha3.c b/crc/sha3.c
new file mode 100644
index 0000000..2685dce
--- /dev/null
+++ b/crc/sha3.c

@@ -0,0 +1,173 @@
+/*
+ * Cryptographic API.
+ *
+ * SHA-3, as specified in
+ * http://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.202.pdf
+ *
+ * SHA-3 code by Jeff Garzik <jeff@garzik.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)•
+ * any later version.
+ *
+ */
+#include <string.h>
+#include <inttypes.h>
+
+#include "../os/os.h"
+
+#include "sha3.h"
+
+#define KECCAK_ROUNDS 24
+
+#define ROTL64(x, y) (((x) << (y)) | ((x) >> (64 - (y))))
+
+static const uint64_t keccakf_rndc[24] = {
+	0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL,
+	0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL,
+	0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL,
+	0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL,
+	0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL,
+	0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL,
+	0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL,
+	0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL
+};
+
+static const int keccakf_rotc[24] = {
+	1,  3,  6,  10, 15, 21, 28, 36, 45, 55, 2,  14,
+	27, 41, 56, 8,  25, 43, 62, 18, 39, 61, 20, 44
+};
+
+static const int keccakf_piln[24] = {
+	10, 7,  11, 17, 18, 3, 5,  16, 8,  21, 24, 4,
+	15, 23, 19, 13, 12, 2, 20, 14, 22, 9,  6,  1
+};
+
+/* update the state with given number of rounds */
+
+static void keccakf(uint64_t st[25])
+{
+	int i, j, round;
+	uint64_t t, bc[5];
+
+	for (round = 0; round < KECCAK_ROUNDS; round++) {
+
+		/* Theta */
+		for (i = 0; i < 5; i++)
+			bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15]
+				^ st[i + 20];
+
+		for (i = 0; i < 5; i++) {
+			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
+			for (j = 0; j < 25; j += 5)
+				st[j + i] ^= t;
+		}
+
+		/* Rho Pi */
+		t = st[1];
+		for (i = 0; i < 24; i++) {
+			j = keccakf_piln[i];
+			bc[0] = st[j];
+			st[j] = ROTL64(t, keccakf_rotc[i]);
+			t = bc[0];
+		}
+
+		/* Chi */
+		for (j = 0; j < 25; j += 5) {
+			for (i = 0; i < 5; i++)
+				bc[i] = st[j + i];
+			for (i = 0; i < 5; i++)
+				st[j + i] ^= (~bc[(i + 1) % 5]) &
+					     bc[(i + 2) % 5];
+		}
+
+		/* Iota */
+		st[0] ^= keccakf_rndc[round];
+	}
+}
+
+static void fio_sha3_init(struct fio_sha3_ctx *sctx, unsigned int digest_sz)
+{
+	memset(sctx->st, 0, sizeof(sctx->st));
+	sctx->md_len = digest_sz;
+	sctx->rsiz = 200 - 2 * digest_sz;
+	sctx->rsizw = sctx->rsiz / 8;
+	sctx->partial = 0;
+	memset(sctx->buf, 0, sizeof(sctx->buf));
+}
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_224_DIGEST_SIZE);
+}
+
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_256_DIGEST_SIZE);
+}
+
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_384_DIGEST_SIZE);
+}
+
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx)
+{
+	fio_sha3_init(sctx, SHA3_512_DIGEST_SIZE);
+}
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len)
+{
+	unsigned int done;
+	const uint8_t *src;
+
+	done = 0;
+	src = data;
+
+	if ((sctx->partial + len) > (sctx->rsiz - 1)) {
+		if (sctx->partial) {
+			done = -sctx->partial;
+			memcpy(sctx->buf + sctx->partial, data,
+			       done + sctx->rsiz);
+			src = sctx->buf;
+		}
+
+		do {
+			unsigned int i;
+
+			for (i = 0; i < sctx->rsizw; i++)
+				sctx->st[i] ^= ((uint64_t *) src)[i];
+			keccakf(sctx->st);
+
+			done += sctx->rsiz;
+			src = data + done;
+		} while (done + (sctx->rsiz - 1) < len);
+
+		sctx->partial = 0;
+	}
+	memcpy(sctx->buf + sctx->partial, src, len - done);
+	sctx->partial += (len - done);
+
+	return 0;
+}
+
+void fio_sha3_final(struct fio_sha3_ctx *sctx)
+{
+	unsigned int i, inlen = sctx->partial;
+
+	sctx->buf[inlen++] = 0x06;
+	memset(sctx->buf + inlen, 0, sctx->rsiz - inlen);
+	sctx->buf[sctx->rsiz - 1] |= 0x80;
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] ^= ((uint64_t *) sctx->buf)[i];
+
+	keccakf(sctx->st);
+
+	for (i = 0; i < sctx->rsizw; i++)
+		sctx->st[i] = cpu_to_le64(sctx->st[i]);
+
+	memcpy(sctx->sha, sctx->st, sctx->md_len);
+}

diff --git a/crc/sha3.h b/crc/sha3.h
new file mode 100644
index 0000000..9f1970a
--- /dev/null
+++ b/crc/sha3.h

@@ -0,0 +1,42 @@
+/*
+ * Common values for SHA-3 algorithms
+ */
+#ifndef __CRYPTO_SHA3_H__
+#define __CRYPTO_SHA3_H__
+
+#include <inttypes.h>
+
+#define SHA3_224_DIGEST_SIZE	(224 / 8)
+#define SHA3_224_BLOCK_SIZE	(200 - 2 * SHA3_224_DIGEST_SIZE)
+
+#define SHA3_256_DIGEST_SIZE	(256 / 8)
+#define SHA3_256_BLOCK_SIZE	(200 - 2 * SHA3_256_DIGEST_SIZE)
+
+#define SHA3_384_DIGEST_SIZE	(384 / 8)
+#define SHA3_384_BLOCK_SIZE	(200 - 2 * SHA3_384_DIGEST_SIZE)
+
+#define SHA3_512_DIGEST_SIZE	(512 / 8)
+#define SHA3_512_BLOCK_SIZE	(200 - 2 * SHA3_512_DIGEST_SIZE)
+
+struct fio_sha3_ctx {
+	uint64_t	st[25];
+	unsigned int	md_len;
+	unsigned int	rsiz;
+	unsigned int	rsizw;
+
+	unsigned int	partial;
+	uint8_t		buf[SHA3_224_BLOCK_SIZE];
+
+	uint8_t		*sha;
+};
+
+void fio_sha3_224_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_256_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_384_init(struct fio_sha3_ctx *sctx);
+void fio_sha3_512_init(struct fio_sha3_ctx *sctx);
+
+int fio_sha3_update(struct fio_sha3_ctx *sctx, const uint8_t *data,
+		    unsigned int len);
+void fio_sha3_final(struct fio_sha3_ctx *sctx);
+
+#endif

diff --git a/crc/test.c b/crc/test.c
index dbc5653..368229e 100644
--- a/crc/test.c
+++ b/crc/test.c

@@ -16,6 +16,7 @@
 #include "../crc/sha1.h"
 #include "../crc/sha256.h"
 #include "../crc/sha512.h"
+#include "../crc/sha3.h"
 #include "../crc/xxhash.h"
 #include "../crc/murmur3.h"
 #include "../crc/fnv.h"
@@ -47,6 +48,10 @@
 	T_MURMUR3	= 1U << 10,
 	T_JHASH		= 1U << 11,
 	T_FNV		= 1U << 12,
+	T_SHA3_224	= 1U << 13,
+	T_SHA3_256	= 1U << 14,
+	T_SHA3_384	= 1U << 15,
+	T_SHA3_512	= 1U << 16,
 };
 
 static void t_md5(struct test_type *t, void *buf, size_t size)
@@ -68,7 +73,7 @@
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc64(buf, size);
+		t->output += fio_crc64(buf, size);
 }
 
 static void t_crc32(struct test_type *t, void *buf, size_t size)
@@ -76,7 +81,7 @@
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc32(buf, size);
+		t->output += fio_crc32(buf, size);
 }
 
 static void t_crc32c(struct test_type *t, void *buf, size_t size)
@@ -84,7 +89,7 @@
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc32c(buf, size);
+		t->output += fio_crc32c(buf, size);
 }
 
 static void t_crc16(struct test_type *t, void *buf, size_t size)
@@ -92,7 +97,7 @@
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc16(buf, size);
+		t->output += fio_crc16(buf, size);
 }
 
 static void t_crc7(struct test_type *t, void *buf, size_t size)
@@ -100,7 +105,7 @@
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc7(buf, size);
+		t->output += fio_crc7(buf, size);
 }
 
 static void t_sha1(struct test_type *t, void *buf, size_t size)
@@ -143,12 +148,68 @@
 		fio_sha512_update(&ctx, buf, size);
 }
 
+static void t_sha3_224(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_224_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_256(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_256_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_384(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_384_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
+static void t_sha3_512(struct test_type *t, void *buf, size_t size)
+{
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx ctx = { .sha = sha };
+	int i;
+
+	fio_sha3_512_init(&ctx);
+
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha3_update(&ctx, buf, size);
+		fio_sha3_final(&ctx);
+	}
+}
+
 static void t_murmur3(struct test_type *t, void *buf, size_t size)
 {
 	int i;
 
 	for (i = 0; i < NR_CHUNKS; i++)
-		murmurhash3(buf, size, 0x8989);
+		t->output += murmurhash3(buf, size, 0x8989);
 }
 
 static void t_jhash(struct test_type *t, void *buf, size_t size)
@@ -247,6 +308,26 @@
 		.fn = t_fnv,
 	},
 	{
+		.name = "sha3-224",
+		.mask = T_SHA3_224,
+		.fn = t_sha3_224,
+	},
+	{
+		.name = "sha3-256",
+		.mask = T_SHA3_256,
+		.fn = t_sha3_256,
+	},
+	{
+		.name = "sha3-384",
+		.mask = T_SHA3_384,
+		.fn = t_sha3_384,
+	},
+	{
+		.name = "sha3-512",
+		.mask = T_SHA3_512,
+		.fn = t_sha3_512,
+	},
+	{
 		.name = NULL,
 	},
 };
@@ -291,6 +372,7 @@
 	int i, first = 1;
 	void *buf;
 
+	crc32c_arm64_probe();
 	crc32c_intel_probe();
 
 	if (!type)
@@ -306,7 +388,7 @@
 	}
 
 	buf = malloc(CHUNK);
-	init_rand_seed(&state, 0x8989);
+	init_rand_seed(&state, 0x8989, 0);
 	fill_random_buf(&state, buf, CHUNK);
 
 	for (i = 0; t[i].name; i++) {
@@ -338,9 +420,9 @@
 				sprintf(pre, "\t");
 			else
 				sprintf(pre, "\t\t");
-			printf("%s:%s%8.2f MB/sec\n", t[i].name, pre, mb_sec);
+			printf("%s:%s%8.2f MiB/sec\n", t[i].name, pre, mb_sec);
 		} else
-			printf("%s:inf MB/sec\n", t[i].name);
+			printf("%s:inf MiB/sec\n", t[i].name);
 		first = 0;
 	}
 

diff --git a/debug.h b/debug.h
index 923fa39..e3aa3f1 100644
--- a/debug.h
+++ b/debug.h

@@ -21,6 +21,8 @@
 	FD_NET,
 	FD_RATE,
 	FD_COMPRESS,
+	FD_STEADYSTATE,
+	FD_HELPERTHREAD,
 	FD_DEBUG_MAX,
 };
 

diff --git a/diskutil.c b/diskutil.c
index 52d87f6..dca3748 100644
--- a/diskutil.c
+++ b/diskutil.c

@@ -11,14 +11,13 @@
 #include "fio.h"
 #include "smalloc.h"
 #include "diskutil.h"
+#include "helper_thread.h"
 
 static int last_majdev, last_mindev;
 static struct disk_util *last_du;
 
 static struct fio_mutex *disk_util_mutex;
 
-FLIST_HEAD(disk_list);
-
 static struct disk_util *__init_per_file_disk_util(struct thread_data *td,
 		int majdev, int mindev, char *path);
 
@@ -36,6 +35,7 @@
 	}
 
 	fio_mutex_remove(du->lock);
+	free(du->sysfs_root);
 	sfree(du);
 }
 
@@ -121,7 +121,7 @@
 
 	fio_mutex_down(disk_util_mutex);
 
-	if (!helper_exit) {
+	if (!helper_should_exit()) {
 		flist_for_each(entry, &disk_list) {
 			du = flist_entry(entry, struct disk_util, list);
 			update_io_tick_disk(du);
@@ -178,6 +178,7 @@
 		/*
 		 * must be a file, open "." in that path
 		 */
+		tempname[PATH_MAX - 1] = '\0';
 		strncpy(tempname, file_name, PATH_MAX - 1);
 		p = dirname(tempname);
 		if (stat(p, &st)) {
@@ -238,7 +239,7 @@
 		    !strcmp(dirent->d_name, ".."))
 			continue;
 
-		sprintf(temppath, "%s%s%s", slavesdir, FIO_OS_PATH_SEPARATOR, dirent->d_name);
+		sprintf(temppath, "%s/%s", slavesdir, dirent->d_name);
 		/* Can we always assume that the slaves device entries
 		 * are links to the real directories for the slave
 		 * devices?
@@ -265,7 +266,7 @@
 		if (slavedu)
 			continue;
 
-		sprintf(temppath, "%s%s%s", slavesdir, FIO_OS_PATH_SEPARATOR, slavepath);
+		sprintf(temppath, "%s/%s", slavesdir, slavepath);
 		__init_per_file_disk_util(td, majdev, mindev, temppath);
 		slavedu = disk_util_exists(majdev, mindev);
 
@@ -290,10 +291,8 @@
 	dprint(FD_DISKUTIL, "add maj/min %d/%d: %s\n", majdev, mindev, path);
 
 	du = smalloc(sizeof(*du));
-	if (!du) {
-		log_err("fio: smalloc() pool exhausted\n");
+	if (!du)
 		return NULL;
-	}
 
 	memset(du, 0, sizeof(*du));
 	INIT_FLIST_HEAD(&du->list);
@@ -305,7 +304,7 @@
 		return NULL;
 	}
 	strncpy((char *) du->dus.name, basename(path), FIO_DU_NAME_SZ - 1);
-	du->sysfs_root = path;
+	du->sysfs_root = strdup(path);
 	du->major = majdev;
 	du->minor = mindev;
 	INIT_FLIST_HEAD(&du->slavelist);
@@ -369,7 +368,7 @@
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
 
-		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
+		sprintf(full_path, "%s/%s", path, dir->d_name);
 
 		if (!strcmp(dir->d_name, "dev")) {
 			if (!check_dev_match(majdev, mindev, full_path)) {
@@ -425,13 +424,11 @@
 			log_err("unknown sysfs layout\n");
 			return NULL;
 		}
+		tmp[PATH_MAX - 1] = '\0';
 		strncpy(tmp, p, PATH_MAX - 1);
 		sprintf(path, "%s", tmp);
 	}
 
-	if (td->o.ioscheduler && !td->sysfs_root)
-		td->sysfs_root = strdup(path);
-
 	return disk_util_add(td, majdev, mindev, path);
 }
 
@@ -450,12 +447,8 @@
 			mindev);
 
 	du = disk_util_exists(majdev, mindev);
-	if (du) {
-		if (td->o.ioscheduler && !td->sysfs_root)
-			td->sysfs_root = strdup(du->sysfs_root);
-
+	if (du)
 		return du;
-	}
 
 	/*
 	 * for an fs without a device, we will repeatedly stat through
@@ -488,20 +481,21 @@
 	unsigned int i;
 
 	if (!td->o.do_disk_util ||
-	    (td->io_ops->flags & (FIO_DISKLESSIO | FIO_NODISKUTIL)))
+	    td_ioengine_flagged(td, FIO_DISKLESSIO | FIO_NODISKUTIL))
 		return;
 
 	for_each_file(td, f, i)
 		f->du = __init_disk_util(td, f);
 }
 
-static void show_agg_stats(struct disk_util_agg *agg, int terse)
+static void show_agg_stats(struct disk_util_agg *agg, int terse,
+			   struct buf_output *out)
 {
 	if (!agg->slavecount)
 		return;
 
 	if (!terse) {
-		log_info(", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
+		log_buf(out, ", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
 			 "aggrticks=%llu/%llu, aggrin_queue=%llu, "
 			 "aggrutil=%3.2f%%",
 			(unsigned long long) agg->ios[0] / agg->slavecount,
@@ -513,7 +507,7 @@
 			(unsigned long long) agg->time_in_queue / agg->slavecount,
 			agg->max_util.u.f);
 	} else {
-		log_info(";slaves;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+		log_buf(out, ";slaves;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
 			(unsigned long long) agg->ios[0] / agg->slavecount,
 			(unsigned long long) agg->ios[1] / agg->slavecount,
 			(unsigned long long) agg->merges[0] / agg->slavecount,
@@ -578,7 +572,7 @@
 }
 
 void print_disk_util(struct disk_util_stat *dus, struct disk_util_agg *agg,
-		     int terse)
+		     int terse, struct buf_output *out)
 {
 	double util = 0;
 
@@ -589,9 +583,9 @@
 
 	if (!terse) {
 		if (agg->slavecount)
-			log_info("  ");
+			log_buf(out, "  ");
 
-		log_info("  %s: ios=%llu/%llu, merge=%llu/%llu, "
+		log_buf(out, "  %s: ios=%llu/%llu, merge=%llu/%llu, "
 			 "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
 				dus->name,
 				(unsigned long long) dus->s.ios[0],
@@ -603,7 +597,7 @@
 				(unsigned long long) dus->s.time_in_queue,
 				util);
 	} else {
-		log_info(";%s;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+		log_buf(out, ";%s;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
 				dus->name,
 				(unsigned long long) dus->s.ios[0],
 				(unsigned long long) dus->s.ios[1],
@@ -619,10 +613,10 @@
 	 * If the device has slaves, aggregate the stats for
 	 * those slave devices also.
 	 */
-	show_agg_stats(agg, terse);
+	show_agg_stats(agg, terse, out);
 
 	if (!terse)
-		log_info("\n");
+		log_buf(out, "\n");
 }
 
 void json_array_add_disk_util(struct disk_util_stat *dus,
@@ -689,10 +683,12 @@
 	}
 }
 
-void show_disk_util(int terse, struct json_object *parent)
+void show_disk_util(int terse, struct json_object *parent,
+		    struct buf_output *out)
 {
 	struct flist_head *entry;
 	struct disk_util *du;
+	bool do_json;
 
 	if (!disk_util_mutex)
 		return;
@@ -704,21 +700,24 @@
 		return;
 	}
 
-	if (output_format == FIO_OUTPUT_JSON)
-		assert(parent);
-
-	if (!terse && output_format != FIO_OUTPUT_JSON)
-		log_info("\nDisk stats (read/write):\n");
-
-	if (output_format == FIO_OUTPUT_JSON)
-		json_object_add_disk_utils(parent, &disk_list);
+	if ((output_format & FIO_OUTPUT_JSON) && parent)
+		do_json = true;
 	else
+		do_json = false;
+
+	if (!terse && !do_json)
+		log_buf(out, "\nDisk stats (read/write):\n");
+
+	if (do_json)
+		json_object_add_disk_utils(parent, &disk_list);
+	else if (output_format & ~(FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS)) {
 		flist_for_each(entry, &disk_list) {
 			du = flist_entry(entry, struct disk_util, list);
 
 			aggregate_slaves_stats(du);
-			print_disk_util(&du->dus, &du->agg, terse);
+			print_disk_util(&du->dus, &du->agg, terse, out);
 		}
+	}
 
 	fio_mutex_up(disk_util_mutex);
 }

diff --git a/diskutil.h b/diskutil.h
index c0ae0ed..f773066 100644
--- a/diskutil.h
+++ b/diskutil.h

@@ -3,7 +3,8 @@
 #include "json.h"
 #define FIO_DU_NAME_SZ		64
 
-extern volatile int helper_exit;
+#include "lib/output_buffer.h"
+#include "helper_thread.h"
 
 struct disk_util_stats {
 	uint64_t ios[2];
@@ -45,7 +46,6 @@
 	 */
 	struct flist_head slavelist;
 
-	char *name;
 	char *sysfs_root;
 	char path[PATH_MAX];
 	int major, minor;
@@ -105,8 +105,8 @@
  * disk util stuff
  */
 #ifdef FIO_HAVE_DISK_UTIL
-extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse);
-extern void show_disk_util(int terse, struct json_object *parent);
+extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse, struct buf_output *);
+extern void show_disk_util(int terse, struct json_object *parent, struct buf_output *);
 extern void json_array_add_disk_util(struct disk_util_stat *dus,
 		struct disk_util_agg *agg, struct json_array *parent);
 extern void init_disk_util(struct thread_data *);
@@ -114,11 +114,13 @@
 extern void setup_disk_util(void);
 extern void disk_util_prune_entries(void);
 #else
+/* keep this as a function to avoid a warning in handle_du() */
 static inline void print_disk_util(struct disk_util_stat *du,
-				   struct disk_util_agg *agg, int terse)
+				   struct disk_util_agg *agg, int terse,
+				   struct buf_output *out)
 {
 }
-#define show_disk_util(terse, parent)
+#define show_disk_util(terse, parent, out)
 #define disk_util_prune_entries()
 #define init_disk_util(td)
 #define setup_disk_util()
@@ -126,7 +128,7 @@
 
 static inline int update_io_ticks(void)
 {
-	return helper_exit;
+	return helper_should_exit();
 }
 #endif
 

diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..3b979f9
--- /dev/null
+++ b/doc/Makefile

@@ -0,0 +1,225 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = output
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  epub3      to make an epub3"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+	@echo "  dummy      to check syntax errors of document sources"
+
+.PHONY: clean
+clean:
+	rm -rf $(BUILDDIR)/*
+
+.PHONY: html
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+.PHONY: dirhtml
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+.PHONY: singlehtml
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+.PHONY: pickle
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+.PHONY: json
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+.PHONY: htmlhelp
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+.PHONY: qthelp
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/fio.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/fio.qhc"
+
+.PHONY: applehelp
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+.PHONY: devhelp
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/fio"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/fio"
+	@echo "# devhelp"
+
+.PHONY: epub
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+.PHONY: epub3
+epub3:
+	$(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3
+	@echo
+	@echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3."
+
+.PHONY: latex
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+.PHONY: latexpdf
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: latexpdfja
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+.PHONY: text
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+.PHONY: man
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+.PHONY: texinfo
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+.PHONY: info
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+.PHONY: gettext
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+.PHONY: changes
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+.PHONY: linkcheck
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+.PHONY: doctest
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+.PHONY: coverage
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+.PHONY: xml
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+.PHONY: pseudoxml
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
+
+.PHONY: dummy
+dummy:
+	$(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy
+	@echo
+	@echo "Build finished. Dummy builder generates no files."

diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 0000000..4102140
--- /dev/null
+++ b/doc/conf.py

@@ -0,0 +1,360 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# fio documentation build configuration file, created by
+# sphinx-quickstart on Mon Nov 14 13:56:30 2016.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#
+# source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'fio'
+copyright = '2017, Jens Axboe <axboe@kernel.dk>'
+author = 'Jens Axboe <axboe@kernel.dk>'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+
+# The short X.Y version.
+# version = '1'
+# The full version, including alpha/beta/rc tags.
+# release = '1'
+
+def fio_version():
+
+	from os.path import exists, dirname, join
+	wsroot = dirname(dirname(__file__))
+	version_file = join(wsroot, "FIO-VERSION-FILE")
+	if not exists(version_file):
+		version_gen = join(wsroot, "FIO-VERSION-GEN")
+		from subprocess import call
+		rc = call(version_gen, shell=True, cwd=wsroot)
+		if rc:
+			print("Couldn't generate version file. rc=%r" % rc)
+			return "Unknown", "Unknown"
+
+	vsl = open(version_file).read().strip().split('-')
+	version = vsl[1]
+	release = '-'.join(vsl[1:])
+	return version, release
+
+version, release = fio_version()
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#
+# today = ''
+#
+# Else, today_fmt is used as the format for a strftime call.
+#
+# today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['output', 'Thumbs.db', '.DS_Store', 'fio_examples.rst']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#
+# default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#
+# add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#
+# add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#
+# show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+# modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+# keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'alabaster'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+# html_theme_path = []
+
+# The name for this set of Sphinx documents.
+# "<project> v<release> documentation" by default.
+#
+# html_title = 'fio v1'
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#
+# html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#
+# html_logo = None
+
+# The name of an image file (relative to this directory) to use as a favicon of
+# the docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#
+# html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#
+# html_extra_path = []
+
+# If not None, a 'Last updated on:' timestamp is inserted at every page
+# bottom, using the given strftime format.
+# The empty string is equivalent to '%b %d, %Y'.
+#
+# html_last_updated_fmt = None
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#
+# html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#
+# html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#
+# html_additional_pages = {}
+
+# If false, no module index is generated.
+#
+# html_domain_indices = True
+
+# If false, no index is generated.
+#
+# html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#
+# html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#
+# html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#
+# html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#
+# html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#
+# html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+# html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+#   'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja'
+#   'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh'
+#
+# html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# 'ja' uses this config value.
+# 'zh' user can custom change `jieba` dictionary path.
+#
+# html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#
+# html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'fiodoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+     # The paper size ('letterpaper' or 'a4paper').
+     #
+     # 'papersize': 'letterpaper',
+
+     # The font size ('10pt', '11pt' or '12pt').
+     #
+     # 'pointsize': '10pt',
+
+     # Additional stuff for the LaTeX preamble.
+     #
+     # 'preamble': '',
+
+     # Latex figure (float) alignment
+     #
+     # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'fio.tex', 'fio Documentation',
+     'a', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#
+# latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#
+# latex_use_parts = False
+
+# If true, show page references after internal links.
+#
+# latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#
+# latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#
+# latex_appendices = []
+
+# It false, will not define \strong, \code, 	itleref, \crossref ... but only
+# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added
+# packages.
+#
+# latex_keep_old_macro_names = True
+
+# If false, no module index is generated.
+#
+# latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    ('fio_man', 'fio', 'flexible I/O tester',
+     [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#
+# man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'fio', 'fio Documentation',
+     author, 'fio', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#
+# texinfo_appendices = []
+
+# If false, no module index is generated.
+#
+# texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#
+# texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#
+# texinfo_no_detailmenu = False

diff --git a/doc/fio_doc.rst b/doc/fio_doc.rst
new file mode 100644
index 0000000..b5987b5
--- /dev/null
+++ b/doc/fio_doc.rst

@@ -0,0 +1,51 @@
+fio - Flexible I/O tester rev. |version|
+========================================
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO
+
+
+
+Examples
+========
+
+.. include:: fio_examples.rst
+
+
+
+TODO
+====
+
+
+GFIO TODO
+---------
+
+.. include:: ../GFIO-TODO
+
+
+Server TODO
+-----------
+
+.. include:: ../SERVER-TODO
+
+
+Steady State TODO
+-----------------
+
+.. include:: ../STEADYSTATE-TODO
+
+
+
+Moral License
+=============
+
+.. include:: ../MORAL-LICENSE
+
+
+License
+=======
+
+.. literalinclude:: ../COPYING

diff --git a/doc/fio_examples.rst b/doc/fio_examples.rst
new file mode 100644
index 0000000..ae0ef6f
--- /dev/null
+++ b/doc/fio_examples.rst

@@ -0,0 +1,62 @@
+Some job file examples.
+
+
+Poisson request flow
+--------------------
+
+.. only:: builder_html
+
+:download:`Download poisson-rate-submission.fio <../examples/poisson-rate-submission.fio>`
+
+.. literalinclude:: ../examples/poisson-rate-submission.fio
+	:language: ini
+
+Latency profile
+---------------
+
+.. only:: builder_html
+
+:download:`Download latency-profile.fio <../examples/latency-profile.fio>`
+
+.. literalinclude:: ../examples/latency-profile.fio
+	:language: ini
+
+Read 4 files with aio at different depths
+-----------------------------------------
+
+.. only:: builder_html
+
+:download:`Download aio-read.fio <../examples/aio-read.fio>`
+
+.. literalinclude:: ../examples/aio-read.fio
+	:language: ini
+
+Read backwards in a file
+------------------------
+
+.. only:: builder_html
+
+:download:`Download backwards-read.fio <../examples/backwards-read.fio>`
+
+.. literalinclude:: ../examples/backwards-read.fio
+	:language: ini
+
+Basic verification
+------------------
+
+.. only:: builder_html
+
+:download:`Download basic-verify.fio <../examples/basic-verify.fio>`
+
+.. literalinclude:: ../examples/basic-verify.fio
+	:language: ini
+
+Fixed rate submission
+---------------------
+
+.. only:: builder_html
+
+:download:`Download fixed-rate-submission.fio <../examples/fixed-rate-submission.fio>`
+
+.. literalinclude:: ../examples/fixed-rate-submission.fio
+	:language: ini

diff --git a/doc/fio_man.rst b/doc/fio_man.rst
new file mode 100644
index 0000000..c6a6438
--- /dev/null
+++ b/doc/fio_man.rst

@@ -0,0 +1,12 @@
+:orphan:
+
+Fio Manpage
+===========
+
+(rev. |release|)
+
+
+.. include:: ../README
+
+
+.. include:: ../HOWTO

diff --git a/doc/index.rst b/doc/index.rst
new file mode 100644
index 0000000..b98d997
--- /dev/null
+++ b/doc/index.rst

@@ -0,0 +1,25 @@
+.. FIO documentation master file, created by
+   sphinx-quickstart on Thu Mar 20 16:24:25 2015.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to FIO's documentation!
+===============================
+
+**Version:** |release|
+
+Contents:
+
+.. toctree::
+   :maxdepth: 3
+   :numbered:
+
+	fio - Flexible I/O tester |version| <fio_doc>
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+

diff --git a/doc/make.bat b/doc/make.bat
new file mode 100644
index 0000000..71fa19c
--- /dev/null
+++ b/doc/make.bat

@@ -0,0 +1,281 @@
+@ECHO OFF

+

+REM Command file for Sphinx documentation

+

+if "%SPHINXBUILD%" == "" (

+	set SPHINXBUILD=sphinx-build

+)

+set BUILDDIR=_build

+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .

+set I18NSPHINXOPTS=%SPHINXOPTS% .

+if NOT "%PAPER%" == "" (

+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%

+	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%

+)

+

+if "%1" == "" goto help

+

+if "%1" == "help" (

+	:help

+	echo.Please use `make ^<target^>` where ^<target^> is one of

+	echo.  html       to make standalone HTML files

+	echo.  dirhtml    to make HTML files named index.html in directories

+	echo.  singlehtml to make a single large HTML file

+	echo.  pickle     to make pickle files

+	echo.  json       to make JSON files

+	echo.  htmlhelp   to make HTML files and a HTML help project

+	echo.  qthelp     to make HTML files and a qthelp project

+	echo.  devhelp    to make HTML files and a Devhelp project

+	echo.  epub       to make an epub

+	echo.  epub3      to make an epub3

+	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter

+	echo.  text       to make text files

+	echo.  man        to make manual pages

+	echo.  texinfo    to make Texinfo files

+	echo.  gettext    to make PO message catalogs

+	echo.  changes    to make an overview over all changed/added/deprecated items

+	echo.  xml        to make Docutils-native XML files

+	echo.  pseudoxml  to make pseudoxml-XML files for display purposes

+	echo.  linkcheck  to check all external links for integrity

+	echo.  doctest    to run all doctests embedded in the documentation if enabled

+	echo.  coverage   to run coverage check of the documentation if enabled

+	echo.  dummy      to check syntax errors of document sources

+	goto end

+)

+

+if "%1" == "clean" (

+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i

+	del /q /s %BUILDDIR%\*

+	goto end

+)

+

+

+REM Check if sphinx-build is available and fallback to Python version if any

+%SPHINXBUILD% 1>NUL 2>NUL

+if errorlevel 9009 goto sphinx_python

+goto sphinx_ok

+

+:sphinx_python

+

+set SPHINXBUILD=python -m sphinx.__init__

+%SPHINXBUILD% 2> nul

+if errorlevel 9009 (

+	echo.

+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx

+	echo.installed, then set the SPHINXBUILD environment variable to point

+	echo.to the full path of the 'sphinx-build' executable. Alternatively you

+	echo.may add the Sphinx directory to PATH.

+	echo.

+	echo.If you don't have Sphinx installed, grab it from

+	echo.http://sphinx-doc.org/

+	exit /b 1

+)

+

+:sphinx_ok

+

+

+if "%1" == "html" (

+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.

+	goto end

+)

+

+if "%1" == "dirhtml" (

+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.

+	goto end

+)

+

+if "%1" == "singlehtml" (

+	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.

+	goto end

+)

+

+if "%1" == "pickle" (

+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished; now you can process the pickle files.

+	goto end

+)

+

+if "%1" == "json" (

+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished; now you can process the JSON files.

+	goto end

+)

+

+if "%1" == "htmlhelp" (

+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished; now you can run HTML Help Workshop with the ^

+.hhp project file in %BUILDDIR%/htmlhelp.

+	goto end

+)

+

+if "%1" == "qthelp" (

+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished; now you can run "qcollectiongenerator" with the ^

+.qhcp project file in %BUILDDIR%/qthelp, like this:

+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\fio.qhcp

+	echo.To view the help file:

+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\fio.ghc

+	goto end

+)

+

+if "%1" == "devhelp" (

+	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished.

+	goto end

+)

+

+if "%1" == "epub" (

+	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The epub file is in %BUILDDIR%/epub.

+	goto end

+)

+

+if "%1" == "epub3" (

+	%SPHINXBUILD% -b epub3 %ALLSPHINXOPTS% %BUILDDIR%/epub3

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The epub3 file is in %BUILDDIR%/epub3.

+	goto end

+)

+

+if "%1" == "latex" (

+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.

+	goto end

+)

+

+if "%1" == "latexpdf" (

+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex

+	cd %BUILDDIR%/latex

+	make all-pdf

+	cd %~dp0

+	echo.

+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.

+	goto end

+)

+

+if "%1" == "latexpdfja" (

+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex

+	cd %BUILDDIR%/latex

+	make all-pdf-ja

+	cd %~dp0

+	echo.

+	echo.Build finished; the PDF files are in %BUILDDIR%/latex.

+	goto end

+)

+

+if "%1" == "text" (

+	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The text files are in %BUILDDIR%/text.

+	goto end

+)

+

+if "%1" == "man" (

+	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The manual pages are in %BUILDDIR%/man.

+	goto end

+)

+

+if "%1" == "texinfo" (

+	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.

+	goto end

+)

+

+if "%1" == "gettext" (

+	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.

+	goto end

+)

+

+if "%1" == "changes" (

+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.The overview file is in %BUILDDIR%/changes.

+	goto end

+)

+

+if "%1" == "linkcheck" (

+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Link check complete; look for any errors in the above output ^

+or in %BUILDDIR%/linkcheck/output.txt.

+	goto end

+)

+

+if "%1" == "doctest" (

+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Testing of doctests in the sources finished, look at the ^

+results in %BUILDDIR%/doctest/output.txt.

+	goto end

+)

+

+if "%1" == "coverage" (

+	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Testing of coverage in the sources finished, look at the ^

+results in %BUILDDIR%/coverage/python.txt.

+	goto end

+)

+

+if "%1" == "xml" (

+	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The XML files are in %BUILDDIR%/xml.

+	goto end

+)

+

+if "%1" == "pseudoxml" (

+	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.

+	goto end

+)

+

+if "%1" == "dummy" (

+	%SPHINXBUILD% -b dummy %ALLSPHINXOPTS% %BUILDDIR%/dummy

+	if errorlevel 1 exit /b 1

+	echo.

+	echo.Build finished. Dummy builder generates no files.

+	goto end

+)

+

+:end


diff --git a/engines/binject.c b/engines/binject.c
index f8e83cd..932534a 100644
--- a/engines/binject.c
+++ b/engines/binject.c

@@ -94,7 +94,7 @@
 				 unsigned int max,
 				 const struct timespec fio_unused *t)
 {
-	struct binject_data *bd = td->io_ops->data;
+	struct binject_data *bd = td->io_ops_data;
 	int left = max, ret, r = 0, ev_index = 0;
 	void *buf = bd->cmds;
 	unsigned int i, events;
@@ -185,7 +185,7 @@
 
 static int fio_binject_prep(struct thread_data *td, struct io_u *io_u)
 {
-	struct binject_data *bd = td->io_ops->data;
+	struct binject_data *bd = td->io_ops_data;
 	struct b_user_cmd *buc = &io_u->buc;
 	struct binject_file *bf = FILE_ENG_DATA(io_u->file);
 
@@ -234,7 +234,7 @@
 
 static struct io_u *fio_binject_event(struct thread_data *td, int event)
 {
-	struct binject_data *bd = td->io_ops->data;
+	struct binject_data *bd = td->io_ops_data;
 
 	return bd->events[event];
 }
@@ -351,7 +351,7 @@
 	if (ret)
 		return 1;
 
-	if (f->filetype != FIO_TYPE_BD) {
+	if (f->filetype != FIO_TYPE_BLOCK) {
 		log_err("fio: binject only works with block devices\n");
 		goto err_close;
 	}
@@ -376,7 +376,7 @@
 
 static void fio_binject_cleanup(struct thread_data *td)
 {
-	struct binject_data *bd = td->io_ops->data;
+	struct binject_data *bd = td->io_ops_data;
 
 	if (bd) {
 		free(bd->events);
@@ -406,7 +406,7 @@
 	bd->fd_flags = malloc(sizeof(int) * td->o.nr_files);
 	memset(bd->fd_flags, 0, sizeof(int) * td->o.nr_files);
 
-	td->io_ops->data = bd;
+	td->io_ops_data = bd;
 	return 0;
 }
 

diff --git a/engines/cpu.c b/engines/cpu.c
index 7e4d737..d0b4a89 100644
--- a/engines/cpu.c
+++ b/engines/cpu.c

@@ -6,6 +6,7 @@
  *
  */
 #include "../fio.h"
+#include "../optgroup.h"
 
 struct cpu_options {
 	void *pad;
@@ -21,7 +22,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct cpu_options, cpuload),
 		.help	= "Use this percentage of CPU",
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -33,7 +34,7 @@
 		.def	= "50000",
 		.parent = "cpuload",
 		.hide	= 1,
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -43,7 +44,7 @@
 		.off1	= offsetof(struct cpu_options, exit_io_done),
 		.help	= "Exit when IO threads finish",
 		.def	= "0",
-		.category = FIO_OPT_C_GENERAL,
+		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
@@ -87,8 +88,8 @@
 
 	o->nr_files = o->open_files = 1;
 
-	log_info("%s: ioengine=cpu, cpuload=%u, cpucycle=%u\n", td->o.name,
-						co->cpuload, co->cpucycle);
+	log_info("%s: ioengine=%s, cpuload=%u, cpucycle=%u\n",
+		td->o.name, td->io_ops->name, co->cpuload, co->cpucycle);
 
 	return 0;
 }

diff --git a/engines/dev-dax.c b/engines/dev-dax.c
new file mode 100644
index 0000000..235a31e
--- /dev/null
+++ b/engines/dev-dax.c

@@ -0,0 +1,348 @@
+/*
+ * device DAX engine
+ *
+ * IO engine that reads/writes from files by doing memcpy to/from
+ * a memory mapped region of DAX enabled device.
+ *
+ * Copyright (C) 2016 Intel Corp
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/*
+ * device dax engine
+ * IO engine that access a DAX device directly for read and write data
+ *
+ * To use:
+ *   ioengine=dev-dax
+ *
+ *   Other relevant settings:
+ *     iodepth=1
+ *     direct=0	   REQUIRED
+ *     filename=/dev/daxN.N
+ *     bs=2m
+ *
+ *     direct should be left to 0. Using dev-dax implies that memory access
+ *     is direct. However, dev-dax does not support O_DIRECT flag by design
+ *     since it is not necessary.
+ *
+ *     bs should adhere to the device dax alignment at minimally.
+ *
+ * libpmem.so
+ *   By default, the dev-dax engine will let the system find the libpmem.so
+ *   that it uses. You can use an alternative libpmem by setting the
+ *   FIO_PMEM_LIB environment variable to the full path to the desired
+ *   libpmem.so.
+ */
+
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <libgen.h>
+#include <libpmem.h>
+
+#include "../fio.h"
+#include "../verify.h"
+
+/*
+ * Limits us to 1GiB of mapped files in total to model after
+ * mmap engine behavior
+ */
+#define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
+
+struct fio_devdax_data {
+	void *devdax_ptr;
+	size_t devdax_sz;
+	off_t devdax_off;
+};
+
+static int fio_devdax_file(struct thread_data *td, struct fio_file *f,
+			   size_t length, off_t off)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int flags = 0;
+
+	if (td_rw(td))
+		flags = PROT_READ | PROT_WRITE;
+	else if (td_write(td)) {
+		flags = PROT_WRITE;
+
+		if (td->o.verify != VERIFY_NONE)
+			flags |= PROT_READ;
+	} else
+		flags = PROT_READ;
+
+	fdd->devdax_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
+	if (fdd->devdax_ptr == MAP_FAILED) {
+		fdd->devdax_ptr = NULL;
+		td_verror(td, errno, "mmap");
+	}
+
+	if (td->error && fdd->devdax_ptr)
+		munmap(fdd->devdax_ptr, length);
+
+	return td->error;
+}
+
+/*
+ * Just mmap an appropriate portion, we cannot mmap the full extent
+ */
+static int fio_devdax_prep_limited(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	if (io_u->buflen > f->real_file_size) {
+		log_err("dev-dax: bs too big for dev-dax engine\n");
+		return EIO;
+	}
+
+	fdd->devdax_sz = min(MMAP_TOTAL_SZ, f->real_file_size);
+	if (fdd->devdax_sz > f->io_size)
+		fdd->devdax_sz = f->io_size;
+
+	fdd->devdax_off = io_u->offset;
+
+	return fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+}
+
+/*
+ * Attempt to mmap the entire file
+ */
+static int fio_devdax_prep_full(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	if (fio_file_partial_mmap(f))
+		return EINVAL;
+
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
+
+	fdd->devdax_sz = f->io_size;
+	fdd->devdax_off = 0;
+
+	ret = fio_devdax_file(td, f, fdd->devdax_sz, fdd->devdax_off);
+	if (ret)
+		fio_file_set_partial_mmap(f);
+
+	return ret;
+}
+
+static int fio_devdax_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+	int ret;
+
+	/*
+	 * It fits within existing mapping, use it
+	 */
+	if (io_u->offset >= fdd->devdax_off &&
+	    io_u->offset + io_u->buflen < fdd->devdax_off + fdd->devdax_sz)
+		goto done;
+
+	/*
+	 * unmap any existing mapping
+	 */
+	if (fdd->devdax_ptr) {
+		if (munmap(fdd->devdax_ptr, fdd->devdax_sz) < 0)
+			return errno;
+		fdd->devdax_ptr = NULL;
+	}
+
+	if (fio_devdax_prep_full(td, io_u)) {
+		td_clear_error(td);
+		ret = fio_devdax_prep_limited(td, io_u);
+		if (ret)
+			return ret;
+	}
+
+done:
+	io_u->mmap_data = fdd->devdax_ptr + io_u->offset - fdd->devdax_off -
+				f->file_offset;
+	return 0;
+}
+
+static int fio_devdax_queue(struct thread_data *td, struct io_u *io_u)
+{
+	fio_ro_check(td, io_u);
+	io_u->error = 0;
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+		memcpy(io_u->xfer_buf, io_u->mmap_data, io_u->xfer_buflen);
+		break;
+	case DDIR_WRITE:
+		pmem_memcpy_persist(io_u->mmap_data, io_u->xfer_buf,
+				    io_u->xfer_buflen);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_devdax_init(struct thread_data *td)
+{
+	struct thread_options *o = &td->o;
+
+	if ((o->rw_min_bs & page_mask) &&
+	    (o->fsync_blocks || o->fdatasync_blocks)) {
+		log_err("dev-dax: mmap options dictate a minimum block size of %llu bytes\n",
+			(unsigned long long) page_size);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_devdax_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fdd = calloc(1, sizeof(*fdd));
+	if (!fdd) {
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fdd);
+
+	return 0;
+}
+
+static int fio_devdax_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_devdax_data *fdd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fdd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
+static int
+fio_devdax_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	char spath[PATH_MAX];
+	char npath[PATH_MAX];
+	char *rpath;
+	FILE *sfile;
+	uint64_t size;
+	struct stat st;
+	int rc;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_CHAR)
+		return -EINVAL;
+
+	rc = stat(f->file_name, &st);
+	if (rc < 0) {
+		log_err("%s: failed to stat file %s (%s)\n",
+			td->o.name, f->file_name, strerror(errno));
+		return -errno;
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/subsystem",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	rpath = realpath(spath, npath);
+	if (!rpath) {
+		log_err("%s: realpath on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return -errno;
+	}
+
+	/* check if DAX device */
+	if (strcmp("/sys/class/dax", rpath)) {
+		log_err("%s: %s not a DAX device!\n",
+			td->o.name, f->file_name);
+	}
+
+	snprintf(spath, PATH_MAX, "/sys/dev/char/%d:%d/size",
+		 major(st.st_rdev), minor(st.st_rdev));
+
+	sfile = fopen(spath, "r");
+	if (!sfile) {
+		log_err("%s: fopen on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return 1;
+	}
+
+	rc = fscanf(sfile, "%lu", &size);
+	if (rc < 0) {
+		log_err("%s: fscanf on %s failed (%s)\n",
+			td->o.name, spath, strerror(errno));
+		return 1;
+	}
+
+	f->real_file_size = size;
+
+	fclose(sfile);
+
+	if (f->file_offset > f->real_file_size) {
+		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
+					(unsigned long long) f->file_offset,
+					(unsigned long long) f->real_file_size);
+		return 1;
+	}
+
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "dev-dax",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_devdax_init,
+	.prep		= fio_devdax_prep,
+	.queue		= fio_devdax_queue,
+	.open_file	= fio_devdax_open_file,
+	.close_file	= fio_devdax_close_file,
+	.get_file_size	= fio_devdax_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_devdax_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_devdax_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}

diff --git a/engines/e4defrag.c b/engines/e4defrag.c
index d6113a9..4b44488 100644
--- a/engines/e4defrag.c
+++ b/engines/e4defrag.c

@@ -17,6 +17,7 @@
 #include <fcntl.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 #ifndef EXT4_IOC_MOVE_EXT
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
@@ -44,6 +45,7 @@
 static struct fio_option options[] = {
 	{
 		.name	= "donorname",
+		.lname	= "Donor Name",
 		.type	= FIO_OPT_STR_STORE,
 		.off1	= offsetof(struct e4defrag_options, donor_name),
 		.help	= "File used as a block donor",
@@ -52,6 +54,7 @@
 	},
 	{
 		.name	= "inplace",
+		.lname	= "In Place",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct e4defrag_options, inplace),
 		.minval	= 0,
@@ -92,7 +95,7 @@
 	ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644);
 	if (ed->donor_fd < 0) {
 		td_verror(td, errno, "io_queue_init");
-		log_err("Can't open donor file %s err:%d", donor_name, ed->donor_fd);
+		log_err("Can't open donor file %s err:%d\n", donor_name, ed->donor_fd);
 		free(ed);
 		return 1;
 	}
@@ -108,7 +111,7 @@
 		goto err;
 
 	ed->bsz = stub.st_blksize;
-	td->io_ops->data = ed;
+	td->io_ops_data = ed;
 	return 0;
 err:
 	td_verror(td, errno, "io_queue_init");
@@ -119,7 +122,7 @@
 
 static void fio_e4defrag_cleanup(struct thread_data *td)
 {
-	struct e4defrag_data *ed = td->io_ops->data;
+	struct e4defrag_data *ed = td->io_ops_data;
 	if (ed) {
 		if (ed->donor_fd >= 0)
 			close(ed->donor_fd);
@@ -135,7 +138,7 @@
 	unsigned long long len;
 	struct move_extent me;
 	struct fio_file *f = io_u->file;
-	struct e4defrag_data *ed = td->io_ops->data;
+	struct e4defrag_data *ed = td->io_ops_data;
 	struct e4defrag_options *o = td->eo;
 
 	fio_ro_check(td, io_u);
@@ -169,8 +172,13 @@
 		len = io_u->xfer_buflen;
 
 	if (len != io_u->xfer_buflen) {
-		io_u->resid = io_u->xfer_buflen - len;
-		io_u->error = 0;
+		if (len) {
+			io_u->resid = io_u->xfer_buflen - len;
+			io_u->error = 0;
+		} else {
+			/* access beyond i_size */
+			io_u->error = EINVAL;
+		}
 	}
 	if (ret)
 		io_u->error = errno;

diff --git a/engines/ftruncate.c b/engines/ftruncate.c
new file mode 100644
index 0000000..e86dbac
--- /dev/null
+++ b/engines/ftruncate.c

@@ -0,0 +1,56 @@
+/*
+ * ftruncate: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular truncates to simulate data transfer
+ * as fio ioengine.
+ * DDIR_WRITE does ftruncate
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+#include "../filehash.h"
+
+static int fio_ftruncate_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = EINVAL;
+		return FIO_Q_COMPLETED;
+	}
+	ret = ftruncate(f->fd, io_u->offset);
+
+	if (ret)
+		io_u->error = errno;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "ftruncate",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_ftruncate_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_FAKEIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}

diff --git a/engines/glusterfs.c b/engines/glusterfs.c
index 507cd25..2abc283 100644
--- a/engines/glusterfs.c
+++ b/engines/glusterfs.c

@@ -6,6 +6,7 @@
  */
 
 #include "gfapi.h"
+#include "../optgroup.h"
 
 struct fio_option gfapi_options[] = {
 	{
@@ -40,7 +41,7 @@
 
 	dprint(FD_IO, "fio setup\n");
 
-	if (td->io_ops->data)
+	if (td->io_ops_data)
 		return 0;
 
 	g = malloc(sizeof(struct gf_data));
@@ -76,19 +77,19 @@
 		goto cleanup;
 	}
 	dprint(FD_FILE, "fio setup %p\n", g->fs);
-	td->io_ops->data = g;
+	td->io_ops_data = g;
 	return 0;
 cleanup:
 	if (g->fs)
 		glfs_fini(g->fs);
 	free(g);
-	td->io_ops->data = NULL;
+	td->io_ops_data = NULL;
 	return r;
 }
 
 void fio_gf_cleanup(struct thread_data *td)
 {
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 
 	if (g) {
 		if (g->aio_events)
@@ -98,7 +99,7 @@
 		if (g->fs)
 			glfs_fini(g->fs);
 		free(g);
-		td->io_ops->data = NULL;
+		td->io_ops_data = NULL;
 	}
 }
 
@@ -106,7 +107,7 @@
 {
 	struct stat buf;
 	int ret;
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 
 	dprint(FD_FILE, "get file size %s\n", f->file_name);
 
@@ -134,7 +135,7 @@
 
 	int flags = 0;
 	int ret = 0;
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 	struct stat sb = { 0, };
 
 	if (td_write(td)) {
@@ -267,7 +268,7 @@
 int fio_gf_close_file(struct thread_data *td, struct fio_file *f)
 {
 	int ret = 0;
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 
 	dprint(FD_FILE, "fd close %s\n", f->file_name);
 
@@ -283,7 +284,7 @@
 int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f)
 {
 	int ret = 0;
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 
 	dprint(FD_FILE, "fd unlink %s\n", f->file_name);
 
@@ -299,7 +300,7 @@
 		g->fd = NULL;
 		free(g);
 	}
-	td->io_ops->data = NULL;
+	td->io_ops_data = NULL;
 
 	return ret;
 }

diff --git a/engines/glusterfs_async.c b/engines/glusterfs_async.c
index 7c2c139..f46cb26 100644
--- a/engines/glusterfs_async.c
+++ b/engines/glusterfs_async.c

@@ -13,7 +13,7 @@
 
 static struct io_u *fio_gf_event(struct thread_data *td, int event)
 {
-	struct gf_data *gf_data = td->io_ops->data;
+	struct gf_data *gf_data = td->io_ops_data;
 
 	dprint(FD_IO, "%s\n", __FUNCTION__);
 	return gf_data->aio_events[event];
@@ -22,7 +22,7 @@
 static int fio_gf_getevents(struct thread_data *td, unsigned int min,
 			    unsigned int max, const struct timespec *t)
 {
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 	unsigned int events = 0;
 	struct io_u *io_u;
 	int i;
@@ -99,7 +99,7 @@
 static int fio_gf_async_queue(struct thread_data fio_unused * td,
 			      struct io_u *io_u)
 {
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 	int r;
 
 	dprint(FD_IO, "%s op %s\n", __FUNCTION__, io_ddir_name(io_u->ddir));
@@ -137,7 +137,7 @@
 	return FIO_Q_COMPLETED;
 }
 
-int fio_gf_async_setup(struct thread_data *td)
+static int fio_gf_async_setup(struct thread_data *td)
 {
 	struct gf_data *g;
 	int r;
@@ -150,7 +150,7 @@
 		return r;
 
 	td->o.use_thread = 1;
-	g = td->io_ops->data;
+	g = td->io_ops_data;
 	g->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
 	if (!g->aio_events) {
 		r = -ENOMEM;

diff --git a/engines/glusterfs_sync.c b/engines/glusterfs_sync.c
index 6de4ee2..25d05b2 100644
--- a/engines/glusterfs_sync.c
+++ b/engines/glusterfs_sync.c

@@ -7,11 +7,11 @@
 
 #include "gfapi.h"
 
-#define LAST_POS(f)	((f)->engine_data)
+#define LAST_POS(f)	((f)->engine_pos)
 static int fio_gf_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 
 	dprint(FD_FILE, "fio prep\n");
 
@@ -31,7 +31,7 @@
 
 static int fio_gf_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct gf_data *g = td->io_ops->data;
+	struct gf_data *g = td->io_ops_data;
 	int ret = 0;
 
 	dprint(FD_FILE, "fio queue len %lu\n", io_u->xfer_buflen);

diff --git a/engines/guasi.c b/engines/guasi.c
index c586f09..eb12c89 100644
--- a/engines/guasi.c
+++ b/engines/guasi.c

@@ -50,7 +50,7 @@
 
 static struct io_u *fio_guasi_event(struct thread_data *td, int event)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	struct io_u *io_u;
 	struct guasi_reqinfo rinf;
 
@@ -82,7 +82,7 @@
 static int fio_guasi_getevents(struct thread_data *td, unsigned int min,
 			       unsigned int max, const struct timespec *t)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int n, r;
 	long timeo = -1;
 
@@ -115,7 +115,7 @@
 
 static int fio_guasi_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -148,7 +148,7 @@
 
 static int fio_guasi_commit(struct thread_data *td)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int i;
 	struct io_u *io_u;
 	struct fio_file *f;
@@ -198,7 +198,7 @@
 
 static void fio_guasi_cleanup(struct thread_data *td)
 {
-	struct guasi_data *ld = td->io_ops->data;
+	struct guasi_data *ld = td->io_ops_data;
 	int n;
 
 	GDBG_PRINT(("fio_guasi_cleanup(%p)\n", ld));
@@ -235,7 +235,7 @@
 	ld->queued_nr = 0;
 	ld->reqs_nr = 0;
 
-	td->io_ops->data = ld;
+	td->io_ops_data = ld;
 	GDBG_PRINT(("fio_guasi_init(): depth=%d -> %p\n", td->o.iodepth, ld));
 
 	return 0;

diff --git a/engines/libaio.c b/engines/libaio.c
index d4f4830..e15c519 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c

@@ -12,6 +12,8 @@
 #include <libaio.h>
 
 #include "../fio.h"
+#include "../lib/pow2.h"
+#include "../optgroup.h"
 
 static int fio_libaio_commit(struct thread_data *td);
 
@@ -81,7 +83,7 @@
 
 static struct io_u *fio_libaio_event(struct thread_data *td, int event)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct io_event *ev;
 	struct io_u *io_u;
 
@@ -143,9 +145,9 @@
 static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
 				unsigned int max, const struct timespec *t)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct libaio_options *o = td->eo;
-	unsigned actual_min = td->o.iodepth_batch_complete == 0 ? 0 : min;
+	unsigned actual_min = td->o.iodepth_batch_complete_min == 0 ? 0 : min;
 	struct timespec __lt, *lt = NULL;
 	int r, events = 0;
 
@@ -179,7 +181,7 @@
 
 static int fio_libaio_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -236,7 +238,7 @@
 
 static int fio_libaio_commit(struct thread_data *td)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 	struct iocb **iocbs;
 	struct io_u **io_us;
 	struct timeval tv;
@@ -306,17 +308,24 @@
 
 static int fio_libaio_cancel(struct thread_data *td, struct io_u *io_u)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	return io_cancel(ld->aio_ctx, &io_u->iocb, ld->aio_events);
 }
 
 static void fio_libaio_cleanup(struct thread_data *td)
 {
-	struct libaio_data *ld = td->io_ops->data;
+	struct libaio_data *ld = td->io_ops_data;
 
 	if (ld) {
-		io_destroy(ld->aio_ctx);
+		/*
+		 * Work-around to avoid huge RCU stalls at exit time. If we
+		 * don't do this here, then it'll be torn down by exit_aio().
+		 * But for that case we can parallellize the freeing, thus
+		 * speeding it up a lot.
+		 */
+		if (!(td->flags & TD_F_CHILD))
+			io_destroy(ld->aio_ctx);
 		free(ld->aio_events);
 		free(ld->iocbs);
 		free(ld->io_us);
@@ -354,7 +363,7 @@
 	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
 	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
 
-	td->io_ops->data = ld;
+	td->io_ops_data = ld;
 	return 0;
 }
 

diff --git a/engines/libhdfs.c b/engines/libhdfs.c
index 658cd6a..96a0871 100644
--- a/engines/libhdfs.c
+++ b/engines/libhdfs.c

@@ -9,69 +9,127 @@
  *
  * thus, random reads and writes can also be achieved with this logic.
  *
- * NOTE: please set environment variables FIO_HDFS_BS and FIO_HDFS_FCOUNT
- * to appropriate value to work this engine properly
- *
  */
 
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/uio.h>
-#include <errno.h>
-#include <assert.h>
+#include <math.h>
+#include <hdfs.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
-#include "hdfs.h"
+#define CHUNCK_NAME_LENGTH_MAX 80
+#define CHUNCK_CREATION_BUFFER_SIZE 65536
 
 struct hdfsio_data {
-	char host[256];
-	int port;
 	hdfsFS fs;
 	hdfsFile fp;
-	unsigned long fsbs;
-	unsigned long fscount;
-	unsigned long curr_file_id;
-	unsigned int numjobs;
-	unsigned int fid_correction;
+	uint64_t curr_file_id;
 };
 
-static int fio_hdfsio_setup_fs_params(struct hdfsio_data *hd)
-{
-	/* make sure that hdfsConnect is invoked before executing this function */
-	hdfsSetWorkingDirectory(hd->fs, "/.perftest");
-	hd->fp = hdfsOpenFile(hd->fs, ".fcount", O_RDONLY, 0, 0, 0);
-	if (hd->fp) {
-		hdfsRead(hd->fs, hd->fp, &(hd->fscount), sizeof(hd->fscount));
-		hdfsCloseFile(hd->fs, hd->fp);
-	}
-	hd->fp = hdfsOpenFile(hd->fs, ".fbs", O_RDONLY, 0, 0, 0);
-	if (hd->fp) {
-		hdfsRead(hd->fs, hd->fp, &(hd->fsbs), sizeof(hd->fsbs));
-		hdfsCloseFile(hd->fs, hd->fp);
-	}
+struct hdfsio_options {
+	void *pad;			/* needed because offset can't be 0 for a option defined used offsetof */
+	char *host;
+	char *directory;
+	unsigned int port;
+	unsigned int chunck_size;
+	unsigned int single_instance;
+	unsigned int use_direct;
+};
 
-	return 0;
+static struct fio_option options[] = {
+	{
+		.name	= "namenode",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hostname",
+		.lname	= "hfds namenode",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, host),
+		.def    = "localhost",
+		.help	= "Namenode of the HDFS cluster",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "port",
+		.lname	= "hdfs namenode port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, port),
+		.def    = "9000",
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port used by the HDFS cluster namenode",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfsdirectory",
+		.lname	= "hfds directory",
+		.type	= FIO_OPT_STR_STORE,
+		.off1   = offsetof(struct hdfsio_options, directory),
+		.def    = "/",
+		.help	= "The HDFS directory where fio will create chuncks",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "chunk_size",
+		.alias	= "chunck_size",
+		.lname	= "Chunk size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct hdfsio_options, chunck_size),
+		.def    = "1048576",
+		.help	= "Size of individual chunck",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "single_instance",
+		.lname	= "Single Instance",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, single_instance),
+		.def    = "1",
+		.help	= "Use a single instance",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= "hdfs_use_direct",
+		.lname	= "HDFS Use Direct",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct hdfsio_options, use_direct),
+		.def    = "0",
+		.help	= "Use readDirect instead of hdfsRead",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+
+static int get_chunck_name(char *dest, char *file_name, uint64_t chunk_id) {
+	return snprintf(dest, CHUNCK_NAME_LENGTH_MAX, "%s_%lu", file_name, chunk_id);
 }
 
 static int fio_hdfsio_prep(struct thread_data *td, struct io_u *io_u)
 {
-	struct hdfsio_data *hd;
-	hdfsFileInfo *fi;
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
 	unsigned long f_id;
-	char fname[80];
-	int open_flags = 0;
-
-	hd = td->io_ops->data;
-
-	if (hd->curr_file_id == -1) {
-		/* see comment in fio_hdfsio_setup() function */
-		fio_hdfsio_setup_fs_params(hd);
-	}
+	char fname[CHUNCK_NAME_LENGTH_MAX];
+	int open_flags;
 
 	/* find out file id based on the offset generated by fio */
-	f_id = (io_u->offset / hd->fsbs) + hd->fid_correction;
+	f_id = floor(io_u->offset / options-> chunck_size);
 
 	if (f_id == hd->curr_file_id) {
 		/* file is already open */
@@ -79,46 +137,76 @@
 	}
 
 	if (hd->curr_file_id != -1) {
-		hdfsCloseFile(hd->fs, hd->fp);
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
 	}
 
-	if (io_u->ddir == DDIR_READ) {
+	if (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_SYNC) {
 		open_flags = O_RDONLY;
 	} else if (io_u->ddir == DDIR_WRITE) {
 		open_flags = O_WRONLY;
 	} else {
 		log_err("hdfs: Invalid I/O Operation\n");
+		return 0;
 	}
-
+	
+	get_chunck_name(fname, io_u->file->file_name, f_id);
+	hd->fp = hdfsOpenFile(hd->fs, fname, open_flags, 0, 0,
+			      options->chunck_size);
+	if(hd->fp == NULL) {
+		log_err("hdfs: unable to open file: %s: %d\n", fname, strerror(errno));
+		return errno;
+	}
 	hd->curr_file_id = f_id;
-	do {
-		sprintf(fname, ".f%lu", f_id);
-		fi = hdfsGetPathInfo(hd->fs, fname);
-		if (fi->mSize >= hd->fsbs || io_u->ddir == DDIR_WRITE) {
-			/* file has enough data to read OR file is opened in write mode */
-			hd->fp =
-			    hdfsOpenFile(hd->fs, fname, open_flags, 0, 0,
-					 hd->fsbs);
-			if (hd->fp) {
-				break;
-			}
-		}
-		/* file is empty, so try next file for reading */
-		f_id = (f_id + 1) % hd->fscount;
-	} while (1);
 
 	return 0;
 }
 
-static int fio_io_end(struct thread_data *td, struct io_u *io_u, int ret)
+static int fio_hdfsio_queue(struct thread_data *td, struct io_u *io_u)
 {
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int ret;
+	unsigned long offset;
+	
+	offset = io_u->offset % options->chunck_size;
+	
+	if( (io_u->ddir == DDIR_READ || io_u->ddir == DDIR_WRITE) && 
+	     hdfsTell(hd->fs, hd->fp) != offset && hdfsSeek(hd->fs, hd->fp, offset) != 0 ) {
+		log_err("hdfs: seek failed: %s, are you doing random write smaller than chunck size ?\n", strerror(errno));
+		io_u->error = errno;
+		return FIO_Q_COMPLETED;
+	};
+
+	// do the IO
+	if (io_u->ddir == DDIR_READ) {
+		if (options->use_direct) {
+			ret = readDirect(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		} else {
+			ret = hdfsRead(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+		}
+	} else if (io_u->ddir == DDIR_WRITE) {
+		ret = hdfsWrite(hd->fs, hd->fp, io_u->xfer_buf,
+				io_u->xfer_buflen);
+	} else if (io_u->ddir == DDIR_SYNC) {
+		ret = hdfsFlush(hd->fs, hd->fp);
+	} else {
+		log_err("hdfs: Invalid I/O Operation: %d\n", io_u->ddir);
+		ret = EINVAL;
+	}
+
+	// Check if the IO went fine, or is incomplete
 	if (ret != (int)io_u->xfer_buflen) {
 		if (ret >= 0) {
 			io_u->resid = io_u->xfer_buflen - ret;
 			io_u->error = 0;
 			return FIO_Q_COMPLETED;
-		} else
+		} else {
 			io_u->error = errno;
+		}
 	}
 
 	if (io_u->error)
@@ -127,107 +215,200 @@
 	return FIO_Q_COMPLETED;
 }
 
-static int fio_hdfsio_queue(struct thread_data *td, struct io_u *io_u)
-{
-	struct hdfsio_data *hd;
-	int ret = 0;
-
-	hd = td->io_ops->data;
-
-	if (io_u->ddir == DDIR_READ) {
-		ret =
-		    hdfsRead(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
-	} else if (io_u->ddir == DDIR_WRITE) {
-		ret =
-		    hdfsWrite(hd->fs, hd->fp, io_u->xfer_buf,
-			      io_u->xfer_buflen);
-	} else {
-		log_err("hdfs: Invalid I/O Operation\n");
-	}
-
-	return fio_io_end(td, io_u, ret);
-}
-
 int fio_hdfsio_open_file(struct thread_data *td, struct fio_file *f)
 {
-	struct hdfsio_data *hd;
-
-	hd = td->io_ops->data;
-	hd->fs = hdfsConnect(hd->host, hd->port);
-	hdfsSetWorkingDirectory(hd->fs, "/.perftest");
-	hd->fid_correction = (getpid() % hd->numjobs);
+	if (td->o.odirect) {
+		td->error = EINVAL;
+		return 0;
+	}
 
 	return 0;
 }
 
 int fio_hdfsio_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct hdfsio_data *hd;
+	struct hdfsio_data *hd = td->io_ops_data;
 
-	hd = td->io_ops->data;
-	hdfsDisconnect(hd->fs);
-
+	if (hd->curr_file_id != -1) {
+		if ( hdfsCloseFile(hd->fs, hd->fp) == -1) {
+			log_err("hdfs: unable to close file: %s\n", strerror(errno));
+			return errno;
+		}
+		hd->curr_file_id = -1;
+	}
 	return 0;
 }
 
+static int fio_hdfsio_init(struct thread_data *td)
+{
+	struct hdfsio_options *options = td->eo;
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct fio_file *f;
+	uint64_t j,k;
+	int i, failure = 0;
+	uint8_t buffer[CHUNCK_CREATION_BUFFER_SIZE];
+	uint64_t bytes_left;	
+	char fname[CHUNCK_NAME_LENGTH_MAX];	
+	hdfsFile fp;
+	hdfsFileInfo *fi;
+	tOffset fi_size;
+
+	for_each_file(td, f, i) {
+		k = 0;
+		for(j=0; j < f->real_file_size; j += options->chunck_size) {
+			get_chunck_name(fname, f->file_name, k++);
+			fi = hdfsGetPathInfo(hd->fs, fname);
+			fi_size = fi ? fi->mSize : 0;
+			// fill exist and is big enough, nothing to do
+			if( fi && fi_size >= options->chunck_size) {
+				continue;
+			}
+			fp = hdfsOpenFile(hd->fs, fname, O_WRONLY, 0, 0,
+					  options->chunck_size);
+			if(fp == NULL) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+			bytes_left = options->chunck_size;
+			memset(buffer, 0, CHUNCK_CREATION_BUFFER_SIZE);
+			while( bytes_left > CHUNCK_CREATION_BUFFER_SIZE) {
+				if( hdfsWrite(hd->fs, fp, buffer, CHUNCK_CREATION_BUFFER_SIZE)
+				    != CHUNCK_CREATION_BUFFER_SIZE) {
+    					failure = errno;
+	    				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+					break;
+				};
+				bytes_left -= CHUNCK_CREATION_BUFFER_SIZE;
+			}
+			if(bytes_left > 0) {
+				if( hdfsWrite(hd->fs, fp, buffer, bytes_left)
+				    != bytes_left) {
+					failure = errno;
+					break;
+				};
+			}
+			if( hdfsCloseFile(hd->fs, fp) != 0) {
+				failure = errno;
+				log_err("hdfs: unable to prepare file chunk %s: %s\n", fname, strerror(errno));
+				break;
+			}
+		}
+		if(failure) {
+			break;
+		}
+	}
+	
+	if( !failure ) {
+		fio_file_set_size_known(f);
+	}
+
+	return failure;
+}
+
 static int fio_hdfsio_setup(struct thread_data *td)
 {
 	struct hdfsio_data *hd;
 	struct fio_file *f;
-	static unsigned int numjobs = 1;	/* atleast one job has to be there! */
-	numjobs = (td->o.numjobs > numjobs) ? td->o.numjobs : numjobs;
+	int i;
+	uint64_t file_size, total_file_size;
 
-	if (!td->io_ops->data) {
-		hd = malloc(sizeof(*hd));;
-
+	if (!td->io_ops_data) {
+		hd = malloc(sizeof(*hd));
 		memset(hd, 0, sizeof(*hd));
-		td->io_ops->data = hd;
-
-		/* separate host and port from filename */
-		*(strchr(td->o.filename, ',')) = ' ';
-		sscanf(td->o.filename, "%s%d", hd->host, &(hd->port));
-
-		/* read fbs and fcount and based on that set f->real_file_size */
-		f = td->files[0];
-#if 0
-		/* IMHO, this should be done here instead of fio_hdfsio_prep()
-		 * but somehow calling it here doesn't seem to work,
-		 * some problem with libhdfs that needs to be debugged */
-		hd->fs = hdfsConnect(hd->host, hd->port);
-		fio_hdfsio_setup_fs_params(hd);
-		hdfsDisconnect(hd->fs);
-#else
-		/* so, as an alternate, using environment variables */
-		if (getenv("FIO_HDFS_FCOUNT") && getenv("FIO_HDFS_BS")) {
-			hd->fscount = atol(getenv("FIO_HDFS_FCOUNT"));
-			hd->fsbs = atol(getenv("FIO_HDFS_BS"));
-		} else {
-			log_err("FIO_HDFS_FCOUNT and/or FIO_HDFS_BS not set.\n");
-			return 1;
-		}
-#endif
-		f->real_file_size = hd->fscount * hd->fsbs;
-
-		td->o.nr_files = 1;
+		
 		hd->curr_file_id = -1;
-		hd->numjobs = numjobs;
-		fio_file_set_size_known(f);
+
+		td->io_ops_data = hd;
+	}
+	
+	total_file_size = 0;
+	file_size = 0;
+
+	for_each_file(td, f, i) {
+		if(!td->o.file_size_low) {
+			file_size = floor(td->o.size / td->o.nr_files);
+			total_file_size += file_size;
+		}
+		else if (td->o.file_size_low == td->o.file_size_high)
+			file_size = td->o.file_size_low;
+		else {
+			file_size = get_rand_file_size(td);
+		}
+		f->real_file_size = file_size;
+	}
+	/* If the size doesn't divide nicely with the chunck size,
+	 * make the last files bigger.
+	 * Used only if filesize was not explicitely given
+	 */
+	if (!td->o.file_size_low && total_file_size < td->o.size) {
+		f->real_file_size += (td->o.size - total_file_size);
 	}
 
 	return 0;
 }
 
+static int fio_hdfsio_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+	struct hdfsio_options *options = td->eo;
+	int failure;
+	struct hdfsBuilder *bld;
+
+	if (options->host == NULL || options->port == 0) {
+		log_err("hdfs: server not defined\n");
+		return EINVAL;
+	}
+	
+	bld = hdfsNewBuilder();
+	if (!bld) {
+		failure = errno;
+		log_err("hdfs: unable to allocate connect builder\n");
+		return failure;
+	}
+	hdfsBuilderSetNameNode(bld, options->host);
+	hdfsBuilderSetNameNodePort(bld, options->port);
+	if(! options->single_instance) {
+		hdfsBuilderSetForceNewInstance(bld);
+	}
+	hd->fs = hdfsBuilderConnect(bld);
+	
+	/* hdfsSetWorkingDirectory succeed on non existend directory */
+	if (hdfsExists(hd->fs, options->directory) < 0 || hdfsSetWorkingDirectory(hd->fs, options->directory) < 0) {
+		failure = errno;
+		log_err("hdfs: invalid working directory %s: %s\n", options->directory, strerror(errno));
+		return failure;
+	}
+	
+	return 0;
+}
+
+static void fio_hdfsio_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd = td->io_ops_data;
+
+	if (hd->fs && hdfsDisconnect(hd->fs) < 0) {
+		log_err("hdfs: disconnect failed: %d\n", errno);
+	}
+}
+
 static struct ioengine_ops ioengine_hdfs = {
 	.name = "libhdfs",
 	.version = FIO_IOOPS_VERSION,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NODISKUTIL,
 	.setup = fio_hdfsio_setup,
+	.init = fio_hdfsio_init,
 	.prep = fio_hdfsio_prep,
 	.queue = fio_hdfsio_queue,
 	.open_file = fio_hdfsio_open_file,
 	.close_file = fio_hdfsio_close_file,
-	.flags = FIO_SYNCIO,
+	.io_u_init = fio_hdfsio_io_u_init,
+	.io_u_free = fio_hdfsio_io_u_free,
+	.option_struct_size	= sizeof(struct hdfsio_options),
+	.options		= options,
 };
 
+
 static void fio_init fio_hdfsio_register(void)
 {
 	register_ioengine(&ioengine_hdfs);

diff --git a/engines/mmap.c b/engines/mmap.c
index 69add78..bc038f4 100644
--- a/engines/mmap.c
+++ b/engines/mmap.c

@@ -15,12 +15,11 @@
 #include "../verify.h"
 
 /*
- * Limits us to 1GB of mapped files in total
+ * Limits us to 1GiB of mapped files in total
  */
 #define MMAP_TOTAL_SZ	(1 * 1024 * 1024 * 1024UL)
 
 static unsigned long mmap_map_size;
-static unsigned long mmap_map_mask;
 
 struct fio_mmap_data {
 	void *mmap_ptr;
@@ -68,11 +67,10 @@
 	}
 
 #ifdef FIO_MADV_FREE
-	if (f->filetype == FIO_TYPE_BD)
+	if (f->filetype == FIO_TYPE_BLOCK)
 		(void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE);
 #endif
 
-
 err:
 	if (td->error && fmd->mmap_ptr)
 		munmap(fmd->mmap_ptr, length);
@@ -208,26 +206,15 @@
 static int fio_mmapio_init(struct thread_data *td)
 {
 	struct thread_options *o = &td->o;
-	unsigned long shift, mask;
 
-	if ((td->o.rw_min_bs & page_mask) &&
+	if ((o->rw_min_bs & page_mask) &&
 	    (o->odirect || o->fsync_blocks || o->fdatasync_blocks)) {
 		log_err("fio: mmap options dictate a minimum block size of "
 			"%llu bytes\n", (unsigned long long) page_size);
 		return 1;
 	}
 
-	mmap_map_size = MMAP_TOTAL_SZ / td->o.nr_files;
-	mask = mmap_map_size;
-	shift = 0;
-	do {
-		mask >>= 1;
-		if (!mask)
-			break;
-		shift++;
-	} while (1);
-
-	mmap_map_mask = 1UL << shift;
+	mmap_map_size = MMAP_TOTAL_SZ / o->nr_files;
 	return 0;
 }
 
@@ -242,8 +229,8 @@
 
 	fmd = calloc(1, sizeof(*fmd));
 	if (!fmd) {
-		int fio_unused ret;
-		ret = generic_close_file(td, f);
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
 		return 1;
 	}
 

diff --git a/engines/mtd.c b/engines/mtd.c
new file mode 100644
index 0000000..3c22a1b
--- /dev/null
+++ b/engines/mtd.c

@@ -0,0 +1,209 @@
+/*
+ * MTD engine
+ *
+ * IO engine that reads/writes from MTD character devices.
+ *
+ */
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "../fio.h"
+#include "../verify.h"
+#include "../oslib/libmtd.h"
+
+static libmtd_t desc;
+
+struct fio_mtd_data {
+	struct mtd_dev_info info;
+};
+
+static int fio_mtd_maybe_mark_bad(struct thread_data *td,
+				  struct fio_mtd_data *fmd,
+				  struct io_u *io_u, int eb)
+{
+	int ret;
+	if (errno == EIO) {
+		ret = mtd_mark_bad(&fmd->info, io_u->file->fd, eb);
+		if (ret != 0) {
+			io_u->error = errno;
+			td_verror(td, errno, "mtd_mark_bad");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static int fio_mtd_is_bad(struct thread_data *td,
+			  struct fio_mtd_data *fmd,
+			  struct io_u *io_u, int eb)
+{
+	int ret = mtd_is_bad(&fmd->info, io_u->file->fd, eb);
+	if (ret == -1) {
+		io_u->error = errno;
+		td_verror(td, errno, "mtd_is_bad");
+	} else if (ret == 1)
+		io_u->error = EIO;	/* Silent failure--don't flood stderr */
+	return ret;
+}
+
+static int fio_mtd_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+	int local_offs = 0;
+	int ret;
+
+	fio_ro_check(td, io_u);
+
+	/*
+	 * Errors tend to pertain to particular erase blocks, so divide up
+	 * I/O to erase block size.
+	 * If an error is encountered, log it and keep going onto the next
+	 * block because the error probably just pertains to that block.
+	 * TODO(dehrenberg): Divide up reads and writes into page-sized
+	 * operations to get more fine-grained information about errors.
+	 */
+	while (local_offs < io_u->buflen) {
+		int eb = (io_u->offset + local_offs) / fmd->info.eb_size;
+		int eb_offs = (io_u->offset + local_offs) % fmd->info.eb_size;
+		/* The length is the smaller of the length remaining in the
+		 * buffer and the distance to the end of the erase block */
+		int len = min((int)io_u->buflen - local_offs,
+			      (int)fmd->info.eb_size - eb_offs);
+		char *buf = ((char *)io_u->buf) + local_offs;
+
+		if (td->o.skip_bad) {
+			ret = fio_mtd_is_bad(td, fmd, io_u, eb);
+			if (ret == -1)
+				break;
+			else if (ret == 1)
+				goto next;
+		}
+		if (io_u->ddir == DDIR_READ) {
+			ret = mtd_read(&fmd->info, f->fd, eb, eb_offs, buf, len);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_read");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_WRITE) {
+			ret = mtd_write(desc, &fmd->info, f->fd, eb,
+					    eb_offs, buf, len, NULL, 0, 0);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_write");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else if (io_u->ddir == DDIR_TRIM) {
+			if (eb_offs != 0 || len != fmd->info.eb_size) {
+				io_u->error = EINVAL;
+				td_verror(td, EINVAL,
+					  "trim on MTD must be erase block-aligned");
+			}
+			ret = mtd_erase(desc, &fmd->info, f->fd, eb);
+			if (ret != 0) {
+				io_u->error = errno;
+				td_verror(td, errno, "mtd_erase");
+				if (fio_mtd_maybe_mark_bad(td, fmd, io_u, eb))
+					break;
+			}
+		} else {
+			io_u->error = ENOTSUP;
+			td_verror(td, io_u->error, "operation not supported on mtd");
+		}
+
+next:
+		local_offs += len;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_mtd_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd)
+		goto err_close;
+
+	ret = mtd_get_dev_info(desc, f->file_name, &fmd->info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		goto err_free;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
+	return 0;
+
+err_free:
+	free(fmd);
+err_close:
+	{
+		int fio_unused __ret;
+		__ret = generic_close_file(td, f);
+		return 1;
+	}
+}
+
+static int fio_mtd_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mtd_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+
+	return generic_close_file(td, f);
+}
+
+static int fio_mtd_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct mtd_dev_info info;
+
+	int ret = mtd_get_dev_info(desc, f->file_name, &info);
+	if (ret != 0) {
+		td_verror(td, errno, "mtd_get_dev_info");
+		return errno;
+	}
+	f->real_file_size = info.size;
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "mtd",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_mtd_queue,
+	.open_file	= fio_mtd_open_file,
+	.close_file	= fio_mtd_close_file,
+	.get_file_size	= fio_mtd_get_file_size,
+	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
+};
+
+static void fio_init fio_mtd_register(void)
+{
+	desc = libmtd_open();
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_mtd_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+	libmtd_close(desc);
+	desc = NULL;
+}
+
+
+

diff --git a/engines/net.c b/engines/net.c
index cd19535..37d44fd 100644
--- a/engines/net.c
+++ b/engines/net.c

@@ -22,6 +22,7 @@
 
 #include "../fio.h"
 #include "../verify.h"
+#include "../optgroup.h"
 
 struct netio_data {
 	int listenfd;
@@ -134,6 +135,7 @@
 #ifdef CONFIG_TCP_NODELAY
 	{
 		.name	= "nodelay",
+		.lname	= "No Delay",
 		.type	= FIO_OPT_BOOL,
 		.off1	= offsetof(struct netio_options, nodelay),
 		.help	= "Use TCP_NODELAY on TCP connections",
@@ -152,6 +154,7 @@
 	},
 	{
 		.name	= "pingpong",
+		.lname	= "Ping Pong",
 		.type	= FIO_OPT_STR_SET,
 		.off1	= offsetof(struct netio_options, pingpong),
 		.help	= "Ping-pong IO requests",
@@ -373,7 +376,7 @@
  */
 static int splice_in(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return splice_io_u(io_u->file->fd, nd->pipes[1], io_u->xfer_buflen);
 }
@@ -384,7 +387,7 @@
 static int splice_out(struct thread_data *td, struct io_u *io_u,
 		      unsigned int len)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return splice_io_u(nd->pipes[0], io_u->file->fd, len);
 }
@@ -422,7 +425,7 @@
 static int vmsplice_io_u_out(struct thread_data *td, struct io_u *io_u,
 			     unsigned int len)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return vmsplice_io_u(io_u, nd->pipes[0], len);
 }
@@ -432,7 +435,7 @@
  */
 static int vmsplice_io_u_in(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	return vmsplice_io_u(io_u, nd->pipes[1], io_u->xfer_buflen);
 }
@@ -523,7 +526,7 @@
 
 static int fio_netio_send(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret, flags = 0;
 
@@ -586,7 +589,7 @@
 
 static int fio_netio_recv(struct thread_data *td, struct io_u *io_u)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret, flags = 0;
 
@@ -644,7 +647,7 @@
 static int __fio_netio_queue(struct thread_data *td, struct io_u *io_u,
 			     enum fio_ddir ddir)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret;
 
@@ -710,7 +713,7 @@
 
 static int fio_netio_connect(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int type, domain;
 
@@ -825,7 +828,7 @@
 
 static int fio_netio_accept(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	socklen_t socklen;
 	int state;
@@ -877,7 +880,7 @@
 
 static void fio_netio_send_close(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
 	struct sockaddr *to;
@@ -912,7 +915,7 @@
 
 static int fio_netio_udp_recv_open(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
 	struct sockaddr *to;
@@ -946,7 +949,7 @@
 
 static int fio_netio_send_open(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct udp_close_msg msg;
 	struct sockaddr *to;
@@ -1048,7 +1051,7 @@
 static int fio_netio_setup_connect_inet(struct thread_data *td,
 					const char *host, unsigned short port)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct addrinfo *res = NULL;
 	void *dst, *src;
@@ -1098,7 +1101,7 @@
 static int fio_netio_setup_connect_unix(struct thread_data *td,
 					const char *path)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct sockaddr_un *soun = &nd->addr_un;
 
 	soun->sun_family = AF_UNIX;
@@ -1119,7 +1122,7 @@
 
 static int fio_netio_setup_listen_unix(struct thread_data *td, const char *path)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct sockaddr_un *addr = &nd->addr_un;
 	mode_t mode;
 	int len, fd;
@@ -1152,7 +1155,7 @@
 
 static int fio_netio_setup_listen_inet(struct thread_data *td, short port)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	struct ip_mreq mr;
 	struct sockaddr_in sin;
@@ -1215,7 +1218,7 @@
 			return 1;
 		}
 		if (is_ipv6(o)) {
-			log_err("fio: IPv6 not supported for multicast network IO");
+			log_err("fio: IPv6 not supported for multicast network IO\n");
 			close(fd);
 			return 1;
 		}
@@ -1268,7 +1271,7 @@
 
 static int fio_netio_setup_listen(struct thread_data *td)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 	struct netio_options *o = td->eo;
 	int ret;
 
@@ -1343,7 +1346,7 @@
 
 static void fio_netio_cleanup(struct thread_data *td)
 {
-	struct netio_data *nd = td->io_ops->data;
+	struct netio_data *nd = td->io_ops_data;
 
 	if (nd) {
 		if (nd->listenfd != -1)
@@ -1367,13 +1370,13 @@
 		td->o.open_files++;
 	}
 
-	if (!td->io_ops->data) {
-		nd = malloc(sizeof(*nd));;
+	if (!td->io_ops_data) {
+		nd = malloc(sizeof(*nd));
 
 		memset(nd, 0, sizeof(*nd));
 		nd->listenfd = -1;
 		nd->pipes[0] = nd->pipes[1] = -1;
-		td->io_ops->data = nd;
+		td->io_ops_data = nd;
 	}
 
 	return 0;
@@ -1391,7 +1394,7 @@
 
 	fio_netio_setup(td);
 
-	nd = td->io_ops->data;
+	nd = td->io_ops_data;
 	if (nd) {
 		if (pipe(nd->pipes) < 0)
 			return 1;

diff --git a/engines/null.c b/engines/null.c
index 6000930..812cadf 100644
--- a/engines/null.c
+++ b/engines/null.c

@@ -25,7 +25,7 @@
 
 static struct io_u *fio_null_event(struct thread_data *td, int event)
 {
-	struct null_data *nd = (struct null_data *) td->io_ops->data;
+	struct null_data *nd = (struct null_data *) td->io_ops_data;
 
 	return nd->io_us[event];
 }
@@ -34,7 +34,7 @@
 			      unsigned int fio_unused max,
 			      const struct timespec fio_unused *t)
 {
-	struct null_data *nd = (struct null_data *) td->io_ops->data;
+	struct null_data *nd = (struct null_data *) td->io_ops_data;
 	int ret = 0;
 	
 	if (min_events) {
@@ -47,7 +47,7 @@
 
 static int fio_null_commit(struct thread_data *td)
 {
-	struct null_data *nd = (struct null_data *) td->io_ops->data;
+	struct null_data *nd = (struct null_data *) td->io_ops_data;
 
 	if (!nd->events) {
 #ifndef FIO_EXTERNAL_ENGINE
@@ -62,7 +62,7 @@
 
 static int fio_null_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct null_data *nd = (struct null_data *) td->io_ops->data;
+	struct null_data *nd = (struct null_data *) td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -83,11 +83,10 @@
 
 static void fio_null_cleanup(struct thread_data *td)
 {
-	struct null_data *nd = (struct null_data *) td->io_ops->data;
+	struct null_data *nd = (struct null_data *) td->io_ops_data;
 
 	if (nd) {
-		if (nd->io_us)
-			free(nd->io_us);
+		free(nd->io_us);
 		free(nd);
 	}
 }
@@ -104,7 +103,7 @@
 	} else
 		td->io_ops->flags |= FIO_SYNCIO;
 
-	td->io_ops->data = nd;
+	td->io_ops_data = nd;
 	return 0;
 }
 
@@ -136,23 +135,21 @@
 
 #ifdef FIO_EXTERNAL_ENGINE
 extern "C" {
+static struct ioengine_ops ioengine;
 void get_ioengine(struct ioengine_ops **ioengine_ptr)
 {
-	struct ioengine_ops *ioengine;
+	*ioengine_ptr = &ioengine;
 
-	*ioengine_ptr = (struct ioengine_ops *) malloc(sizeof(struct ioengine_ops));
-	ioengine = *ioengine_ptr;
-
-	strcpy(ioengine->name, "cpp_null");
-	ioengine->version        = FIO_IOOPS_VERSION;
-	ioengine->queue          = fio_null_queue;
-	ioengine->commit         = fio_null_commit;
-	ioengine->getevents      = fio_null_getevents;
-	ioengine->event          = fio_null_event;
-	ioengine->init           = fio_null_init;
-	ioengine->cleanup        = fio_null_cleanup;
-	ioengine->open_file      = fio_null_open;
-	ioengine->flags	         = FIO_DISKLESSIO | FIO_FAKEIO;
+	ioengine.name           = "cpp_null";
+	ioengine.version        = FIO_IOOPS_VERSION;
+	ioengine.queue          = fio_null_queue;
+	ioengine.commit         = fio_null_commit;
+	ioengine.getevents      = fio_null_getevents;
+	ioengine.event          = fio_null_event;
+	ioengine.init           = fio_null_init;
+	ioengine.cleanup        = fio_null_cleanup;
+	ioengine.open_file      = fio_null_open;
+	ioengine.flags          = FIO_DISKLESSIO | FIO_FAKEIO;
 }
 }
 #endif /* FIO_EXTERNAL_ENGINE */

diff --git a/engines/pmemblk.c b/engines/pmemblk.c
new file mode 100644
index 0000000..52af9ed
--- /dev/null
+++ b/engines/pmemblk.c

@@ -0,0 +1,445 @@
+/*
+ * pmemblk: IO engine that uses NVML libpmemblk to read and write data
+ *
+ * Copyright (C) 2016 Hewlett Packard Enterprise Development LP
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License,
+ * version 2 as published by the Free Software Foundation..
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the Free
+ * Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307 USA
+ */
+
+/*
+ * pmemblk engine
+ *
+ * IO engine that uses libpmemblk to read and write data
+ *
+ * To use:
+ *   ioengine=pmemblk
+ *
+ * Other relevant settings:
+ *   thread=1   REQUIRED
+ *   iodepth=1
+ *   direct=1
+ *   unlink=1
+ *   filename=/mnt/pmem0/fiotestfile,BSIZE,FSIZEMiB
+ *
+ *   thread must be set to 1 for pmemblk as multiple processes cannot
+ *     open the same block pool file.
+ *
+ *   iodepth should be set to 1 as pmemblk is always synchronous.
+ *   Use numjobs to scale up.
+ *
+ *   direct=1 is implied as pmemblk is always direct. A warning message
+ *   is printed if this is not specified.
+ *
+ *   unlink=1 removes the block pool file after testing, and is optional.
+ *
+ *   The pmem device must have a DAX-capable filesystem and be mounted
+ *   with DAX enabled.  filename must point to a file on that filesystem.
+ *
+ *   Example:
+ *     mkfs.xfs /dev/pmem0
+ *     mkdir /mnt/pmem0
+ *     mount -o dax /dev/pmem0 /mnt/pmem0
+ *
+ *   When specifying the filename, if the block pool file does not already
+ *   exist, then the pmemblk engine creates the pool file if you specify
+ *   the block and file sizes.  BSIZE is the block size in bytes.
+ *   FSIZEMB is the pool file size in MiB.
+ *
+ *   See examples/pmemblk.fio for more.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <string.h>
+#include <libpmem.h>
+#include <libpmemblk.h>
+
+#include "../fio.h"
+
+/*
+ * libpmemblk
+ */
+typedef struct fio_pmemblk_file *fio_pmemblk_file_t;
+
+struct fio_pmemblk_file {
+	fio_pmemblk_file_t pmb_next;
+	char *pmb_filename;
+	uint64_t pmb_refcnt;
+	PMEMblkpool *pmb_pool;
+	size_t pmb_bsize;
+	size_t pmb_nblocks;
+};
+
+static fio_pmemblk_file_t Cache;
+
+static pthread_mutex_t CacheLock = PTHREAD_MUTEX_INITIALIZER;
+
+#define PMB_CREATE   (0x0001)	/* should create file */
+
+fio_pmemblk_file_t fio_pmemblk_cache_lookup(const char *filename)
+{
+	fio_pmemblk_file_t i;
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (!strcmp(filename, i->pmb_filename))
+			return i;
+
+	return NULL;
+}
+
+static void fio_pmemblk_cache_insert(fio_pmemblk_file_t pmb)
+{
+	pmb->pmb_next = Cache;
+	Cache = pmb;
+}
+
+static void fio_pmemblk_cache_remove(fio_pmemblk_file_t pmb)
+{
+	fio_pmemblk_file_t i;
+
+	if (pmb == Cache) {
+		Cache = Cache->pmb_next;
+		pmb->pmb_next = NULL;
+		return;
+	}
+
+	for (i = Cache; i != NULL; i = i->pmb_next)
+		if (pmb == i->pmb_next) {
+			i->pmb_next = i->pmb_next->pmb_next;
+			pmb->pmb_next = NULL;
+			return;
+		}
+}
+
+/*
+ * to control block size and gross file size at the libpmemblk
+ * level, we allow the block size and file size to be appended
+ * to the file name:
+ *
+ *   path[,bsize,fsizemib]
+ *
+ * note that we do not use the fio option "filesize" to dictate
+ * the file size because we can only give libpmemblk the gross
+ * file size, which is different from the net or usable file
+ * size (which is probably what fio wants).
+ *
+ * the final path without the parameters is returned in ppath.
+ * the block size and file size are returned in pbsize and fsize.
+ *
+ * note that the user specifies the file size in MiB, but
+ * we return bytes from here.
+ */
+static void pmb_parse_path(const char *pathspec, char **ppath, uint64_t *pbsize,
+			   uint64_t *pfsize)
+{
+	char *path;
+	char *s;
+	uint64_t bsize;
+	uint64_t fsizemib;
+
+	path = strdup(pathspec);
+	if (!path) {
+		*ppath = NULL;
+		return;
+	}
+
+	/* extract sizes, if given */
+	s = strrchr(path, ',');
+	if (s && (fsizemib = strtoull(s + 1, NULL, 10))) {
+		*s = 0;
+		s = strrchr(path, ',');
+		if (s && (bsize = strtoull(s + 1, NULL, 10))) {
+			*s = 0;
+			*ppath = path;
+			*pbsize = bsize;
+			*pfsize = fsizemib << 20;
+			return;
+		}
+	}
+
+	/* size specs not found */
+	strcpy(path, pathspec);
+	*ppath = path;
+	*pbsize = 0;
+	*pfsize = 0;
+}
+
+static fio_pmemblk_file_t pmb_open(const char *pathspec, int flags)
+{
+	fio_pmemblk_file_t pmb;
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	pmb_parse_path(pathspec, &path, &bsize, &fsize);
+	if (!path)
+		return NULL;
+
+	pthread_mutex_lock(&CacheLock);
+
+	pmb = fio_pmemblk_cache_lookup(path);
+	if (!pmb) {
+		pmb = malloc(sizeof(*pmb));
+		if (!pmb)
+			goto error;
+
+		/* try opening existing first, create it if needed */
+		pmb->pmb_pool = pmemblk_open(path, bsize);
+		if (!pmb->pmb_pool && (errno == ENOENT) &&
+		    (flags & PMB_CREATE) && (0 < fsize) && (0 < bsize)) {
+			pmb->pmb_pool =
+			    pmemblk_create(path, bsize, fsize, 0644);
+		}
+		if (!pmb->pmb_pool) {
+			log_err("pmemblk: unable to open pmemblk pool file %s (%s)\n",
+			     path, strerror(errno));
+			goto error;
+		}
+
+		pmb->pmb_filename = path;
+		pmb->pmb_next = NULL;
+		pmb->pmb_refcnt = 0;
+		pmb->pmb_bsize = pmemblk_bsize(pmb->pmb_pool);
+		pmb->pmb_nblocks = pmemblk_nblock(pmb->pmb_pool);
+
+		fio_pmemblk_cache_insert(pmb);
+	}
+
+	pmb->pmb_refcnt += 1;
+
+	pthread_mutex_unlock(&CacheLock);
+
+	return pmb;
+
+error:
+	if (pmb) {
+		if (pmb->pmb_pool)
+			pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		pmb->pmb_filename = NULL;
+		free(pmb);
+	}
+	if (path)
+		free(path);
+
+	pthread_mutex_unlock(&CacheLock);
+	return NULL;
+}
+
+static void pmb_close(fio_pmemblk_file_t pmb, const bool keep)
+{
+	pthread_mutex_lock(&CacheLock);
+
+	pmb->pmb_refcnt--;
+
+	if (!keep && !pmb->pmb_refcnt) {
+		pmemblk_close(pmb->pmb_pool);
+		pmb->pmb_pool = NULL;
+		free(pmb->pmb_filename);
+		pmb->pmb_filename = NULL;
+		fio_pmemblk_cache_remove(pmb);
+		free(pmb);
+	}
+
+	pthread_mutex_unlock(&CacheLock);
+}
+
+static int pmb_get_flags(struct thread_data *td, uint64_t *pflags)
+{
+	static int thread_warned = 0;
+	static int odirect_warned = 0;
+
+	uint64_t flags = 0;
+
+	if (!td->o.use_thread) {
+		if (!thread_warned) {
+			thread_warned = 1;
+			log_err("pmemblk: must set thread=1 for pmemblk engine\n");
+		}
+		return 1;
+	}
+
+	if (!td->o.odirect && !odirect_warned) {
+		odirect_warned = 1;
+		log_info("pmemblk: direct == 0, but pmemblk is always direct\n");
+	}
+
+	if (td->o.allow_create)
+		flags |= PMB_CREATE;
+
+	(*pflags) = flags;
+	return 0;
+}
+
+static int fio_pmemblk_open_file(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb;
+
+	if (pmb_get_flags(td, &flags))
+		return 1;
+
+	pmb = pmb_open(f->file_name, flags);
+	if (!pmb)
+		return 1;
+
+	FILE_SET_ENG_DATA(f, pmb);
+	return 0;
+}
+
+static int fio_pmemblk_close_file(struct thread_data fio_unused *td,
+				  struct fio_file *f)
+{
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (pmb)
+		pmb_close(pmb, false);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	return 0;
+}
+
+static int fio_pmemblk_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	uint64_t flags = 0;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (!pmb) {
+		if (pmb_get_flags(td, &flags))
+			return 1;
+		pmb = pmb_open(f->file_name, flags);
+		if (!pmb)
+			return 1;
+	}
+
+	f->real_file_size = pmb->pmb_bsize * pmb->pmb_nblocks;
+
+	fio_file_set_size_known(f);
+
+	if (!FILE_ENG_DATA(f))
+		pmb_close(pmb, true);
+
+	return 0;
+}
+
+static int fio_pmemblk_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	fio_pmemblk_file_t pmb = FILE_ENG_DATA(f);
+
+	unsigned long long off;
+	unsigned long len;
+	void *buf;
+
+	fio_ro_check(td, io_u);
+
+	switch (io_u->ddir) {
+	case DDIR_READ:
+	case DDIR_WRITE:
+		off = io_u->offset;
+		len = io_u->xfer_buflen;
+
+		io_u->error = EINVAL;
+		if (off % pmb->pmb_bsize)
+			break;
+		if (len % pmb->pmb_bsize)
+			break;
+		if ((off + len) / pmb->pmb_bsize > pmb->pmb_nblocks)
+			break;
+
+		io_u->error = 0;
+		buf = io_u->xfer_buf;
+		off /= pmb->pmb_bsize;
+		len /= pmb->pmb_bsize;
+		while (0 < len) {
+			if (io_u->ddir == DDIR_READ &&
+			   0 != pmemblk_read(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			} else if (0 != pmemblk_write(pmb->pmb_pool, buf, off)) {
+				io_u->error = errno;
+				break;
+			}
+			buf += pmb->pmb_bsize;
+			off++;
+			len--;
+		}
+		off *= pmb->pmb_bsize;
+		len *= pmb->pmb_bsize;
+		io_u->resid = io_u->xfer_buflen - (off - io_u->offset);
+		break;
+	case DDIR_SYNC:
+	case DDIR_DATASYNC:
+	case DDIR_SYNC_FILE_RANGE:
+		/* we're always sync'd */
+		io_u->error = 0;
+		break;
+	default:
+		io_u->error = EINVAL;
+		break;
+	}
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_pmemblk_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	char *path = NULL;
+	uint64_t bsize = 0;
+	uint64_t fsize = 0;
+
+	/*
+	 * we need our own unlink in case the user has specified
+	 * the block and file sizes in the path name.  we parse
+	 * the file_name to determine the file name we actually used.
+	 */
+
+	pmb_parse_path(f->file_name, &path, &bsize, &fsize);
+	if (!path)
+		return ENOENT;
+
+	unlink(path);
+	free(path);
+	return 0;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "pmemblk",
+	.version = FIO_IOOPS_VERSION,
+	.queue = fio_pmemblk_queue,
+	.open_file = fio_pmemblk_open_file,
+	.close_file = fio_pmemblk_close_file,
+	.get_file_size = fio_pmemblk_get_file_size,
+	.unlink_file = fio_pmemblk_unlink_file,
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO | FIO_NOEXTEND | FIO_NODISKUTIL,
+};
+
+static void fio_init fio_pmemblk_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_pmemblk_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}

diff --git a/engines/posixaio.c b/engines/posixaio.c
index 8ab88fb..bddb1ec 100644
--- a/engines/posixaio.c
+++ b/engines/posixaio.c

@@ -93,7 +93,7 @@
 static int fio_posixaio_getevents(struct thread_data *td, unsigned int min,
 				  unsigned int max, const struct timespec *t)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 	os_aiocb_t *suspend_list[SUSPEND_ENTRIES];
 	struct timespec start;
 	int have_timeout = 0;
@@ -109,7 +109,7 @@
 
 	r = 0;
 restart:
-	memset(suspend_list, 0, sizeof(*suspend_list));
+	memset(suspend_list, 0, sizeof(suspend_list));
 	suspend_entries = 0;
 	io_u_qiter(&td->io_u_all, io_u, i) {
 		int err;
@@ -161,7 +161,7 @@
 
 static struct io_u *fio_posixaio_event(struct thread_data *td, int event)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 
 	return pd->aio_events[event];
 }
@@ -169,7 +169,7 @@
 static int fio_posixaio_queue(struct thread_data *td,
 			      struct io_u *io_u)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 	os_aiocb_t *aiocb = &io_u->aiocb;
 	int ret;
 
@@ -198,7 +198,7 @@
 	}
 
 	if (ret) {
-		int aio_err = aio_error(aiocb);
+		int aio_err = errno;
 
 		/*
 		 * At least OSX has a very low limit on the number of pending
@@ -220,7 +220,7 @@
 
 static void fio_posixaio_cleanup(struct thread_data *td)
 {
-	struct posixaio_data *pd = td->io_ops->data;
+	struct posixaio_data *pd = td->io_ops_data;
 
 	if (pd) {
 		free(pd->aio_events);
@@ -236,7 +236,7 @@
 	pd->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
 	memset(pd->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
 
-	td->io_ops->data = pd;
+	td->io_ops_data = pd;
 	return 0;
 }
 

diff --git a/engines/rbd.c b/engines/rbd.c
index 3688577..4bae425 100644
--- a/engines/rbd.c
+++ b/engines/rbd.c

@@ -8,12 +8,25 @@
 #include <rbd/librbd.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
+#ifdef CONFIG_RBD_BLKIN
+#include <zipkin_c.h>
+#endif
+
+#ifdef CONFIG_RBD_POLL
+/* add for poll */
+#include <poll.h>
+#include <sys/eventfd.h>
+#endif
 
 struct fio_rbd_iou {
 	struct io_u *io_u;
 	rbd_completion_t completion;
 	int io_seen;
 	int io_complete;
+#ifdef CONFIG_RBD_BLKIN
+	struct blkin_trace_info info;
+#endif
 };
 
 struct rbd_data {
@@ -22,10 +35,13 @@
 	rbd_image_t image;
 	struct io_u **aio_events;
 	struct io_u **sort_events;
+	int fd; /* add for poll */
+	bool connected;
 };
 
 struct rbd_options {
 	void *pad;
+	char *cluster_name;
 	char *rbd_name;
 	char *pool_name;
 	char *client_name;
@@ -33,6 +49,15 @@
 };
 
 static struct fio_option options[] = {
+        {
+		.name		= "clustername",
+		.lname		= "ceph cluster name",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Cluster name for ceph",
+		.off1		= offsetof(struct rbd_options, cluster_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+        },
 	{
 		.name		= "rbdname",
 		.lname		= "rbd engine rbdname",
@@ -80,13 +105,18 @@
 {
 	struct rbd_data *rbd;
 
-	if (td->io_ops->data)
+	if (td->io_ops_data)
 		return 0;
 
 	rbd = calloc(1, sizeof(struct rbd_data));
 	if (!rbd)
 		goto failed;
 
+	rbd->connected = false;
+
+	/* add for poll, init fd: -1 */
+	rbd->fd = -1;
+
 	rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
 	if (!rbd->aio_events)
 		goto failed;
@@ -99,19 +129,79 @@
 	return 0;
 
 failed:
-	if (rbd)
+	if (rbd) {
+		if (rbd->aio_events) 
+			free(rbd->aio_events);
+		if (rbd->sort_events)
+			free(rbd->sort_events);
 		free(rbd);
+	}
 	return 1;
 
 }
 
+#ifdef CONFIG_RBD_POLL
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	int r;
+
+	/* add for rbd poll */
+	rbd->fd = eventfd(0, EFD_NONBLOCK);
+	if (rbd->fd < 0) {
+		log_err("eventfd failed.\n");
+		return false;
+	}
+
+	r = rbd_set_image_notification(rbd->image, rbd->fd, EVENT_TYPE_EVENTFD);
+	if (r < 0) {
+		log_err("rbd_set_image_notification failed.\n");
+		close(rbd->fd);
+		rbd->fd = -1;
+		return false;
+	}
+
+	return true;
+}
+#else
+static bool _fio_rbd_setup_poll(struct rbd_data *rbd)
+{
+	return true;
+}
+#endif
+
 static int _fio_rbd_connect(struct thread_data *td)
 {
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 	struct rbd_options *o = td->eo;
 	int r;
 
-	r = rados_create(&rbd->cluster, o->client_name);
+	if (o->cluster_name) {
+		char *client_name = NULL; 
+
+		/*
+		 * If we specify cluser name, the rados_create2
+		 * will not assume 'client.'. name is considered
+		 * as a full type.id namestr
+		 */
+		if (o->client_name) {
+			if (!index(o->client_name, '.')) {
+				client_name = calloc(1, strlen("client.") +
+						    strlen(o->client_name) + 1);
+				strcat(client_name, "client.");
+				strcat(client_name, o->client_name);
+			} else {
+				client_name = o->client_name;
+			}
+		}
+
+		r = rados_create2(&rbd->cluster, o->cluster_name,
+				 client_name, 0);
+
+		if (client_name && !index(o->client_name, '.'))
+			free(client_name);
+	} else
+		r = rados_create(&rbd->cluster, o->client_name);
+	
 	if (r < 0) {
 		log_err("rados_create failed.\n");
 		goto failed_early;
@@ -140,8 +230,15 @@
 		log_err("rbd_open failed.\n");
 		goto failed_open;
 	}
+
+	if (!_fio_rbd_setup_poll(rbd))
+		goto failed_poll;
+
 	return 0;
 
+failed_poll:
+	rbd_close(rbd->image);
+	rbd->image = NULL;
 failed_open:
 	rados_ioctx_destroy(rbd->io_ctx);
 	rbd->io_ctx = NULL;
@@ -157,6 +254,12 @@
 	if (!rbd)
 		return;
 
+	/* close eventfd */
+	if (rbd->fd != -1) {
+		close(rbd->fd);
+		rbd->fd = -1;
+	}
+
 	/* shutdown everything */
 	if (rbd->image) {
 		rbd_close(rbd->image);
@@ -185,19 +288,19 @@
 	 * a specific error. So we have to assume that it can't do
 	 * partial completions.
 	 */
-	fri->io_complete = 1;
-	
 	ret = rbd_aio_get_return_value(fri->completion);
 	if (ret < 0) {
-		io_u->error = ret;
+		io_u->error = -ret;
 		io_u->resid = io_u->xfer_buflen;
 	} else
 		io_u->error = 0;
+
+	fri->io_complete = 1;
 }
 
 static struct io_u *fio_rbd_event(struct thread_data *td, int event)
 {
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 
 	return rbd->aio_events[event];
 }
@@ -253,13 +356,35 @@
 static int rbd_iter_events(struct thread_data *td, unsigned int *events,
 			   unsigned int min_evts, int wait)
 {
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 	unsigned int this_events = 0;
 	struct io_u *io_u;
-	int i, sidx;
+	int i, sidx = 0;
 
-	sidx = 0;
+#ifdef CONFIG_RBD_POLL
+	int ret = 0;
+	int event_num = 0;
+	struct fio_rbd_iou *fri = NULL;
+	rbd_completion_t comps[min_evts];
+
+	struct pollfd pfd;
+	pfd.fd = rbd->fd;
+	pfd.events = POLLIN;
+
+	ret = poll(&pfd, 1, -1);
+	if (ret <= 0)
+		return 0;
+
+	assert(pfd.revents & POLLIN);
+
+	event_num = rbd_poll_io_events(rbd->image, comps, min_evts);
+
+	for (i = 0; i < event_num; i++) {
+		fri = rbd_aio_get_arg(comps[i]);
+		io_u = fri->io_u;
+#else
 	io_u_qiter(&td->io_u_all, io_u, i) {
+#endif
 		if (!(io_u->flags & IO_U_F_FLIGHT))
 			continue;
 		if (rbd_io_u_seen(io_u))
@@ -332,7 +457,7 @@
 
 static int fio_rbd_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 	struct fio_rbd_iou *fri = io_u->engine_data;
 	int r = -1;
 
@@ -349,16 +474,28 @@
 	}
 
 	if (io_u->ddir == DDIR_WRITE) {
+#ifdef CONFIG_RBD_BLKIN
+		blkin_init_trace_info(&fri->info);
+		r = rbd_aio_write_traced(rbd->image, io_u->offset, io_u->xfer_buflen,
+					 io_u->xfer_buf, fri->completion, &fri->info);
+#else
 		r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
 					 io_u->xfer_buf, fri->completion);
+#endif
 		if (r < 0) {
 			log_err("rbd_aio_write failed.\n");
 			goto failed_comp;
 		}
 
 	} else if (io_u->ddir == DDIR_READ) {
+#ifdef CONFIG_RBD_BLKIN
+		blkin_init_trace_info(&fri->info);
+		r = rbd_aio_read_traced(rbd->image, io_u->offset, io_u->xfer_buflen,
+					io_u->xfer_buf, fri->completion, &fri->info);
+#else
 		r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
 					io_u->xfer_buf, fri->completion);
+#endif
 
 		if (r < 0) {
 			log_err("rbd_aio_read failed.\n");
@@ -387,7 +524,7 @@
 failed_comp:
 	rbd_aio_release(fri->completion);
 failed:
-	io_u->error = r;
+	io_u->error = -r;
 	td_verror(td, io_u->error, "xfer");
 	return FIO_Q_COMPLETED;
 }
@@ -395,6 +532,10 @@
 static int fio_rbd_init(struct thread_data *td)
 {
 	int r;
+	struct rbd_data *rbd = td->io_ops_data;
+
+	if (rbd->connected)
+		return 0;
 
 	r = _fio_rbd_connect(td);
 	if (r) {
@@ -410,7 +551,7 @@
 
 static void fio_rbd_cleanup(struct thread_data *td)
 {
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 
 	if (rbd) {
 		_fio_rbd_disconnect(rbd);
@@ -425,20 +566,15 @@
 	rbd_image_info_t info;
 	struct fio_file *f;
 	struct rbd_data *rbd = NULL;
-	int major, minor, extra;
 	int r;
 
-	/* log version of librbd. No cluster connection required. */
-	rbd_version(&major, &minor, &extra);
-	log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra);
-
 	/* allocate engine specific structure to deal with librbd. */
 	r = _fio_setup_rbd_data(td, &rbd);
 	if (r) {
 		log_err("fio_setup_rbd_data failed.\n");
 		goto cleanup;
 	}
-	td->io_ops->data = rbd;
+	td->io_ops_data = rbd;
 
 	/* librbd does not allow us to run first in the main thread and later
 	 * in a fork child. It needs to be the same process context all the
@@ -455,13 +591,19 @@
 		log_err("fio_rbd_connect failed.\n");
 		goto cleanup;
 	}
+	rbd->connected = true;
 
 	/* get size of the RADOS block device */
 	r = rbd_stat(rbd->image, &info, sizeof(info));
 	if (r < 0) {
 		log_err("rbd_status failed.\n");
-		goto disconnect;
+		goto cleanup;
+	} else if (info.size == 0) {
+		log_err("image size should be larger than zero.\n");
+		r = -EINVAL;
+		goto cleanup;
 	}
+
 	dprint(FD_IO, "rbd-engine: image size: %lu\n", info.size);
 
 	/* taken from "net" engine. Pretend we deal with files,
@@ -476,14 +618,8 @@
 	f = td->files[0];
 	f->real_file_size = info.size;
 
-	/* disconnect, then we were only connected to determine
-	 * the size of the RBD.
-	 */
-	_fio_rbd_disconnect(rbd);
 	return 0;
 
-disconnect:
-	_fio_rbd_disconnect(rbd);
 cleanup:
 	fio_rbd_cleanup(td);
 	return r;
@@ -497,7 +633,7 @@
 static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
 {
 #if defined(CONFIG_RBD_INVAL)
-	struct rbd_data *rbd = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops_data;
 
 	return rbd_invalidate_cache(rbd->image);
 #else

diff --git a/engines/rdma.c b/engines/rdma.c
index 5081202..10e60dc 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c

@@ -41,6 +41,7 @@
 
 #include "../fio.h"
 #include "../hash.h"
+#include "../optgroup.h"
 
 #include <rdma/rdma_cma.h>
 #include <infiniband/arch.h>
@@ -55,6 +56,77 @@
 	FIO_RDMA_CHA_RECV
 };
 
+struct rdmaio_options {
+	struct thread_data *td;
+	unsigned int port;
+	enum rdma_io_mode verb;
+};
+
+static int str_hostname_cb(void *data, const char *input)
+{
+	struct rdmaio_options *o = data;
+
+	if (o->td->o.filename)
+		free(o->td->o.filename);
+	o->td->o.filename = strdup(input);
+	return 0;
+}
+
+static struct fio_option options[] = {
+	{
+		.name	= "hostname",
+		.lname	= "rdma engine hostname",
+		.type	= FIO_OPT_STR_STORE,
+		.cb	= str_hostname_cb,
+		.help	= "Hostname for RDMA IO engine",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "port",
+		.lname	= "rdma engine port",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct rdmaio_options, port),
+		.minval	= 1,
+		.maxval	= 65535,
+		.help	= "Port to use for RDMA connections",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "verb",
+		.lname	= "RDMA engine verb",
+		.alias	= "proto",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct rdmaio_options, verb),
+		.help	= "RDMA engine verb",
+		.def	= "write",
+		.posval = {
+			  { .ival = "write",
+			    .oval = FIO_RDMA_MEM_WRITE,
+			    .help = "Memory Write",
+			  },
+			  { .ival = "read",
+			    .oval = FIO_RDMA_MEM_READ,
+			    .help = "Memory Read",
+			  },
+			  { .ival = "send",
+			    .oval = FIO_RDMA_CHA_SEND,
+			    .help = "Posted Send",
+			  },
+			  { .ival = "recv",
+			    .oval = FIO_RDMA_CHA_RECV,
+			    .help = "Posted Receive",
+			  },
+		},
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
 struct remote_u {
 	uint64_t buf;
 	uint32_t rkey;
@@ -66,6 +138,7 @@
 	uint32_t nr;		/* client: io depth
 				   server: number of records for memory semantic
 				 */
+	uint32_t max_bs;        /* maximum block size */
 	struct remote_u rmt_us[FIO_RDMA_MAX_IO_DEPTH];
 };
 
@@ -118,13 +191,22 @@
 
 static int client_recv(struct thread_data *td, struct ibv_wc *wc)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
 
 	if (wc->byte_len != sizeof(rd->recv_buf)) {
 		log_err("Received bogus data, size %d\n", wc->byte_len);
 		return 1;
 	}
 
+	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	if (max_bs > ntohl(rd->recv_buf.max_bs)) {
+		log_err("fio: Server's block size (%d) must be greater than or "
+			"equal to the client's block size (%d)!\n",
+			ntohl(rd->recv_buf.max_bs), max_bs);
+		return 1;
+	}
+
 	/* store mr info for MEMORY semantic */
 	if ((rd->rdma_protocol == FIO_RDMA_MEM_WRITE) ||
 	    (rd->rdma_protocol == FIO_RDMA_MEM_READ)) {
@@ -150,7 +232,8 @@
 
 static int server_recv(struct thread_data *td, struct ibv_wc *wc)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	unsigned int max_bs;
 
 	if (wc->wr_id == FIO_RDMA_MAX_IO_DEPTH) {
 		rd->rdma_protocol = ntohl(rd->recv_buf.mode);
@@ -158,6 +241,15 @@
 		/* CHANNEL semantic, do nothing */
 		if (rd->rdma_protocol == FIO_RDMA_CHA_SEND)
 			rd->rdma_protocol = FIO_RDMA_CHA_RECV;
+
+		max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+		if (max_bs < ntohl(rd->recv_buf.max_bs)) {
+			log_err("fio: Server's block size (%d) must be greater than or "
+				"equal to the client's block size (%d)!\n",
+				ntohl(rd->recv_buf.max_bs), max_bs);
+			return 1;
+		}
+
 	}
 
 	return 0;
@@ -165,7 +257,7 @@
 
 static int cq_event_handler(struct thread_data *td, enum ibv_wc_opcode opcode)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_wc wc;
 	struct rdma_io_u_data *r_io_u_d;
 	int ret;
@@ -186,9 +278,12 @@
 
 		case IBV_WC_RECV:
 			if (rd->is_client == 1)
-				client_recv(td, &wc);
+				ret = client_recv(td, &wc);
 			else
-				server_recv(td, &wc);
+				ret = server_recv(td, &wc);
+
+			if (ret)
+				return -1;
 
 			if (wc.wr_id == FIO_RDMA_MAX_IO_DEPTH)
 				break;
@@ -258,6 +353,7 @@
 		}
 		rd->cq_event_num++;
 	}
+
 	if (ret) {
 		log_err("fio: poll error %d\n", ret);
 		return 1;
@@ -272,7 +368,7 @@
  */
 static int rdma_poll_wait(struct thread_data *td, enum ibv_wc_opcode opcode)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
 	int ret;
@@ -297,7 +393,7 @@
 	}
 
 	ret = cq_event_handler(td, opcode);
-	if (ret < 1)
+	if (ret == 0)
 		goto again;
 
 	ibv_ack_cq_events(rd->cq, ret);
@@ -309,7 +405,7 @@
 
 static int fio_rdmaio_setup_qp(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_qp_init_attr init_attr;
 	int qp_depth = td->o.iodepth * 2;	/* 2 times of io depth */
 
@@ -319,7 +415,7 @@
 		rd->pd = ibv_alloc_pd(rd->cm_id->verbs);
 
 	if (rd->pd == NULL) {
-		log_err("fio: ibv_alloc_pd fail\n");
+		log_err("fio: ibv_alloc_pd fail: %m\n");
 		return 1;
 	}
 
@@ -328,7 +424,7 @@
 	else
 		rd->channel = ibv_create_comp_channel(rd->cm_id->verbs);
 	if (rd->channel == NULL) {
-		log_err("fio: ibv_create_comp_channel fail\n");
+		log_err("fio: ibv_create_comp_channel fail: %m\n");
 		goto err1;
 	}
 
@@ -342,12 +438,12 @@
 		rd->cq = ibv_create_cq(rd->cm_id->verbs,
 				       qp_depth, rd, rd->channel, 0);
 	if (rd->cq == NULL) {
-		log_err("fio: ibv_create_cq failed\n");
+		log_err("fio: ibv_create_cq failed: %m\n");
 		goto err2;
 	}
 
 	if (ibv_req_notify_cq(rd->cq, 0) != 0) {
-		log_err("fio: ibv_create_cq failed\n");
+		log_err("fio: ibv_req_notify_cq failed: %m\n");
 		goto err3;
 	}
 
@@ -363,13 +459,13 @@
 
 	if (rd->is_client == 0) {
 		if (rdma_create_qp(rd->child_cm_id, rd->pd, &init_attr) != 0) {
-			log_err("fio: rdma_create_qp failed\n");
+			log_err("fio: rdma_create_qp failed: %m\n");
 			goto err3;
 		}
 		rd->qp = rd->child_cm_id->qp;
 	} else {
 		if (rdma_create_qp(rd->cm_id, rd->pd, &init_attr) != 0) {
-			log_err("fio: rdma_create_qp failed\n");
+			log_err("fio: rdma_create_qp failed: %m\n");
 			goto err3;
 		}
 		rd->qp = rd->cm_id->qp;
@@ -389,19 +485,19 @@
 
 static int fio_rdmaio_setup_control_msg_buffers(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	rd->recv_mr = ibv_reg_mr(rd->pd, &rd->recv_buf, sizeof(rd->recv_buf),
 				 IBV_ACCESS_LOCAL_WRITE);
 	if (rd->recv_mr == NULL) {
-		log_err("fio: recv_buf reg_mr failed\n");
+		log_err("fio: recv_buf reg_mr failed: %m\n");
 		return 1;
 	}
 
 	rd->send_mr = ibv_reg_mr(rd->pd, &rd->send_buf, sizeof(rd->send_buf),
 				 0);
 	if (rd->send_mr == NULL) {
-		log_err("fio: send_buf reg_mr failed\n");
+		log_err("fio: send_buf reg_mr failed: %m\n");
 		ibv_dereg_mr(rd->recv_mr);
 		return 1;
 	}
@@ -433,7 +529,7 @@
 				  struct rdma_event_channel *channel,
 				  enum rdma_cm_event_type wait_event)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_cm_event *event;
 	int ret;
 
@@ -465,7 +561,7 @@
 
 static int fio_rdmaio_prep(struct thread_data *td, struct io_u *io_u)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_io_u_data *r_io_u_d;
 
 	r_io_u_d = io_u->engine_data;
@@ -508,7 +604,7 @@
 
 static struct io_u *fio_rdmaio_event(struct thread_data *td, int event)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct io_u *io_u;
 	int i;
 
@@ -526,7 +622,7 @@
 static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
 				unsigned int max, const struct timespec *t)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	enum ibv_wc_opcode comp_opcode;
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
@@ -588,7 +684,7 @@
 static int fio_rdmaio_send(struct thread_data *td, struct io_u **io_us,
 			   unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_send_wr *bad_wr;
 #if 0
 	enum ibv_wc_opcode comp_opcode;
@@ -635,7 +731,7 @@
 		}
 
 		if (ibv_post_send(rd->qp, &r_io_u_d->sq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_send fail\n");
+			log_err("fio: ibv_post_send fail: %m\n");
 			return -1;
 		}
 
@@ -651,7 +747,7 @@
 static int fio_rdmaio_recv(struct thread_data *td, struct io_u **io_us,
 			   unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_recv_wr *bad_wr;
 	struct rdma_io_u_data *r_io_u_d;
 	int i;
@@ -663,7 +759,7 @@
 			r_io_u_d = io_us[i]->engine_data;
 			if (ibv_post_recv(rd->qp, &r_io_u_d->rq_wr, &bad_wr) !=
 			    0) {
-				log_err("fio: ibv_post_recv fail\n");
+				log_err("fio: ibv_post_recv fail: %m\n");
 				return 1;
 			}
 		}
@@ -671,7 +767,7 @@
 		   || (rd->rdma_protocol == FIO_RDMA_MEM_WRITE)) {
 		/* re-post the rq_wr */
 		if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_recv fail\n");
+			log_err("fio: ibv_post_recv fail: %m\n");
 			return 1;
 		}
 
@@ -687,7 +783,7 @@
 
 static int fio_rdmaio_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -705,7 +801,7 @@
 static void fio_rdmaio_queued(struct thread_data *td, struct io_u **io_us,
 			      unsigned int nr)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct timeval now;
 	unsigned int i;
 
@@ -728,7 +824,7 @@
 
 static int fio_rdmaio_commit(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct io_u **io_us;
 	int ret;
 
@@ -760,7 +856,7 @@
 
 static int fio_rdmaio_connect(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_conn_param conn_param;
 	struct ibv_send_wr *bad_wr;
 
@@ -770,7 +866,7 @@
 	conn_param.retry_count = 10;
 
 	if (rdma_connect(rd->cm_id, &conn_param) != 0) {
-		log_err("fio: rdma_connect fail\n");
+		log_err("fio: rdma_connect fail: %m\n");
 		return 1;
 	}
 
@@ -785,14 +881,16 @@
 	rd->send_buf.nr = htonl(td->o.iodepth);
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
-	rdma_poll_wait(td, IBV_WC_SEND);
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
 
 	/* wait for remote MR info from server side */
-	rdma_poll_wait(td, IBV_WC_RECV);
+	if (rdma_poll_wait(td, IBV_WC_RECV) < 0)
+		return 1;
 
 	/* In SEND/RECV test, it's a good practice to setup the iodepth of
 	 * of the RECV side deeper than that of the SEND side to
@@ -809,9 +907,10 @@
 
 static int fio_rdmaio_accept(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct rdma_conn_param conn_param;
 	struct ibv_send_wr *bad_wr;
+	int ret = 0;
 
 	/* rdma_accept() - then wait for accept success */
 	memset(&conn_param, 0, sizeof(conn_param));
@@ -819,7 +918,7 @@
 	conn_param.initiator_depth = 1;
 
 	if (rdma_accept(rd->child_cm_id, &conn_param) != 0) {
-		log_err("fio: rdma_accept\n");
+		log_err("fio: rdma_accept: %m\n");
 		return 1;
 	}
 
@@ -830,16 +929,17 @@
 	}
 
 	/* wait for request */
-	rdma_poll_wait(td, IBV_WC_RECV);
+	ret = rdma_poll_wait(td, IBV_WC_RECV) < 0;
 
 	if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_send fail");
+		log_err("fio: ibv_post_send fail: %m\n");
 		return 1;
 	}
 
-	rdma_poll_wait(td, IBV_WC_SEND);
+	if (rdma_poll_wait(td, IBV_WC_SEND) < 0)
+		return 1;
 
-	return 0;
+	return ret;
 }
 
 static int fio_rdmaio_open_file(struct thread_data *td, struct fio_file *f)
@@ -852,7 +952,7 @@
 
 static int fio_rdmaio_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_send_wr *bad_wr;
 
 	/* unregister rdma buffer */
@@ -865,7 +965,7 @@
 				     || (rd->rdma_protocol ==
 					 FIO_RDMA_MEM_READ))) {
 		if (ibv_post_send(rd->qp, &rd->sq_wr, &bad_wr) != 0) {
-			log_err("fio: ibv_post_send fail");
+			log_err("fio: ibv_post_send fail: %m\n");
 			return 1;
 		}
 
@@ -908,7 +1008,7 @@
 static int fio_rdmaio_setup_connect(struct thread_data *td, const char *host,
 				    unsigned short port)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_recv_wr *bad_wr;
 	int err;
 
@@ -972,8 +1072,11 @@
 
 static int fio_rdmaio_setup_listen(struct thread_data *td, short port)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 	struct ibv_recv_wr *bad_wr;
+	int state = td->runstate;
+
+	td_set_runstate(td, TD_SETTING_UP);
 
 	rd->addr.sin_family = AF_INET;
 	rd->addr.sin_addr.s_addr = htonl(INADDR_ANY);
@@ -981,15 +1084,17 @@
 
 	/* rdma_listen */
 	if (rdma_bind_addr(rd->cm_id, (struct sockaddr *)&rd->addr) != 0) {
-		log_err("fio: rdma_bind_addr fail\n");
+		log_err("fio: rdma_bind_addr fail: %m\n");
 		return 1;
 	}
 
 	if (rdma_listen(rd->cm_id, 3) != 0) {
-		log_err("fio: rdma_listen fail\n");
+		log_err("fio: rdma_listen fail: %m\n");
 		return 1;
 	}
 
+	log_info("fio: waiting for connection\n");
+
 	/* wait for CONNECT_REQUEST */
 	if (get_next_channel_event
 	    (td, rd->cm_channel, RDMA_CM_EVENT_CONNECT_REQUEST) != 0) {
@@ -1005,10 +1110,11 @@
 
 	/* post recv buf */
 	if (ibv_post_recv(rd->qp, &rd->rq_wr, &bad_wr) != 0) {
-		log_err("fio: ibv_post_recv fail\n");
+		log_err("fio: ibv_post_recv fail: %m\n");
 		return 1;
 	}
 
+	td_set_runstate(td, state);
 	return 0;
 }
 
@@ -1046,13 +1152,64 @@
 	return 0;
 }
 
+static int compat_options(struct thread_data *td)
+{
+	// The original RDMA engine had an ugly / seperator
+	// on the filename for it's options. This function
+	// retains backwards compatibility with it.100
+
+	struct rdmaio_options *o = td->eo;
+	char *modep, *portp;
+	char *filename = td->o.filename;
+
+	if (!filename)
+		return 0;
+
+	portp = strchr(filename, '/');
+	if (portp == NULL)
+		return 0;
+
+	*portp = '\0';
+	portp++;
+
+	o->port = strtol(portp, NULL, 10);
+	if (!o->port || o->port > 65535)
+		goto bad_host;
+
+	modep = strchr(portp, '/');
+	if (modep != NULL) {
+		*modep = '\0';
+		modep++;
+	}
+
+	if (modep) {
+		if (!strncmp("rdma_write", modep, strlen(modep)) ||
+		    !strncmp("RDMA_WRITE", modep, strlen(modep)))
+			o->verb = FIO_RDMA_MEM_WRITE;
+		else if (!strncmp("rdma_read", modep, strlen(modep)) ||
+			 !strncmp("RDMA_READ", modep, strlen(modep)))
+			o->verb = FIO_RDMA_MEM_READ;
+		else if (!strncmp("send", modep, strlen(modep)) ||
+			 !strncmp("SEND", modep, strlen(modep)))
+			o->verb = FIO_RDMA_CHA_SEND;
+		else
+			goto bad_host;
+	} else
+		o->verb = FIO_RDMA_MEM_WRITE;
+
+
+	return 0;
+
+bad_host:
+	log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
+	return 1;
+}
+
 static int fio_rdmaio_init(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
+	struct rdmaio_options *o = td->eo;
 	unsigned int max_bs;
-	unsigned int port;
-	char host[64], buf[128];
-	char *sep, *portp, *modep;
 	int ret, i;
 
 	if (td_rw(td)) {
@@ -1064,59 +1221,30 @@
 		return 1;
 	}
 
+	if (compat_options(td))
+		return 1;
+
+	if (!o->port) {
+		log_err("fio: no port has been specified which is required "
+			"for the rdma engine\n");
+		return 1;
+	}
+
 	if (check_set_rlimits(td))
 		return 1;
 
-	strcpy(buf, td->o.filename);
-
-	sep = strchr(buf, '/');
-	if (!sep)
-		goto bad_host;
-
-	*sep = '\0';
-	sep++;
-	strcpy(host, buf);
-	if (!strlen(host))
-		goto bad_host;
-
-	modep = NULL;
-	portp = sep;
-	sep = strchr(portp, '/');
-	if (sep) {
-		*sep = '\0';
-		modep = sep + 1;
-	}
-
-	port = strtol(portp, NULL, 10);
-	if (!port || port > 65535)
-		goto bad_host;
-
-	if (modep) {
-		if (!strncmp("rdma_write", modep, strlen(modep)) ||
-		    !strncmp("RDMA_WRITE", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
-		else if (!strncmp("rdma_read", modep, strlen(modep)) ||
-			 !strncmp("RDMA_READ", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_MEM_READ;
-		else if (!strncmp("send", modep, strlen(modep)) ||
-			 !strncmp("SEND", modep, strlen(modep)))
-			rd->rdma_protocol = FIO_RDMA_CHA_SEND;
-		else
-			goto bad_host;
-	} else
-		rd->rdma_protocol = FIO_RDMA_MEM_WRITE;
-
+	rd->rdma_protocol = o->verb;
 	rd->cq_event_num = 0;
 
 	rd->cm_channel = rdma_create_event_channel();
 	if (!rd->cm_channel) {
-		log_err("fio: rdma_create_event_channel fail\n");
+		log_err("fio: rdma_create_event_channel fail: %m\n");
 		return 1;
 	}
 
 	ret = rdma_create_id(rd->cm_channel, &rd->cm_id, rd, RDMA_PS_TCP);
 	if (ret) {
-		log_err("fio: rdma_create_id fail\n");
+		log_err("fio: rdma_create_id fail: %m\n");
 		return 1;
 	}
 
@@ -1143,14 +1271,17 @@
 
 	if (td_read(td)) {	/* READ as the server */
 		rd->is_client = 0;
+		td->flags |= TD_F_NO_PROGRESS;
 		/* server rd->rdma_buf_len will be setup after got request */
-		ret = fio_rdmaio_setup_listen(td, port);
+		ret = fio_rdmaio_setup_listen(td, o->port);
 	} else {		/* WRITE as the client */
 		rd->is_client = 1;
-		ret = fio_rdmaio_setup_connect(td, host, port);
+		ret = fio_rdmaio_setup_connect(td, td->o.filename, o->port);
 	}
 
 	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	rd->send_buf.max_bs = htonl(max_bs);
+
 	/* register each io_u in the free list */
 	for (i = 0; i < td->io_u_freelist.nr; i++) {
 		struct io_u *io_u = td->io_u_freelist.io_us[i];
@@ -1164,7 +1295,7 @@
 				      IBV_ACCESS_REMOTE_READ |
 				      IBV_ACCESS_REMOTE_WRITE);
 		if (io_u->mr == NULL) {
-			log_err("fio: ibv_reg_mr io_u failed\n");
+			log_err("fio: ibv_reg_mr io_u failed: %m\n");
 			return 1;
 		}
 
@@ -1181,14 +1312,11 @@
 	rd->send_buf.nr = htonl(i);
 
 	return ret;
-bad_host:
-	log_err("fio: bad rdma host/port/protocol: %s\n", td->o.filename);
-	return 1;
 }
 
 static void fio_rdmaio_cleanup(struct thread_data *td)
 {
-	struct rdmaio_data *rd = td->io_ops->data;
+	struct rdmaio_data *rd = td->io_ops_data;
 
 	if (rd)
 		free(rd);
@@ -1198,31 +1326,39 @@
 {
 	struct rdmaio_data *rd;
 
-	if (!td->io_ops->data) {
+	if (!td->files_index) {
+		add_file(td, td->o.filename ?: "rdma", 0, 0);
+		td->o.nr_files = td->o.nr_files ?: 1;
+		td->o.open_files++;
+	}
+
+	if (!td->io_ops_data) {
 		rd = malloc(sizeof(*rd));
 
 		memset(rd, 0, sizeof(*rd));
-		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME);
-		td->io_ops->data = rd;
+		init_rand_seed(&rd->rand_state, (unsigned int) GOLDEN_RATIO_PRIME, 0);
+		td->io_ops_data = rd;
 	}
 
 	return 0;
 }
 
 static struct ioengine_ops ioengine_rw = {
-	.name		= "rdma",
-	.version	= FIO_IOOPS_VERSION,
-	.setup		= fio_rdmaio_setup,
-	.init		= fio_rdmaio_init,
-	.prep		= fio_rdmaio_prep,
-	.queue		= fio_rdmaio_queue,
-	.commit		= fio_rdmaio_commit,
-	.getevents	= fio_rdmaio_getevents,
-	.event		= fio_rdmaio_event,
-	.cleanup	= fio_rdmaio_cleanup,
-	.open_file	= fio_rdmaio_open_file,
-	.close_file	= fio_rdmaio_close_file,
-	.flags		= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.name			= "rdma",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rdmaio_setup,
+	.init			= fio_rdmaio_init,
+	.prep			= fio_rdmaio_prep,
+	.queue			= fio_rdmaio_queue,
+	.commit			= fio_rdmaio_commit,
+	.getevents		= fio_rdmaio_getevents,
+	.event			= fio_rdmaio_event,
+	.cleanup		= fio_rdmaio_cleanup,
+	.open_file		= fio_rdmaio_open_file,
+	.close_file		= fio_rdmaio_close_file,
+	.flags			= FIO_DISKLESSIO | FIO_UNIDIR | FIO_PIPEIO,
+	.options		= options,
+	.option_struct_size	= sizeof(struct rdmaio_options),
 };
 
 static void fio_init fio_rdmaio_register(void)

diff --git a/engines/sg.c b/engines/sg.c
index 6272b79..2148e87 100644
--- a/engines/sg.c
+++ b/engines/sg.c

@@ -15,8 +15,13 @@
 
 #ifdef FIO_HAVE_SGIO
 
+#define MAX_10B_LBA  0xFFFFFFFFULL
+#define SCSI_TIMEOUT_MS 30000   // 30 second timeout; currently no method to override
+#define MAX_SB 64               // sense block maximum return size
+
 struct sgio_cmd {
-	unsigned char cdb[10];
+	unsigned char cdb[16];      // enhanced from 10 to support 16 byte commands
+	unsigned char sb[MAX_SB];   // add sense block to commands
 	int nr;
 };
 
@@ -41,6 +46,8 @@
 	hdr->interface_id = 'S';
 	hdr->cmdp = sc->cdb;
 	hdr->cmd_len = sizeof(sc->cdb);
+	hdr->sbp = sc->sb;
+	hdr->mx_sb_len = sizeof(sc->sb);
 	hdr->pack_id = io_u->index;
 	hdr->usr_ptr = io_u;
 
@@ -61,12 +68,41 @@
 	return 0;
 }
 
+static int sg_fd_read(int fd, void *data, size_t size)
+{
+	int err = 0;
+
+	while (size) {
+		ssize_t ret;
+
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			err = errno;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (err)
+		return err;
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
 static int fio_sgio_getevents(struct thread_data *td, unsigned int min,
 			      unsigned int max,
 			      const struct timespec fio_unused *t)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	int left = max, ret, r = 0;
+	struct sgio_data *sd = td->io_ops_data;
+	int left = max, eventNum, ret, r = 0;
 	void *buf = sd->sgbuf;
 	unsigned int i, events;
 	struct fio_file *f;
@@ -90,6 +126,8 @@
 	while (left) {
 		void *p;
 
+		dprint(FD_IO, "sgio_getevents: sd %p: left=%d\n", sd, left);
+
 		do {
 			if (!min)
 				break;
@@ -114,20 +152,21 @@
 		p = buf;
 		events = 0;
 		for_each_file(td, f, i) {
-			ret = read(f->fd, p, left * sizeof(struct sg_io_hdr));
-			if (ret < 0) {
-				if (errno == EAGAIN)
-					continue;
-				r = -errno;
-				td_verror(td, errno, "read");
-				break;
-			} else if (ret) {
-				p += ret;
-				events += ret / sizeof(struct sg_io_hdr);
+			for (eventNum = 0; eventNum < left; eventNum++) {
+				ret = sg_fd_read(f->fd, p, sizeof(struct sg_io_hdr));
+				dprint(FD_IO, "sgio_getevents: ret: %d\n", ret);
+				if (ret) {
+					r = -ret;
+					td_verror(td, r, "sg_read");
+					break;
+				}
+				p += sizeof(struct sg_io_hdr);
+				events++;
+				dprint(FD_IO, "sgio_getevents: events: %d\n", events);
 			}
 		}
 
-		if (r < 0)
+		if (r < 0 && !events)
 			break;
 		if (!events) {
 			usleep(1000);
@@ -139,8 +178,15 @@
 
 		for (i = 0; i < events; i++) {
 			struct sg_io_hdr *hdr = (struct sg_io_hdr *) buf + i;
-
 			sd->events[i] = hdr->usr_ptr;
+
+			/* record if an io error occurred, ignore resid */
+			if (hdr->info & SG_INFO_CHECK) {
+				struct io_u *io_u;
+				io_u = (struct io_u *)(hdr->usr_ptr);
+				memcpy((void*)&(io_u->hdr), (void*)hdr, sizeof(struct sg_io_hdr));
+				sd->events[i]->error = EIO;
+			}
 		}
 	}
 
@@ -160,7 +206,7 @@
 static int fio_sgio_ioctl_doio(struct thread_data *td,
 			       struct fio_file *f, struct io_u *io_u)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 	struct sg_io_hdr *hdr = &io_u->hdr;
 	int ret;
 
@@ -170,6 +216,10 @@
 	if (ret < 0)
 		return ret;
 
+	/* record if an io error occurred */
+	if (hdr->info & SG_INFO_CHECK)
+		io_u->error = EIO;
+
 	return FIO_Q_COMPLETED;
 }
 
@@ -186,6 +236,11 @@
 		ret = read(f->fd, hdr, sizeof(*hdr));
 		if (ret < 0)
 			return ret;
+
+		/* record if an io error occurred */
+		if (hdr->info & SG_INFO_CHECK)
+			io_u->error = EIO;
+
 		return FIO_Q_COMPLETED;
 	}
 
@@ -195,52 +250,88 @@
 static int fio_sgio_doio(struct thread_data *td, struct io_u *io_u, int do_sync)
 {
 	struct fio_file *f = io_u->file;
+	int ret;
 
-	if (f->filetype == FIO_TYPE_BD)
-		return fio_sgio_ioctl_doio(td, f, io_u);
+	if (f->filetype == FIO_TYPE_BLOCK) {
+		ret = fio_sgio_ioctl_doio(td, f, io_u);
+		td->error = io_u->error;
+	} else {
+		ret = fio_sgio_rw_doio(f, io_u, do_sync);
+		if (do_sync)
+			td->error = io_u->error;
+	}
 
-	return fio_sgio_rw_doio(f, io_u, do_sync);
+	return ret;
 }
 
 static int fio_sgio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct sg_io_hdr *hdr = &io_u->hdr;
-	struct sgio_data *sd = td->io_ops->data;
-	int nr_blocks, lba;
+	struct sgio_data *sd = td->io_ops_data;
+	long long nr_blocks, lba;
 
 	if (io_u->xfer_buflen & (sd->bs - 1)) {
 		log_err("read/write not sector aligned\n");
 		return EINVAL;
 	}
 
+	nr_blocks = io_u->xfer_buflen / sd->bs;
+	lba = io_u->offset / sd->bs;
+
 	if (io_u->ddir == DDIR_READ) {
 		sgio_hdr_init(sd, hdr, io_u, 1);
 
 		hdr->dxfer_direction = SG_DXFER_FROM_DEV;
-		hdr->cmdp[0] = 0x28;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x28; // read(10)
+		else
+			hdr->cmdp[0] = 0x88; // read(16)
 	} else if (io_u->ddir == DDIR_WRITE) {
 		sgio_hdr_init(sd, hdr, io_u, 1);
 
 		hdr->dxfer_direction = SG_DXFER_TO_DEV;
-		hdr->cmdp[0] = 0x2a;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x2a; // write(10)
+		else
+			hdr->cmdp[0] = 0x8a; // write(16)
 	} else {
 		sgio_hdr_init(sd, hdr, io_u, 0);
-
 		hdr->dxfer_direction = SG_DXFER_NONE;
-		hdr->cmdp[0] = 0x35;
+		if (lba < MAX_10B_LBA)
+			hdr->cmdp[0] = 0x35; // synccache(10)
+		else
+			hdr->cmdp[0] = 0x91; // synccache(16)
 	}
 
+	/*
+	 * for synccache, we leave lba and length to 0 to sync all
+	 * blocks on medium.
+	 */
 	if (hdr->dxfer_direction != SG_DXFER_NONE) {
-		nr_blocks = io_u->xfer_buflen / sd->bs;
-		lba = io_u->offset / sd->bs;
-		hdr->cmdp[2] = (unsigned char) ((lba >> 24) & 0xff);
-		hdr->cmdp[3] = (unsigned char) ((lba >> 16) & 0xff);
-		hdr->cmdp[4] = (unsigned char) ((lba >>  8) & 0xff);
-		hdr->cmdp[5] = (unsigned char) (lba & 0xff);
-		hdr->cmdp[7] = (unsigned char) ((nr_blocks >> 8) & 0xff);
-		hdr->cmdp[8] = (unsigned char) (nr_blocks & 0xff);
+		if (lba < MAX_10B_LBA) {
+			hdr->cmdp[2] = (unsigned char) ((lba >> 24) & 0xff);
+			hdr->cmdp[3] = (unsigned char) ((lba >> 16) & 0xff);
+			hdr->cmdp[4] = (unsigned char) ((lba >>  8) & 0xff);
+			hdr->cmdp[5] = (unsigned char) (lba & 0xff);
+			hdr->cmdp[7] = (unsigned char) ((nr_blocks >> 8) & 0xff);
+			hdr->cmdp[8] = (unsigned char) (nr_blocks & 0xff);
+		} else {
+			hdr->cmdp[2] = (unsigned char) ((lba >> 56) & 0xff);
+			hdr->cmdp[3] = (unsigned char) ((lba >> 48) & 0xff);
+			hdr->cmdp[4] = (unsigned char) ((lba >> 40) & 0xff);
+			hdr->cmdp[5] = (unsigned char) ((lba >> 32) & 0xff);
+			hdr->cmdp[6] = (unsigned char) ((lba >> 24) & 0xff);
+			hdr->cmdp[7] = (unsigned char) ((lba >> 16) & 0xff);
+			hdr->cmdp[8] = (unsigned char) ((lba >>  8) & 0xff);
+			hdr->cmdp[9] = (unsigned char) (lba & 0xff);
+			hdr->cmdp[10] = (unsigned char) ((nr_blocks >> 32) & 0xff);
+			hdr->cmdp[11] = (unsigned char) ((nr_blocks >> 16) & 0xff);
+			hdr->cmdp[12] = (unsigned char) ((nr_blocks >> 8) & 0xff);
+			hdr->cmdp[13] = (unsigned char) (nr_blocks & 0xff);
+		}
 	}
 
+	hdr->timeout = SCSI_TIMEOUT_MS;
 	return 0;
 }
 
@@ -273,42 +364,103 @@
 
 static struct io_u *fio_sgio_event(struct thread_data *td, int event)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 
 	return sd->events[event];
 }
 
-static int fio_sgio_get_bs(struct thread_data *td, unsigned int *bs)
+static int fio_sgio_read_capacity(struct thread_data *td, unsigned int *bs,
+				  unsigned long long *max_lba)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	struct io_u io_u;
-	struct sg_io_hdr *hdr;
-	unsigned char buf[8];
+	/*
+	 * need to do read capacity operation w/o benefit of sd or
+	 * io_u structures, which are not initialized until later.
+	 */
+	struct sg_io_hdr hdr;
+	unsigned char cmd[16];
+	unsigned char sb[64];
+	unsigned char buf[32];  // read capacity return
 	int ret;
+	int fd = -1;
 
-	memset(&io_u, 0, sizeof(io_u));
-	io_u.file = td->files[0];
+	struct fio_file *f = td->files[0];
 
-	hdr = &io_u.hdr;
-	sgio_hdr_init(sd, hdr, &io_u, 0);
+	/* open file independent of rest of application */
+	fd = open(f->file_name, O_RDONLY);
+	if (fd < 0)
+		return -errno;
+
+	memset(&hdr, 0, sizeof(hdr));
+	memset(cmd, 0, sizeof(cmd));
+	memset(sb, 0, sizeof(sb));
 	memset(buf, 0, sizeof(buf));
 
-	hdr->cmdp[0] = 0x25;
-	hdr->dxfer_direction = SG_DXFER_FROM_DEV;
-	hdr->dxferp = buf;
-	hdr->dxfer_len = sizeof(buf);
+	/* First let's try a 10 byte read capacity. */
+	hdr.interface_id = 'S';
+	hdr.cmdp = cmd;
+	hdr.cmd_len = 10;
+	hdr.sbp = sb;
+	hdr.mx_sb_len = sizeof(sb);
+	hdr.timeout = SCSI_TIMEOUT_MS;
+	hdr.cmdp[0] = 0x25;  // Read Capacity(10)
+	hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	hdr.dxferp = buf;
+	hdr.dxfer_len = sizeof(buf);
 
-	ret = fio_sgio_doio(td, &io_u, 1);
-	if (ret)
+	ret = ioctl(fd, SG_IO, &hdr);
+	if (ret < 0) {
+		close(fd);
 		return ret;
+	}
 
-	*bs = (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7];
+	*bs	 = (buf[4] << 24) | (buf[5] << 16) | (buf[6] << 8) | buf[7];
+	*max_lba = ((buf[0] << 24) | (buf[1] << 16) | (buf[2] << 8) | buf[3]) & MAX_10B_LBA;  // for some reason max_lba is being sign extended even though unsigned.
+
+	/*
+	 * If max lba masked by MAX_10B_LBA equals MAX_10B_LBA,
+	 * then need to retry with 16 byte Read Capacity command.
+	 */
+	if (*max_lba == MAX_10B_LBA) {
+		hdr.cmd_len = 16;
+		hdr.cmdp[0] = 0x9e; // service action
+		hdr.cmdp[1] = 0x10; // Read Capacity(16)
+		hdr.cmdp[10] = (unsigned char) ((sizeof(buf) >> 24) & 0xff);
+		hdr.cmdp[11] = (unsigned char) ((sizeof(buf) >> 16) & 0xff);
+		hdr.cmdp[12] = (unsigned char) ((sizeof(buf) >> 8) & 0xff);
+		hdr.cmdp[13] = (unsigned char) (sizeof(buf) & 0xff);
+
+		hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+		hdr.dxferp = buf;
+		hdr.dxfer_len = sizeof(buf);
+
+		ret = ioctl(fd, SG_IO, &hdr);
+		if (ret < 0) {
+			close(fd);
+			return ret;
+		}
+
+		/* record if an io error occurred */
+		if (hdr.info & SG_INFO_CHECK)
+			td_verror(td, EIO, "fio_sgio_read_capacity");
+
+		*bs = (buf[8] << 24) | (buf[9] << 16) | (buf[10] << 8) | buf[11];
+		*max_lba = ((unsigned long long)buf[0] << 56) |
+				((unsigned long long)buf[1] << 48) |
+				((unsigned long long)buf[2] << 40) |
+				((unsigned long long)buf[3] << 32) |
+				((unsigned long long)buf[4] << 24) |
+				((unsigned long long)buf[5] << 16) |
+				((unsigned long long)buf[6] << 8) |
+				(unsigned long long)buf[7];
+	}
+
+	close(fd);
 	return 0;
 }
 
 static void fio_sgio_cleanup(struct thread_data *td)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 
 	if (sd) {
 		free(sd->events);
@@ -336,8 +488,8 @@
 	memset(sd->fd_flags, 0, sizeof(int) * td->o.nr_files);
 	sd->sgbuf = malloc(sizeof(struct sg_io_hdr) * td->o.iodepth);
 	memset(sd->sgbuf, 0, sizeof(struct sg_io_hdr) * td->o.iodepth);
-
-	td->io_ops->data = sd;
+	sd->type_checked = 0;
+	td->io_ops_data = sd;
 
 	/*
 	 * we want to do it, regardless of whether odirect is set or not
@@ -348,10 +500,11 @@
 
 static int fio_sgio_type_check(struct thread_data *td, struct fio_file *f)
 {
-	struct sgio_data *sd = td->io_ops->data;
-	unsigned int bs;
+	struct sgio_data *sd = td->io_ops_data;
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
 
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		if (ioctl(f->fd, BLKSSZGET, &bs) < 0) {
 			td_verror(td, errno, "ioctl");
 			return 1;
@@ -364,27 +517,38 @@
 			return 1;
 		}
 
-		ret = fio_sgio_get_bs(td, &bs);
-		if (ret)
+		ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+		if (ret) {
+			td_verror(td, td->error, "fio_sgio_read_capacity");
+			log_err("ioengine sg unable to read capacity successfully\n");
 			return 1;
+		}
 	} else {
-		log_err("ioengine sg only works on block devices\n");
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
 		return 1;
 	}
 
 	sd->bs = bs;
+	// Determine size of commands needed based on max_lba
+	if (max_lba >= MAX_10B_LBA) {
+		dprint(FD_IO, "sgio_type_check: using 16 byte read/write "
+			"commands for lba above 0x%016llx/0x%016llx\n",
+			MAX_10B_LBA, max_lba);
+	}
 
-	if (f->filetype == FIO_TYPE_BD) {
+	if (f->filetype == FIO_TYPE_BLOCK) {
 		td->io_ops->getevents = NULL;
 		td->io_ops->event = NULL;
 	}
+	sd->type_checked = 1;
 
 	return 0;
 }
 
 static int fio_sgio_open(struct thread_data *td, struct fio_file *f)
 {
-	struct sgio_data *sd = td->io_ops->data;
+	struct sgio_data *sd = td->io_ops_data;
 	int ret;
 
 	ret = generic_open_file(td, f);
@@ -399,6 +563,251 @@
 	return 0;
 }
 
+/*
+ * Build an error string with details about the driver, host or scsi
+ * error contained in the sg header Caller will use as necessary.
+ */
+static char *fio_sgio_errdetails(struct io_u *io_u)
+{
+	struct sg_io_hdr *hdr = &io_u->hdr;
+#define MAXERRDETAIL 1024
+#define MAXMSGCHUNK  128
+	char *msg, msgchunk[MAXMSGCHUNK], *ret = NULL;
+	int i;
+
+	msg = calloc(1, MAXERRDETAIL);
+
+	/*
+	 * can't seem to find sg_err.h, so I'll just echo the define values
+	 * so others can search on internet to find clearer clues of meaning.
+	 */
+	if (hdr->info & SG_INFO_CHECK) {
+		ret = msg;
+		if (hdr->host_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Host Status: 0x%02x; ", hdr->host_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->host_status) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DID_NO_CONNECT", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DID_BUS_BUSY", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DID_TIME_OUT", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DID_BAD_TARGET", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DID_ABORT", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DID_PARITY", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DID_ERROR (internal error)", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DID_RESET", MAXERRDETAIL);
+				break;
+			case 0x09:
+				strlcat(msg, "SG_ERR_DID_BAD_INTR (unexpected)", MAXERRDETAIL);
+				break;
+			case 0x0a:
+				strlcat(msg, "SG_ERR_DID_PASSTHROUGH", MAXERRDETAIL);
+				break;
+			case 0x0b:
+				strlcat(msg, "SG_ERR_DID_SOFT_ERROR (driver retry?)", MAXERRDETAIL);
+				break;
+			case 0x0c:
+				strlcat(msg, "SG_ERR_DID_IMM_RETRY", MAXERRDETAIL);
+				break;
+			case 0x0d:
+				strlcat(msg, "SG_ERR_DID_REQUEUE", MAXERRDETAIL);
+				break;
+			case 0x0e:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_DISRUPTED", MAXERRDETAIL);
+				break;
+			case 0x0f:
+				strlcat(msg, "SG_ERR_DID_TRANSPORT_FAILFAST", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "SG_ERR_DID_TARGET_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x11:
+				strlcat(msg, "SG_ERR_DID_NEXUS_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x12:
+				strlcat(msg, "SG_ERR_DID_ALLOC_FAILURE", MAXERRDETAIL);
+				break;
+			case 0x13:
+				strlcat(msg, "SG_ERR_DID_MEDIUM_ERROR", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->driver_status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver Status: 0x%02x; ", hdr->driver_status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			switch (hdr->driver_status & 0x0F) {
+			case 0x01:
+				strlcat(msg, "SG_ERR_DRIVER_BUSY", MAXERRDETAIL);
+				break;
+			case 0x02:
+				strlcat(msg, "SG_ERR_DRIVER_SOFT", MAXERRDETAIL);
+				break;
+			case 0x03:
+				strlcat(msg, "SG_ERR_DRIVER_MEDIA", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "SG_ERR_DRIVER_ERROR", MAXERRDETAIL);
+				break;
+			case 0x05:
+				strlcat(msg, "SG_ERR_DRIVER_INVALID", MAXERRDETAIL);
+				break;
+			case 0x06:
+				strlcat(msg, "SG_ERR_DRIVER_TIMEOUT", MAXERRDETAIL);
+				break;
+			case 0x07:
+				strlcat(msg, "SG_ERR_DRIVER_HARD", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "SG_ERR_DRIVER_SENSE", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, "; ", MAXERRDETAIL);
+			switch (hdr->driver_status & 0xF0) {
+			case 0x10:
+				strlcat(msg, "SG_ERR_SUGGEST_RETRY", MAXERRDETAIL);
+				break;
+			case 0x20:
+				strlcat(msg, "SG_ERR_SUGGEST_ABORT", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "SG_ERR_SUGGEST_REMAP", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "SG_ERR_SUGGEST_DIE", MAXERRDETAIL);
+				break;
+			case 0x80:
+				strlcat(msg, "SG_ERR_SUGGEST_SENSE", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->status) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG SCSI Status: 0x%02x; ", hdr->status);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			// SCSI 3 status codes
+			switch (hdr->status) {
+			case 0x02:
+				strlcat(msg, "CHECK_CONDITION", MAXERRDETAIL);
+				break;
+			case 0x04:
+				strlcat(msg, "CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x08:
+				strlcat(msg, "BUSY", MAXERRDETAIL);
+				break;
+			case 0x10:
+				strlcat(msg, "INTERMEDIATE", MAXERRDETAIL);
+				break;
+			case 0x14:
+				strlcat(msg, "INTERMEDIATE_CONDITION_MET", MAXERRDETAIL);
+				break;
+			case 0x18:
+				strlcat(msg, "RESERVATION_CONFLICT", MAXERRDETAIL);
+				break;
+			case 0x22:
+				strlcat(msg, "COMMAND_TERMINATED", MAXERRDETAIL);
+				break;
+			case 0x28:
+				strlcat(msg, "TASK_SET_FULL", MAXERRDETAIL);
+				break;
+			case 0x30:
+				strlcat(msg, "ACA_ACTIVE", MAXERRDETAIL);
+				break;
+			case 0x40:
+				strlcat(msg, "TASK_ABORTED", MAXERRDETAIL);
+				break;
+			default:
+				strlcat(msg, "Unknown", MAXERRDETAIL);
+				break;
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->sb_len_wr) {
+			snprintf(msgchunk, MAXMSGCHUNK, "Sense Data (%d bytes):", hdr->sb_len_wr);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			for (i = 0; i < hdr->sb_len_wr; i++) {
+				snprintf(msgchunk, MAXMSGCHUNK, " %02x", hdr->sbp[i]);
+				strlcat(msg, msgchunk, MAXERRDETAIL);
+			}
+			strlcat(msg, ". ", MAXERRDETAIL);
+		}
+		if (hdr->resid != 0) {
+			snprintf(msgchunk, MAXMSGCHUNK, "SG Driver: %d bytes out of %d not transferred. ", hdr->resid, hdr->dxfer_len);
+			strlcat(msg, msgchunk, MAXERRDETAIL);
+			ret = msg;
+		}
+	}
+
+	if (!ret)
+		ret = strdup("SG Driver did not report a Host, Driver or Device check");
+
+	return ret;
+}
+
+/*
+ * get max file size from read capacity.
+ */
+static int fio_sgio_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	/*
+	 * get_file_size is being called even before sgio_init is
+	 * called, so none of the sg_io structures are
+	 * initialized in the thread_data yet.  So we need to do the
+	 * ReadCapacity without any of those helpers.  One of the effects
+	 * is that ReadCapacity may get called 4 times on each open:
+	 * readcap(10) followed by readcap(16) if needed - just to get
+	 * the file size after the init occurs - it will be called
+	 * again when "type_check" is called during structure
+	 * initialization I'm not sure how to prevent this little
+	 * inefficiency.
+	 */
+	unsigned int bs = 0;
+	unsigned long long max_lba = 0;
+	int ret;
+
+	if (fio_file_size_known(f))
+		return 0;
+
+	if (f->filetype != FIO_TYPE_BLOCK && f->filetype != FIO_TYPE_CHAR) {
+		td_verror(td, EINVAL, "wrong file type");
+		log_err("ioengine sg only works on block or character devices\n");
+		return 1;
+	}
+
+	ret = fio_sgio_read_capacity(td, &bs, &max_lba);
+	if (ret ) {
+		td_verror(td, td->error, "fio_sgio_read_capacity");
+		log_err("ioengine sg unable to successfully execute read capacity to get block size and maximum lba\n");
+		return 1;
+	}
+
+	f->real_file_size = (max_lba + 1) * bs;
+	fio_file_set_size_known(f);
+	return 0;
+}
+
+
 static struct ioengine_ops ioengine = {
 	.name		= "sg",
 	.version	= FIO_IOOPS_VERSION,
@@ -406,11 +815,12 @@
 	.prep		= fio_sgio_prep,
 	.queue		= fio_sgio_queue,
 	.getevents	= fio_sgio_getevents,
+	.errdetails	= fio_sgio_errdetails,
 	.event		= fio_sgio_event,
 	.cleanup	= fio_sgio_cleanup,
 	.open_file	= fio_sgio_open,
 	.close_file	= generic_close_file,
-	.get_file_size	= generic_get_file_size,
+	.get_file_size	= fio_sgio_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_RAWIO,
 };
 

diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
index 63a6f8d..4bebcc4 100644
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c

@@ -109,11 +109,11 @@
 
 /*
  * Hook for opening the given file. Unless the engine has special
- * needs, it usually just provides generic_file_open() as the handler.
+ * needs, it usually just provides generic_open_file() as the handler.
  */
 static int fio_skeleton_open(struct thread_data *td, struct fio_file *f)
 {
-	return generic_file_open(td, f);
+	return generic_open_file(td, f);
 }
 
 /*
@@ -121,12 +121,12 @@
  */
 static int fio_skeleton_close(struct thread_data *td, struct fio_file *f)
 {
-	generic_file_close(td, f);
+	return generic_close_file(td, f);
 }
 
 /*
  * Note that the structure is exported, so that fio can get it via
- * dlsym(..., "ioengine");
+ * dlsym(..., "ioengine"); for (and only for) external engines.
  */
 struct ioengine_ops ioengine = {
 	.name		= "engine_name",

diff --git a/engines/solarisaio.c b/engines/solarisaio.c
index 55a0cb9..151f31d 100644
--- a/engines/solarisaio.c
+++ b/engines/solarisaio.c

@@ -28,7 +28,7 @@
 static int fio_solarisaio_prep(struct thread_data fio_unused *td,
 			    struct io_u *io_u)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	io_u->resultp.aio_return = AIO_INPROGRESS;
 	io_u->engine_data = sd;
@@ -75,7 +75,7 @@
 static int fio_solarisaio_getevents(struct thread_data *td, unsigned int min,
 				    unsigned int max, const struct timespec *t)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 	struct timeval tv;
 	int ret;
 
@@ -100,7 +100,7 @@
 
 static struct io_u *fio_solarisaio_event(struct thread_data *td, int event)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	return sd->aio_events[event];
 }
@@ -108,7 +108,7 @@
 static int fio_solarisaio_queue(struct thread_data fio_unused *td,
 			      struct io_u *io_u)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	off_t off;
 	int ret;
@@ -155,7 +155,7 @@
 
 static void fio_solarisaio_cleanup(struct thread_data *td)
 {
-	struct solarisaio_data *sd = td->io_ops->data;
+	struct solarisaio_data *sd = td->io_ops_data;
 
 	if (sd) {
 		free(sd->aio_events);
@@ -204,7 +204,7 @@
 	fio_solarisaio_init_sigio();
 #endif
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 

diff --git a/engines/splice.c b/engines/splice.c
index f35ae17..eba093e 100644
--- a/engines/splice.c
+++ b/engines/splice.c

@@ -28,7 +28,7 @@
  */
 static int fio_splice_read_old(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	int ret, ret2, buflen;
 	off_t offset;
@@ -72,7 +72,7 @@
  */
 static int fio_splice_read(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct fio_file *f = io_u->file;
 	struct iovec iov;
 	int ret , buflen, mmap_len;
@@ -166,7 +166,7 @@
  */
 static int fio_splice_write(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	struct iovec iov = {
 		.iov_base = io_u->xfer_buf,
 		.iov_len = io_u->xfer_buflen,
@@ -201,7 +201,7 @@
 
 static int fio_spliceio_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 	int ret = 0;
 
 	fio_ro_check(td, io_u);
@@ -247,7 +247,7 @@
 
 static void fio_spliceio_cleanup(struct thread_data *td)
 {
-	struct spliceio_data *sd = td->io_ops->data;
+	struct spliceio_data *sd = td->io_ops_data;
 
 	if (sd) {
 		close(sd->pipe[0]);
@@ -284,7 +284,7 @@
 	if (td_read(td))
 		td->o.mem_align = 1;
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 

diff --git a/engines/sync.c b/engines/sync.c
index 48aafff..e76bbbb 100644
--- a/engines/sync.c
+++ b/engines/sync.c

@@ -13,11 +13,12 @@
 #include <assert.h>
 
 #include "../fio.h"
+#include "../optgroup.h"
 
 /*
  * Sync engine uses engine_data to store last offset
  */
-#define LAST_POS(f)	((f)->engine_data)
+#define LAST_POS(f)	((f)->engine_pos)
 
 struct syncio_data {
 	struct iovec *iovecs;
@@ -31,6 +32,28 @@
 	enum fio_ddir last_ddir;
 };
 
+#ifdef FIO_HAVE_PWRITEV2
+struct psyncv2_options {
+	void *pad;
+	unsigned int hipri;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "hipri",
+		.lname	= "RWF_HIPRI",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct psyncv2_options, hipri),
+		.help	= "Set RWF_HIPRI for pwritev2/preadv2",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= NULL,
+	},
+};
+#endif
+
 static int fio_syncio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
@@ -74,7 +97,7 @@
 #ifdef CONFIG_PWRITEV
 static int fio_pvsyncio_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct iovec *iov = &sd->iovecs[0];
 	struct fio_file *f = io_u->file;
 	int ret;
@@ -98,6 +121,38 @@
 }
 #endif
 
+#ifdef FIO_HAVE_PWRITEV2
+static int fio_pvsyncio2_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct syncio_data *sd = td->io_ops_data;
+	struct psyncv2_options *o = td->eo;
+	struct iovec *iov = &sd->iovecs[0];
+	struct fio_file *f = io_u->file;
+	int ret, flags = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (o->hipri)
+		flags |= RWF_HIPRI;
+
+	iov->iov_base = io_u->xfer_buf;
+	iov->iov_len = io_u->xfer_buflen;
+
+	if (io_u->ddir == DDIR_READ)
+		ret = preadv2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = pwritev2(f->fd, iov, 1, io_u->offset, flags);
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
+		ret = do_io_u_sync(td, io_u);
+
+	return fio_io_end(td, io_u, ret);
+}
+#endif
+
+
 static int fio_psyncio_queue(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
@@ -142,7 +197,7 @@
 				 unsigned int max,
 				 const struct timespec fio_unused *t)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	int ret;
 
 	if (min) {
@@ -157,14 +212,14 @@
 
 static struct io_u *fio_vsyncio_event(struct thread_data *td, int event)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	return sd->io_us[event];
 }
 
 static int fio_vsyncio_append(struct thread_data *td, struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	if (ddir_sync(io_u->ddir))
 		return 0;
@@ -191,7 +246,7 @@
 
 static int fio_vsyncio_queue(struct thread_data *td, struct io_u *io_u)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
 	fio_ro_check(td, io_u);
 
@@ -231,7 +286,7 @@
  */
 static int fio_vsyncio_end(struct thread_data *td, ssize_t bytes)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct io_u *io_u;
 	unsigned int i;
 	int err;
@@ -271,7 +326,7 @@
 
 static int fio_vsyncio_commit(struct thread_data *td)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 	struct fio_file *f;
 	ssize_t ret;
 
@@ -309,17 +364,19 @@
 	sd->iovecs = malloc(td->o.iodepth * sizeof(struct iovec));
 	sd->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
 
-	td->io_ops->data = sd;
+	td->io_ops_data = sd;
 	return 0;
 }
 
 static void fio_vsyncio_cleanup(struct thread_data *td)
 {
-	struct syncio_data *sd = td->io_ops->data;
+	struct syncio_data *sd = td->io_ops_data;
 
-	free(sd->iovecs);
-	free(sd->io_us);
-	free(sd);
+	if (sd) {
+		free(sd->iovecs);
+		free(sd->io_us);
+		free(sd);
+	}
 }
 
 static struct ioengine_ops ioengine_rw = {
@@ -372,6 +429,22 @@
 };
 #endif
 
+#ifdef FIO_HAVE_PWRITEV2
+static struct ioengine_ops ioengine_pvrw2 = {
+	.name		= "pvsync2",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_vsyncio_init,
+	.cleanup	= fio_vsyncio_cleanup,
+	.queue		= fio_pvsyncio2_queue,
+	.open_file	= generic_open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO,
+	.options	= options,
+	.option_struct_size	= sizeof(struct psyncv2_options),
+};
+#endif
+
 static void fio_init fio_syncio_register(void)
 {
 	register_ioengine(&ioengine_rw);
@@ -380,6 +453,9 @@
 #ifdef CONFIG_PWRITEV
 	register_ioengine(&ioengine_pvrw);
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+	register_ioengine(&ioengine_pvrw2);
+#endif
 }
 
 static void fio_exit fio_syncio_unregister(void)
@@ -390,4 +466,7 @@
 #ifdef CONFIG_PWRITEV
 	unregister_ioengine(&ioengine_pvrw);
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+	unregister_ioengine(&ioengine_pvrw2);
+#endif
 }

diff --git a/engines/windowsaio.c b/engines/windowsaio.c
index ec8222c..f5cb048 100644
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c

@@ -84,7 +84,7 @@
 		}
 	}
 
-	td->io_ops->data = wd;
+	td->io_ops_data = wd;
 
 	if (!rc) {
 		struct thread_ctx *ctx;
@@ -97,7 +97,7 @@
 			rc = 1;
 		}
 
-		wd = td->io_ops->data;
+		wd = td->io_ops_data;
 		wd->iothread_running = TRUE;
 		wd->iocp = hFile;
 
@@ -113,10 +113,15 @@
 
 		if (!rc)
 		{
+			DWORD threadid;
+
 			ctx->iocp = hFile;
 			ctx->wd = wd;
-			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, NULL);
-			if (wd->iothread == NULL)
+			wd->iothread = CreateThread(NULL, 0, IoCompletionRoutine, ctx, 0, &threadid);
+
+			if (wd->iothread != NULL)
+				fio_setaffinity(threadid, td->o.cpumask);
+			else
 				log_err("windowsaio: failed to create io completion thread\n");
 		}
 
@@ -131,7 +136,7 @@
 {
 	struct windowsaio_data *wd;
 
-	wd = td->io_ops->data;
+	wd = td->io_ops_data;
 
 	if (wd != NULL) {
 		wd->iothread_running = FALSE;
@@ -143,7 +148,7 @@
 		free(wd->aio_events);
 		free(wd);
 
-		td->io_ops->data = NULL;
+		td->io_ops_data = NULL;
 	}
 }
 
@@ -203,10 +208,10 @@
 
 	/* Only set up the completion port and thread if we're not just
 	 * querying the device size */
-	if (!rc && td->io_ops->data != NULL) {
+	if (!rc && td->io_ops_data != NULL) {
 		struct windowsaio_data *wd;
 
-		wd = td->io_ops->data;
+		wd = td->io_ops_data;
 
 		if (CreateIoCompletionPort(f->hFile, wd->iocp, 0, 0) == NULL) {
 			log_err("windowsaio: failed to create io completion port\n");
@@ -251,7 +256,7 @@
 
 static struct io_u* fio_windowsaio_event(struct thread_data *td, int event)
 {
-	struct windowsaio_data *wd = td->io_ops->data;
+	struct windowsaio_data *wd = td->io_ops_data;
 	return wd->aio_events[event];
 }
 
@@ -259,7 +264,7 @@
 				    unsigned int max,
 				    const struct timespec *t)
 {
-	struct windowsaio_data *wd = td->io_ops->data;
+	struct windowsaio_data *wd = td->io_ops_data;
 	unsigned int dequeued = 0;
 	struct io_u *io_u;
 	int i;
@@ -284,14 +289,13 @@
 
 			if (fov->io_complete) {
 				fov->io_complete = FALSE;
-				ResetEvent(fov->o.hEvent);
 				wd->aio_events[dequeued] = io_u;
 				dequeued++;
 			}
 
-			if (dequeued >= min)
-				break;
 		}
+		if (dequeued >= min)
+			break;
 
 		if (dequeued < min) {
 			status = WaitForSingleObject(wd->iocomplete_event, mswait);
@@ -310,23 +314,22 @@
 {
 	struct fio_overlapped *o = io_u->engine_data;
 	LPOVERLAPPED lpOvl = &o->o;
-	DWORD iobytes;
 	BOOL success = FALSE;
 	int rc = FIO_Q_COMPLETED;
 
 	fio_ro_check(td, io_u);
 
-	lpOvl->Internal = STATUS_PENDING;
+	lpOvl->Internal = 0;
 	lpOvl->InternalHigh = 0;
 	lpOvl->Offset = io_u->offset & 0xFFFFFFFF;
 	lpOvl->OffsetHigh = io_u->offset >> 32;
 
 	switch (io_u->ddir) {
 	case DDIR_WRITE:
-		success = WriteFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl);
+		success = WriteFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, NULL, lpOvl);
 		break;
 	case DDIR_READ:
-		success = ReadFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, &iobytes, lpOvl);
+		success = ReadFile(io_u->file->hFile, io_u->xfer_buf, io_u->xfer_buflen, NULL, lpOvl);
 		break;
 	case DDIR_SYNC:
 	case DDIR_DATASYNC:
@@ -403,7 +406,6 @@
 	struct fio_overlapped *o = io_u->engine_data;
 
 	if (o) {
-		CloseHandle(o->o.hEvent);
 		io_u->engine_data = NULL;
 		free(o);
 	}
@@ -416,13 +418,7 @@
 	o = malloc(sizeof(*o));
 	o->io_complete = FALSE;
 	o->io_u = io_u;
-	o->o.hEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
-	if (o->o.hEvent == NULL) {
-		log_err("windowsaio: failed to create event handle\n");
-		free(o);
-		return 1;
-	}
-
+	o->o.hEvent = NULL;
 	io_u->engine_data = o;
 	return 0;
 }

diff --git a/eta.c b/eta.c
index 167bf5f..adf7f94 100644
--- a/eta.c
+++ b/eta.c

@@ -6,6 +6,7 @@
 #include <string.h>
 
 #include "fio.h"
+#include "lib/pow2.h"
 
 static char __run_str[REAL_MAX_JOBS + 1];
 static char run_str[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS)];
@@ -122,6 +123,11 @@
 	unsigned int d, h, m, s;
 	int disp_hour = 0;
 
+	if (eta_sec == -1) {
+		sprintf(str, "--");
+		return;
+	}
+
 	s = eta_sec % 60;
 	eta_sec /= 60;
 	m = eta_sec % 60;
@@ -145,7 +151,7 @@
 /*
  * Best effort calculation of the estimated pending runtime of a job.
  */
-static int thread_eta(struct thread_data *td)
+static unsigned long thread_eta(struct thread_data *td)
 {
 	unsigned long long bytes_total, bytes_done;
 	unsigned long eta_sec = 0;
@@ -157,6 +163,9 @@
 
 	bytes_total = td->total_io_size;
 
+	if (td->flags & TD_F_NO_PROGRESS)
+		return -1;
+
 	if (td->o.fill_device && td->o.size  == -1ULL) {
 		if (!td->fill_device_size || td->fill_device_size == -1ULL)
 			return 0;
@@ -216,7 +225,11 @@
 			}
 		}
 
-		eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed;
+		if (perc == 0.0) {
+			eta_sec = timeout;
+		} else {
+			eta_sec = (unsigned long) (elapsed * (1.0 / perc)) - elapsed;
+		}
 
 		if (td->o.timeout &&
 		    eta_sec > (timeout + done_secs - elapsed))
@@ -226,7 +239,7 @@
 			|| td->runstate == TD_SETTING_UP
 			|| td->runstate == TD_RAMP
 			|| td->runstate == TD_PRE_READING) {
-		int t_eta = 0, r_eta = 0;
+		int64_t t_eta = 0, r_eta = 0;
 		unsigned long long rate_bytes;
 
 		/*
@@ -238,10 +251,13 @@
 			uint64_t start_delay = td->o.start_delay;
 			uint64_t ramp_time = td->o.ramp_time;
 
-			t_eta = __timeout + start_delay + ramp_time;
+			t_eta = __timeout + start_delay;
+			if (!td->ramp_time_over) {
+				t_eta += ramp_time;
+			}
 			t_eta /= 1000000ULL;
 
-			if (in_ramp_time(td)) {
+			if ((td->runstate == TD_RAMP) && in_ramp_time(td)) {
 				unsigned long ramp_left;
 
 				ramp_left = mtime_since_now(&td->epoch);
@@ -250,9 +266,16 @@
 					t_eta -= ramp_left;
 			}
 		}
-		rate_bytes = ddir_rw_sum(td->o.rate);
+		rate_bytes = 0;
+		if (td_read(td))
+			rate_bytes  = td->o.rate[DDIR_READ];
+		if (td_write(td))
+			rate_bytes += td->o.rate[DDIR_WRITE];
+		if (td_trim(td))
+			rate_bytes += td->o.rate[DDIR_TRIM];
+
 		if (rate_bytes) {
-			r_eta = (bytes_total / 1024) / rate_bytes;
+			r_eta = bytes_total / rate_bytes;
 			r_eta += (td->o.start_delay / 1000000ULL);
 		}
 
@@ -276,19 +299,24 @@
 
 static void calc_rate(int unified_rw_rep, unsigned long mtime,
 		      unsigned long long *io_bytes,
-		      unsigned long long *prev_io_bytes, unsigned int *rate)
+		      unsigned long long *prev_io_bytes, uint64_t *rate)
 {
 	int i;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		unsigned long long diff;
+		unsigned long long diff, this_rate;
 
 		diff = io_bytes[i] - prev_io_bytes[i];
+		if (mtime)
+			this_rate = ((1000 * diff) / mtime) / 1024; /* KiB/s */
+		else
+			this_rate = 0;
+
 		if (unified_rw_rep) {
 			rate[i] = 0;
-			rate[0] += ((1000 * diff) / mtime) / 1024;
+			rate[0] += this_rate;
 		} else
-			rate[i] = ((1000 * diff) / mtime) / 1024;
+			rate[i] = this_rate;
 
 		prev_io_bytes[i] = io_bytes[i];
 	}
@@ -301,14 +329,19 @@
 	int i;
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		unsigned long long diff;
+		unsigned long long diff, this_iops;
 
 		diff = io_iops[i] - prev_io_iops[i];
+		if (mtime)
+			this_iops = (diff * 1000) / mtime;
+		else
+			this_iops = 0;
+
 		if (unified_rw_rep) {
 			iops[i] = 0;
-			iops[0] += (diff * 1000) / mtime;
+			iops[0] += this_iops;
 		} else
-			iops[i] = (diff * 1000) / mtime;
+			iops[i] = this_iops;
 
 		prev_io_iops[i] = io_iops[i];
 	}
@@ -318,11 +351,11 @@
  * Print status of the jobs we know about. This includes rate estimates,
  * ETA, thread state, etc.
  */
-int calc_thread_status(struct jobs_eta *je, int force)
+bool calc_thread_status(struct jobs_eta *je, int force)
 {
 	struct thread_data *td;
 	int i, unified_rw_rep;
-	unsigned long rate_time, disp_time, bw_avg_time, *eta_secs;
+	uint64_t rate_time, disp_time, bw_avg_time, *eta_secs;
 	unsigned long long io_bytes[DDIR_RWDIR_CNT];
 	unsigned long long io_iops[DDIR_RWDIR_CNT];
 	struct timeval now;
@@ -333,14 +366,14 @@
 	static struct timeval rate_prev_time, disp_prev_time;
 
 	if (!force) {
-		if (output_format != FIO_OUTPUT_NORMAL &&
+		if (!(output_format & FIO_OUTPUT_NORMAL) &&
 		    f_out == stdout)
-			return 0;
+			return false;
 		if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
-			return 0;
+			return false;
 
 		if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
-			return 0;
+			return false;
 	}
 
 	if (!ddir_rw_sum(rate_io_bytes))
@@ -348,8 +381,8 @@
 	if (!ddir_rw_sum(disp_io_bytes))
 		fill_start_time(&disp_prev_time);
 
-	eta_secs = malloc(thread_number * sizeof(unsigned long));
-	memset(eta_secs, 0, thread_number * sizeof(unsigned long));
+	eta_secs = malloc(thread_number * sizeof(uint64_t));
+	memset(eta_secs, 0, thread_number * sizeof(uint64_t));
 
 	je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
 
@@ -407,7 +440,7 @@
 		if (td->runstate > TD_SETTING_UP) {
 			int ddir;
 
-			for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+			for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 				if (unified_rw_rep) {
 					io_bytes[0] += td->io_bytes[ddir];
 					io_iops[0] += td->io_blocks[ddir];
@@ -419,19 +452,25 @@
 		}
 	}
 
-	if (exitall_on_terminate)
+	if (exitall_on_terminate) {
 		je->eta_sec = INT_MAX;
-	else
-		je->eta_sec = 0;
-
-	for_each_td(td, i) {
-		if (exitall_on_terminate) {
+		for_each_td(td, i) {
 			if (eta_secs[i] < je->eta_sec)
 				je->eta_sec = eta_secs[i];
-		} else {
-			if (eta_secs[i] > je->eta_sec)
-				je->eta_sec = eta_secs[i];
 		}
+	} else {
+		unsigned long eta_stone = 0;
+
+		je->eta_sec = 0;
+		for_each_td(td, i) {
+			if ((td->runstate == TD_NOT_CREATED) && td->o.stonewall)
+				eta_stone += eta_secs[i];
+			else {
+				if (eta_secs[i] > je->eta_sec)
+					je->eta_sec = eta_secs[i];
+			}
+		}
+		je->eta_sec += eta_stone;
 	}
 
 	free(eta_secs);
@@ -443,9 +482,9 @@
 		calc_rate(unified_rw_rep, rate_time, io_bytes, rate_io_bytes,
 				je->rate);
 		memcpy(&rate_prev_time, &now, sizeof(now));
-		add_agg_sample(je->rate[DDIR_READ], DDIR_READ, 0);
-		add_agg_sample(je->rate[DDIR_WRITE], DDIR_WRITE, 0);
-		add_agg_sample(je->rate[DDIR_TRIM], DDIR_TRIM, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_READ]), DDIR_READ, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_WRITE]), DDIR_WRITE, 0);
+		add_agg_sample(sample_val(je->rate[DDIR_TRIM]), DDIR_TRIM, 0);
 	}
 
 	disp_time = mtime_since(&disp_prev_time, &now);
@@ -454,7 +493,7 @@
 	 * Allow a little slack, the target is to print it every 1000 msecs
 	 */
 	if (!force && disp_time < 900)
-		return 0;
+		return false;
 
 	calc_rate(unified_rw_rep, disp_time, io_bytes, disp_io_bytes, je->rate);
 	calc_iops(unified_rw_rep, disp_time, io_iops, disp_io_iops, je->iops);
@@ -462,12 +501,12 @@
 	memcpy(&disp_prev_time, &now, sizeof(now));
 
 	if (!force && !je->nr_running && !je->nr_pending)
-		return 0;
+		return false;
 
 	je->nr_threads = thread_number;
 	update_condensed_str(__run_str, run_str);
 	memcpy(je->run_str, run_str, strlen(run_str));
-	return 1;
+	return true;
 }
 
 void display_thread_status(struct jobs_eta *je)
@@ -491,19 +530,28 @@
 	}
 
 	p += sprintf(p, "Jobs: %d (f=%d)", je->nr_running, je->files_open);
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
+
+	/* rate limits, if any */
+	if (je->m_rate[0] || je->m_rate[1] || je->m_rate[2] ||
+	    je->t_rate[0] || je->t_rate[1] || je->t_rate[2]) {
 		char *tr, *mr;
 
-		mr = num2str(je->m_rate[0] + je->m_rate[1], 4, 0, je->is_pow2, 8);
-		tr = num2str(je->t_rate[0] + je->t_rate[1], 4, 0, je->is_pow2, 8);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
+		mr = num2str(je->m_rate[0] + je->m_rate[1] + je->m_rate[2],
+				4, 0, je->is_pow2, N2S_BYTEPERSEC);
+		tr = num2str(je->t_rate[0] + je->t_rate[1] + je->t_rate[2],
+				4, 0, je->is_pow2, N2S_BYTEPERSEC);
+
+		p += sprintf(p, ", %s-%s", mr, tr);
 		free(tr);
 		free(mr);
-	} else if (je->m_iops[0] || je->m_iops[1] || je->t_iops[0] || je->t_iops[1]) {
-		p += sprintf(p, ", CR=%d/%d IOPS",
-					je->t_iops[0] + je->t_iops[1],
-					je->m_iops[0] + je->m_iops[1]);
+	} else if (je->m_iops[0] || je->m_iops[1] || je->m_iops[2] ||
+		   je->t_iops[0] || je->t_iops[1] || je->t_iops[2]) {
+		p += sprintf(p, ", %d-%d IOPS",
+					je->m_iops[0] + je->m_iops[1] + je->m_iops[2],
+					je->t_iops[0] + je->t_iops[1] + je->t_iops[2]);
 	}
+
+	/* current run string, % done, bandwidth, iops, eta */
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char perc_str[32];
 		char *iops_str[DDIR_RWDIR_CNT];
@@ -512,8 +560,9 @@
 		int l;
 		int ddir;
 
-		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
-			strcpy(perc_str, "-.-% done");
+		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running ||
+		    je->eta_sec == -1)
+			strcpy(perc_str, "-.-%");
 		else {
 			double mult = 100.0;
 
@@ -522,28 +571,37 @@
 
 			eta_good = 1;
 			perc *= mult;
-			sprintf(perc_str, "%3.1f%% done", perc);
+			sprintf(perc_str, "%3.1f%%", perc);
 		}
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-			rate_str[ddir] = num2str(je->rate[ddir], 5,
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			rate_str[ddir] = num2str(je->rate[ddir], 4,
 						1024, je->is_pow2, je->unit_base);
-			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, 0);
+			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0, N2S_NONE);
 		}
 
 		left = sizeof(output) - (p - output) - 1;
 
-		l = snprintf(p, left, ": [%s] [%s] [%s/%s/%s /s] [%s/%s/%s iops] [eta %s]",
+		if (je->rate[DDIR_TRIM] || je->iops[DDIR_TRIM])
+			l = snprintf(p, left,
+				": [%s][%s][r=%s,w=%s,t=%s][r=%s,w=%s,t=%s IOPS][eta %s]",
 				je->run_str, perc_str, rate_str[DDIR_READ],
 				rate_str[DDIR_WRITE], rate_str[DDIR_TRIM],
 				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
 				iops_str[DDIR_TRIM], eta_str);
+		else
+			l = snprintf(p, left,
+				": [%s][%s][r=%s,w=%s][r=%s,w=%s IOPS][eta %s]",
+				je->run_str, perc_str,
+				rate_str[DDIR_READ], rate_str[DDIR_WRITE],
+				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
+				eta_str);
 		p += l;
 		if (l >= 0 && l < linelen_last)
 			p += sprintf(p, "%*s", linelen_last - l, "");
 		linelen_last = l;
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			free(rate_str[ddir]);
 			free(iops_str[ddir]);
 		}
@@ -563,14 +621,14 @@
 	fflush(stdout);
 }
 
-struct jobs_eta *get_jobs_eta(int force, size_t *size)
+struct jobs_eta *get_jobs_eta(bool force, size_t *size)
 {
 	struct jobs_eta *je;
 
 	if (!thread_number)
 		return NULL;
 
-	*size = sizeof(*je) + THREAD_RUNSTR_SZ;
+	*size = sizeof(*je) + THREAD_RUNSTR_SZ + 8;
 	je = malloc(*size);
 	if (!je)
 		return NULL;
@@ -590,7 +648,7 @@
 	struct jobs_eta *je;
 	size_t size;
 
-	je = get_jobs_eta(0, &size);
+	je = get_jobs_eta(false, &size);
 	if (je)
 		display_thread_status(je);
 

diff --git a/examples/backwards-read.fio b/examples/backwards-read.fio
new file mode 100644
index 0000000..0fe35a2
--- /dev/null
+++ b/examples/backwards-read.fio

@@ -0,0 +1,8 @@
+# Demonstrates how to read backwards in a file.
+
+[backwards-read]
+bs=4k
+# seek -8k back for every IO
+rw=read:-8k
+filename=128m
+size=128m

diff --git a/examples/basic-verify.fio b/examples/basic-verify.fio
new file mode 100644
index 0000000..7871aeb
--- /dev/null
+++ b/examples/basic-verify.fio

@@ -0,0 +1,12 @@
+# The most basic form of data verification. Write the device randomly
+# in 4K chunks, then read it back and verify the contents.
+[write-and-verify]
+rw=randwrite
+bs=4k
+direct=1
+ioengine=libaio
+iodepth=16
+verify=crc32c
+# Use /dev/XXX. For running this on a file instead, remove the filename
+# option and add a size=32G (or whatever file size you want) instead.
+filename=/dev/XXX

diff --git a/examples/dev-dax.fio b/examples/dev-dax.fio
new file mode 100644
index 0000000..d9f430e
--- /dev/null
+++ b/examples/dev-dax.fio

@@ -0,0 +1,45 @@
+[global]
+bs=2m
+ioengine=dev-dax
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the dev-dax engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#
+iodepth=1
+direct=0
+thread=1
+numjobs=16
+#
+# The dev-dax engine does IO to DAX device that are special character
+# devices exported by the kernel (e.g. /dev/dax0.0). The device is
+# opened normally and then the region is accessible via mmap. We do
+# not use the O_DIRECT flag because the device is naturally direct
+# access. The O_DIRECT flags will result in failure. The engine
+# access the underlying NVDIMM directly once the mmapping is setup.
+#
+# Check the alignment requirement of your DAX device. Currently the default
+# should be 2M. Blocksize (bs) should meet alignment requirement.
+#
+# An example of creating a dev dax device node from pmem:
+# ndctl create-namespace --reconfig=namespace0.0 --mode=dax --force
+#
+filename=/dev/dax0.0
+
+[dev-dax-write]
+rw=randwrite
+stonewall
+
+[dev-dax-read]
+rw=randread
+stonewall

diff --git a/examples/fixed-rate-submission.fio b/examples/fixed-rate-submission.fio
new file mode 100644
index 0000000..076a868
--- /dev/null
+++ b/examples/fixed-rate-submission.fio

@@ -0,0 +1,10 @@
+[fixed-rate-submit]
+size=128m
+rw=read
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=1000

diff --git a/examples/ftruncate.fio b/examples/ftruncate.fio
new file mode 100644
index 0000000..a6ef457
--- /dev/null
+++ b/examples/ftruncate.fio

@@ -0,0 +1,27 @@
+# Example ftruncate engine jobs
+
+[global]
+ioengine=ftruncate
+directory=/scratch
+size=102404k ; 100Mb+4k
+stonewall
+filename=truncate
+runtime=10s
+time_based
+direct=1
+#
+# bs option is stub here. Truncation is performed on the current block offset.
+# blocksize value is ignored
+bs=4k
+
+# truncate the file to 4Kbytes then repeatedly grow the file back to just over
+# its original size using subsequent truncates
+[grow-truncate]
+rw=write
+
+# Repeatedly change a file to a random size between 0Kbytes and 100Mb
+# using truncates
+[rand-truncate]
+rw=randwrite
+norandommap
+

diff --git a/examples/gpudirect-rdmaio-client.fio b/examples/gpudirect-rdmaio-client.fio
new file mode 100644
index 0000000..1e24624
--- /dev/null
+++ b/examples/gpudirect-rdmaio-client.fio

@@ -0,0 +1,15 @@
+# Example gpudirect rdma client job
+[global]
+ioengine=rdma
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[sender]
+rw=write
+iodepth=1
+iodepth_batch_complete=1

diff --git a/examples/gpudirect-rdmaio-server.fio b/examples/gpudirect-rdmaio-server.fio
new file mode 100644
index 0000000..5fc4950
--- /dev/null
+++ b/examples/gpudirect-rdmaio-server.fio

@@ -0,0 +1,12 @@
+# Example rdma server job
+[global]
+ioengine=rdma
+port=[port]
+mem=cudamalloc
+gpu_dev_id=0
+bs=1m
+size=100g
+
+[receiver]
+rw=read
+iodepth=16

diff --git a/examples/jesd219.fio b/examples/jesd219.fio
new file mode 100644
index 0000000..24f16f7
--- /dev/null
+++ b/examples/jesd219.fio

@@ -0,0 +1,20 @@
+# Sample implementation of the JESD219 workload for SSD endurance
+# testing. It uses a specific distribution of block sizes and
+# read/write mix, as well as a specific distribution of where on
+# the device the IO accesses will land. Based on posting from
+# Jeff Furlong <jeff.furlong@hgst.com>
+[JESD219]
+ioengine=libaio
+direct=1
+rw=randrw
+norandommap
+randrepeat=0
+rwmixread=40
+rwmixwrite=60
+iodepth=256
+numjobs=4
+bssplit=512/4:1024/1:1536/1:2048/1:2560/1:3072/1:3584/1:4k/67:8k/10:16k/7:32k/3:64k/3
+blockalign=4k
+random_distribution=zoned:50/5:30/15:20/80
+filename=/dev/nvme0n1
+group_reporting=1

diff --git a/examples/mtd.fio b/examples/mtd.fio
new file mode 100644
index 0000000..ca09735
--- /dev/null
+++ b/examples/mtd.fio

@@ -0,0 +1,21 @@
+[global]
+gtod_reduce=1
+filename=/dev/mtd0
+ioengine=mtd
+ignore_error=,EIO
+blocksize=512,512,16384
+skip_bad=1
+
+[write]
+stonewall
+rw=trim
+
+[write]
+stonewall
+rw=write
+
+[write]
+stonewall
+block_error_percentiles=1
+rw=writetrim
+loops=4

diff --git a/examples/pmemblk.fio b/examples/pmemblk.fio
new file mode 100644
index 0000000..2d5ecfc
--- /dev/null
+++ b/examples/pmemblk.fio

@@ -0,0 +1,71 @@
+[global]
+bs=1m
+ioengine=pmemblk
+norandommap
+time_based=1
+runtime=30
+group_reporting
+disable_lat=1
+disable_slat=1
+disable_clat=1
+clat_percentiles=0
+cpus_allowed_policy=split
+
+# For the pmemblk engine:
+#
+#   IOs always complete immediately
+#   IOs are always direct
+#   Must use threads
+#
+iodepth=1
+direct=1
+thread=1
+numjobs=16
+#
+# Unlink can be used to remove the files when done, but if you are
+# using serial runs with stonewall, and you want the files to be created
+# only once and unlinked only at the very end, then put the unlink=1
+# in the last group.  This is the method demonstrated here.
+#
+# Note that if you have a read-only group and if the files will be
+# newly created, then all of the data will read back as zero and the
+# read will be optimized, yielding performance that is different from
+# that of reading non-zero blocks (or unoptimized zero blocks).
+#
+unlink=0
+#
+# The pmemblk engine does IO to files in a DAX-mounted filesystem.
+# The filesystem should be created on an NVDIMM (e.g /dev/pmem0)
+# and then mounted with the '-o dax' option.  Note that the engine
+# accesses the underlying NVDIMM directly, bypassing the kernel block
+# layer, so the usual filesystem/disk performance monitoring tools such
+# as iostat will not provide useful data.
+#
+# Here we specify a test file on each of two NVDIMMs.  The first
+# number after the file name is the block size in bytes (4096 bytes
+# in this example).  The second number is the size of the file to
+# create in MiB (1 GiB in this example); note that the actual usable
+# space available to fio will be less than this as libpmemblk requires
+# some space for metadata.
+#
+# Currently, the minimum block size is 512 bytes and the minimum file
+# size is about 17 MiB (these are libpmemblk requirements).
+#
+# While both files in this example have the same block size and file
+# size, this is not required.
+#
+filename=/pmem0/fio-test,4096,1024
+filename=/pmem1/fio-test,4096,1024
+
+[pmemblk-write]
+rw=randwrite
+stonewall
+
+[pmemblk-read]
+rw=randread
+stonewall
+#
+# We're done, so unlink the file:
+#
+unlink=1
+

diff --git a/examples/poisson-rate-submission.fio b/examples/poisson-rate-submission.fio
new file mode 100644
index 0000000..4bb28f2
--- /dev/null
+++ b/examples/poisson-rate-submission.fio

@@ -0,0 +1,14 @@
+[poisson-rate-submit]
+size=128m
+rw=randread
+ioengine=libaio
+iodepth=32
+direct=1
+# by setting the submit mode to offload, we can guarantee a fixed rate of
+# submission regardless of what the device completion rate is.
+io_submit_mode=offload
+rate_iops=50
+# Real world random request flow follows Poisson process. To give better
+# insight on latency distribution, we simulate request flow under Poisson
+# process.
+rate_process=poisson

diff --git a/examples/rand-zones.fio b/examples/rand-zones.fio
new file mode 100644
index 0000000..da13fa3
--- /dev/null
+++ b/examples/rand-zones.fio

@@ -0,0 +1,18 @@
+# Sample job file demonstrating how to use zoned random distributionss
+# to have skewed random accesses. This example has 50% of the accesses
+# to the first 5% of the file (50/5), 30% to the next 15% (30/15), and
+# finally 20% of the IO will end up in the remaining 80%.
+[zones]
+size=2g
+direct=1
+bs=4k
+rw=randread
+norandommap
+random_distribution=zoned:50/5:30/15:20/
+
+# The above applies to all of reads/writes/trims. If we wanted to do
+# something differently for writes, let's say 50% for the first 10%
+# and 50% for the remaining 90%, we could do it by adding a new section
+# after a a comma.
+
+# random_distribution=zoned:50/5:30/15:20/,50/10:50/90

diff --git a/examples/rdmaio-client.fio b/examples/rdmaio-client.fio
index 7c660c9..286aa21 100644
--- a/examples/rdmaio-client.fio
+++ b/examples/rdmaio-client.fio

@@ -1,11 +1,13 @@
 # Example rdma client job
 [global]
 ioengine=rdma
-filename=[ip_addr]/[port]/[RDMA_WRITE/RDMA_READ/SEND]
+hostname=[hostname]
+port=[port]
+verb=[read/write/send/recv]
 bs=1m
 size=100g
 
 [sender]
 rw=write
 iodepth=1
-iodepth_batch_complete=1
\ No newline at end of file
+iodepth_batch_complete=1

diff --git a/examples/rdmaio-server.fio b/examples/rdmaio-server.fio
index 9348859..ee30856 100644
--- a/examples/rdmaio-server.fio
+++ b/examples/rdmaio-server.fio

@@ -1,10 +1,10 @@
 # Example rdma server job
 [global]
 ioengine=rdma
-filename=[ip_addr]/[port]
+port=[port]
 bs=1m
 size=100g
 
 [receiver]
 rw=read
-iodepth=16
\ No newline at end of file
+iodepth=16

diff --git a/examples/steadystate.fio b/examples/steadystate.fio
new file mode 100644
index 0000000..26fb808
--- /dev/null
+++ b/examples/steadystate.fio

@@ -0,0 +1,45 @@
+#
+# Example job file for steady state job termination
+# Use --output-format=json for detailed information
+#
+# For Windows, change the file names
+#
+
+[global]
+threads=1
+group_reporting=1
+time_based
+size=128m
+
+[ss-write]
+filename=/dev/null
+rw=write
+bs=128k
+numjobs=4
+runtime=5m
+ss=iops:10%
+ss_dur=30s
+ss_ramp=10s
+#
+# Begin ss detection 10s after job starts
+# Terminate job when largest deviation from mean IOPS is 10%
+# Use a rolling 30s window for deviations
+#
+
+
+[ss-read]
+new_group
+stonewall
+filename=/dev/zero
+rw=randread
+bs=4k
+numjobs=4
+runtime=5m
+ss=bw_slope:1%
+ss_dur=10s
+ss_ramp=5s
+#
+# Begin ss detection 5s after job starts
+# Terminate job when bandwidth slope is less than 1% of avg bw
+# Use a rolling 10s window for bw measurements
+#

diff --git a/examples/waitfor.fio b/examples/waitfor.fio
new file mode 100644
index 0000000..95fad00
--- /dev/null
+++ b/examples/waitfor.fio

@@ -0,0 +1,35 @@
+[global]
+threads=1
+group_reporting=1
+filename=/tmp/data
+filesize=128m
+
+[writers]
+rw=write
+bs=128k
+numjobs=4
+runtime=10
+
+[readers]
+new_group
+wait_for=writers
+rw=randread
+bs=4k
+numjobs=4
+runtime=10
+
+[writers2]
+new_group
+wait_for=readers
+rw=randwrite
+bs=4k
+numjobs=4
+runtime=10
+
+[readers2]
+new_group
+wait_for=writers2
+rw=randread
+bs=4k
+numjobs=4
+runtime=10

diff --git a/file.h b/file.h
index f7a1eae..9801bb5 100644
--- a/file.h
+++ b/file.h

@@ -8,13 +8,14 @@
 #include "lib/zipf.h"
 #include "lib/axmap.h"
 #include "lib/lfsr.h"
+#include "lib/gauss.h"
 
 /*
  * The type of object we are working on
  */
 enum fio_filetype {
 	FIO_TYPE_FILE = 1,		/* plain file */
-	FIO_TYPE_BD,			/* block device */
+	FIO_TYPE_BLOCK,			/* block device */
 	FIO_TYPE_CHAR,			/* character device */
 	FIO_TYPE_PIPE,			/* pipe */
 };
@@ -38,13 +39,20 @@
 };
 
 /*
- * roundrobin available files, or choose one at random, or do each one
- * serially.
+ * How fio chooses what file to service next. Choice of uniformly random, or
+ * some skewed random variants, or just sequentially go through them or
+ * roundrobing.
  */
 enum {
-	FIO_FSERVICE_RANDOM	= 1,
-	FIO_FSERVICE_RR		= 2,
-	FIO_FSERVICE_SEQ	= 3,
+	FIO_FSERVICE_RANDOM		= 1,
+	FIO_FSERVICE_RR			= 2,
+	FIO_FSERVICE_SEQ		= 3,
+	__FIO_FSERVICE_NONUNIFORM	= 0x100,
+	FIO_FSERVICE_ZIPF		= __FIO_FSERVICE_NONUNIFORM | 4,
+	FIO_FSERVICE_PARETO		= __FIO_FSERVICE_NONUNIFORM | 5,
+	FIO_FSERVICE_GAUSS		= __FIO_FSERVICE_NONUNIFORM | 6,
+
+	FIO_FSERVICE_SHIFT		= 10,
 };
 
 /*
@@ -75,12 +83,14 @@
 	/*
 	 * filename and possible memory mapping
 	 */
-	char *file_name;
 	unsigned int major, minor;
 	int fileno;
+	int bs;
+	char *file_name;
 
 	/*
 	 * size of the file, offset into file, and io size from that offset
+	 * (be aware io_size is different from thread_options::io_size)
 	 */
 	uint64_t real_file_size;
 	uint64_t file_offset;
@@ -96,9 +106,19 @@
 	uint64_t last_write;
 
 	/*
-	 * For use by the io engine
+	 * Tracks the last iodepth number of completed writes, if data
+	 * verification is enabled
 	 */
-	uint64_t engine_data;
+	uint64_t *last_write_comp;
+	unsigned int last_write_idx;
+
+	/*
+	 * For use by the io engine for offset or private data storage
+	 */
+	union {
+		uint64_t engine_pos;
+		void *engine_data;
+	};
 
 	/*
 	 * if io is protected by a semaphore, this is set
@@ -119,7 +139,10 @@
 	/*
 	 * Used for zipf random distribution
 	 */
-	struct zipf_state zipf;
+	union {
+		struct zipf_state zipf;
+		struct gauss_state gauss;
+	};
 
 	int references;
 	enum fio_file_flags flags;
@@ -127,14 +150,8 @@
 	struct disk_util *du;
 };
 
-#define FILE_ENG_DATA(f)	((void *) (uintptr_t) (f)->engine_data)
-#define FILE_SET_ENG_DATA(f, data)	\
-	((f)->engine_data = (uintptr_t) (data))
-
-struct file_name {
-	struct flist_head list;
-	char *filename;
-};
+#define FILE_ENG_DATA(f)		((f)->engine_data)
+#define FILE_SET_ENG_DATA(f, data)	((f)->engine_data = (data))
 
 #define FILE_FLAG_FNS(name)						\
 static inline void fio_file_set_##name(struct fio_file *f)		\
@@ -175,6 +192,7 @@
 extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
 extern int __must_check file_lookup_open(struct fio_file *f, int flags);
 extern int __must_check pre_read_files(struct thread_data *);
+extern unsigned long long get_rand_file_size(struct thread_data *td);
 extern int add_file(struct thread_data *, const char *, int, int);
 extern int add_file_exclusive(struct thread_data *, const char *);
 extern void get_file(struct fio_file *);
@@ -189,7 +207,8 @@
 extern int get_fileno(struct thread_data *, const char *);
 extern void free_release_files(struct thread_data *);
 extern void filesetup_mem_free(void);
-void fio_file_reset(struct thread_data *, struct fio_file *);
-int fio_files_done(struct thread_data *);
+extern void fio_file_reset(struct thread_data *, struct fio_file *);
+extern bool fio_files_done(struct thread_data *);
+extern bool exists_and_not_regfile(const char *);
 
 #endif

diff --git a/filehash.c b/filehash.c
index 0d61f54..edeeab4 100644
--- a/filehash.c
+++ b/filehash.c

@@ -5,14 +5,19 @@
 #include "flist.h"
 #include "hash.h"
 #include "filehash.h"
+#include "smalloc.h"
+#include "lib/bloom.h"
 
 #define HASH_BUCKETS	512
 #define HASH_MASK	(HASH_BUCKETS - 1)
 
-unsigned int file_hash_size = HASH_BUCKETS * sizeof(struct flist_head);
+#define BLOOM_SIZE	16*1024*1024
+
+static unsigned int file_hash_size = HASH_BUCKETS * sizeof(struct flist_head);
 
 static struct flist_head *file_hash;
 static struct fio_mutex *hash_lock;
+static struct bloom *file_bloom;
 
 static unsigned short hash(const char *name)
 {
@@ -95,6 +100,11 @@
 	return alias;
 }
 
+bool file_bloom_exists(const char *fname, bool set)
+{
+	return bloom_string(file_bloom, fname, strlen(fname), set);
+}
+
 void file_hash_exit(void)
 {
 	unsigned int i, has_entries = 0;
@@ -107,18 +117,23 @@
 	if (has_entries)
 		log_err("fio: file hash not empty on exit\n");
 
+	sfree(file_hash);
 	file_hash = NULL;
 	fio_mutex_remove(hash_lock);
 	hash_lock = NULL;
+	bloom_free(file_bloom);
+	file_bloom = NULL;
 }
 
-void file_hash_init(void *ptr)
+void file_hash_init(void)
 {
 	unsigned int i;
 
-	file_hash = ptr;
+	file_hash = smalloc(file_hash_size);
+
 	for (i = 0; i < HASH_BUCKETS; i++)
 		INIT_FLIST_HEAD(&file_hash[i]);
 
 	hash_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+	file_bloom = bloom_new(BLOOM_SIZE);
 }

diff --git a/filehash.h b/filehash.h
index f316b20..5fecc3b 100644
--- a/filehash.h
+++ b/filehash.h

@@ -1,14 +1,15 @@
 #ifndef FIO_FILE_HASH_H
 #define FIO_FILE_HASH_H
 
-extern unsigned int file_hash_size;
+#include "lib/types.h"
 
-extern void file_hash_init(void *);
+extern void file_hash_init(void);
 extern void file_hash_exit(void);
 extern struct fio_file *lookup_file_hash(const char *);
 extern struct fio_file *add_file_hash(struct fio_file *);
 extern void remove_file_hash(struct fio_file *);
 extern void fio_file_hash_lock(void);
 extern void fio_file_hash_unlock(void);
+extern bool file_bloom_exists(const char *, bool);
 
 #endif

diff --git a/filelock.c b/filelock.c
index b113007..6e84970 100644
--- a/filelock.c
+++ b/filelock.c

@@ -165,7 +165,7 @@
 	return ff;
 }
 
-static int __fio_lock_file(const char *fname, int trylock)
+static bool __fio_lock_file(const char *fname, int trylock)
 {
 	struct fio_filelock *ff;
 	uint32_t hash;
@@ -180,16 +180,16 @@
 
 	if (!ff) {
 		assert(!trylock);
-		return 1;
+		return true;
 	}
 
 	if (!trylock) {
 		fio_mutex_down(&ff->lock);
-		return 0;
+		return false;
 	}
 
 	if (!fio_mutex_down_trylock(&ff->lock))
-		return 0;
+		return false;
 
 	fio_mutex_down(&fld->lock);
 
@@ -206,13 +206,13 @@
 
 	if (ff) {
 		fio_mutex_down(&ff->lock);
-		return 0;
+		return false;
 	}
 
-	return 1;
+	return true;
 }
 
-int fio_trylock_file(const char *fname)
+bool fio_trylock_file(const char *fname)
 {
 	return __fio_lock_file(fname, 1);
 }

diff --git a/filelock.h b/filelock.h
index 97d13b7..4551bb0 100644
--- a/filelock.h
+++ b/filelock.h

@@ -1,8 +1,10 @@
 #ifndef FIO_LOCK_FILE_H
 #define FIO_LOCK_FILE_H
 
+#include "lib/types.h"
+
 extern void fio_lock_file(const char *);
-extern int fio_trylock_file(const char *);
+extern bool fio_trylock_file(const char *);
 extern void fio_unlock_file(const char *);
 
 extern int fio_filelock_init(void);

diff --git a/filesetup.c b/filesetup.c
index 0fb5589..612e794 100644
--- a/filesetup.c
+++ b/filesetup.c

@@ -24,6 +24,14 @@
 
 static FLIST_HEAD(filename_list);
 
+/*
+ * List entry for filename_list
+ */
+struct file_name {
+	struct flist_head list;
+	char *filename;
+};
+
 static inline void clear_error(struct thread_data *td)
 {
 	td->error = 0;
@@ -52,20 +60,26 @@
 	 */
 	if (td_read(td) ||
 	   (td_write(td) && td->o.overwrite && !td->o.file_append) ||
-	    (td_write(td) && td->io_ops->flags & FIO_NOEXTEND))
+	    (td_write(td) && td_ioengine_flagged(td, FIO_NOEXTEND)))
 		new_layout = 1;
 	if (td_write(td) && !td->o.overwrite && !td->o.file_append)
 		unlink_file = 1;
 
 	if (unlink_file || new_layout) {
+		int ret;
+
 		dprint(FD_FILE, "layout unlink %s\n", f->file_name);
-		if ((td_io_unlink_file(td, f) < 0) && (errno != ENOENT)) {
+
+		ret = td_io_unlink_file(td, f);
+		if (ret != 0 && ret != ENOENT) {
 			td_verror(td, errno, "unlink");
 			return 1;
 		}
 	}
 
-	flags = O_WRONLY | O_CREAT;
+	flags = O_WRONLY;
+	if (td->o.allow_create)
+		flags |= O_CREAT;
 	if (new_layout)
 		flags |= O_TRUNC;
 
@@ -76,7 +90,13 @@
 	dprint(FD_FILE, "open file %s, flags %x\n", f->file_name, flags);
 	f->fd = open(f->file_name, flags, 0644);
 	if (f->fd < 0) {
-		td_verror(td, errno, "open");
+		int err = errno;
+
+		if (err == ENOENT && !td->o.allow_create)
+			log_err("fio: file creation disallowed by "
+					"allow_file_create=0\n");
+		else
+			td_verror(td, err, "open");
 		return 1;
 	}
 
@@ -118,6 +138,9 @@
 	}
 #endif /* CONFIG_POSIX_FALLOCATE */
 
+	/*
+	 * If our jobs don't require regular files initially, we're done.
+	 */
 	if (!new_layout)
 		goto done;
 
@@ -136,11 +159,18 @@
 		}
 	}
 
-	b = malloc(td->o.max_bs[DDIR_WRITE]);
-
 	left = f->real_file_size;
+	bs = td->o.max_bs[DDIR_WRITE];
+	if (bs > left)
+		bs = left;
+
+	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		goto err;
+	}
+
 	while (left && !td->terminate) {
-		bs = td->o.max_bs[DDIR_WRITE];
 		if (bs > left)
 			bs = left;
 
@@ -205,7 +235,11 @@
 	unsigned int bs;
 	char *b;
 
-	if (td->io_ops->flags & FIO_PIPEIO)
+	if (td_ioengine_flagged(td, FIO_PIPEIO) ||
+	    td_ioengine_flagged(td, FIO_NOIO))
+		return 0;
+
+	if (f->filetype == FIO_TYPE_CHAR)
 		return 0;
 
 	if (!fio_file_open(f)) {
@@ -218,8 +252,17 @@
 
 	old_runstate = td_bump_runstate(td, TD_PRE_READING);
 
+	left = f->io_size;
 	bs = td->o.max_bs[DDIR_READ];
+	if (bs > left)
+		bs = left;
+
 	b = malloc(bs);
+	if (!b) {
+		td_verror(td, errno, "malloc");
+		ret = 1;
+		goto error;
+	}
 	memset(b, 0, bs);
 
 	if (lseek(f->fd, f->file_offset, SEEK_SET) < 0) {
@@ -229,8 +272,6 @@
 		goto error;
 	}
 
-	left = f->io_size;
-
 	while (left && !td->terminate) {
 		if (bs > left)
 			bs = left;
@@ -256,14 +297,16 @@
 	return ret;
 }
 
-static unsigned long long get_rand_file_size(struct thread_data *td)
+unsigned long long get_rand_file_size(struct thread_data *td)
 {
 	unsigned long long ret, sized;
+	uint64_t frand_max;
 	unsigned long r;
 
+	frand_max = rand_max(&td->file_size_state);
 	r = __rand(&td->file_size_state);
 	sized = td->o.file_size_high - td->o.file_size_low;
-	ret = (unsigned long long) ((double) sized * (r / (FRAND_MAX + 1.0)));
+	ret = (unsigned long long) ((double) sized * (r / (frand_max + 1.0)));
 	ret += td->o.file_size_low;
 	ret -= (ret % td->o.rw_min_bs);
 	return ret;
@@ -319,7 +362,7 @@
 	int r;
 
 	if (td->io_ops->open_file(td, f)) {
-		log_err("fio: failed opening blockdev %s for size check\n",
+		log_err("fio: failed opening chardev %s for size check\n",
 			f->file_name);
 		return 1;
 	}
@@ -356,16 +399,38 @@
 
 	if (f->filetype == FIO_TYPE_FILE)
 		ret = file_size(td, f);
-	else if (f->filetype == FIO_TYPE_BD)
+	else if (f->filetype == FIO_TYPE_BLOCK)
 		ret = bdev_size(td, f);
 	else if (f->filetype == FIO_TYPE_CHAR)
 		ret = char_size(td, f);
 	else
-		f->real_file_size = -1;
+		f->real_file_size = -1ULL;
 
+	/*
+	 * Leave ->real_file_size with 0 since it could be expectation
+	 * of initial setup for regular files.
+	 */
 	if (ret)
 		return ret;
 
+	/*
+	 * If ->real_file_size is -1, a conditional for the message
+	 * "offset extends end" is always true, but it makes no sense,
+	 * so just return the same value here.
+	 */
+	if (f->real_file_size == -1ULL) {
+		log_info("%s: failed to get file size of %s\n", td->o.name,
+					f->file_name);
+		return 1;
+	}
+
+	if (td->o.start_offset && f->file_offset == 0)
+		dprint(FD_FILE, "offset of file %s not initialized yet\n",
+					f->file_name);
+	/*
+	 * ->file_offset normally hasn't been initialized yet, so this
+	 * is basically always false.
+	 */
 	if (f->file_offset > f->real_file_size) {
 		log_err("%s: offset extends end (%llu > %llu)\n", td->o.name,
 					(unsigned long long) f->file_offset,
@@ -381,7 +446,7 @@
 				   unsigned long long off,
 				   unsigned long long len)
 {
-	int ret = 0;
+	int errval = 0, ret = 0;
 
 #ifdef CONFIG_ESX
 	return 0;
@@ -395,15 +460,33 @@
 	if (len == -1ULL || off == -1ULL)
 		return 0;
 
-	dprint(FD_IO, "invalidate cache %s: %llu/%llu\n", f->file_name, off,
-								len);
-
-	if (td->io_ops->invalidate)
+	if (td->io_ops->invalidate) {
+		dprint(FD_IO, "invalidate %s cache %s\n", td->io_ops->name,
+			f->file_name);
 		ret = td->io_ops->invalidate(td, f);
-	else if (f->filetype == FIO_TYPE_FILE)
+		if (ret < 0)
+			errval = -ret;
+	} else if (f->filetype == FIO_TYPE_FILE) {
+		dprint(FD_IO, "declare unneeded cache %s: %llu/%llu\n",
+			f->file_name, off, len);
 		ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
-	else if (f->filetype == FIO_TYPE_BD) {
+		if (ret)
+			errval = ret;
+	} else if (f->filetype == FIO_TYPE_BLOCK) {
+		int retry_count = 0;
+
+		dprint(FD_IO, "drop page cache %s\n", f->file_name);
 		ret = blockdev_invalidate_cache(f);
+		while (ret < 0 && errno == EAGAIN && retry_count++ < 25) {
+			/*
+			 * Linux multipath devices reject ioctl while
+			 * the maps are being updated. That window can
+			 * last tens of milliseconds; we'll try up to
+			 * a quarter of a second.
+			 */
+			usleep(10000);
+			ret = blockdev_invalidate_cache(f);
+		}
 		if (ret < 0 && errno == EACCES && geteuid()) {
 			if (!root_warn) {
 				log_err("fio: only root may flush block "
@@ -412,8 +495,15 @@
 			}
 			ret = 0;
 		}
-	} else if (f->filetype == FIO_TYPE_CHAR || f->filetype == FIO_TYPE_PIPE)
+		if (ret < 0)
+			errval = errno;
+		else if (ret) /* probably not supported */
+			errval = ret;
+	} else if (f->filetype == FIO_TYPE_CHAR ||
+		   f->filetype == FIO_TYPE_PIPE) {
+		dprint(FD_IO, "invalidate not supported %s\n", f->file_name);
 		ret = 0;
+	}
 
 	/*
 	 * Cache flushing isn't a fatal condition, and we know it will
@@ -421,10 +511,9 @@
 	 * function to flush eg block device caches. So just warn and
 	 * continue on our way.
 	 */
-	if (ret) {
-		log_info("fio: cache invalidation of %s failed: %s\n", f->file_name, strerror(errno));
-		ret = 0;
-	}
+	if (errval)
+		log_info("fio: cache invalidation of %s failed: %s\n",
+			 f->file_name, strerror(errval));
 
 	return 0;
 
@@ -456,7 +545,7 @@
 		f->shadow_fd = -1;
 	}
 
-	f->engine_data = 0;
+	f->engine_pos = 0;
 	return ret;
 }
 
@@ -468,9 +557,6 @@
 	__f = lookup_file_hash(f->file_name);
 	if (__f) {
 		dprint(FD_FILE, "found file in hash %s\n", f->file_name);
-		/*
-		 * racy, need the __f->lock locked
-		 */
 		f->lock = __f->lock;
 		from_hash = 1;
 	} else {
@@ -512,11 +598,6 @@
 
 	dprint(FD_FILE, "fd open %s\n", f->file_name);
 
-	if (td_trim(td) && f->filetype != FIO_TYPE_BD) {
-		log_err("fio: trim only applies to block device\n");
-		return 1;
-	}
-
 	if (!strcmp(f->file_name, "-")) {
 		if (td_rw(td)) {
 			log_err("fio: can't read/write to stdin/out\n");
@@ -544,7 +625,7 @@
 	}
 	if (td->o.sync_io)
 		flags |= O_SYNC;
-	if (td->o.create_on_open)
+	if (td->o.create_on_open && td->o.allow_create)
 		flags |= O_CREAT;
 skip_flags:
 	if (f->filetype != FIO_TYPE_FILE)
@@ -555,7 +636,7 @@
 		if (!read_only)
 			flags |= O_RDWR;
 
-		if (f->filetype == FIO_TYPE_FILE)
+		if (f->filetype == FIO_TYPE_FILE && td->o.allow_create)
 			flags |= O_CREAT;
 
 		if (is_std)
@@ -572,7 +653,8 @@
 			f->fd = dup(STDIN_FILENO);
 		else
 			from_hash = file_lookup_open(f, flags);
-	} else { //td trim
+	} else if (td_trim(td)) {
+		assert(!td_rw(td)); /* should have matched above */
 		flags |= O_RDWR;
 		from_hash = file_lookup_open(f, flags);
 	}
@@ -627,6 +709,10 @@
 	return 0;
 }
 
+/*
+ * This function i.e. get_file_size() is the default .get_file_size
+ * implementation of majority of I/O engines.
+ */
 int generic_get_file_size(struct thread_data *td, struct fio_file *f)
 {
 	return get_file_size(td, f);
@@ -642,7 +728,7 @@
 	int err = 0;
 
 	for_each_file(td, f, i) {
-		dprint(FD_FILE, "get file size for %p/%d/%p\n", f, i,
+		dprint(FD_FILE, "get file size for %p/%d/%s\n", f, i,
 								f->file_name);
 
 		if (td_io_get_file_size(td, f)) {
@@ -654,6 +740,13 @@
 			clear_error(td);
 		}
 
+		/*
+		 * There are corner cases where we end up with -1 for
+		 * ->real_file_size due to unsupported file type, etc.
+		 * We then just set to size option value divided by number
+		 * of files, similar to the way file ->io_size is set.
+		 * stat(2) failure doesn't set ->real_file_size to -1.
+		 */
 		if (f->real_file_size == -1ULL && td->o.size)
 			f->real_file_size = td->o.size / td->o.nr_files;
 	}
@@ -684,7 +777,7 @@
 		struct stat sb;
 		char buf[256];
 
-		if (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_CHAR) {
+		if (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_CHAR) {
 			if (f->real_file_size != -1ULL)
 				ret += f->real_file_size;
 			continue;
@@ -727,7 +820,7 @@
 		fm = flist_entry(n, struct fio_mount, list);
 		flist_del(&fm->list);
 
-		sz = get_fs_size(fm->base);
+		sz = get_fs_free_size(fm->base);
 		if (sz && sz != -1ULL)
 			ret += sz;
 
@@ -770,7 +863,9 @@
 		goto done;
 
 	/*
-	 * if ioengine defines a setup() method, it's responsible for
+	 * Find out physical size of files or devices for this thread,
+	 * before we determine I/O size and range of our targets.
+	 * If ioengine defines a setup() method, it's responsible for
 	 * opening the files and setting f->real_file_size to indicate
 	 * the valid range for that file.
 	 */
@@ -788,6 +883,7 @@
 	 */
 	total_size = 0;
 	for_each_file(td, f, i) {
+		f->fileno = i;
 		if (f->real_file_size == -1ULL)
 			total_size = -1ULL;
 		else
@@ -801,7 +897,7 @@
 	 * device/file sizes are zero and no size given, punt
 	 */
 	if ((!total_size || total_size == -1ULL) && !o->size &&
-	    !(td->io_ops->flags & FIO_NOIO) && !o->fill_device &&
+	    !td_ioengine_flagged(td, FIO_NOIO) && !o->fill_device &&
 	    !(o->nr_files && (o->file_size_low || o->file_size_high))) {
 		log_err("%s: you need to specify size=\n", o->name);
 		td_verror(td, EINVAL, "total_file_size");
@@ -810,7 +906,7 @@
 
 	/*
 	 * Calculate per-file size and potential extra size for the
-	 * first files, if needed.
+	 * first files, if needed (i.e. if we don't have a fixed size).
 	 */
 	if (!o->file_size_low && o->nr_files) {
 		uint64_t all_fs;
@@ -832,11 +928,18 @@
 	for_each_file(td, f, i) {
 		f->file_offset = get_start_offset(td, f);
 
+		/*
+		 * Update ->io_size depending on options specified.
+		 * ->file_size_low being 0 means filesize option isn't set.
+		 * Non zero ->file_size_low equals ->file_size_high means
+		 * filesize option is set in a fixed size format.
+		 * Non zero ->file_size_low not equals ->file_size_high means
+		 * filesize option is set in a range format.
+		 */
 		if (!o->file_size_low) {
 			/*
-			 * no file size range given, file size is equal to
-			 * total size divided by number of files. If that is
-			 * zero, set it to the real file size. If the size
+			 * no file size or range given, file size is equal to
+			 * total size divided by number of files. If the size
 			 * doesn't divide nicely with the min blocksize,
 			 * make the first files bigger.
 			 */
@@ -846,8 +949,22 @@
 				f->io_size += bs;
 			}
 
-			if (!f->io_size)
+			/*
+			 * We normally don't come here for regular files, but
+			 * if the result is 0 for a regular file, set it to the
+			 * real file size. This could be size of the existing
+			 * one if it already exists, but otherwise will be set
+			 * to 0. A new file won't be created because
+			 * ->io_size + ->file_offset equals ->real_file_size.
+			 */
+			if (!f->io_size) {
+				if (f->file_offset > f->real_file_size)
+					goto err_offset;
 				f->io_size = f->real_file_size - f->file_offset;
+				if (!f->io_size)
+					log_info("fio: file %s may be ignored\n",
+						f->file_name);
+			}
 		} else if (f->real_file_size < o->file_size_low ||
 			   f->real_file_size > o->file_size_high) {
 			if (f->file_offset > o->file_size_low)
@@ -868,23 +985,46 @@
 		if (f->io_size == -1ULL)
 			total_size = -1ULL;
 		else {
-                        if (o->size_percent)
-                                f->io_size = (f->io_size * o->size_percent) / 100;
+                        if (o->size_percent) {
+				f->io_size = (f->io_size * o->size_percent) / 100;
+				f->io_size -= (f->io_size % td_min_bs(td));
+			}
 			total_size += f->io_size;
 		}
 
 		if (f->filetype == FIO_TYPE_FILE &&
 		    (f->io_size + f->file_offset) > f->real_file_size &&
-		    !(td->io_ops->flags & FIO_DISKLESSIO)) {
+		    !td_ioengine_flagged(td, FIO_DISKLESSIO)) {
 			if (!o->create_on_open) {
 				need_extend++;
 				extend_size += (f->io_size + f->file_offset);
+				fio_file_set_extend(f);
 			} else
 				f->real_file_size = f->io_size + f->file_offset;
-			fio_file_set_extend(f);
 		}
 	}
 
+	if (td->o.block_error_hist) {
+		int len;
+
+		assert(td->o.nr_files == 1);	/* checked in fixup_options */
+		f = td->files[0];
+		len = f->io_size / td->o.bs[DDIR_TRIM];
+		if (len > MAX_NR_BLOCK_INFOS || len <= 0) {
+			log_err("fio: cannot calculate block histogram with "
+				"%d trim blocks, maximum %d\n",
+				len, MAX_NR_BLOCK_INFOS);
+			td_verror(td, EINVAL, "block_error_hist");
+			goto err_out;
+		}
+
+		td->ts.nr_block_infos = len;
+		for (i = 0; i < len; i++)
+			td->ts.block_infos[i] =
+				BLOCK_INFO(0, BLOCK_STATE_UNINIT);
+	} else
+		td->ts.nr_block_infos = 0;
+
 	if (!o->size || (total_size && o->size > total_size))
 		o->size = total_size;
 
@@ -894,14 +1034,21 @@
 	}
 
 	/*
-	 * See if we need to extend some files
+	 * See if we need to extend some files, typically needed when our
+	 * target regular files don't exist yet, but our jobs require them
+	 * initially due to read I/Os.
 	 */
 	if (need_extend) {
 		temp_stall_ts = 1;
-		if (output_format == FIO_OUTPUT_NORMAL)
-			log_info("%s: Laying out IO file(s) (%u file(s) /"
-				 " %lluMB)\n", o->name, need_extend,
-					extend_size >> 20);
+		if (output_format & FIO_OUTPUT_NORMAL) {
+			log_info("%s: Laying out IO file%s (%u file%s / %s%lluMiB)\n",
+				 o->name,
+				 need_extend > 1 ? "s" : "",
+				 need_extend,
+				 need_extend > 1 ? "s" : "",
+				 need_extend > 1 ? "total " : "",
+				 extend_size >> 20);
+		}
 
 		for_each_file(td, f, i) {
 			unsigned long long old_len = -1ULL, extend_len = -1ULL;
@@ -948,8 +1095,8 @@
 	 * stored entries.
 	 */
 	if (!o->read_iolog_file) {
-		if (o->io_limit)
-			td->total_io_size = o->io_limit * o->loops;
+		if (o->io_size)
+			td->total_io_size = o->io_size * o->loops;
 		else
 			td->total_io_size = o->size * o->loops;
 	}
@@ -975,10 +1122,11 @@
 	dprint(FD_FILE, "pre_read files\n");
 
 	for_each_file(td, f, i) {
-		pre_read_file(td, f);
+		if (pre_read_file(td, f))
+			return -1;
 	}
 
-	return 1;
+	return 0;
 }
 
 static int __init_rand_distribution(struct thread_data *td, struct fio_file *f)
@@ -998,8 +1146,10 @@
 
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
 		zipf_init(&f->zipf, nranges, td->o.zipf_theta.u.f, seed);
-	else
+	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
 		pareto_init(&f->zipf, nranges, td->o.pareto_h.u.f, seed);
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		gauss_init(&f->gauss, nranges, td->o.gauss_dev.u.f, seed);
 
 	return 1;
 }
@@ -1023,6 +1173,43 @@
 	return 1;
 }
 
+/*
+ * Check if the number of blocks exceeds the randomness capability of
+ * the selected generator. Tausworthe is 32-bit, the others are fullly
+ * 64-bit capable.
+ */
+static int check_rand_gen_limits(struct thread_data *td, struct fio_file *f,
+				 uint64_t blocks)
+{
+	if (blocks <= FRAND32_MAX)
+		return 0;
+	if (td->o.random_generator != FIO_RAND_GEN_TAUSWORTHE)
+		return 0;
+
+	/*
+	 * If the user hasn't specified a random generator, switch
+	 * to tausworthe64 with informational warning. If the user did
+	 * specify one, just warn.
+	 */
+	log_info("fio: file %s exceeds 32-bit tausworthe random generator.\n",
+			f->file_name);
+
+	if (!fio_option_is_set(&td->o, random_generator)) {
+		log_info("fio: Switching to tausworthe64. Use the "
+			 "random_generator= option to get rid of this "
+			 "warning.\n");
+		td->o.random_generator = FIO_RAND_GEN_TAUSWORTHE64;
+		return 0;
+	}
+
+	/*
+	 * Just make this information to avoid breaking scripts.
+	 */
+	log_info("fio: Use the random_generator= option to switch to lfsr or "
+			 "tausworthe64.\n");
+	return 0;
+}
+
 int init_random_map(struct thread_data *td)
 {
 	unsigned long long blocks;
@@ -1039,6 +1226,9 @@
 
 		blocks = fsize / (unsigned long long) td->o.rw_min_bs;
 
+		if (check_rand_gen_limits(td, f, blocks))
+			return 1;
+
 		if (td->o.random_generator == FIO_RAND_GEN_LFSR) {
 			unsigned long seed;
 
@@ -1134,14 +1324,16 @@
 	else
 		f->filetype = FIO_TYPE_FILE;
 
+#ifdef WIN32
 	/* \\.\ is the device namespace in Windows, where every file is
 	 * a block device */
 	if (strncmp(f->file_name, "\\\\.\\", 4) == 0)
-		f->filetype = FIO_TYPE_BD;
+		f->filetype = FIO_TYPE_BLOCK;
+#endif
 
 	if (!stat(f->file_name, &sb)) {
 		if (S_ISBLK(sb.st_mode))
-			f->filetype = FIO_TYPE_BD;
+			f->filetype = FIO_TYPE_BLOCK;
 		else if (S_ISCHR(sb.st_mode))
 			f->filetype = FIO_TYPE_CHAR;
 		else if (S_ISFIFO(sb.st_mode))
@@ -1149,31 +1341,35 @@
 	}
 }
 
-static int __is_already_allocated(const char *fname)
+static bool __is_already_allocated(const char *fname, bool set)
 {
 	struct flist_head *entry;
-	char *filename;
+	bool ret;
 
-	if (flist_empty(&filename_list))
-		return 0;
+	ret = file_bloom_exists(fname, set);
+	if (!ret)
+		return ret;
 
 	flist_for_each(entry, &filename_list) {
-		filename = flist_entry(entry, struct file_name, list)->filename;
+		struct file_name *fn;
 
-		if (strcmp(filename, fname) == 0)
-			return 1;
+		fn = flist_entry(entry, struct file_name, list);
+
+		if (!strcmp(fn->filename, fname))
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
-static int is_already_allocated(const char *fname)
+static bool is_already_allocated(const char *fname)
 {
-	int ret;
+	bool ret;
 
 	fio_file_hash_lock();
-	ret = __is_already_allocated(fname);
+	ret = __is_already_allocated(fname, false);
 	fio_file_hash_unlock();
+
 	return ret;
 }
 
@@ -1185,7 +1381,7 @@
 	fn->filename = strdup(fname);
 
 	fio_file_hash_lock();
-	if (!__is_already_allocated(fname)) {
+	if (!__is_already_allocated(fname, true)) {
 		flist_add_tail(&fn->list, &filename_list);
 		fn = NULL;
 	}
@@ -1197,7 +1393,6 @@
 	}
 }
 
-
 static void free_already_allocated(void)
 {
 	struct flist_head *entry, *tmp;
@@ -1223,7 +1418,6 @@
 
 	f = smalloc(sizeof(*f));
 	if (!f) {
-		log_err("fio: smalloc OOM\n");
 		assert(0);
 		return NULL;
 	}
@@ -1234,6 +1428,26 @@
 	return f;
 }
 
+bool exists_and_not_regfile(const char *filename)
+{
+	struct stat sb;
+
+	if (lstat(filename, &sb) == -1)
+		return false;
+
+#ifndef WIN32 /* NOT Windows */
+	if (S_ISREG(sb.st_mode))
+		return false;
+#else
+	/* \\.\ is the device namespace in Windows, where every file
+	 * is a device node */
+	if (S_ISREG(sb.st_mode) && strncmp(filename, "\\\\.\\", 4) != 0)
+		return false;
+#endif
+
+	return true;
+}
+
 int add_file(struct thread_data *td, const char *fname, int numjob, int inc)
 {
 	int cur_files = td->files_index;
@@ -1244,12 +1458,14 @@
 	dprint(FD_FILE, "add file %s\n", fname);
 
 	if (td->o.directory)
-		len = set_name_idx(file_name, td->o.directory, numjob);
+		len = set_name_idx(file_name, PATH_MAX, td->o.directory, numjob,
+					td->o.unique_filename);
 
 	sprintf(file_name + len, "%s", fname);
 
 	/* clean cloned siblings using existing files */
-	if (numjob && is_already_allocated(file_name))
+	if (numjob && is_already_allocated(file_name) &&
+	    !exists_and_not_regfile(fname))
 		return 0;
 
 	f = alloc_new_file(td);
@@ -1280,14 +1496,12 @@
 	/*
 	 * init function, io engine may not be loaded yet
 	 */
-	if (td->io_ops && (td->io_ops->flags & FIO_DISKLESSIO))
+	if (td->io_ops && td_ioengine_flagged(td, FIO_DISKLESSIO))
 		f->real_file_size = -1ULL;
 
 	f->file_name = smalloc_strdup(file_name);
-	if (!f->file_name) {
-		log_err("fio: smalloc OOM\n");
+	if (!f->file_name)
 		assert(0);
-	}
 
 	get_file_type(f);
 
@@ -1490,10 +1704,8 @@
 
 		if (f->file_name) {
 			__f->file_name = smalloc_strdup(f->file_name);
-			if (!__f->file_name) {
-				log_err("fio: smalloc OOM\n");
+			if (!__f->file_name)
 				assert(0);
-			}
 
 			__f->filetype = f->filetype;
 		}
@@ -1549,16 +1761,16 @@
 		lfsr_reset(&f->lfsr, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
 }
 
-int fio_files_done(struct thread_data *td)
+bool fio_files_done(struct thread_data *td)
 {
 	struct fio_file *f;
 	unsigned int i;
 
 	for_each_file(td, f, i)
 		if (!fio_file_done(f))
-			return 0;
+			return false;
 
-	return 1;
+	return true;
 }
 
 /* free memory used in initialization phase only */

diff --git a/fio.1 b/fio.1
index 84d71a0..0167c23 100644
--- a/fio.1
+++ b/fio.1

@@ -1,4 +1,4 @@
-.TH fio 1 "December 2014" "User Manual"
+.TH fio 1 "March 2017" "User Manual"
 .SH NAME
 fio \- flexible I/O tester
 .SH SYNOPSIS
@@ -21,25 +21,29 @@
 Write output to \fIfilename\fR.
 .TP
 .BI \-\-output-format \fR=\fPformat
-Set the reporting format to \fInormal\fR, \fIterse\fR, or \fIjson\fR.
+Set the reporting format to \fInormal\fR, \fIterse\fR, \fIjson\fR, or
+\fIjson+\fR. Multiple formats can be selected, separate by a comma. \fIterse\fR
+is a CSV based format. \fIjson+\fR is like \fIjson\fR, except it adds a full
+dump of the latency buckets.
 .TP
 .BI \-\-runtime \fR=\fPruntime
 Limit run time to \fIruntime\fR seconds.
 .TP
 .B \-\-bandwidth\-log
-Generate per-job bandwidth logs.
+Generate aggregate bandwidth logs.
 .TP
 .B \-\-minimal
 Print statistics in a terse, semicolon-delimited format.
 .TP
 .B \-\-append-terse
 Print statistics in selected mode AND terse, semicolon-delimited format.
+Deprecated, use \-\-output-format instead to select multiple formats.
 .TP
 .B \-\-version
 Display version information and exit.
 .TP
 .BI \-\-terse\-version \fR=\fPversion
-Set terse version output format (Current version 3, or older version 2).
+Set terse version output format (default 3, or 2 or 4)
 .TP
 .B \-\-help
 Display usage information and exit.
@@ -93,7 +97,7 @@
 Background a fio server, writing the pid to the given pid file.
 .TP
 .BI \-\-client \fR=\fPhost
-Instead of running the jobs locally, send and run them on the given host.
+Instead of running the jobs locally, send and run them on the given host or set of hosts.  See client/server section.
 .TP
 .BI \-\-idle\-prof \fR=\fPoption
 Report cpu idleness on a system or percpu basis (\fIoption\fP=system,percpu) or run unit work calibration only (\fIoption\fP=calibrate).
@@ -143,19 +147,77 @@
 String: a sequence of alphanumeric characters.
 .TP
 .I int
-SI integer: a whole number, possibly containing a suffix denoting the base unit
-of the value.  Accepted suffixes are `k', 'M', 'G', 'T', and 'P', denoting
-kilo (1024), mega (1024^2), giga (1024^3), tera (1024^4), and peta (1024^5)
-respectively. If prefixed with '0x', the value is assumed to be base 16
-(hexadecimal). A suffix may include a trailing 'b', for instance 'kb' is
-identical to 'k'. You can specify a base 10 value by using 'KiB', 'MiB','GiB',
-etc. This is useful for disk drives where values are often given in base 10
-values. Specifying '30GiB' will get you 30*1000^3 bytes.
-When specifying times the default suffix meaning changes, still denoting the
-base unit of the value, but accepted suffixes are 'D' (days), 'H' (hours), 'M'
-(minutes), 'S' Seconds, 'ms' (or msec) milli seconds, 'us' (or 'usec') micro
-seconds. Time values without a unit specify seconds.
-The suffixes are not case sensitive.
+Integer. A whole number value, which may contain an integer prefix
+and an integer suffix.
+
+[integer prefix]number[integer suffix]
+
+The optional integer prefix specifies the number's base. The default
+is decimal. 0x specifies hexadecimal.
+
+The optional integer suffix specifies the number's units, and includes
+an optional unit prefix and an optional unit.  For quantities
+of data, the default unit is bytes. For quantities of time,
+the default unit is seconds.
+
+With \fBkb_base=1000\fR, fio follows international standards for unit prefixes.
+To specify power-of-10 decimal values defined in the International
+System of Units (SI):
+.nf
+ki means kilo (K) or 1000
+mi means mega (M) or 1000**2
+gi means giga (G) or 1000**3
+ti means tera (T) or 1000**4
+pi means peta (P) or 1000**5
+.fi
+
+To specify power-of-2 binary values defined in IEC 80000-13:
+.nf
+k means kibi (Ki) or 1024
+m means mebi (Mi) or 1024**2
+g means gibi (Gi) or 1024**3
+t means tebi (Ti) or 1024**4
+p means pebi (Pi) or 1024**5
+.fi
+
+With \fBkb_base=1024\fR (the default), the unit prefixes are opposite from
+those specified in the SI and IEC 80000-13 standards to provide
+compatibility with old scripts.  For example, 4k means 4096.
+
+.nf
+Examples with \fBkb_base=1000\fR:
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+1 MiB: 1048576, 1m, 1024k
+1 MB: 1000000, 1mi, 1000ki
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.fi
+
+.nf
+Examples with \fBkb_base=1024\fR (default):
+4 KiB: 4096, 4096b, 4096B, 4k, 4kb, 4kB, 4K, 4KB
+1 MiB: 1048576, 1m, 1024k
+1 MB: 1000000, 1mi, 1000ki
+1 TiB: 1073741824, 1t, 1024m, 1048576k
+1 TB: 1000000000, 1ti, 1000mi, 1000000ki
+.fi
+
+For quantities of data, an optional unit of 'B' may be included
+(e.g.,  'kb' is the same as 'k').
+
+The integer suffix is not case sensitive (e.g., m/mi mean mebi/mega,
+not milli). 'b' and 'B' both mean byte, not bit.
+
+To specify times (units are not case sensitive):
+.nf
+D means days
+H means hours
+M mean minutes
+s or sec means seconds (default)
+ms or msec means milliseconds
+us or usec means microseconds
+.fi
+
 .TP
 .I bool
 Boolean: a true or false value. `0' denotes false, `1' denotes true.
@@ -176,6 +238,14 @@
 May be used to override the job name.  On the command line, this parameter
 has the special purpose of signalling the start of a new job.
 .TP
+.BI wait_for \fR=\fPstr
+Specifies the name of the already defined job to wait for. Single waitee name
+only may be specified. If set, the job won't be started until all workers of
+the waitee job are done.  Wait_for operates on the job name basis, so there are
+a few limitations. First, the waitee must be defined prior to the waiter job
+(meaning no forward references). Second, if a job is being referenced as a
+waitee, it must have a unique name (no duplicate waitees).
+.TP
 .BI description \fR=\fPstr
 Human-readable description of the job. It is printed when the job is run, but
 otherwise has no special purpose.
@@ -235,6 +305,11 @@
 .RE
 .P
 .TP
+.BI unique_filename \fR=\fPbool
+To avoid collisions between networked clients, fio defaults to prefixing
+any generated filenames (with a directory specified) with the source of
+the client connecting. To disable this behavior, set this option to 0.
+.TP
 .BI lockfile \fR=\fPstr
 Fio defaults to not locking any files before it does IO to them. If a file or
 file descriptor is shared, fio can serialize IO to that file to make the end
@@ -270,7 +345,7 @@
 Sequential writes.
 .TP
 .B trim
-Sequential trim (Linux block devices only).
+Sequential trims (Linux block devices only).
 .TP
 .B randread
 Random reads.
@@ -279,15 +354,20 @@
 Random writes.
 .TP
 .B randtrim
-Random trim (Linux block devices only).
+Random trims (Linux block devices only).
 .TP
 .B rw, readwrite
 Mixed sequential reads and writes.
 .TP
-.B randrw 
+.B randrw
 Mixed random reads and writes.
+.TP
+.B trimwrite
+Sequential trim and write mixed workload. Blocks will be trimmed first, then
+the same blocks will be written to.
 .RE
 .P
+Fio defaults to read if the option is not specified.
 For mixed I/O, the default split is 50/50. For certain types of io the result
 may still be skewed a bit, since the speed may be different. It is possible to
 specify a number of IO's to do before getting a new offset, this is done by
@@ -331,7 +411,7 @@
 .TP
 .BI unified_rw_reporting \fR=\fPbool
 Fio normally reports statistics on a per data direction basis, meaning that
-read, write, and trim are accounted and reported separately. If this option is
+reads, writes, and trims are accounted and reported separately. If this option is
 set fio sums the results and reports them as "mixed" instead.
 .TP
 .BI randrepeat \fR=\fPbool
@@ -374,9 +454,32 @@
 because ZFS doesn't support it. Default: 'posix'.
 .RE
 .TP
-.BI fadvise_hint \fR=\fPbool
+.BI fadvise_hint \fR=\fPstr
 Use \fBposix_fadvise\fR\|(2) to advise the kernel what I/O patterns
-are likely to be issued. Default: true.
+are likely to be issued. Accepted values are:
+.RS
+.RS
+.TP
+.B 0
+Backwards compatible hint for "no hint".
+.TP
+.B 1
+Backwards compatible hint for "advise with fio workload type". This
+uses \fBFADV_RANDOM\fR for a random workload, and \fBFADV_SEQUENTIAL\fR
+for a sequential workload.
+.TP
+.B sequential
+Advise using \fBFADV_SEQUENTIAL\fR
+.TP
+.B random
+Advise using \fBFADV_RANDOM\fR
+.RE
+.RE
+.TP
+.BI fadvise_stream \fR=\fPint
+Use \fBposix_fadvise\fR\|(2) to advise the kernel what stream ID the
+writes issued belong to. Only supported on Linux. Note, this option
+may change going forward.
 .TP
 .BI size \fR=\fPint
 Total size of I/O for this job.  \fBfio\fR will run until this many bytes have
@@ -418,20 +521,32 @@
 instead. This has identical behavior to setting \fRoffset\fP to the size
 of a file. This option is ignored on non-regular files.
 .TP
-.BI blocksize \fR=\fPint[,int] "\fR,\fB bs" \fR=\fPint[,int]
-Block size for I/O units.  Default: 4k.  Values for reads, writes, and trims
-can be specified separately in the format \fIread\fR,\fIwrite\fR,\fItrim\fR
-either of which may be empty to leave that value at its default. If a trailing
-comma isn't given, the remainder will inherit the last value set.
+.BI blocksize \fR=\fPint[,int][,int] "\fR,\fB bs" \fR=\fPint[,int][,int]
+The block size in bytes for I/O units.  Default: 4096.
+A single value applies to reads, writes, and trims.
+Comma-separated values may be specified for reads, writes, and trims.
+Empty values separated by commas use the default value. A value not
+terminated in a comma applies to subsequent types.
+.nf
+Examples:
+bs=256k    means 256k for reads, writes and trims
+bs=8k,32k  means 8k for reads, 32k for writes and trims
+bs=8k,32k, means 8k for reads, 32k for writes, and default for trims
+bs=,8k     means default for reads, 8k for writes and trims
+bs=,8k,    means default for reads, 8k for writes, and default for writes
+.fi
 .TP
-.BI blocksize_range \fR=\fPirange[,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange]
-Specify a range of I/O block sizes.  The issued I/O unit will always be a
-multiple of the minimum size, unless \fBblocksize_unaligned\fR is set.  Applies
-to both reads and writes if only one range is given, but can be specified
-separately with a comma separating the values. Example: bsrange=1k-4k,2k-8k.
-Also (see \fBblocksize\fR).
+.BI blocksize_range \fR=\fPirange[,irange][,irange] "\fR,\fB bsrange" \fR=\fPirange[,irange][,irange]
+A range of block sizes in bytes for I/O units.
+The issued I/O unit will always be a multiple of the minimum size, unless
+\fBblocksize_unaligned\fR is set.
+Comma-separated ranges may be specified for reads, writes, and trims
+as described in \fBblocksize\fR.
+.nf
+Example: bsrange=1k-4k,2k-8k.
+.fi
 .TP
-.BI bssplit \fR=\fPstr
+.BI bssplit \fR=\fPstr[,str][,str]
 This option allows even finer grained control of the block sizes issued,
 not just even splits between them. With this option, you can weight various
 block sizes for exact control of the issued IO for a job that has mixed
@@ -439,31 +554,31 @@
 optionally adding as many definitions as needed separated by a colon.
 Example: bssplit=4k/10:64k/50:32k/40 would issue 50% 64k blocks, 10% 4k
 blocks and 40% 32k blocks. \fBbssplit\fR also supports giving separate
-splits to reads and writes. The format is identical to what the
-\fBbs\fR option accepts, the read and write parts are separated with a
-comma.
+splits to reads, writes, and trims.
+Comma-separated values may be specified for reads, writes, and trims
+as described in \fBblocksize\fR.
 .TP
-.B blocksize_unaligned\fR,\fP bs_unaligned
-If set, any size in \fBblocksize_range\fR may be used.  This typically won't
+.B blocksize_unaligned\fR,\fB bs_unaligned
+If set, fio will issue I/O units with any size within \fBblocksize_range\fR,
+not just multiples of the minimum size.  This typically won't
 work with direct I/O, as that normally requires sector alignment.
 .TP
-.BI blockalign \fR=\fPint[,int] "\fR,\fB ba" \fR=\fPint[,int]
-At what boundary to align random IO offsets. Defaults to the same as 'blocksize'
-the minimum blocksize given.  Minimum alignment is typically 512b
-for using direct IO, though it usually depends on the hardware block size.
-This option is mutually exclusive with using a random map for files, so it
-will turn off that option.
-.TP
 .BI bs_is_seq_rand \fR=\fPbool
 If this option is set, fio will use the normal read,write blocksize settings as
-sequential,random instead. Any random read or write will use the WRITE
-blocksize settings, and any sequential read or write will use the READ
-blocksize setting.
+sequential,random blocksize settings instead. Any random read or write will
+use the WRITE blocksize settings, and any sequential read or write will use
+the READ blocksize settings.
+.TP
+.BI blockalign \fR=\fPint[,int][,int] "\fR,\fB ba" \fR=\fPint[,int][,int]
+Boundary to which fio will align random I/O units. Default: \fBblocksize\fR.
+Minimum alignment is typically 512b for using direct IO, though it usually
+depends on the hardware block size.  This option is mutually exclusive with
+using a random map for files, so it will turn off that option.
+Comma-separated values may be specified for reads, writes, and trims
+as described in \fBblocksize\fR.
 .TP
 .B zero_buffers
 Initialize buffers with all zeros. Default: fill buffers with random data.
-The resulting IO buffers will not be completely zeroed, unless
-\fPscramble_buffers\fR is also turned off.
 .TP
 .B refill_buffers
 If this option is given, fio will refill the IO buffers on every submit. The
@@ -500,7 +615,26 @@
 of IO buffers is defined by the other options related to buffer contents. The
 setting can be any pattern of bytes, and can be prefixed with 0x for hex
 values. It may also be a string, where the string must then be wrapped with
-"".
+"", e.g.:
+.RS
+.RS
+\fBbuffer_pattern\fR="abcd"
+.RS
+or
+.RE
+\fBbuffer_pattern\fR=-12
+.RS
+or
+.RE
+\fBbuffer_pattern\fR=0xdeadface
+.RE
+.LP
+Also you can combine everything together in any order:
+.LP
+.RS
+\fBbuffer_pattern\fR=0xdeadface"abcd"-12
+.RE
+.RE
 .TP
 .BI dedupe_percentage \fR=\fPint
 If set, fio will generate this percentage of identical buffers when writing.
@@ -528,10 +662,24 @@
 .TP
 .B sequential
 Do each file in the set sequentially.
+.TP
+.B zipf
+Use a zipfian distribution to decide what file to access.
+.TP
+.B pareto
+Use a pareto distribution to decide what file to access.
+.TP
+.B gauss
+Use a gaussian (normal) distribution to decide what file to access.
 .RE
 .P
-The number of I/Os to issue before switching to a new file can be specified by
-appending `:\fIint\fR' to the service type.
+For \fBrandom\fR, \fBroundrobin\fR, and \fBsequential\fR, a postfix can be
+appended to tell fio how many I/Os to issue before switching to a new file.
+For example, specifying \fBfile_service_type=random:8\fR would cause fio to
+issue \fI8\fR I/Os before selecting a new file at random. For the non-uniform
+distributions, a floating point postfix can be given to influence how the
+distribution is skewed. See \fBrandom_distribution\fR for a description of how
+that would work.
 .RE
 .TP
 .BI ioengine \fR=\fPstr
@@ -545,6 +693,7 @@
 .TP
 .B psync
 Basic \fBpread\fR\|(2) or \fBpwrite\fR\|(2) I/O.
+Default on all supported operating systems except for Windows.
 .TP
 .B vsync
 Basic \fBreadv\fR\|(2) or \fBwritev\fR\|(2) I/O. Will emulate queuing by
@@ -553,6 +702,9 @@
 .B pvsync
 Basic \fBpreadv\fR\|(2) or \fBpwritev\fR\|(2) I/O.
 .TP
+.B pvsync2
+Basic \fBpreadv2\fR\|(2) or \fBpwritev2\fR\|(2) I/O.
+.TP
 .B libaio
 Linux native asynchronous I/O. This ioengine defines engine specific options.
 .TP
@@ -563,7 +715,7 @@
 Solaris native asynchronous I/O.
 .TP
 .B windowsaio
-Windows native asynchronous I/O.
+Windows native asynchronous I/O. Default on Windows.
 .TP
 .B mmap
 File is memory mapped with \fBmmap\fR\|(2) and data copied using
@@ -573,9 +725,6 @@
 \fBsplice\fR\|(2) is used to transfer the data and \fBvmsplice\fR\|(2) to
 transfer data from user-space to the kernel.
 .TP
-.B syslet-rw
-Use the syslet system calls to make regular read/write asynchronous.
-.TP
 .B sg
 SCSI generic sg v3 I/O. May be either synchronous using the SG_IO ioctl, or if
 the target is an sg character device, we use \fBread\fR\|(2) and
@@ -597,7 +746,8 @@
 .TP
 .B cpuio
 Doesn't transfer any data, but burns CPU cycles according to \fBcpuload\fR and
-\fBcpucycles\fR parameters.
+\fBcpuchunks\fR parameters. A job never finishes unless there is at least one
+non-cpuio job.
 .TP
 .B guasi
 The GUASI I/O engine is the Generic Userspace Asynchronous Syscall Interface
@@ -628,8 +778,8 @@
 request to DDIR_WRITE event
 .TP
 .B rbd
-IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd 
-without the need to use the kernel rbd driver. This ioengine defines engine specific 
+IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd
+without the need to use the kernel rbd driver. This ioengine defines engine specific
 options.
 .TP
 .B gfapi
@@ -652,6 +802,21 @@
 example job file to create such files, use rw=write option). Please note, you
 might want to set necessary environment variables to work with hdfs/libhdfs
 properly.
+.TP
+.B mtd
+Read, write and erase an MTD character device (e.g., /dev/mtd0). Discards are
+treated as erases. Depending on the underlying device type, the I/O may have
+to go in a certain pattern, e.g., on NAND, writing sequentially to erase blocks
+and discarding before overwriting. The trimwrite mode works well for this
+constraint.
+.TP
+.B pmemblk
+Read and write using filesystem DAX to a file on a filesystem mounted with
+DAX on a persistent memory device through the NVML libpmemblk library.
+.TP
+.B dev-dax
+Read and write using device DAX to a persistent memory device
+(e.g., /dev/dax0.0) through the NVML libpmem library.
 .RE
 .P
 .RE
@@ -665,10 +830,13 @@
 not async on that OS. Keep an eye on the IO depth distribution in the
 fio output to verify that the achieved depth is as expected. Default: 1.
 .TP
-.BI iodepth_batch \fR=\fPint
-Number of I/Os to submit at once.  Default: \fBiodepth\fR.
+.BI iodepth_batch \fR=\fPint "\fR,\fP iodepth_batch_submit" \fR=\fPint
+This defines how many pieces of IO to submit at once. It defaults to 1
+which means that we submit each IO as soon as it is available, but can
+be raised to submit bigger batches of IO at the time. If it is set to 0
+the \fBiodepth\fR value will be used.
 .TP
-.BI iodepth_batch_complete \fR=\fPint
+.BI iodepth_batch_complete_min \fR=\fPint "\fR,\fP iodepth_batch_complete" \fR=\fPint
 This defines how many pieces of IO to retrieve at once. It defaults to 1 which
  means that we'll ask for a minimum of 1 IO in the retrieval process from the
 kernel. The IO retrieval will go on until we hit the limit set by
@@ -676,9 +844,52 @@
 completed events before queuing more IO. This helps reduce IO latency, at the
 cost of more retrieval system calls.
 .TP
+.BI iodepth_batch_complete_max \fR=\fPint
+This defines maximum pieces of IO to
+retrieve at once. This variable should be used along with
+\fBiodepth_batch_complete_min\fR=int variable, specifying the range
+of min and max amount of IO which should be retrieved. By default
+it is equal to \fBiodepth_batch_complete_min\fR value.
+
+Example #1:
+.RS
+.RS
+\fBiodepth_batch_complete_min\fR=1
+.LP
+\fBiodepth_batch_complete_max\fR=<iodepth>
+.RE
+
+which means that we will retrieve at least 1 IO and up to the
+whole submitted queue depth. If none of IO has been completed
+yet, we will wait.
+
+Example #2:
+.RS
+\fBiodepth_batch_complete_min\fR=0
+.LP
+\fBiodepth_batch_complete_max\fR=<iodepth>
+.RE
+
+which means that we can retrieve up to the whole submitted
+queue depth, but if none of IO has been completed yet, we will
+NOT wait and immediately exit the system call. In this example
+we simply do polling.
+.RE
+.TP
 .BI iodepth_low \fR=\fPint
 Low watermark indicating when to start filling the queue again.  Default:
-\fBiodepth\fR. 
+\fBiodepth\fR.
+.TP
+.BI io_submit_mode \fR=\fPstr
+This option controls how fio submits the IO to the IO engine. The default is
+\fBinline\fR, which means that the fio job threads submit and reap IO directly.
+If set to \fBoffload\fR, the job threads will offload IO submission to a
+dedicated pool of IO threads. This requires some coordination and thus has a
+bit of extra overhead, especially for lower queue depth IO where it can
+increase latencies. The benefit is that fio can manage submission rates
+independently of the device completion rates. This avoids skewed latency
+reporting if IO gets back up on the device side (the coordinated omission
+problem).
 .TP
 .BI direct \fR=\fPbool
 If true, use non-buffered I/O (usually O_DIRECT).  Default: false.
@@ -780,17 +991,53 @@
 .B pareto
 Pareto distribution
 .TP
+.B gauss
+Normal (gaussian) distribution
+.TP
+.B zoned
+Zoned random distribution
+.TP
+.RE
+When using a \fBzipf\fR or \fBpareto\fR distribution, an input value is also
+needed to define the access pattern. For \fBzipf\fR, this is the zipf theta.
+For \fBpareto\fR, it's the pareto power. Fio includes a test program, genzipf,
+that can be used visualize what the given input values will yield in terms of
+hit rates. If you wanted to use \fBzipf\fR with a theta of 1.2, you would use
+random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
+fio will disable use of the random map. For the \fBgauss\fR distribution, a
+normal deviation is supplied as a value between 0 and 100.
+.P
+.RS
+For a \fBzoned\fR distribution, fio supports specifying percentages of IO
+access that should fall within what range of the file or device. For example,
+given a criteria of:
+.P
+.RS
+60% of accesses should be to the first 10%
+.RE
+.RS
+30% of accesses should be to the next 20%
+.RE
+.RS
+8% of accesses should be to to the next 30%
+.RE
+.RS
+2% of accesses should be to the next 40%
 .RE
 .P
-When using a zipf or pareto distribution, an input value is also needed to
-define the access pattern. For zipf, this is the zipf theta. For pareto,
-it's the pareto power. Fio includes a test program, genzipf, that can be
-used visualize what the given input values will yield in terms of hit rates.
-If you wanted to use zipf with a theta of 1.2, you would use
-random_distribution=zipf:1.2 as the option. If a non-uniform model is used,
-fio will disable use of the random map.
+we can define that through zoning of the random accesses. For the above
+example, the user would do:
+.P
+.RS
+.B random_distribution=zoned:60/10:30/20:8/30:2/40
+.RE
+.P
+similarly to how \fBbssplit\fR works for setting ranges and percentages of block
+sizes. Like \fBbssplit\fR, it's possible to specify separate zones for reads,
+writes, and trims. If just one set is given, it'll apply to all of them.
+.RE
 .TP
-.BI percentage_random \fR=\fPint
+.BI percentage_random \fR=\fPint[,int][,int]
 For a random workload, set how big a percentage should be random. This defaults
 to 100%, in which case the workload is fully random. It can be set from
 anywhere from 0 to 100.  Setting it to 0 would make the workload fully
@@ -818,6 +1065,9 @@
 .B lfsr
 Linear feedback shift register generator
 .TP
+.B tausworthe64
+Strong 64-bit 2^258 cycle random number generator
+.TP
 .RE
 .P
 Tausworthe is a strong random number generator, but it requires tracking on the
@@ -826,7 +1076,9 @@
 computationally expensive. It's not a true random generator, however, though
 for IO purposes it's typically good enough. LFSR only works with single block
 sizes, not with workloads that use multiple block sizes. If used with such a
-workload, fio may read or write some blocks multiple times.
+workload, fio may read or write some blocks multiple times. The default
+value is tausworthe, unless the required space exceeds 2^32 blocks. If it does,
+then tausworthe64 is selected automatically.
 .TP
 .BI nice \fR=\fPint
 Run job with given nice value.  See \fBnice\fR\|(2).
@@ -854,31 +1106,41 @@
 words, this setting effectively caps the queue depth if the latter is larger.
 Default: 1.
 .TP
-.BI rate \fR=\fPint
+.BI rate \fR=\fPint[,int][,int]
 Cap bandwidth used by this job. The number is in bytes/sec, the normal postfix
 rules apply. You can use \fBrate\fR=500k to limit reads and writes to 500k each,
-or you can specify read and writes separately. Using \fBrate\fR=1m,500k would
-limit reads to 1MB/sec and writes to 500KB/sec. Capping only reads or writes
+or you can specify reads, write, and trim limits separately.
+Using \fBrate\fR=1m,500k would
+limit reads to 1MiB/sec and writes to 500KiB/sec. Capping only reads or writes
 can be done with \fBrate\fR=,500k or \fBrate\fR=500k,. The former will only
-limit writes (to 500KB/sec), the latter will only limit reads.
+limit writes (to 500KiB/sec), the latter will only limit reads.
 .TP
-.BI ratemin \fR=\fPint
+.BI rate_min \fR=\fPint[,int][,int]
 Tell \fBfio\fR to do whatever it can to maintain at least the given bandwidth.
 Failing to meet this requirement will cause the job to exit. The same format
-as \fBrate\fR is used for read vs write separation.
+as \fBrate\fR is used for read vs write vs trim separation.
 .TP
-.BI rate_iops \fR=\fPint
+.BI rate_iops \fR=\fPint[,int][,int]
 Cap the bandwidth to this number of IOPS. Basically the same as rate, just
 specified independently of bandwidth. The same format as \fBrate\fR is used for
-read vs write separation. If \fBblocksize\fR is a range, the smallest block
+read vs write vs trim separation. If \fBblocksize\fR is a range, the smallest block
 size is used as the metric.
 .TP
-.BI rate_iops_min \fR=\fPint
+.BI rate_iops_min \fR=\fPint[,int][,int]
 If this rate of I/O is not met, the job will exit. The same format as \fBrate\fR
-is used for read vs write separation.
+is used for read vs write vs trim separation.
 .TP
-.BI ratecycle \fR=\fPint
-Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
+.BI rate_process \fR=\fPstr
+This option controls how fio manages rated IO submissions. The default is
+\fBlinear\fR, which submits IO in a linear fashion with fixed delays between
+IOs that gets adjusted based on IO completion rates. If this is set to
+\fBpoisson\fR, fio will submit IO based on a more real world random request
+flow, known as the Poisson process
+(https://en.wikipedia.org/wiki/Poisson_process). The lambda will be
+10^6 / IOPS for the given workload.
+.TP
+.BI rate_cycle \fR=\fPint
+Average bandwidth for \fBrate\fR and \fBrate_min\fR over this number of
 milliseconds.  Default: 1000ms.
 .TP
 .BI latency_target \fR=\fPint
@@ -973,6 +1235,50 @@
 that the \fBramp_time\fR is considered lead in time for a job, thus it will
 increase the total runtime if a special timeout or runtime is specified.
 .TP
+.BI steadystate \fR=\fPstr:float "\fR,\fP ss" \fR=\fPstr:float
+Define the criterion and limit for assessing steady state performance. The
+first parameter designates the criterion whereas the second parameter sets the
+threshold. When the criterion falls below the threshold for the specified
+duration, the job will stop. For example, iops_slope:0.1% will direct fio
+to terminate the job when the least squares regression slope falls below 0.1%
+of the mean IOPS. If group_reporting is enabled this will apply to all jobs in
+the group. All assessments are carried out using only data from the rolling
+collection window. Threshold limits can be expressed as a fixed value or as a
+percentage of the mean in the collection window. Below are the available steady
+state assessment criteria.
+.RS
+.RS
+.TP
+.B iops
+Collect IOPS data. Stop the job if all individual IOPS measurements are within
+the specified limit of the mean IOPS (e.g., iops:2 means that all individual
+IOPS values must be within 2 of the mean, whereas iops:0.2% means that all
+individual IOPS values must be within 0.2% of the mean IOPS to terminate the
+job).
+.TP
+.B iops_slope
+Collect IOPS data and calculate the least squares regression slope. Stop the
+job if the slope falls below the specified limit.
+.TP
+.B bw
+Collect bandwidth data. Stop the job if all individual bandwidth measurements
+are within the specified limit of the mean bandwidth.
+.TP
+.B bw_slope
+Collect bandwidth data and calculate the least squares regression slope. Stop
+the job if the slope falls below the specified limit.
+.RE
+.RE
+.TP
+.BI steadystate_duration \fR=\fPtime "\fR,\fP ss_dur" \fR=\fPtime
+A rolling window of this duration will be used to judge whether steady state
+has been reached. Data will be collected once per second. The default is 0
+which disables steady state detection.
+.TP
+.BI steadystate_ramp_time \fR=\fPtime "\fR,\fP ss_ramp" \fR=\fPtime
+Allow the job to run for the specified duration before beginning data collection
+for checking the steady state job termination criterion. The default is 0.
+.TP
 .BI invalidate \fR=\fPbool
 Invalidate buffer-cache for the file prior to starting I/O.  Default: true.
 .TP
@@ -986,7 +1292,7 @@
 .RS
 .TP
 .B malloc
-Allocate memory with \fBmalloc\fR\|(3).
+Allocate memory with \fBmalloc\fR\|(3). Default memory type.
 .TP
 .B shm
 Use shared memory buffers allocated through \fBshmget\fR\|(2).
@@ -1000,6 +1306,12 @@
 .TP
 .B mmaphuge
 Same as \fBmmap\fR, but use huge files as backing.
+.TP
+.B mmapshared
+Same as \fBmmap\fR, but use a MMAP_SHARED mapping.
+.TP
+.B cudamalloc
+Use GPU memory as the buffers for GPUDirect RDMA benchmark. The ioengine must be \fBrdma\fR.
 .RE
 .P
 The amount of memory allocated is the maximum allowed \fBblocksize\fR for the
@@ -1023,18 +1335,24 @@
 .TP
 .BI hugepage\-size \fR=\fPint
 Defines the size of a huge page.  Must be at least equal to the system setting.
-Should be a multiple of 1MB. Default: 4MB.
+Should be a multiple of 1MiB. Default: 4MiB.
 .TP
 .B exitall
 Terminate all jobs when one finishes.  Default: wait for each job to finish.
 .TP
+.B exitall_on_error \fR=\fPbool
+Terminate all jobs if one job finishes in error.  Default: wait for each job
+to finish.
+.TP
 .BI bwavgtime \fR=\fPint
-Average bandwidth calculations over the given time in milliseconds.  Default:
-500ms.
+Average bandwidth calculations over the given time in milliseconds. If the job
+also does bandwidth logging through \fBwrite_bw_log\fR, then the minimum of
+this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
 .TP
 .BI iopsavgtime \fR=\fPint
-Average IOPS calculations over the given time in milliseconds.  Default:
-500ms.
+Average IOPS calculations over the given time in milliseconds. If the job
+also does IOPS logging through \fBwrite_iops_log\fR, then the minimum of
+this option and \fBlog_avg_msec\fR will be used.  Default: 500ms.
 .TP
 .BI create_serialize \fR=\fPbool
 If true, serialize file creation for the jobs.  Default: true.
@@ -1050,6 +1368,17 @@
 laid out or updated on disk, only that will be done. The actual job contents
 are not executed.
 .TP
+.BI allow_file_create \fR=\fPbool
+If true, fio is permitted to create files as part of its workload. This is
+the default behavior. If this option is false, then fio will error out if the
+files it needs to use don't already exist. Default: true.
+.TP
+.BI allow_mounted_write \fR=\fPbool
+If this isn't set, fio will abort jobs that are destructive (eg that write)
+to what appears to be a mounted device or partition. This should help catch
+creating inadvertently destructive tests, not realizing that the test will
+destroy data on the mounted file system. Default: false.
+.TP
 .BI pre_read \fR=\fPbool
 If this is given, files will be pre-read into memory before starting the given
 IO operation. This will also clear the \fR \fBinvalidate\fR flag, since it is
@@ -1060,6 +1389,9 @@
 .BI unlink \fR=\fPbool
 Unlink job files when done.  Default: false.
 .TP
+.BI unlink_each_loop \fR=\fPbool
+Unlink job files after each iteration or loop.  Default: false.
+.TP
 .BI loops \fR=\fPint
 Specifies the number of iterations (runs of the same workload) of this job.
 Default: 1.
@@ -1076,19 +1408,30 @@
 Default: true.
 .TP
 .BI verify \fR=\fPstr
-Method of verifying file contents after each iteration of the job.  Allowed
-values are:
+Method of verifying file contents after each iteration of the job. Each
+verification method also implies verification of special header, which is
+written to the beginning of each block. This header also includes meta
+information, like offset of the block, block number, timestamp when block
+was written, etc.  \fBverify\fR=str can be combined with \fBverify_pattern\fR=str
+option.  The allowed values are:
 .RS
 .RS
 .TP
-.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1 xxhash
+.B md5 crc16 crc32 crc32c crc32c-intel crc64 crc7 sha256 sha512 sha1 sha3-224 sha3-256 sha3-384 sha3-512 xxhash
 Store appropriate checksum in the header of each block. crc32c-intel is
 hardware accelerated SSE4.2 driven, falls back to regular crc32c if
 not supported by the system.
 .TP
 .B meta
-Write extra information about each I/O (timestamp, block number, etc.). The
-block number is verified. See \fBverify_pattern\fR as well.
+This option is deprecated, since now meta information is included in generic
+verification header and meta verification happens by default.  For detailed
+information see the description of the \fBverify\fR=str setting. This option
+is kept because of compatibility's sake with old configurations. Do not use it.
+.TP
+.B pattern
+Verify a strict pattern. Normally fio includes a header with some basic
+information and checksumming, but if this option is set, only the
+specific pattern set with \fBverify_pattern\fR is verified.
 .TP
 .B null
 Pretend to verify.  Used for testing internals.
@@ -1123,7 +1466,18 @@
 fio will fill 1/2/3/4 bytes of the buffer at the time(it can be either a
 decimal or a hex number). The verify_pattern if larger than a 32-bit quantity
 has to be a hex number that starts with either "0x" or "0X". Use with
-\fBverify\fP=meta.
+\fBverify\fP=str. Also, verify_pattern supports %o format, which means that for
+each block offset will be written and then verified back, e.g.:
+.RS
+.RS
+\fBverify_pattern\fR=%o
+.RE
+Or use combination of everything:
+.LP
+.RS
+\fBverify_pattern\fR=0xff%o"abcd"-21
+.RE
+.RE
 .TP
 .BI verify_fatal \fR=\fPbool
 If true, exit the job on the first observed verification failure.  Default:
@@ -1158,8 +1512,8 @@
 .BI verify_backlog_batch \fR=\fPint
 Control how many blocks fio will verify if verify_backlog is set. If not set,
 will default to the value of \fBverify_backlog\fR (meaning the entire queue is
-read back and verified).  If \fBverify_backlog_batch\fR is less than 
-\fBverify_backlog\fR then not all blocks will be verified,  if 
+read back and verified).  If \fBverify_backlog_batch\fR is less than
+\fBverify_backlog\fR then not all blocks will be verified,  if
 \fBverify_backlog_batch\fR is larger than \fBverify_backlog\fR,  some blocks
 will be verified more than once.
 .TP
@@ -1197,8 +1551,12 @@
 Start a new reporting group.  If not given, all jobs in a file will be part
 of the same reporting group, unless separated by a stonewall.
 .TP
+.BI stats \fR=\fPbool
+By default, fio collects and shows final output results for all jobs that run.
+If this option is set to 0, then fio will ignore it in the final stat output.
+.TP
 .BI numjobs \fR=\fPint
-Number of clones (processes/threads performing the same workload) of this job.  
+Number of clones (processes/threads performing the same workload) of this job.
 Default: 1.
 .TP
 .B group_reporting
@@ -1240,33 +1598,75 @@
 from.  Setting \fBreplay_redirect\fR causes all IOPS to be replayed onto the
 single specified device regardless of the device it was recorded from.
 .TP
+.BI replay_align \fR=\fPint
+Force alignment of IO offsets and lengths in a trace to this power of 2 value.
+.TP
+.BI replay_scale \fR=\fPint
+Scale sector offsets down by this factor when replaying traces.
+.TP
+.BI per_job_logs \fR=\fPbool
+If set, this generates bw/clat/iops log with per file private filenames. If
+not set, jobs with identical names will share the log filename. Default: true.
+.TP
 .BI write_bw_log \fR=\fPstr
-If given, write a bandwidth log of the jobs in this job file. Can be used to
-store data of the bandwidth of the jobs in their lifetime. The included
-fio_generate_plots script uses gnuplot to turn these text files into nice
-graphs. See \fBwrite_lat_log\fR for behaviour of given filename. For this
-option, the postfix is _bw.x.log, where x is the index of the job (1..N,
-where N is the number of jobs)
+If given, write a bandwidth log for this job. Can be used to store data of the
+bandwidth of the jobs in their lifetime. The included fio_generate_plots script
+uses gnuplot to turn these text files into nice graphs. See \fBwrite_lat_log\fR
+for behaviour of given filename. For this option, the postfix is _bw.x.log,
+where x is the index of the job (1..N, where N is the number of jobs). If
+\fBper_job_logs\fR is false, then the filename will not include the job index.
+See the \fBLOG FILE FORMATS\fR
+section.
 .TP
 .BI write_lat_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
 filename is given with this option, the default filename of
 "jobname_type.x.log" is used, where x is the index of the job (1..N, where
 N is the number of jobs). Even if the filename is given, fio will still
-append the type of log.
+append the type of log. If \fBper_job_logs\fR is false, then the filename will
+not include the job index. See the \fBLOG FILE FORMATS\fR section.
+.TP
+.BI write_hist_log \fR=\fPstr
+Same as \fBwrite_lat_log\fR, but writes I/O completion latency histograms. If
+no filename is given with this option, the default filename of
+"jobname_clat_hist.x.log" is used, where x is the index of the job (1..N, where
+N is the number of jobs). Even if the filename is given, fio will still append
+the type of log. If \fBper_job_logs\fR is false, then the filename will not
+include the job index. See the \fBLOG FILE FORMATS\fR section.
 .TP
 .BI write_iops_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
 option, the default filename of "jobname_type.x.log" is used, where x is the
 index of the job (1..N, where N is the number of jobs). Even if the filename
-is given, fio will still append the type of log.
+is given, fio will still append the type of log. If \fBper_job_logs\fR is false,
+then the filename will not include the job index. See the \fBLOG FILE FORMATS\fR
+section.
 .TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
 IO that completes. When writing to the disk log, that can quickly grow to a
 very large size. Setting this option makes fio average the each log entry
-over the specified period of time, reducing the resolution of the log.
-Defaults to 0.
+over the specified period of time, reducing the resolution of the log. See
+\fBlog_max_value\fR as well.  Defaults to 0, logging all entries.
+.TP
+.BI log_max_value \fR=\fPbool
+If \fBlog_avg_msec\fR is set, fio logs the average over that window. If you
+instead want to log the maximum value, set this option to 1.  Defaults to
+0, meaning that averaged values are logged.
+.TP
+.BI log_hist_msec \fR=\fPint
+Same as \fBlog_avg_msec\fR, but logs entries for completion latency histograms.
+Computing latency percentiles from averages of intervals using \fBlog_avg_msec\fR
+is innacurate. Setting this option makes fio log histogram entries over the
+specified period of time, reducing log sizes for high IOPS devices while
+retaining percentile accuracy. See \fBlog_hist_coarseness\fR as well. Defaults
+to 0, meaning histogram logging is disabled.
+.TP
+.BI log_hist_coarseness \fR=\fPint
+Integer ranging from 0 to 6, defining the coarseness of the resolution of the
+histogram logs enabled with \fBlog_hist_msec\fR. For each increment in
+coarseness, fio outputs half as many bins. Defaults to 0, for which histogram
+logs contain 1216 latency bins. See the \fBLOG FILE FORMATS\fR section.
 .TP
 .BI log_offset \fR=\fPbool
 If this is set, the iolog options will include the byte offset for the IO
@@ -1283,11 +1683,25 @@
 normally at the end of a run, by decompressing the chunks and storing them
 in the specified log file. This feature depends on the availability of zlib.
 .TP
+.BI log_compression_cpus \fR=\fPstr
+Define the set of CPUs that are allowed to handle online log compression
+for the IO jobs. This can provide better isolation between performance
+sensitive jobs, and background compression work.
+.TP
 .BI log_store_compressed \fR=\fPbool
-If set, and \fBlog\fR_compression is also set, fio will store the log files in
-a compressed format. They can be decompressed with fio, using the
-\fB\-\-inflate-log\fR command line parameter. The files will be stored with a
-\fB\.fz\fR suffix.
+If set, fio will store the log files in a compressed format. They can be
+decompressed with fio, using the \fB\-\-inflate-log\fR command line parameter.
+The files will be stored with a \fB\.fz\fR suffix.
+.TP
+.BI log_unix_epoch \fR=\fPbool
+If set, fio will log Unix timestamps to the log files produced by enabling
+\fBwrite_type_log\fR for each log type, instead of the default zero-based
+timestamps.
+.TP
+.BI block_error_percentiles \fR=\fPbool
+If set, record errors in trim block-sized units from writes and trims and output
+a histogram of how many trims it took to get to errors, and what kind of error
+was encountered.
 .TP
 .BI disable_lat \fR=\fPbool
 Disable measurements of total latency numbers. Useful only for cutting
@@ -1373,8 +1787,8 @@
 Error may be symbol ('ENOSPC', 'ENOMEM') or an integer.
 .br
 Example: ignore_error=EAGAIN,ENOSPC:122 .
-.br	
-This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE. 
+.br
+This option will ignore EAGAIN from READ, and ENOSPC and 122(EDQUOT) from WRITE.
 .TP
 .BI error_dump \fR=\fPbool
 If set dump every error even if it is non fatal, true by default. If disabled
@@ -1447,9 +1861,9 @@
 Enable the reporting of percentiles of completion latencies.
 .TP
 .BI percentile_list \fR=\fPfloat_list
-Overwrite the default list of percentiles for completion
-latencies. Each number is a floating number in the range (0,100], and
-the maximum length of the list is 20. Use ':' to separate the
+Overwrite the default list of percentiles for completion latencies and the
+block error histogram. Each number is a floating number in the range (0,100],
+and the maximum length of the list is 20. Use ':' to separate the
 numbers. For example, \-\-percentile_list=99.5:99.9 will cause fio to
 report the values of completion latency below which 99.5% and 99.9% of
 the observed latencies fell, respectively.
@@ -1458,13 +1872,13 @@
 used identically to normal parameters, with the caveat that when used on the
 command line, they must come after the ioengine.
 .TP
-.BI (cpu)cpuload \fR=\fPint
+.BI (cpuio)cpuload \fR=\fPint
 Attempt to use the specified percentage of CPU cycles.
 .TP
-.BI (cpu)cpuchunks \fR=\fPint
+.BI (cpuio)cpuchunks \fR=\fPint
 Split the load into cycles of the given time. In microseconds.
 .TP
-.BI (cpu)exit_on_io_done \fR=\fPbool
+.BI (cpuio)exit_on_io_done \fR=\fPbool
 Detect when IO threads are done, then exit.
 .TP
 .BI (libaio)userspace_reap
@@ -1475,6 +1889,10 @@
 enabled when polling for a minimum of 0 events (eg when
 iodepth_batch_complete=0).
 .TP
+.BI (pvsync2)hipri
+Set RWF_HIPRI on IO, indicating to the kernel that it's of
+higher priority than normal.
+.TP
 .BI (net,netsplice)hostname \fR=\fPstr
 The host name or IP address to use for TCP or UDP based IO.
 If the job is a TCP listener or UDP reader, the hostname is not
@@ -1547,7 +1965,7 @@
 File will be used as a block donor (swap extents between files)
 .TP
 .BI (e4defrag,inplace) \fR=\fPint
-Configure donor file block allocation strategy		
+Configure donor file block allocation strategy
 .RS
 .BI 0(default) :
 Preallocate donor's file on init
@@ -1556,6 +1974,9 @@
 allocate space immediately inside defragment event, and free right after event
 .RE
 .TP
+.BI (rbd)clustername \fR=\fPstr
+Specifies the name of the ceph cluster.
+.TP
 .BI (rbd)rbdname \fR=\fPstr
 Specifies the name of the RBD.
 .TP
@@ -1563,13 +1984,18 @@
 Specifies the name of the Ceph pool containing the RBD.
 .TP
 .BI (rbd)clientname \fR=\fPstr
-Specifies the username (without the 'client.' prefix) used to access the Ceph cluster.
+Specifies the username (without the 'client.' prefix) used to access the Ceph
+cluster. If the clustername is specified, the clientname shall be the full
+type.id string. If no type. prefix is given, fio will add 'client.' by default.
+.TP
+.BI (mtd)skipbad \fR=\fPbool
+Skip operations against known bad blocks.
 .SH OUTPUT
 While running, \fBfio\fR will display the status of the created jobs.  For
 example:
 .RS
 .P
-Threads: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
+Jobs: 1: [_r] [24.8% done] [ 13509/  8334 kb/s] [eta 00h:01m:31s]
 .RE
 .P
 The characters in the first set of brackets denote the current status of each
@@ -1653,7 +2079,9 @@
 .TP
 .B cpu
 CPU usage statistics. Includes user and system time, number of context switches
-this thread went through and number of major and minor page faults.
+this thread went through and number of major and minor page faults. The CPU
+utilization numbers are averages for the jobs in that reporting group, while
+the context and fault counters are summed.
 .TP
 .B IO depths
 Distribution of I/O depths.  Each depth includes everything less than (or equal)
@@ -1729,7 +2157,7 @@
 .P
 Read status:
 .RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
+.B Total I/O \fR(KiB)\fP, bandwidth \fR(KiB/s)\fP, IOPS, runtime \fR(ms)\fP
 .P
 Submission latency:
 .RS
@@ -1755,7 +2183,7 @@
 .P
 Write status:
 .RS
-.B Total I/O \fR(KB)\fP, bandwidth \fR(KB/s)\fP, IOPS, runtime \fR(ms)\fP
+.B Total I/O \fR(KiB)\fP, bandwidth \fR(KiB/s)\fP, IOPS, runtime \fR(ms)\fP
 .P
 Submission latency:
 .RS
@@ -1808,11 +2236,258 @@
 .P
 Error Info (dependent on continue_on_error, default off):
 .RS
-.B total # errors, first error code 
+.B total # errors, first error code
 .RE
 .P
 .B text description (if provided in config - appears on newline)
 .RE
+.SH TRACE FILE FORMAT
+There are two trace file format that you can encounter. The older (v1) format
+is unsupported since version 1.20-rc3 (March 2008). It will still be described
+below in case that you get an old trace and want to understand it.
+
+In any case the trace is a simple text file with a single action per line.
+
+.P
+.B Trace file format v1
+.RS
+Each line represents a single io action in the following format:
+
+rw, offset, length
+
+where rw=0/1 for read/write, and the offset and length entries being in bytes.
+
+This format is not supported in Fio versions => 1.20-rc3.
+
+.RE
+.P
+.B Trace file format v2
+.RS
+The second version of the trace file format was added in Fio version 1.17.
+It allows one to access more then one file per trace and has a bigger set of
+possible file actions.
+
+The first line of the trace file has to be:
+
+\fBfio version 2 iolog\fR
+
+Following this can be lines in two different formats, which are described below.
+The file management format:
+
+\fBfilename action\fR
+
+The filename is given as an absolute path. The action can be one of these:
+
+.P
+.PD 0
+.RS
+.TP
+.B add
+Add the given filename to the trace
+.TP
+.B open
+Open the file with the given filename. The filename has to have been previously
+added with the \fBadd\fR action.
+.TP
+.B close
+Close the file with the given filename. The file must have previously been
+opened.
+.RE
+.PD
+.P
+
+The file io action format:
+
+\fBfilename action offset length\fR
+
+The filename is given as an absolute path, and has to have been added and opened
+before it can be used with this format. The offset and length are given in
+bytes. The action can be one of these:
+
+.P
+.PD 0
+.RS
+.TP
+.B wait
+Wait for 'offset' microseconds. Everything below 100 is discarded.  The time is
+relative to the previous wait statement.
+.TP
+.B read
+Read \fBlength\fR bytes beginning from \fBoffset\fR
+.TP
+.B write
+Write \fBlength\fR bytes beginning from \fBoffset\fR
+.TP
+.B sync
+fsync() the file
+.TP
+.B datasync
+fdatasync() the file
+.TP
+.B trim
+trim the given file from the given \fBoffset\fR for \fBlength\fR bytes
+.RE
+.PD
+.P
+
+.SH CPU IDLENESS PROFILING
+In some cases, we want to understand CPU overhead in a test. For example,
+we test patches for the specific goodness of whether they reduce CPU usage.
+fio implements a balloon approach to create a thread per CPU that runs at
+idle priority, meaning that it only runs when nobody else needs the cpu.
+By measuring the amount of work completed by the thread, idleness of each
+CPU can be derived accordingly.
+
+An unit work is defined as touching a full page of unsigned characters. Mean
+and standard deviation of time to complete an unit work is reported in "unit
+work" section. Options can be chosen to report detailed percpu idleness or
+overall system idleness by aggregating percpu stats.
+
+.SH VERIFICATION AND TRIGGERS
+Fio is usually run in one of two ways, when data verification is done. The
+first is a normal write job of some sort with verify enabled. When the
+write phase has completed, fio switches to reads and verifies everything
+it wrote. The second model is running just the write phase, and then later
+on running the same job (but with reads instead of writes) to repeat the
+same IO patterns and verify the contents. Both of these methods depend
+on the write phase being completed, as fio otherwise has no idea how much
+data was written.
+
+With verification triggers, fio supports dumping the current write state
+to local files. Then a subsequent read verify workload can load this state
+and know exactly where to stop. This is useful for testing cases where
+power is cut to a server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+.RS
+Storing the write state of each job
+.LP
+Executing a trigger command
+.RE
+
+The write state is relatively small, on the order of hundreds of bytes
+to single kilobytes. It contains information on the number of completions
+done, the last X completions, etc.
+
+A trigger is invoked either through creation (\fBtouch\fR) of a specified
+file in the system, or through a timeout setting. If fio is run with
+\fB\-\-trigger\-file=/tmp/trigger-file\fR, then it will continually check for
+the existence of /tmp/trigger-file. When it sees this file, it will
+fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If
+fio is running as a server backend, it will send the job states back
+to the client for safe storage, then execute the remote trigger, if
+specified. If a local trigger is specified, the server will still send
+back the write state, but the client will then execute the trigger.
+
+.RE
+.P
+.B Verification trigger example
+.RS
+
+Lets say we want to run a powercut test on the remote machine 'server'.
+Our write workload is in write-test.fio. We want to cut power to 'server'
+at some point during the run, and we'll run this test from the safety
+or our local machine, 'localbox'. On the server, we'll start the fio
+backend normally:
+
+server# \fBfio \-\-server\fR
+
+and on the client, we'll fire off the workload:
+
+localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger-remote="bash \-c "echo b > /proc/sysrq-triger""\fR
+
+We set \fB/tmp/my-trigger\fR as the trigger file, and we tell fio to execute
+
+\fBecho b > /proc/sysrq-trigger\fR
+
+on the server once it has received the trigger and sent us the write
+state. This will work, but it's not \fIreally\fR cutting power to the server,
+it's merely abruptly rebooting it. If we have a remote way of cutting
+power to the server through IPMI or similar, we could do that through
+a local trigger command instead. Lets assume we have a script that does
+IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
+then have run fio with a local trigger instead:
+
+localbox$ \fBfio \-\-client=server \-\-trigger\-file=/tmp/my\-trigger \-\-trigger="ipmi-reboot server"\fR
+
+For this case, fio would wait for the server to send us the write state,
+then execute 'ipmi-reboot server' when that happened.
+
+.RE
+.P
+.B Loading verify state
+.RS
+To load store write state, read verification job file must contain
+the verify_state_load option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send
+the files over and load them from there.
+
+.RE
+
+.SH LOG FILE FORMATS
+
+Fio supports a variety of log file formats, for logging latencies, bandwidth,
+and IOPS. The logs share a common format, which looks like this:
+
+.B time (msec), value, data direction, offset
+
+Time for the log entry is always in milliseconds. The value logged depends
+on the type of log, it will be one of the following:
+
+.P
+.PD 0
+.TP
+.B Latency log
+Value is in latency in usecs
+.TP
+.B Bandwidth log
+Value is in KiB/sec
+.TP
+.B IOPS log
+Value is in IOPS
+.PD
+.P
+
+Data direction is one of the following:
+
+.P
+.PD 0
+.TP
+.B 0
+IO is a READ
+.TP
+.B 1
+IO is a WRITE
+.TP
+.B 2
+IO is a TRIM
+.PD
+.P
+
+The \fIoffset\fR is the offset, in bytes, from the start of the file, for that
+particular IO. The logging of the offset can be toggled with \fBlog_offset\fR.
+
+If windowed logging is enabled through \fBlog_avg_msec\fR, then fio doesn't log
+individual IOs. Instead of logs the average values over the specified
+period of time. Since \fIdata direction\fR and \fIoffset\fR are per-IO values,
+they aren't applicable if windowed logging is enabled. If windowed logging
+is enabled and \fBlog_max_value\fR is set, then fio logs maximum values in
+that window instead of averages.
+
+For histogram logging the logs look like this:
+
+.B time (msec), data direction, block-size, bin 0, bin 1, ..., bin 1215
+
+Where 'bin i' gives the frequency of IO requests with a latency falling in
+the i-th bin. See \fBlog_hist_coarseness\fR for logging fewer bins.
+
+.RE
+
 .SH CLIENT / SERVER
 Normally you would run fio as a stand-alone application on the machine
 where the IO workload should be generated. However, it is also possible to
@@ -1830,34 +2505,34 @@
 socket. 'hostname' is either a hostname or IP address, and 'port' is the port to
 listen to (only valid for TCP/IP, not a local socket). Some examples:
 
-1) fio \-\-server
+1) \fBfio \-\-server\fR
 
    Start a fio server, listening on all interfaces on the default port (8765).
 
-2) fio \-\-server=ip:hostname,4444
+2) \fBfio \-\-server=ip:hostname,4444\fR
 
    Start a fio server, listening on IP belonging to hostname and on port 4444.
 
-3) fio \-\-server=ip6:::1,4444
+3) \fBfio \-\-server=ip6:::1,4444\fR
 
    Start a fio server, listening on IPv6 localhost ::1 and on port 4444.
 
-4) fio \-\-server=,4444
+4) \fBfio \-\-server=,4444\fR
 
    Start a fio server, listening on all interfaces on port 4444.
 
-5) fio \-\-server=1.2.3.4
+5) \fBfio \-\-server=1.2.3.4\fR
 
    Start a fio server, listening on IP 1.2.3.4 on the default port.
 
-6) fio \-\-server=sock:/tmp/fio.sock
+6) \fBfio \-\-server=sock:/tmp/fio.sock\fR
 
    Start a fio server, listening on the local socket /tmp/fio.sock.
 
 When a server is running, you can connect to it from a client. The client
 is run with:
 
-fio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>
+\fBfio \-\-local-args \-\-client=server \-\-remote-args <job file(s)>\fR
 
 where \-\-local-args are arguments that are local to the client where it is
 running, 'server' is the connect string, and \-\-remote-args and <job file(s)>
@@ -1865,15 +2540,42 @@
 does on the server side, to allow IP/hostname/socket and port strings.
 You can connect to multiple clients as well, to do that you could run:
 
-fio \-\-client=server2 \-\-client=server2 <job file(s)>
+\fBfio \-\-client=server2 \-\-client=server2 <job file(s)>\fR
 
 If the job file is located on the fio server, then you can tell the server
 to load a local file as well. This is done by using \-\-remote-config:
 
-fio \-\-client=server \-\-remote-config /path/to/file.fio
+\fBfio \-\-client=server \-\-remote-config /path/to/file.fio\fR
 
-Then the fio serer will open this local (to the server) job file instead
+Then fio will open this local (to the server) job file instead
 of being passed one from the client.
+
+If you have many servers (example: 100 VMs/containers), you can input a pathname
+of a file containing host IPs/names as the parameter value for the \-\-client option.
+For example, here is an example "host.list" file containing 2 hostnames:
+
+host1.your.dns.domain
+.br
+host2.your.dns.domain
+
+The fio command would then be:
+
+\fBfio \-\-client=host.list <job file>\fR
+
+In this mode, you cannot input server-specific parameters or job files, and all
+servers receive the same job file.
+
+In order to enable fio \-\-client runs utilizing a shared filesystem from multiple hosts,
+fio \-\-client now prepends the IP address of the server to the filename. For example,
+if fio is using directory /mnt/nfs/fio and is writing filename fileio.tmp,
+with a \-\-client hostfile
+containing two hostnames h1 and h2 with IP addresses 192.168.10.120 and 192.168.10.121, then
+fio will create two files:
+
+/mnt/nfs/fio/192.168.10.120.fileio.tmp
+.br
+/mnt/nfs/fio/192.168.10.121.fileio.tmp
+
 .SH AUTHORS
 
 .B fio
@@ -1889,4 +2591,10 @@
 For further documentation see \fBHOWTO\fR and \fBREADME\fR.
 .br
 Sample jobfiles are available in the \fBexamples\fR directory.
+.br
+These are typically located under /usr/share/doc/fio.
 
+\fBHOWTO\fR:  http://git.kernel.dk/?p=fio.git;a=blob_plain;f=HOWTO
+.br
+\fBREADME\fR: http://git.kernel.dk/?p=fio.git;a=blob_plain;f=README
+.br

diff --git a/fio.c b/fio.c
index 9adc29a..7b3a50b 100644
--- a/fio.c
+++ b/fio.c

@@ -30,6 +30,10 @@
 
 int main(int argc, char *argv[], char *envp[])
 {
+	int ret = 1;
+
+	compiletime_assert(TD_NR <= TD_ENG_FLAG_SHIFT, "TD_ENG_FLAG_SHIFT");
+
 	if (initialize_fio(envp))
 		return 1;
 
@@ -37,8 +41,17 @@
 #error "No available clock source!"
 #endif
 
+	if (fio_server_create_sk_key())
+		goto done;
+
 	if (parse_options(argc, argv))
-		return 1;
+		goto done_key;
+
+	/*
+	 * line buffer stdout to avoid output lines from multiple
+	 * threads getting mixed
+	 */
+	setvbuf(stdout, NULL, _IOLBF, 0);
 
 	fio_time_init();
 
@@ -46,8 +59,14 @@
 		set_genesis_time();
 
 		if (fio_start_all_clients())
-			return 1;
-		return fio_handle_clients(&fio_client_ops);
+			goto done_key;
+		ret = fio_handle_clients(&fio_client_ops);
 	} else
-		return fio_backend();
+		ret = fio_backend(NULL);
+
+done_key:
+	fio_server_destroy_sk_key();
+done:
+	deinitialize_fio();
+	return ret;
 }

diff --git a/fio.h b/fio.h
index f688084..e11a039 100644
--- a/fio.h
+++ b/fio.h

@@ -25,21 +25,25 @@
 #include "debug.h"
 #include "file.h"
 #include "io_ddir.h"
-#include "ioengine.h"
+#include "ioengines.h"
 #include "iolog.h"
 #include "helpers.h"
 #include "options.h"
 #include "profile.h"
 #include "fio_time.h"
 #include "gettime.h"
-#include "lib/getopt.h"
+#include "oslib/getopt.h"
 #include "lib/rand.h"
 #include "lib/rbtree.h"
+#include "lib/num2str.h"
 #include "client.h"
 #include "server.h"
 #include "stat.h"
 #include "flow.h"
+#include "io_u.h"
 #include "io_u_queue.h"
+#include "workqueue.h"
+#include "steadystate.h"
 
 #ifdef CONFIG_SOLARISAIO
 #include <sys/asynch.h>
@@ -55,6 +59,10 @@
 #define MPOL_LOCAL MPOL_MAX
 #endif
 
+#ifdef CONFIG_CUDA
+#include <cuda.h>
+#endif
+
 /*
  * offset generator types
  */
@@ -64,17 +72,21 @@
 };
 
 enum {
-	TD_F_VER_BACKLOG	= 1,
-	TD_F_TRIM_BACKLOG	= 2,
-	TD_F_READ_IOLOG		= 4,
-	TD_F_REFILL_BUFFERS	= 8,
-	TD_F_SCRAMBLE_BUFFERS	= 16,
-	TD_F_VER_NONE		= 32,
-	TD_F_PROFILE_OPS	= 64,
-	TD_F_COMPRESS		= 128,
-	TD_F_NOIO		= 256,
-	TD_F_COMPRESS_LOG	= 512,
-	TD_F_VSTATE_SAVED	= 1024,
+	TD_F_VER_BACKLOG	= 1U << 0,
+	TD_F_TRIM_BACKLOG	= 1U << 1,
+	TD_F_READ_IOLOG		= 1U << 2,
+	TD_F_REFILL_BUFFERS	= 1U << 3,
+	TD_F_SCRAMBLE_BUFFERS	= 1U << 4,
+	TD_F_VER_NONE		= 1U << 5,
+	TD_F_PROFILE_OPS	= 1U << 6,
+	TD_F_COMPRESS		= 1U << 7,
+	TD_F_RESERVED		= 1U << 8, /* not used */
+	TD_F_COMPRESS_LOG	= 1U << 9,
+	TD_F_VSTATE_SAVED	= 1U << 10,
+	TD_F_NEED_LOCK		= 1U << 11,
+	TD_F_CHILD		= 1U << 12,
+	TD_F_NO_PROGRESS        = 1U << 13,
+	TD_F_REGROW_LOGS	= 1U << 14,
 };
 
 enum {
@@ -91,17 +103,48 @@
 	FIO_RAND_SEQ_RAND_TRIM_OFF,
 	FIO_RAND_START_DELAY,
 	FIO_DEDUPE_OFF,
+	FIO_RAND_POISSON_OFF,
+	FIO_RAND_ZONE_OFF,
+	FIO_RAND_POISSON2_OFF,
+	FIO_RAND_POISSON3_OFF,
 	FIO_RAND_NR_OFFS,
 };
 
+enum {
+	IO_MODE_INLINE = 0,
+	IO_MODE_OFFLOAD = 1,
+
+	RATE_PROCESS_LINEAR = 0,
+	RATE_PROCESS_POISSON = 1,
+};
+
+enum {
+	F_ADV_NONE = 0,
+	F_ADV_TYPE,
+	F_ADV_RANDOM,
+	F_ADV_SEQUENTIAL,
+};
+
+/*
+ * Per-thread/process specific data. Only used for the network client
+ * for now.
+ */
+void sk_out_assign(struct sk_out *);
+void sk_out_drop(void);
+
+struct zone_split_index {
+	uint8_t size_perc;
+	uint8_t size_perc_prev;
+};
+
 /*
  * This describes a single thread/process executing a fio job.
  */
 struct thread_data {
-	struct thread_options o;
+	struct flist_head opt_list;
 	unsigned long flags;
+	struct thread_options o;
 	void *eo;
-	char verror[FIO_VERROR_SIZE];
 	pthread_t thread;
 	unsigned int thread_number;
 	unsigned int subjob_number;
@@ -112,11 +155,14 @@
 
 	struct io_log *slat_log;
 	struct io_log *clat_log;
+	struct io_log *clat_hist_log;
 	struct io_log *lat_log;
 	struct io_log *bw_log;
 	struct io_log *iops_log;
 
-	struct tp_data *tp_data;
+	struct workqueue log_compress_wq;
+
+	struct thread_data *parent;
 
 	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
 	struct timeval bw_sample_time;
@@ -124,13 +170,6 @@
 	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
 	struct timeval iops_sample_time;
 
-	/*
-	 * Tracks the last iodepth number of completed writes, if data
-	 * verification is enabled
-	 */
-	uint64_t *last_write_comp;
-	unsigned int last_write_idx;
-
 	volatile int update_rusage;
 	struct fio_mutex *rusage_sem;
 	struct rusage ru_start;
@@ -147,6 +186,15 @@
 		unsigned int next_file;
 		struct frand_state next_file_state;
 	};
+	union {
+		struct zipf_state next_file_zipf;
+		struct gauss_state next_file_gauss;
+	};
+	union {
+		double zipf_theta;
+		double pareto_h;
+		double gauss_dev;
+	};
 	int error;
 	int sig;
 	int done;
@@ -164,8 +212,6 @@
 	void *iolog_buf;
 	FILE *iolog_f;
 
-	char *sysfs_root;
-
 	unsigned long rand_seeds[FIO_RAND_NR_OFFS];
 
 	struct frand_state bsrange_state;
@@ -176,6 +222,9 @@
 	struct frand_state buf_state;
 	struct frand_state buf_state_prev;
 	struct frand_state dedupe_state;
+	struct frand_state zone_state;
+
+	struct zone_split_index **zone_state_index;
 
 	unsigned int verify_batch;
 	unsigned int trim_batch;
@@ -189,6 +238,13 @@
 	 * to any of the available IO engines.
 	 */
 	struct ioengine_ops *io_ops;
+	int io_ops_init;
+
+	/*
+	 * IO engine private data and dlhandle.
+	 */
+	void *io_ops_data;
+	void *io_ops_dlhandle;
 
 	/*
 	 * Queue depth of io_u's that fio MIGHT do
@@ -227,10 +283,18 @@
 	 * Rate state
 	 */
 	uint64_t rate_bps[DDIR_RWDIR_CNT];
-	long rate_pending_usleep[DDIR_RWDIR_CNT];
+	uint64_t rate_next_io_time[DDIR_RWDIR_CNT];
 	unsigned long rate_bytes[DDIR_RWDIR_CNT];
 	unsigned long rate_blocks[DDIR_RWDIR_CNT];
+	unsigned long long rate_io_issue_bytes[DDIR_RWDIR_CNT];
 	struct timeval lastrate[DDIR_RWDIR_CNT];
+	int64_t last_usec[DDIR_RWDIR_CNT];
+	struct frand_state poisson_state[DDIR_RWDIR_CNT];
+
+	/*
+	 * Enforced rate submission/completion workqueue
+	 */
+	struct workqueue io_wq;
 
 	uint64_t total_io_size;
 	uint64_t fill_device_size;
@@ -248,10 +312,11 @@
 	uint64_t io_blocks[DDIR_RWDIR_CNT];
 	uint64_t this_io_blocks[DDIR_RWDIR_CNT];
 	uint64_t io_bytes[DDIR_RWDIR_CNT];
-	uint64_t io_skip_bytes;
 	uint64_t this_io_bytes[DDIR_RWDIR_CNT];
+	uint64_t io_skip_bytes;
 	uint64_t zone_bytes;
 	struct fio_mutex *mutex;
+	uint64_t bytes_done[DDIR_RWDIR_CNT];
 
 	/*
 	 * State for random io, a bitmap of blocks done vs not done
@@ -260,6 +325,7 @@
 
 	struct timeval start;	/* start of this loop */
 	struct timeval epoch;	/* time job was started */
+	unsigned long long unix_epoch; /* Time job was started, unix epoch based. */
 	struct timeval last_issue;
 	long time_offset;
 	struct timeval tv_cache;
@@ -342,6 +408,22 @@
 	void *prof_data;
 
 	void *pinned_mem;
+
+	struct steadystate_data ss;
+
+	char verror[FIO_VERROR_SIZE];
+
+#ifdef CONFIG_CUDA
+	/*
+	 * for GPU memory management
+	 */
+	int gpu_dev_cnt;
+	int gpu_dev_id;
+	CUdevice  cu_dev;
+	CUcontext cu_ctx;
+	CUdeviceptr dev_mem_ptr;
+#endif	
+
 };
 
 /*
@@ -364,12 +446,23 @@
 	} while (0)
 
 
-#define td_clear_error(td)		\
-	(td)->error = 0;
-#define td_verror(td, err, func)	\
-	__td_verror((td), (err), strerror((err)), (func))
-#define td_vmsg(td, err, msg, func)	\
-	__td_verror((td), (err), (msg), (func))
+#define td_clear_error(td)		do {		\
+	(td)->error = 0;				\
+	if ((td)->parent)				\
+		(td)->parent->error = 0;		\
+} while (0)
+
+#define td_verror(td, err, func)	do {			\
+	__td_verror((td), (err), strerror((err)), (func));	\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), strerror((err)), (func)); \
+} while (0)
+
+#define td_vmsg(td, err, msg, func)	do {			\
+	__td_verror((td), (err), (msg), (func));		\
+	if ((td)->parent)					\
+		__td_verror((td)->parent, (err), (msg), (func));	\
+} while (0)
 
 #define __fio_stringify_1(x)	#x
 #define __fio_stringify(x)	__fio_stringify_1(x)
@@ -387,7 +480,6 @@
 extern int eta_print;
 extern int eta_new_line;
 extern unsigned long done_secs;
-extern char *job_section;
 extern int fio_gtod_offload;
 extern int fio_gtod_cpu;
 extern enum fio_cs fio_clock_source;
@@ -399,12 +491,11 @@
 extern int log_syslog;
 extern int status_interval;
 extern const char fio_version_string[];
-extern int helper_do_stat;
-extern pthread_cond_t helper_cond;
 extern char *trigger_file;
 extern char *trigger_cmd;
 extern char *trigger_remote_cmd;
 extern long long trigger_timeout;
+extern char *aux_path;
 
 extern struct thread_data *threads;
 
@@ -413,13 +504,13 @@
 	assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)));
 }
 
-#define REAL_MAX_JOBS		2048
+#define REAL_MAX_JOBS		4096
 
 static inline int should_fsync(struct thread_data *td)
 {
 	if (td->last_was_sync)
 		return 0;
-	if (td_write(td) || td_rw(td) || td->o.override_sync)
+	if (td_write(td) || td->o.override_sync)
 		return 1;
 
 	return 0;
@@ -432,11 +523,12 @@
 extern int __must_check parse_options(int, char **);
 extern int parse_jobs_ini(char *, int, int, int);
 extern int parse_cmd_line(int, char **, int);
-extern int fio_backend(void);
+extern int fio_backend(struct sk_out *);
 extern void reset_fio_state(void);
-extern void clear_io_state(struct thread_data *);
-extern int fio_options_parse(struct thread_data *, char **, int, int);
+extern void clear_io_state(struct thread_data *, int);
+extern int fio_options_parse(struct thread_data *, char **, int);
 extern void fio_keywords_init(void);
+extern void fio_keywords_exit(void);
 extern int fio_cmd_option_parse(struct thread_data *, const char *, char *);
 extern int fio_cmd_ioengine_option_parse(struct thread_data *, const char *, char *);
 extern void fio_fill_default_options(struct thread_data *);
@@ -444,18 +536,20 @@
 extern void fio_options_set_ioengine_opts(struct option *long_options, struct thread_data *td);
 extern void fio_options_dup_and_init(struct option *);
 extern void fio_options_mem_dupe(struct thread_data *);
-extern void options_mem_dupe(void *data, struct fio_option *options);
 extern void td_fill_rand_seeds(struct thread_data *);
+extern void td_fill_verify_state_seed(struct thread_data *);
 extern void add_job_opts(const char **, int);
-extern char *num2str(uint64_t, int, int, int, int);
 extern int ioengine_load(struct thread_data *);
-extern int parse_dryrun(void);
+extern bool parse_dryrun(void);
 extern int fio_running_or_pending_io_threads(void);
 extern int fio_set_fd_nonblocking(int, const char *);
+extern void sig_show_status(int sig);
+extern struct thread_data *get_global_options(void);
 
 extern uintptr_t page_mask;
 extern uintptr_t page_size;
 extern int initialize_fio(char *envp[]);
+extern void deinitialize_fio(void);
 
 #define FIO_GETOPT_JOB		0x89000000
 #define FIO_GETOPT_IOENGINE	0x98000000
@@ -487,20 +581,44 @@
 	TD_FINISHING,
 	TD_EXITED,
 	TD_REAPED,
+	TD_LAST,
+	TD_NR,
 };
 
+#define TD_ENG_FLAG_SHIFT	16
+#define TD_ENG_FLAG_MASK	((1U << 16) - 1)
+
+static inline enum fio_ioengine_flags td_ioengine_flags(struct thread_data *td)
+{
+	return (enum fio_ioengine_flags)
+		((td->flags >> TD_ENG_FLAG_SHIFT) & TD_ENG_FLAG_MASK);
+}
+
+static inline void td_set_ioengine_flags(struct thread_data *td)
+{
+	td->flags = (~(TD_ENG_FLAG_MASK << TD_ENG_FLAG_SHIFT) & td->flags) |
+		    (td->io_ops->flags << TD_ENG_FLAG_SHIFT);
+}
+
+static inline bool td_ioengine_flagged(struct thread_data *td,
+				       enum fio_ioengine_flags flags)
+{
+	return ((td->flags >> TD_ENG_FLAG_SHIFT) & flags) != 0;
+}
+
 extern void td_set_runstate(struct thread_data *, int);
 extern int td_bump_runstate(struct thread_data *, int);
 extern void td_restore_runstate(struct thread_data *, int);
+extern const char *runstate_to_name(int runstate);
 
 /*
  * Allow 60 seconds for a job to quit on its own, otherwise reap with
  * a vengeance.
  */
-#define FIO_REAP_TIMEOUT	60
+#define FIO_REAP_TIMEOUT	300
 
-#define TERMINATE_ALL		(-1)
-extern void fio_terminate_threads(int);
+#define TERMINATE_ALL		(-1U)
+extern void fio_terminate_threads(unsigned int);
 extern void fio_mark_td_terminate(struct thread_data *);
 
 /*
@@ -512,6 +630,11 @@
 extern void free_io_mem(struct thread_data *);
 extern void free_threads_shm(void);
 
+#ifdef FIO_INTERNAL
+#define PTR_ALIGN(ptr, mask)	\
+	(char *) (((uintptr_t) (ptr) + (mask)) & ~(mask))
+#endif
+
 /*
  * Reset stats after ramp time completes
  */
@@ -525,6 +648,10 @@
 extern int load_blktrace(struct thread_data *, const char *, int);
 #endif
 
+extern int io_queue_event(struct thread_data *td, struct io_u *io_u, int *ret,
+		   enum fio_ddir ddir, uint64_t *bytes_issued, int from_verify,
+		   struct timeval *comp_time);
+
 /*
  * Latency target helpers
  */
@@ -532,6 +659,9 @@
 extern void lat_target_init(struct thread_data *);
 extern void lat_target_reset(struct thread_data *);
 
+/*
+ * Iterates all threads/processes within all the defined jobs
+ */
 #define for_each_td(td, i)	\
 	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
 #define for_each_file(td, f, i)	\
@@ -550,17 +680,17 @@
 	}	\
 } while (0)
 
-static inline int fio_fill_issue_time(struct thread_data *td)
+static inline bool fio_fill_issue_time(struct thread_data *td)
 {
 	if (td->o.read_iolog_file ||
 	    !td->o.disable_clat || !td->o.disable_slat || !td->o.disable_bw)
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static inline int __should_check_rate(struct thread_data *td,
-				      enum fio_ddir ddir)
+static inline bool __should_check_rate(struct thread_data *td,
+				       enum fio_ddir ddir)
 {
 	struct thread_options *o = &td->o;
 
@@ -569,24 +699,21 @@
 	 */
 	if (o->rate[ddir] || o->ratemin[ddir] || o->rate_iops[ddir] ||
 	    o->rate_iops_min[ddir])
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static inline int should_check_rate(struct thread_data *td,
-				    uint64_t *bytes_done)
+static inline bool should_check_rate(struct thread_data *td)
 {
-	int ret = 0;
+	if (td->bytes_done[DDIR_READ] && __should_check_rate(td, DDIR_READ))
+		return true;
+	if (td->bytes_done[DDIR_WRITE] && __should_check_rate(td, DDIR_WRITE))
+		return true;
+	if (td->bytes_done[DDIR_TRIM] && __should_check_rate(td, DDIR_TRIM))
+		return true;
 
-	if (bytes_done[DDIR_READ])
-		ret |= __should_check_rate(td, DDIR_READ);
-	if (bytes_done[DDIR_WRITE])
-		ret |= __should_check_rate(td, DDIR_WRITE);
-	if (bytes_done[DDIR_TRIM])
-		ret |= __should_check_rate(td, DDIR_TRIM);
-
-	return ret;
+	return false;
 }
 
 static inline unsigned int td_max_bs(struct thread_data *td)
@@ -605,9 +732,9 @@
 	return min(td->o.min_bs[DDIR_TRIM], min_bs);
 }
 
-static inline int is_power_of_2(uint64_t val)
+static inline bool td_async_processing(struct thread_data *td)
 {
-	return (val != 0 && ((val & (val - 1)) == 0));
+	return (td->flags & TD_F_NEED_LOCK) != 0;
 }
 
 /*
@@ -616,39 +743,67 @@
  */
 static inline void td_io_u_lock(struct thread_data *td)
 {
-	if (td->o.verify_async)
+	if (td_async_processing(td))
 		pthread_mutex_lock(&td->io_u_lock);
 }
 
 static inline void td_io_u_unlock(struct thread_data *td)
 {
-	if (td->o.verify_async)
+	if (td_async_processing(td))
 		pthread_mutex_unlock(&td->io_u_lock);
 }
 
 static inline void td_io_u_free_notify(struct thread_data *td)
 {
-	if (td->o.verify_async)
+	if (td_async_processing(td))
 		pthread_cond_signal(&td->free_cond);
 }
 
+static inline void td_flags_clear(struct thread_data *td, unsigned int *flags,
+				  unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags &= ~value;
+	else
+		__sync_fetch_and_and(flags, ~value);
+}
+
+static inline void td_flags_set(struct thread_data *td, unsigned int *flags,
+				unsigned int value)
+{
+	if (!td_async_processing(td))
+		*flags |= value;
+	else
+		__sync_fetch_and_or(flags, value);
+}
+
 extern const char *fio_get_arch_string(int);
 extern const char *fio_get_os_string(int);
 
 #ifdef FIO_INTERNAL
-#define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+#define FIELD_SIZE(s, f) (sizeof(((typeof(s))0)->f))
 #endif
 
 enum {
-	FIO_OUTPUT_TERSE	= 0,
-	FIO_OUTPUT_JSON,
-	FIO_OUTPUT_NORMAL,
+	__FIO_OUTPUT_TERSE	= 0,
+	__FIO_OUTPUT_JSON	= 1,
+	__FIO_OUTPUT_NORMAL	= 2,
+        __FIO_OUTPUT_JSON_PLUS  = 3,
+	FIO_OUTPUT_NR		= 4,
+
+	FIO_OUTPUT_TERSE	= 1U << __FIO_OUTPUT_TERSE,
+	FIO_OUTPUT_JSON		= 1U << __FIO_OUTPUT_JSON,
+	FIO_OUTPUT_NORMAL	= 1U << __FIO_OUTPUT_NORMAL,
+	FIO_OUTPUT_JSON_PLUS    = 1U << __FIO_OUTPUT_JSON_PLUS,
 };
 
 enum {
 	FIO_RAND_DIST_RANDOM	= 0,
 	FIO_RAND_DIST_ZIPF,
 	FIO_RAND_DIST_PARETO,
+	FIO_RAND_DIST_GAUSS,
+	FIO_RAND_DIST_ZONED,
 };
 
 #define FIO_DEF_ZIPF		1.1
@@ -657,6 +812,7 @@
 enum {
 	FIO_RAND_GEN_TAUSWORTHE = 0,
 	FIO_RAND_GEN_LFSR,
+	FIO_RAND_GEN_TAUSWORTHE64,
 };
 
 enum {

diff --git a/fio_time.h b/fio_time.h
index 79f324a..b49cc82 100644
--- a/fio_time.h
+++ b/fio_time.h

@@ -1,6 +1,8 @@
 #ifndef FIO_TIME_H
 #define FIO_TIME_H
 
+#include "lib/types.h"
+
 struct thread_data;
 extern uint64_t utime_since(const struct timeval *,const  struct timeval *);
 extern uint64_t utime_since_now(const struct timeval *);
@@ -14,8 +16,10 @@
 extern uint64_t usec_sleep(struct thread_data *, unsigned long);
 extern void fill_start_time(struct timeval *);
 extern void set_genesis_time(void);
-extern int ramp_time_over(struct thread_data *);
-extern int in_ramp_time(struct thread_data *);
+extern bool ramp_time_over(struct thread_data *);
+extern bool in_ramp_time(struct thread_data *);
 extern void fio_time_init(void);
+extern void timeval_add_msec(struct timeval *, unsigned int);
+extern void set_epoch_time(struct thread_data *, int);
 
 #endif

diff --git a/flist.h b/flist.h
index d453e79..b4fe6e6 100644
--- a/flist.h
+++ b/flist.h

@@ -177,6 +177,9 @@
 #define flist_first_entry(ptr, type, member) \
 	flist_entry((ptr)->next, type, member)
 
+#define flist_last_entry(ptr, type, member) \
+	flist_entry((ptr)->prev, type, member)
+
 /**
  * flist_for_each	-	iterate over a list
  * @pos:	the &struct flist_head to use as a loop counter.

diff --git a/flow.c b/flow.c
index f9d868d..42b6dd7 100644
--- a/flow.c
+++ b/flow.c

@@ -58,7 +58,7 @@
 	if (!flow) {
 		flow = smalloc(sizeof(*flow));
 		if (!flow) {
-			log_err("fio: smalloc pool exhausted\n");
+			fio_mutex_up(flow_lock);
 			return NULL;
 		}
 		flow->refs = 0;

diff --git a/gclient.c b/gclient.c
index 42bc761..928a1b7 100644
--- a/gclient.c
+++ b/gclient.c

@@ -13,6 +13,7 @@
 #include "graph.h"
 #include "gclient.h"
 #include "printing.h"
+#include "lib/pow2.h"
 
 static void gfio_display_ts(struct fio_client *client, struct thread_stat *ts,
 			    struct group_run_stats *rs);
@@ -47,7 +48,7 @@
 	{ "PrintFile", GTK_STOCK_PRINT, "Print", "<Control>P", NULL, G_CALLBACK(results_print) },
 	{ "CloseFile", GTK_STOCK_CLOSE, "Close", "<Control>W", NULL, G_CALLBACK(results_close) },
 };
-static gint results_nmenu_items = sizeof(results_menu_items) / sizeof(results_menu_items[0]);
+static gint results_nmenu_items = ARRAY_SIZE(results_menu_items);
 
 static const gchar *results_ui_string = " \
 	<ui> \
@@ -279,10 +280,6 @@
 	gdk_threads_leave();
 }
 
-extern int sum_stat_clients;
-extern struct thread_stat client_ts;
-extern struct group_run_stats client_gs;
-
 static int sum_stat_nr;
 
 static void gfio_thread_status_op(struct fio_client *client,
@@ -295,7 +292,7 @@
 	if (sum_stat_clients == 1)
 		return;
 
-	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
+	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr == 1);
 	sum_group_stats(&client_gs, &p->rs);
 
 	client_ts.members++;
@@ -367,29 +364,11 @@
 	sprintf(tmp, "%u", je->files_open);
 	gtk_entry_set_text(GTK_ENTRY(ge->eta.files), tmp);
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ge->eta.cw_iops), "---");
-#endif
-
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char *iops_str[DDIR_RWDIR_CNT];
 		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 		int i;
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
@@ -400,19 +379,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
+		iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC);
 
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), rate_str[0]);
+		rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ge->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ge->graphs.iops_graph, ge->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -424,6 +410,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -460,31 +447,13 @@
 		eta_to_str(eta_str, je->eta_sec);
 	}
 
-#if 0
-	if (je->m_rate[0] || je->m_rate[1] || je->t_rate[0] || je->t_rate[1]) {
-	if (je->m_rate || je->t_rate) {
-		char *tr, *mr;
-
-		mr = num2str(je->m_rate, 4, 0, i2p);
-		tr = num2str(je->t_rate, 4, 0, i2p);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta);
-		p += sprintf(p, ", CR=%s/%s KB/s", tr, mr);
-		free(tr);
-		free(mr);
-	} else if (je->m_iops || je->t_iops)
-		p += sprintf(p, ", CR=%d/%d IOPS", je->t_iops, je->m_iops);
-
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cr_iops), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_bw), "---");
-	gtk_entry_set_text(GTK_ENTRY(ui->eta.cw_iops), "---");
-#endif
-
 	entry_set_int_value(ui->eta.jobs, je->nr_running);
 
 	if (je->eta_sec != INT_MAX && je->nr_running) {
-		char *iops_str[3];
-		char *rate_str[3];
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
+		char *rate_alt[DDIR_RWDIR_CNT];
+		char tmp[128];
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
 			strcpy(output, "-.-% done");
@@ -494,19 +463,26 @@
 			sprintf(output, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 10, i2p, 0);
-		rate_str[1] = num2str(je->rate[1], 5, 10, i2p, 0);
-		rate_str[2] = num2str(je->rate[2], 5, 10, i2p, 0);
+		iops_str[0] = num2str(je->iops[0], 4, 1, 0, N2S_PERSEC);
+		iops_str[1] = num2str(je->iops[1], 4, 1, 0, N2S_PERSEC);
+		iops_str[2] = num2str(je->iops[2], 4, 1, 0, N2S_PERSEC);
 
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0, 0);
-		iops_str[2] = num2str(je->iops[2], 4, 1, 0, 0);
-
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), rate_str[0]);
+		rate_str[0] = num2str(je->rate[0], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[0] = num2str(je->rate[0], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[0], rate_alt[0]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.read_iops), iops_str[0]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), rate_str[1]);
+
+		rate_str[1] = num2str(je->rate[1], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[1] = num2str(je->rate[1], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[1], rate_alt[1]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.write_iops), iops_str[1]);
-		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), rate_str[2]);
+
+		rate_str[2] = num2str(je->rate[2], 4, 10, i2p, N2S_BYTEPERSEC);
+		rate_alt[2] = num2str(je->rate[2], 4, 10, !i2p, N2S_BYTEPERSEC);
+		snprintf(tmp, sizeof(tmp), "%s (%s)", rate_str[2], rate_alt[2]);
+		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_bw), tmp);
 		gtk_entry_set_text(GTK_ENTRY(ui->eta.trim_iops), iops_str[2]);
 
 		graph_add_xy_data(ui->graphs.iops_graph, ui->graphs.read_iops, je->elapsed_sec, je->iops[0], iops_str[0]);
@@ -518,6 +494,7 @@
 
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			free(rate_str[i]);
+			free(rate_alt[i]);
 			free(iops_str[i]);
 		}
 	}
@@ -595,6 +572,7 @@
 	struct thread_options *o;
 	char *c1, *c2, *c3, *c4;
 	char tmp[80];
+	int i2p;
 
 	p->thread_number = le32_to_cpu(p->thread_number);
 	p->groupid = le32_to_cpu(p->groupid);
@@ -608,11 +586,13 @@
 	sprintf(tmp, "%s %s", o->odirect ? "direct" : "buffered", ddir_str(o->td_ddir));
 	multitext_add_entry(&ge->eta.iotype, tmp);
 
-	c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c2 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	c3 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-	c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
-	sprintf(tmp, "%s-%s/%s-%s", c1, c2, c3, c4);
+	i2p = is_power_of_2(o->kb_base);
+	c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+	c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+	c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+	c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+
+	sprintf(tmp, "%s-%s,%s-%s", c1, c2, c3, c4);
 	free(c1);
 	free(c2);
 	free(c3);
@@ -692,12 +672,6 @@
 	gdk_threads_leave();
 }
 
-static void gfio_client_iolog(struct fio_client *client, struct cmd_iolog_pdu *pdu)
-{
-	printf("got iolog: name=%s, type=%u, entries=%lu\n", pdu->name, pdu->log_type, (unsigned long) pdu->nr_samples);
-	free(pdu);
-}
-
 static void gfio_add_total_depths_tree(GtkListStore *model,
 				       struct thread_stat *ts, unsigned int len)
 {
@@ -957,10 +931,10 @@
 				      struct thread_stat *ts)
 {
 	double io_u_lat[FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR];
-	const char *ranges[] = { "2u", "4u", "10u", "20u", "50u", "100u",
-				 "250u", "500u", "750u", "1m", "2m",
-				 "4m", "10m", "20m", "50m", "100m",
-				 "250m", "500m", "750m", "1s", "2s", ">= 2s" };
+	const char *ranges[] = { "2us", "4us", "10us", "20us", "50us", "100us",
+				 "250us", "500us", "750us", "1ms", "2ms",
+				 "4ms", "10ms", "20ms", "50ms", "100ms",
+				 "250ms", "500ms", "750ms", "1s", "2s", ">= 2s" };
 	int start, end, i;
 	const int total = FIO_IO_U_LAT_U_NR + FIO_IO_U_LAT_M_NR;
 	GtkWidget *frame, *tree_view, *hbox, *completion_vbox, *drawing_area;
@@ -989,7 +963,7 @@
 		return;
 
 	tree_view = gfio_output_lat_buckets(&io_u_lat[start], &ranges[start], end - start + 1);
-	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency Buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
+	ge->lat_bucket_graph = setup_lat_bucket_graph("Latency buckets", &io_u_lat[start], &ranges[start], end - start + 1, 700.0, 300.0);
 
 	frame = gtk_frame_new("Latency buckets");
 	gtk_box_pack_start(GTK_BOX(vbox), frame, FALSE, FALSE, 5);
@@ -1017,11 +991,11 @@
 	char *minp, *maxp;
 	char tmp[64];
 
-	if (!usec_to_msec(&min, &max, &mean, &dev))
+	if (usec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
 	sprintf(tmp, "%s %s", name, base);
 	frame = gtk_frame_new(tmp);
@@ -1182,7 +1156,8 @@
 	unsigned long long bw, iops;
 	unsigned int flags = 0;
 	double mean[3], dev[3];
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *io_palt, *bw_p, *bw_palt, *iops_p;
+	char tmp[128];
 	int i2p;
 
 	if (!ts->runtime[ddir])
@@ -1192,11 +1167,9 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
+	iops_p = num2str(iops, 4, 1, 0, N2S_PERSEC);
 
 	box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(mbox), box, TRUE, FALSE, 3);
@@ -1211,9 +1184,17 @@
 	gtk_box_pack_start(GTK_BOX(main_vbox), box, TRUE, FALSE, 3);
 
 	label = new_info_label_in_frame(box, "IO");
-	gtk_label_set_text(GTK_LABEL(label), io_p);
+	io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE);
+	io_palt = num2str(ts->io_bytes[ddir], 4, 1, !i2p, N2S_BYTE);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", io_p, io_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "Bandwidth");
-	gtk_label_set_text(GTK_LABEL(label), bw_p);
+	bw_p = num2str(bw, 4, 1, i2p, ts->unit_base);
+	bw_palt = num2str(bw, 4, 1, !i2p, ts->unit_base);
+	snprintf(tmp, sizeof(tmp), "%s (%s)", bw_p, bw_palt);
+	gtk_label_set_text(GTK_LABEL(label), tmp);
+
 	label = new_info_label_in_frame(box, "IOPS");
 	gtk_label_set_text(GTK_LABEL(label), iops_p);
 	label = new_info_label_in_frame(box, "Runtime (msec)");
@@ -1221,7 +1202,7 @@
 
 	if (calc_lat(&ts->bw_stat[ddir], &min[0], &max[0], &mean[0], &dev[0])) {
 		double p_of_agg = 100.0;
-		const char *bw_str = "KB";
+		const char *bw_str = "KiB/s";
 		char tmp[32];
 
 		if (rs->agg[ddir]) {
@@ -1230,14 +1211,21 @@
 				p_of_agg = 100.0;
 		}
 
-		if (mean[0] > 999999.9) {
-			min[0] /= 1000.0;
-			max[0] /= 1000.0;
-			mean[0] /= 1000.0;
-			dev[0] /= 1000.0;
-			bw_str = "MB";
+		if (mean[0] > 1073741824.9) {
+			min[0] /= 1048576.0;
+			max[0] /= 1048576.0;
+			mean[0] /= 1048576.0;
+			dev[0] /= 1048576.0;
+			bw_str = "GiB/s";
 		}
 
+		if (mean[0] > 1047575.9) {
+			min[0] /= 1024.0;
+			max[0] /= 1024.0;
+			mean[0] /= 1024.0;
+			dev[0] /= 1024.0;
+			bw_str = "MiB/s";
+		}
 		sprintf(tmp, "Bandwidth (%s)", bw_str);
 		frame = gtk_frame_new(tmp);
 		gtk_box_pack_start(GTK_BOX(main_vbox), frame, FALSE, FALSE, 5);
@@ -1287,6 +1275,8 @@
 
 	free(io_p);
 	free(bw_p);
+	free(io_palt);
+	free(bw_palt);
 	free(iops_p);
 }
 
@@ -1393,7 +1383,6 @@
 	.stop			= gfio_client_stop,
 	.start			= gfio_client_start,
 	.job_start		= gfio_client_job_start,
-	.iolog			= gfio_client_iolog,
 	.removed		= gfio_client_removed,
 	.eta_msec		= FIO_CLIENT_DEF_ETA_MSEC,
 	.stay_connected		= 1,

diff --git a/gettime-thread.c b/gettime-thread.c
index 2dc976f..19541b4 100644
--- a/gettime-thread.c
+++ b/gettime-thread.c

@@ -9,9 +9,7 @@
 struct timeval *fio_tv = NULL;
 int fio_gtod_offload = 0;
 static pthread_t gtod_thread;
-#ifdef FIO_HAVE_CPU_AFFINITY
 static os_cpu_mask_t fio_gtod_cpumask;
-#endif
 
 void fio_gtod_init(void)
 {
@@ -73,7 +71,7 @@
 		return 1;
 
 	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
 	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, mutex);
 	pthread_attr_destroy(&attr);
 	if (ret) {
@@ -83,7 +81,7 @@
 
 	ret = pthread_detach(gtod_thread);
 	if (ret) {
-		log_err("Can't detatch gtod thread: %s\n", strerror(ret));
+		log_err("Can't detach gtod thread: %s\n", strerror(ret));
 		goto err;
 	}
 

diff --git a/gettime.c b/gettime.c
index d1c8eb9..628aad6 100644
--- a/gettime.c
+++ b/gettime.c

@@ -13,7 +13,8 @@
 #include "hash.h"
 #include "os/os.h"
 
-#if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
+#if defined(ARCH_HAVE_CPU_CLOCK)
+#ifndef ARCH_CPU_CLOCK_CYCLES_PER_USEC
 static unsigned long cycles_per_usec;
 static unsigned long inv_cycles_per_usec;
 static uint64_t max_cycles_for_mult;
@@ -21,6 +22,7 @@
 #ifdef ARCH_CPU_CLOCK_WRAPS
 static unsigned long long cycles_start, cycles_wrap;
 #endif
+#endif
 int tsc_reliable = 0;
 
 struct tv_valid {
@@ -131,7 +133,9 @@
 #ifdef CONFIG_CLOCK_GETTIME
 static int fill_clock_gettime(struct timespec *ts)
 {
-#ifdef CONFIG_CLOCK_MONOTONIC
+#if defined(CONFIG_CLOCK_MONOTONIC_RAW)
+	return clock_gettime(CLOCK_MONOTONIC_RAW, ts);
+#elif defined(CONFIG_CLOCK_MONOTONIC)
 	return clock_gettime(CLOCK_MONOTONIC, ts);
 #else
 	return clock_gettime(CLOCK_REALTIME, ts);
@@ -228,6 +232,7 @@
 	struct timeval s, e;
 	uint64_t c_s, c_e;
 	enum fio_cs old_cs = fio_clock_source;
+	uint64_t elapsed;
 
 #ifdef CONFIG_CLOCK_GETTIME
 	fio_clock_source = CS_CGETTIME;
@@ -238,8 +243,6 @@
 
 	c_s = get_cpu_clock();
 	do {
-		uint64_t elapsed;
-
 		__fio_gettime(&e);
 
 		elapsed = utime_since(&s, &e);
@@ -250,7 +253,7 @@
 	} while (1);
 
 	fio_clock_source = old_cs;
-	return (c_e - c_s + 127) >> 7;
+	return (c_e - c_s) / elapsed;
 }
 
 #define NR_TIME_ITERS	50
@@ -296,16 +299,11 @@
 	}
 
 	S /= (double) NR_TIME_ITERS;
-	mean /= 10.0;
 
 	for (i = 0; i < NR_TIME_ITERS; i++)
-		dprint(FD_TIME, "cycles[%d]=%llu\n", i,
-					(unsigned long long) cycles[i] / 10);
+		dprint(FD_TIME, "cycles[%d]=%llu\n", i, (unsigned long long) cycles[i]);
 
 	avg /= samples;
-	avg = (avg + 5) / 10;
-	minc /= 10;
-	maxc /= 10;
 	dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg);
 	dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f\n",
 			(unsigned long long) minc,
@@ -383,8 +381,7 @@
 
 uint64_t utime_since(const struct timeval *s, const struct timeval *e)
 {
-	long sec, usec;
-	uint64_t ret;
+	int64_t sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
 	usec = e->tv_usec - s->tv_usec;
@@ -399,22 +396,26 @@
 	if (sec < 0 || (sec == 0 && usec < 0))
 		return 0;
 
-	ret = sec * 1000000ULL + usec;
-
-	return ret;
+	return usec + (sec * 1000000);
 }
 
 uint64_t utime_since_now(const struct timeval *s)
 {
 	struct timeval t;
+#ifdef FIO_DEBUG_TIME
+	void *p = __builtin_return_address(0);
 
+	fio_gettime(&t, p);
+#else
 	fio_gettime(&t, NULL);
+#endif
+
 	return utime_since(s, &t);
 }
 
 uint64_t mtime_since(const struct timeval *s, const struct timeval *e)
 {
-	long sec, usec, ret;
+	long sec, usec;
 
 	sec = e->tv_sec - s->tv_sec;
 	usec = e->tv_usec - s->tv_usec;
@@ -426,19 +427,22 @@
 	if (sec < 0 || (sec == 0 && usec < 0))
 		return 0;
 
-	sec *= 1000UL;
-	usec /= 1000UL;
-	ret = sec + usec;
-
-	return ret;
+	sec *= 1000;
+	usec /= 1000;
+	return sec + usec;
 }
 
 uint64_t mtime_since_now(const struct timeval *s)
 {
 	struct timeval t;
+#ifdef FIO_DEBUG_TIME
 	void *p = __builtin_return_address(0);
 
 	fio_gettime(&t, p);
+#else
+	fio_gettime(&t, NULL);
+#endif
+
 	return mtime_since(s, &t);
 }
 
@@ -481,6 +485,7 @@
 	struct clock_entry *c;
 	os_cpu_mask_t cpu_mask;
 	uint32_t last_seq;
+	unsigned long long first;
 	int i;
 
 	if (fio_cpuset_init(&cpu_mask)) {
@@ -502,6 +507,7 @@
 	pthread_mutex_lock(&t->lock);
 	pthread_mutex_unlock(&t->started);
 
+	first = get_cpu_clock();
 	last_seq = 0;
 	c = &t->entries[0];
 	for (i = 0; i < t->nr_entries; i++, c++) {
@@ -524,7 +530,8 @@
 		unsigned long long clocks;
 
 		clocks = t->entries[i - 1].tsc - t->entries[0].tsc;
-		log_info("cs: cpu%3d: %llu clocks seen\n", t->cpu, clocks);
+		log_info("cs: cpu%3d: %llu clocks seen, first %llu\n", t->cpu,
+							clocks, first);
 	}
 
 	/*
@@ -636,6 +643,8 @@
 
 	qsort(entries, tentries, sizeof(struct clock_entry), clock_cmp);
 
+	/* silence silly gcc */
+	prev = NULL;
 	for (failed = i = 0; i < tentries; i++) {
 		this = &entries[i];
 

diff --git a/gfio.c b/gfio.c
index 42d536e..7c92a50 100644
--- a/gfio.c
+++ b/gfio.c

@@ -449,7 +449,7 @@
 		free(gco);
 	}
 
-	ret = fio_client_send_ini(gc->client, ge->job_file, 0);
+	ret = fio_client_send_ini(gc->client, ge->job_file, false);
 	if (!ret)
 		return 0;
 
@@ -459,10 +459,12 @@
 
 static void *server_thread(void *arg)
 {
+	fio_server_create_sk_key();
 	is_backend = 1;
 	gfio_server_running = 1;
 	fio_start_server(NULL);
 	gfio_server_running = 0;
+	fio_server_destroy_sk_key();
 	return NULL;
 }
 
@@ -1213,7 +1215,7 @@
 {
 	const char *authors[] = {
 		"Jens Axboe <axboe@kernel.dk>",
-		"Stephen Carmeron <stephenmcameron@gmail.com>",
+		"Stephen Cameron <stephenmcameron@gmail.com>",
 		NULL
 	};
 	const char *license[] = {
@@ -1238,7 +1240,7 @@
 		"program-name", "gfio",
 		"comments", "Gtk2 UI for fio",
 		"license", license_trans,
-		"website", "http://git.kernel.dk/?p=fio.git;a=summary",
+		"website", "http://git.kernel.dk/cgit/fio/",
 		"authors", authors,
 		"version", fio_version_string,
 		"copyright", "© 2012 Jens Axboe <axboe@kernel.dk>",
@@ -1269,7 +1271,7 @@
 	{ "Quit", GTK_STOCK_QUIT, NULL,   "<Control>Q", NULL, G_CALLBACK(quit_clicked) },
 	{ "About", GTK_STOCK_ABOUT, NULL,  NULL, NULL, G_CALLBACK(about_dialog) },
 };
-static gint nmenu_items = sizeof(menu_items) / sizeof(menu_items[0]);
+static gint nmenu_items = ARRAY_SIZE(menu_items);
 
 static const gchar *ui_string = " \
 	<ui> \
@@ -1384,7 +1386,7 @@
 	g_signal_connect(ge->eta.names, "changed", G_CALLBACK(combo_entry_changed), ge);
 	g_signal_connect(ge->eta.names, "destroy", G_CALLBACK(combo_entry_destroy), ge);
 	ge->eta.iotype.entry = new_info_entry_in_frame(probe_box, "IO");
-	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write)");
+	ge->eta.bs.entry = new_info_entry_in_frame(probe_box, "Blocksize (Read/Write/Trim)");
 	ge->eta.ioengine.entry = new_info_entry_in_frame(probe_box, "IO Engine");
 	ge->eta.iodepth.entry = new_info_entry_in_frame(probe_box, "IO Depth");
 	ge->eta.jobs = new_info_entry_in_frame(probe_box, "Jobs");
@@ -1393,11 +1395,11 @@
 	probe_box = gtk_hbox_new(FALSE, 3);
 	gtk_box_pack_start(GTK_BOX(probe_frame), probe_box, FALSE, FALSE, 3);
 	ge->eta.read_bw = new_info_entry_in_frame_rgb(probe_box, "Read BW", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
-	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
+	ge->eta.read_iops = new_info_entry_in_frame_rgb(probe_box, "Read IOPS", GFIO_READ_R, GFIO_READ_G, GFIO_READ_B);
 	ge->eta.write_bw = new_info_entry_in_frame_rgb(probe_box, "Write BW", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
-	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
+	ge->eta.write_iops = new_info_entry_in_frame_rgb(probe_box, "Write IOPS", GFIO_WRITE_R, GFIO_WRITE_G, GFIO_WRITE_B);
 	ge->eta.trim_bw = new_info_entry_in_frame_rgb(probe_box, "Trim BW", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
-	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
+	ge->eta.trim_iops = new_info_entry_in_frame_rgb(probe_box, "Trim IOPS", GFIO_TRIM_R, GFIO_TRIM_G, GFIO_TRIM_B);
 
 	/*
 	 * Only add this if we have a commit rate
@@ -1677,7 +1679,7 @@
 	 * Without it, the update that happens in gfio_update_thread_status
 	 * doesn't really happen in a timely fashion, you need expose events
 	 */
-#if !GTK_CHECK_VERSION(2, 24, 0)
+#if !GLIB_CHECK_VERSION(2, 31, 0)
 	if (!g_thread_supported())
 		g_thread_init(NULL);
 #endif

diff --git a/goptions.c b/goptions.c
index c01b6cc..16938ed 100644
--- a/goptions.c
+++ b/goptions.c

@@ -11,6 +11,7 @@
 #include "ghelpers.h"
 #include "gerror.h"
 #include "parse.h"
+#include "optgroup.h"
 
 struct gopt {
 	GtkWidget *box;
@@ -92,10 +93,10 @@
 static GNode *gopt_dep_tree;
 
 static GtkWidget *gopt_get_group_frame(struct gopt_job_view *gjv,
-				       GtkWidget *box, unsigned int groupmask)
+				       GtkWidget *box, uint64_t groupmask)
 {
-	unsigned int mask, group;
-	struct opt_group *og;
+	uint64_t mask, group;
+	const struct opt_group *og;
 	GtkWidget *frame, *hbox;
 	struct gopt_frame_widget *gfw;
 
@@ -107,7 +108,7 @@
 	if (!og)
 		return NULL;
 
-	group = ffz(~groupmask);
+	group = ffz64(~groupmask);
 	gfw = &gjv->g_widgets[group];
 	if (!gfw->vbox[0]) {
 		frame = gtk_frame_new(og->name);
@@ -825,7 +826,7 @@
 				     unsigned long long *p, unsigned int idx)
 {
 	struct gopt_str_val *g;
-	const gchar *postfix[] = { "B", "KB", "MB", "GB", "PB", "TB", "" };
+	const gchar *postfix[] = { "B", "KiB", "MiB", "GiB", "PiB", "PiB", "" };
 	GtkWidget *label;
 	int i;
 
@@ -1135,11 +1136,11 @@
 	 */
 	for (i = 0; fio_options[i].name; i++) {
 		struct fio_option *o = &fio_options[i];
-		unsigned int mask = o->category;
-		struct opt_group *og;
+		uint64_t mask = o->category;
+		const struct opt_group *og;
 
 		while ((og = opt_group_from_mask(&mask)) != NULL) {
-			GtkWidget *vbox = gjv->vboxes[ffz(~og->mask)];
+			GtkWidget *vbox = gjv->vboxes[ffz64(~og->mask)];
 
 			hbox = gtk_hbox_new(FALSE, 3);
 			gtk_box_pack_start(GTK_BOX(vbox), hbox, FALSE, FALSE, 5);
@@ -1177,19 +1178,20 @@
 	return vbox;
 }
 
-static GtkWidget *gopt_add_group_tab(GtkWidget *notebook, struct opt_group *og)
+static GtkWidget *gopt_add_group_tab(GtkWidget *notebook,
+				     const struct opt_group *og)
 {
 	return gopt_add_tab(notebook, og->name);
 }
 
 static void gopt_add_group_tabs(GtkWidget *notebook, struct gopt_job_view *gjv)
 {
-	struct opt_group *og;
+	const struct opt_group *og;
 	unsigned int i;
 
 	i = 0;
 	do {
-		unsigned int mask = (1U << i);
+		uint64_t mask = (1ULL << i);
 
 		og = opt_group_from_mask(&mask);
 		if (!og)

diff --git a/hash.h b/hash.h
index 02b0614..d227b93 100644
--- a/hash.h
+++ b/hash.h

@@ -28,15 +28,31 @@
 #error Define GOLDEN_RATIO_PRIME for your wordsize.
 #endif
 
-#define GR_PRIME_64	0x9e37fffffffc0001ULL
+/*
+ * The above primes are actively bad for hashing, since they are
+ * too sparse. The 32-bit one is mostly ok, the 64-bit one causes
+ * real problems. Besides, the "prime" part is pointless for the
+ * multiplicative hash.
+ *
+ * Although a random odd number will do, it turns out that the golden
+ * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice
+ * properties.
+ *
+ * These are the negative, (1 - phi) = (phi^2) = (3 - sqrt(5))/2.
+ * (See Knuth vol 3, section 6.4, exercise 9.)
+ */
+#define GOLDEN_RATIO_32 0x61C88647
+#define GOLDEN_RATIO_64 0x61C8864680B583EBull
 
-static inline unsigned long __hash_long(unsigned long val)
+static inline unsigned long __hash_long(uint64_t val)
 {
-	unsigned long hash = val;
+	uint64_t hash = val;
 
 #if BITS_PER_LONG == 64
+	hash *= GOLDEN_RATIO_64;
+#else
 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
-	unsigned long n = hash;
+	uint64_t n = hash;
 	n <<= 18;
 	hash -= n;
 	n <<= 33;
@@ -49,9 +65,6 @@
 	hash += n;
 	n <<= 2;
 	hash += n;
-#else
-	/* On some cpus multiply is faster, on others gcc will do shifts */
-	hash *= GOLDEN_RATIO_PRIME;
 #endif
 
 	return hash;
@@ -65,7 +78,7 @@
 
 static inline uint64_t __hash_u64(uint64_t val)
 {
-	return val * GR_PRIME_64;
+	return val * GOLDEN_RATIO_64;
 }
 	
 static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
@@ -77,7 +90,7 @@
  * Bob Jenkins jhash
  */
 
-#define JHASH_INITVAL	GOLDEN_RATIO_PRIME
+#define JHASH_INITVAL	GOLDEN_RATIO_32
 
 static inline uint32_t rol32(uint32_t word, uint32_t shift)
 {

diff --git a/helper_thread.c b/helper_thread.c
new file mode 100644
index 0000000..47ec728
--- /dev/null
+++ b/helper_thread.c

@@ -0,0 +1,185 @@
+#include "fio.h"
+#include "smalloc.h"
+#include "helper_thread.h"
+#include "steadystate.h"
+
+static struct helper_data {
+	volatile int exit;
+	volatile int reset;
+	volatile int do_stat;
+	struct sk_out *sk_out;
+	pthread_t thread;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	struct fio_mutex *startup_mutex;
+} *helper_data;
+
+void helper_thread_destroy(void)
+{
+	pthread_cond_destroy(&helper_data->cond);
+	pthread_mutex_destroy(&helper_data->lock);
+	sfree(helper_data);
+}
+
+void helper_reset(void)
+{
+	if (!helper_data)
+		return;
+
+	pthread_mutex_lock(&helper_data->lock);
+
+	if (!helper_data->reset) {
+		helper_data->reset = 1;
+		pthread_cond_signal(&helper_data->cond);
+	}
+
+	pthread_mutex_unlock(&helper_data->lock);
+}
+
+void helper_do_stat(void)
+{
+	if (!helper_data)
+		return;
+
+	pthread_mutex_lock(&helper_data->lock);
+	helper_data->do_stat = 1;
+	pthread_cond_signal(&helper_data->cond);
+	pthread_mutex_unlock(&helper_data->lock);
+}
+
+bool helper_should_exit(void)
+{
+	if (!helper_data)
+		return true;
+
+	return helper_data->exit;
+}
+
+void helper_thread_exit(void)
+{
+	void *ret;
+
+	pthread_mutex_lock(&helper_data->lock);
+	helper_data->exit = 1;
+	pthread_cond_signal(&helper_data->cond);
+	pthread_mutex_unlock(&helper_data->lock);
+
+	pthread_join(helper_data->thread, &ret);
+}
+
+static void *helper_thread_main(void *data)
+{
+	struct helper_data *hd = data;
+	unsigned int msec_to_next_event, next_log, next_ss = STEADYSTATE_MSEC;
+	struct timeval tv, last_du, last_ss;
+	int ret = 0;
+
+	sk_out_assign(hd->sk_out);
+
+	gettimeofday(&tv, NULL);
+	memcpy(&last_du, &tv, sizeof(tv));
+	memcpy(&last_ss, &tv, sizeof(tv));
+
+	fio_mutex_up(hd->startup_mutex);
+
+	msec_to_next_event = DISK_UTIL_MSEC;
+	while (!ret && !hd->exit) {
+		struct timespec ts;
+		struct timeval now;
+		uint64_t since_du, since_ss = 0;
+
+		timeval_add_msec(&tv, msec_to_next_event);
+		ts.tv_sec = tv.tv_sec;
+		ts.tv_nsec = tv.tv_usec * 1000;
+
+		pthread_mutex_lock(&hd->lock);
+		pthread_cond_timedwait(&hd->cond, &hd->lock, &ts);
+
+		gettimeofday(&now, NULL);
+
+		if (hd->reset) {
+			memcpy(&tv, &now, sizeof(tv));
+			memcpy(&last_du, &now, sizeof(last_du));
+			memcpy(&last_ss, &now, sizeof(last_ss));
+			hd->reset = 0;
+		}
+
+		pthread_mutex_unlock(&hd->lock);
+
+		since_du = mtime_since(&last_du, &now);
+		if (since_du >= DISK_UTIL_MSEC || DISK_UTIL_MSEC - since_du < 10) {
+			ret = update_io_ticks();
+			timeval_add_msec(&last_du, DISK_UTIL_MSEC);
+			msec_to_next_event = DISK_UTIL_MSEC;
+			if (since_du >= DISK_UTIL_MSEC)
+				msec_to_next_event -= (since_du - DISK_UTIL_MSEC);
+		} else
+			msec_to_next_event = DISK_UTIL_MSEC - since_du;
+
+		if (hd->do_stat) {
+			hd->do_stat = 0;
+			__show_running_run_stats();
+		}
+
+		next_log = calc_log_samples();
+		if (!next_log)
+			next_log = DISK_UTIL_MSEC;
+
+		if (steadystate_enabled) {
+			since_ss = mtime_since(&last_ss, &now);
+			if (since_ss >= STEADYSTATE_MSEC || STEADYSTATE_MSEC - since_ss < 10) {
+				steadystate_check();
+				timeval_add_msec(&last_ss, since_ss);
+				if (since_ss > STEADYSTATE_MSEC)
+					next_ss = STEADYSTATE_MSEC - (since_ss - STEADYSTATE_MSEC);
+				else
+					next_ss = STEADYSTATE_MSEC;
+			}
+			else
+				next_ss = STEADYSTATE_MSEC - since_ss;
+                }
+
+		msec_to_next_event = min(min(next_log, msec_to_next_event), next_ss);
+		dprint(FD_HELPERTHREAD, "since_ss: %llu, next_ss: %u, next_log: %u, msec_to_next_event: %u\n", (unsigned long long)since_ss, next_ss, next_log, msec_to_next_event);
+
+		if (!is_backend)
+			print_thread_status();
+	}
+
+	fio_writeout_logs(false);
+
+	sk_out_drop();
+	return NULL;
+}
+
+int helper_thread_create(struct fio_mutex *startup_mutex, struct sk_out *sk_out)
+{
+	struct helper_data *hd;
+	int ret;
+
+	hd = smalloc(sizeof(*hd));
+
+	setup_disk_util();
+	steadystate_setup();
+
+	hd->sk_out = sk_out;
+
+	ret = mutex_cond_init_pshared(&hd->lock, &hd->cond);
+	if (ret)
+		return 1;
+
+	hd->startup_mutex = startup_mutex;
+
+	ret = pthread_create(&hd->thread, NULL, helper_thread_main, hd);
+	if (ret) {
+		log_err("Can't create helper thread: %s\n", strerror(ret));
+		return 1;
+	}
+
+	helper_data = hd;
+
+	dprint(FD_MUTEX, "wait on startup_mutex\n");
+	fio_mutex_down(startup_mutex);
+	dprint(FD_MUTEX, "done waiting on startup_mutex\n");
+	return 0;
+}

diff --git a/helper_thread.h b/helper_thread.h
new file mode 100644
index 0000000..78933b1
--- /dev/null
+++ b/helper_thread.h

@@ -0,0 +1,11 @@
+#ifndef FIO_HELPER_THREAD_H
+#define FIO_HELPER_THREAD_H
+
+extern void helper_reset(void);
+extern void helper_do_stat(void);
+extern bool helper_should_exit(void);
+extern void helper_thread_destroy(void);
+extern void helper_thread_exit(void);
+extern int helper_thread_create(struct fio_mutex *, struct sk_out *);
+
+#endif

diff --git a/idletime.c b/idletime.c
index db272fe..4c00d80 100644
--- a/idletime.c
+++ b/idletime.c

@@ -260,7 +260,7 @@
 
 		if ((ret = pthread_detach(ipt->thread))) {
 			/* log error and let the thread spin */
-			log_err("fio: pthread_detatch %s\n", strerror(ret));
+			log_err("fio: pthread_detach %s\n", strerror(ret));
 		}
 	}
 
@@ -428,7 +428,7 @@
 		fio_idle_prof_init();
 		fio_idle_prof_start();
 		fio_idle_prof_stop();
-		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL);
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, NULL);
 		return 1;
 	} else if (strcmp("system", args) == 0) {
 		ipc.opt = IDLE_PROF_OPT_SYSTEM;
@@ -446,7 +446,8 @@
 #endif
 }
 
-void show_idle_prof_stats(int output, struct json_object *parent)
+void show_idle_prof_stats(int output, struct json_object *parent,
+			  struct buf_output *out)
 {
 	int i, nr_cpus = ipc.nr_cpus;
 	struct json_object *tmp;
@@ -454,23 +455,23 @@
 
 	if (output == FIO_OUTPUT_NORMAL) {
 		if (ipc.opt > IDLE_PROF_OPT_CALI)
-			log_info("\nCPU idleness:\n");
+			log_buf(out, "\nCPU idleness:\n");
 		else if (ipc.opt == IDLE_PROF_OPT_CALI)
-			log_info("CPU idleness:\n");
+			log_buf(out, "CPU idleness:\n");
 
 		if (ipc.opt >= IDLE_PROF_OPT_SYSTEM)
-			log_info("  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));
+			log_buf(out, "  system: %3.2f%%\n", fio_idle_prof_cpu_stat(-1));
 
 		if (ipc.opt == IDLE_PROF_OPT_PERCPU) {
-			log_info("  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
+			log_buf(out, "  percpu: %3.2f%%", fio_idle_prof_cpu_stat(0));
 			for (i = 1; i < nr_cpus; i++)
-				log_info(", %3.2f%%", fio_idle_prof_cpu_stat(i));
-			log_info("\n");
+				log_buf(out, ", %3.2f%%", fio_idle_prof_cpu_stat(i));
+			log_buf(out, "\n");
 		}
 
 		if (ipc.opt >= IDLE_PROF_OPT_CALI) {
-			log_info("  unit work: mean=%3.2fus,", ipc.cali_mean);
-			log_info(" stddev=%3.2f\n", ipc.cali_stddev);
+			log_buf(out, "  unit work: mean=%3.2fus,", ipc.cali_mean);
+			log_buf(out, " stddev=%3.2f\n", ipc.cali_stddev);
 		}
 
 		/* dynamic mem allocations can now be freed */
@@ -480,7 +481,7 @@
 		return;
 	}
 
-	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output == FIO_OUTPUT_JSON)) {
+	if ((ipc.opt != IDLE_PROF_OPT_NONE) && (output & FIO_OUTPUT_JSON)) {
 		if (!parent)
 			return;
 

diff --git a/idletime.h b/idletime.h
index bd6dcef..84c1fbb 100644
--- a/idletime.h
+++ b/idletime.h

@@ -2,6 +2,7 @@
 #define FIO_IDLETIME_H
 
 #include "fio.h"
+#include "lib/output_buffer.h"
 
 #define CALIBRATE_RUNS  10
 #define CALIBRATE_SCALE 1000
@@ -54,6 +55,6 @@
 extern void fio_idle_prof_start(void);
 extern void fio_idle_prof_stop(void);
 
-extern void show_idle_prof_stats(int, struct json_object *);
+extern void show_idle_prof_stats(int, struct json_object *, struct buf_output *);
 
 #endif

diff --git a/init.c b/init.c
index 7aedf2b..52a5f03 100644
--- a/init.c
+++ b/init.c

@@ -25,11 +25,13 @@
 #include "server.h"
 #include "idletime.h"
 #include "filelock.h"
+#include "steadystate.h"
 
-#include "lib/getopt.h"
-#include "lib/strcasestr.h"
+#include "oslib/getopt.h"
+#include "oslib/strcasestr.h"
 
 #include "crc/test.h"
+#include "lib/pow2.h"
 
 const char fio_version_string[] = FIO_VERSION;
 
@@ -38,7 +40,6 @@
 static char **ini_file;
 static int max_jobs = FIO_MAX_JOBS;
 static int dump_cmdline;
-static long long def_timeout;
 static int parse_only;
 
 static struct thread_data def_thread;
@@ -48,7 +49,6 @@
 
 int exitall_on_terminate = 0;
 int output_format = FIO_OUTPUT_NORMAL;
-int append_terse_output = 0;
 int eta_print = FIO_ETA_AUTO;
 int eta_new_line = 0;
 FILE *f_out = NULL;
@@ -69,6 +69,8 @@
 char *trigger_cmd = NULL;
 char *trigger_remote_cmd = NULL;
 
+char *aux_path = NULL;
+
 static int prev_group_jobs;
 
 unsigned long fio_debug = 0;
@@ -91,18 +93,13 @@
 		.val		= 'o' | FIO_CLIENT_FLAG,
 	},
 	{
-		.name		= (char *) "timeout",
-		.has_arg	= required_argument,
-		.val		= 't' | FIO_CLIENT_FLAG,
-	},
-	{
 		.name		= (char *) "latency-log",
 		.has_arg	= required_argument,
 		.val		= 'l' | FIO_CLIENT_FLAG,
 	},
 	{
 		.name		= (char *) "bandwidth-log",
-		.has_arg	= required_argument,
+		.has_arg	= no_argument,
 		.val		= 'b' | FIO_CLIENT_FLAG,
 	},
 	{
@@ -112,7 +109,7 @@
 	},
 	{
 		.name		= (char *) "output-format",
-		.has_arg	= optional_argument,
+		.has_arg	= required_argument,
 		.val		= 'F' | FIO_CLIENT_FLAG,
 	},
 	{
@@ -267,6 +264,11 @@
 		.val		= 'J',
 	},
 	{
+		.name		= (char *) "aux-path",
+		.has_arg	= required_argument,
+		.val		= 'K',
+	},
+	{
 		.name		= NULL,
 	},
 };
@@ -292,7 +294,6 @@
 static void free_shm(void)
 {
 	if (threads) {
-		file_hash_exit();
 		flow_exit();
 		fio_debug_jobp = NULL;
 		free_threads_shm();
@@ -303,8 +304,9 @@
 	free(trigger_remote_cmd);
 	trigger_file = trigger_cmd = trigger_remote_cmd = NULL;
 
-	options_free(fio_options, &def_thread);
+	options_free(fio_options, &def_thread.o);
 	fio_filelock_exit();
+	file_hash_exit();
 	scleanup();
 }
 
@@ -316,8 +318,6 @@
  */
 static int setup_thread_area(void)
 {
-	void *hash;
-
 	if (threads)
 		return 0;
 
@@ -328,7 +328,6 @@
 	do {
 		size_t size = max_jobs * sizeof(struct thread_data);
 
-		size += file_hash_size;
 		size += sizeof(unsigned int);
 
 #ifndef CONFIG_NO_SHM
@@ -357,39 +356,91 @@
 		perror("shmat");
 		return 1;
 	}
+	if (shm_attach_to_open_removed())
+		shmctl(shm_id, IPC_RMID, NULL);
 #endif
 
 	memset(threads, 0, max_jobs * sizeof(struct thread_data));
-	hash = (void *) threads + max_jobs * sizeof(struct thread_data);
-	fio_debug_jobp = (void *) hash + file_hash_size;
+	fio_debug_jobp = (void *) threads + max_jobs * sizeof(struct thread_data);
 	*fio_debug_jobp = -1;
-	file_hash_init(hash);
 
 	flow_init();
 
 	return 0;
 }
 
-static void set_cmd_options(struct thread_data *td)
+static void dump_print_option(struct print_option *p)
 {
-	struct thread_options *o = &td->o;
+	const char *delim;
 
-	if (!o->timeout)
-		o->timeout = def_timeout;
+	if (!strcmp("description", p->name))
+		delim = "\"";
+	else
+		delim = "";
+
+	log_info("--%s%s", p->name, p->value ? "" : " ");
+	if (p->value)
+		log_info("=%s%s%s ", delim, p->value, delim);
+}
+
+static void dump_opt_list(struct thread_data *td)
+{
+	struct flist_head *entry;
+	struct print_option *p;
+
+	if (flist_empty(&td->opt_list))
+		return;
+
+	flist_for_each(entry, &td->opt_list) {
+		p = flist_entry(entry, struct print_option, list);
+		dump_print_option(p);
+	}
+}
+
+static void fio_dump_options_free(struct thread_data *td)
+{
+	while (!flist_empty(&td->opt_list)) {
+		struct print_option *p;
+
+		p = flist_first_entry(&td->opt_list, struct print_option, list);
+		flist_del_init(&p->list);
+		free(p->name);
+		free(p->value);
+		free(p);
+	}
+}
+
+static void copy_opt_list(struct thread_data *dst, struct thread_data *src)
+{
+	struct flist_head *entry;
+
+	if (flist_empty(&src->opt_list))
+		return;
+
+	flist_for_each(entry, &src->opt_list) {
+		struct print_option *srcp, *dstp;
+
+		srcp = flist_entry(entry, struct print_option, list);
+		dstp = malloc(sizeof(*dstp));
+		dstp->name = strdup(srcp->name);
+		if (srcp->value)
+			dstp->value = strdup(srcp->value);
+		else
+			dstp->value = NULL;
+		flist_add_tail(&dstp->list, &dst->opt_list);
+	}
 }
 
 /*
  * Return a free job structure.
  */
-static struct thread_data *get_new_job(int global, struct thread_data *parent,
-				       int preserve_eo, const char *jobname)
+static struct thread_data *get_new_job(bool global, struct thread_data *parent,
+				       bool preserve_eo, const char *jobname)
 {
 	struct thread_data *td;
 
-	if (global) {
-		set_cmd_options(&def_thread);
+	if (global)
 		return &def_thread;
-	}
 	if (setup_thread_area()) {
 		log_err("error: failed to setup shm segment\n");
 		return NULL;
@@ -403,7 +454,12 @@
 	td = &threads[thread_number++];
 	*td = *parent;
 
+	INIT_FLIST_HEAD(&td->opt_list);
+	if (parent != &def_thread)
+		copy_opt_list(td, parent);
+
 	td->io_ops = NULL;
+	td->io_ops_init = 0;
 	if (!preserve_eo)
 		td->eo = NULL;
 
@@ -420,10 +476,9 @@
 	if (jobname)
 		td->o.name = strdup(jobname);
 
-	if (!parent->o.group_reporting)
+	if (!parent->o.group_reporting || parent == &def_thread)
 		stat_number++;
 
-	set_cmd_options(td);
 	return td;
 }
 
@@ -439,6 +494,7 @@
 		log_info("fio: %s\n", td->verror);
 
 	fio_options_free(td);
+	fio_dump_options_free(td);
 	if (td->io_ops)
 		free_ioengine(td);
 
@@ -458,14 +514,16 @@
 	if (td->o.rate[ddir])
 		td->rate_bps[ddir] = td->o.rate[ddir];
 	else
-		td->rate_bps[ddir] = td->o.rate_iops[ddir] * bs;
+		td->rate_bps[ddir] = (uint64_t) td->o.rate_iops[ddir] * bs;
 
 	if (!td->rate_bps[ddir]) {
 		log_err("rate lower than supported\n");
 		return -1;
 	}
 
-	td->rate_pending_usleep[ddir] = 0;
+	td->rate_next_io_time[ddir] = 0;
+	td->rate_io_issue_bytes[ddir] = 0;
+	td->last_usec[ddir] = 0;
 	return 0;
 }
 
@@ -496,18 +554,31 @@
 static unsigned long long get_rand_start_delay(struct thread_data *td)
 {
 	unsigned long long delayrange;
+	uint64_t frand_max;
 	unsigned long r;
 
 	delayrange = td->o.start_delay_high - td->o.start_delay;
 
+	frand_max = rand_max(&td->delay_state);
 	r = __rand(&td->delay_state);
-	delayrange = (unsigned long long) ((double) delayrange * (r / (FRAND_MAX + 1.0)));
+	delayrange = (unsigned long long) ((double) delayrange * (r / (frand_max + 1.0)));
 
 	delayrange += td->o.start_delay;
 	return delayrange;
 }
 
 /*
+ * <3 Johannes
+ */
+static unsigned int gcd(unsigned int m, unsigned int n)
+{
+	if (!n)
+		return m;
+
+	return gcd(n, m % n);
+}
+
+/*
  * Lazy way of fixing up options that depend on each other. We could also
  * define option callback handlers, but this is easier.
  */
@@ -516,7 +587,7 @@
 	struct thread_options *o = &td->o;
 	int ret = 0;
 
-#ifndef FIO_HAVE_PSHARED_MUTEX
+#ifndef CONFIG_PSHARED
 	if (!o->use_thread) {
 		log_info("fio: this platform does not support process shared"
 			 " mutexes, forcing use of threads. Use the 'thread'"
@@ -549,7 +620,7 @@
 	/*
 	 * Reads can do overwrites, we always need to pre-create the file
 	 */
-	if (td_read(td) || td_rw(td))
+	if (td_read(td))
 		o->overwrite = 1;
 
 	if (!o->min_bs[DDIR_READ])
@@ -599,7 +670,7 @@
 			"verify limited\n");
 		ret = warnings_fatal;
 	}
-	if (o->bs_unaligned && (o->odirect || td->io_ops->flags & FIO_RAWIO))
+	if (o->bs_unaligned && (o->odirect || td_ioengine_flagged(td, FIO_RAWIO)))
 		log_err("fio: bs_unaligned may not work with raw io\n");
 
 	/*
@@ -620,6 +691,13 @@
 	if (o->iodepth_batch > o->iodepth || !o->iodepth_batch)
 		o->iodepth_batch = o->iodepth;
 
+	/*
+	 * If max batch complete number isn't set or set incorrectly,
+	 * default to the same as iodepth_batch_complete_min
+	 */
+	if (o->iodepth_batch_complete_min > o->iodepth_batch_complete_max)
+		o->iodepth_batch_complete_max = o->iodepth_batch_complete_min;
+
 	if (o->nr_files > td->files_index)
 		o->nr_files = td->files_index;
 
@@ -633,12 +711,12 @@
 		log_err("fio: rate and rate_iops are mutually exclusive\n");
 		ret = 1;
 	}
-	if ((o->rate[DDIR_READ] < o->ratemin[DDIR_READ]) ||
-	    (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE]) ||
-	    (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM]) ||
-	    (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ]) ||
-	    (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE]) ||
-	    (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM])) {
+	if ((o->rate[DDIR_READ] && (o->rate[DDIR_READ] < o->ratemin[DDIR_READ])) ||
+	    (o->rate[DDIR_WRITE] && (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE])) ||
+	    (o->rate[DDIR_TRIM] && (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM])) ||
+	    (o->rate_iops[DDIR_READ] && (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ])) ||
+	    (o->rate_iops[DDIR_WRITE] && (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE])) ||
+	    (o->rate_iops[DDIR_TRIM] && (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM]))) {
 		log_err("fio: minimum rate exceeds rate\n");
 		ret = 1;
 	}
@@ -660,7 +738,9 @@
 			ret = warnings_fatal;
 		}
 
-		o->refill_buffers = 1;
+		if (!fio_option_is_set(o, refill_buffers))
+			o->refill_buffers = 1;
+
 		if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] &&
 		    !o->verify_interval)
 			o->verify_interval = o->min_bs[DDIR_WRITE];
@@ -673,11 +753,21 @@
 			o->verify_interval = o->min_bs[DDIR_WRITE];
 		else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ])
 			o->verify_interval = o->min_bs[DDIR_READ];
+
+		/*
+		 * Verify interval must be a factor or both min and max
+		 * write size
+		 */
+		if (o->verify_interval % o->min_bs[DDIR_WRITE] ||
+		    o->verify_interval % o->max_bs[DDIR_WRITE])
+			o->verify_interval = gcd(o->min_bs[DDIR_WRITE],
+							o->max_bs[DDIR_WRITE]);
 	}
 
 	if (o->pre_read) {
-		o->invalidate_cache = 0;
-		if (td->io_ops->flags & FIO_PIPEIO) {
+		if (o->invalidate_cache)
+			o->invalidate_cache = 0;
+		if (td_ioengine_flagged(td, FIO_PIPEIO)) {
 			log_info("fio: cannot pre-read files with an IO engine"
 				 " that isn't seekable. Pre-read disabled.\n");
 			ret = warnings_fatal;
@@ -685,7 +775,7 @@
 	}
 
 	if (!o->unit_base) {
-		if (td->io_ops->flags & FIO_BIT_BASED)
+		if (td_ioengine_flagged(td, FIO_BIT_BASED))
 			o->unit_base = 1;
 		else
 			o->unit_base = 8;
@@ -708,7 +798,7 @@
 	 * Windows doesn't support O_DIRECT or O_SYNC with the _open interface,
 	 * so fail if we're passed those flags
 	 */
-	if ((td->io_ops->flags & FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) {
+	if (td_ioengine_flagged(td, FIO_SYNCIO) && (td->o.odirect || td->o.sync_io)) {
 		log_err("fio: Windows does not support direct or non-buffered io with"
 				" the synchronous ioengines. Use the 'windowsaio' ioengine"
 				" with 'direct=1' and 'iodepth=1' instead.\n");
@@ -718,11 +808,16 @@
 
 	/*
 	 * For fully compressible data, just zero them at init time.
-	 * It's faster than repeatedly filling it.
+	 * It's faster than repeatedly filling it. For non-zero
+	 * compression, we should have refill_buffers set. Set it, unless
+	 * the job file already changed it.
 	 */
-	if (td->o.compress_percentage == 100) {
-		td->o.zero_buffers = 1;
-		td->o.compress_percentage = 0;
+	if (o->compress_percentage) {
+		if (o->compress_percentage == 100) {
+			o->zero_buffers = 1;
+			o->compress_percentage = 0;
+		} else if (!fio_option_is_set(o, refill_buffers))
+			o->refill_buffers = 1;
 	}
 
 	/*
@@ -736,7 +831,8 @@
 	 * If size is set but less than the min block size, complain
 	 */
 	if (o->size && o->size < td_min_bs(td)) {
-		log_err("fio: size too small, must be larger than the IO size: %llu\n", (unsigned long long) o->size);
+		log_err("fio: size too small, must not be less than minimum block size: %llu < %u\n",
+			(unsigned long long) o->size, td_min_bs(td));
 		ret = 1;
 	}
 
@@ -749,10 +845,10 @@
 	/*
 	 * If randseed is set, that overrides randrepeat
 	 */
-	if (td->o.rand_seed)
+	if (fio_option_is_set(&td->o, rand_seed))
 		td->o.rand_repeatable = 0;
 
-	if ((td->io_ops->flags & FIO_NOEXTEND) && td->o.file_append) {
+	if (td_ioengine_flagged(td, FIO_NOEXTEND) && td->o.file_append) {
 		log_err("fio: can't append/extent with IO engine %s\n", td->io_ops->name);
 		ret = 1;
 	}
@@ -767,30 +863,16 @@
 	if (!td->loops)
 		td->loops = 1;
 
+	if (td->o.block_error_hist && td->o.nr_files != 1) {
+		log_err("fio: block error histogram only available "
+			"with a single file per job, but %d files "
+			"provided\n", td->o.nr_files);
+		ret = 1;
+	}
+
 	return ret;
 }
 
-/*
- * This function leaks the buffer
- */
-char *fio_uint_to_kmg(unsigned int val)
-{
-	char *buf = malloc(32);
-	char post[] = { 0, 'K', 'M', 'G', 'P', 'E', 0 };
-	char *p = post;
-
-	do {
-		if (val & 1023)
-			break;
-
-		val >>= 10;
-		p++;
-	} while (*p);
-
-	snprintf(buf, 32, "%u%c", val, *p);
-	return buf;
-}
-
 /* External engines are specified by "external:name.o") */
 static const char *get_engine_name(const char *str)
 {
@@ -805,33 +887,57 @@
 	return p;
 }
 
-static int exists_and_not_file(const char *filename)
+static void init_rand_file_service(struct thread_data *td)
 {
-	struct stat sb;
+	unsigned long nranges = td->o.nr_files << FIO_FSERVICE_SHIFT;
+	const unsigned int seed = td->rand_seeds[FIO_RAND_FILE_OFF];
 
-	if (lstat(filename, &sb) == -1)
-		return 0;
-
-	/* \\.\ is the device namespace in Windows, where every file
-	 * is a device node */
-	if (S_ISREG(sb.st_mode) && strncmp(filename, "\\\\.\\", 4) != 0)
-		return 0;
-
-	return 1;
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF) {
+		zipf_init(&td->next_file_zipf, nranges, td->zipf_theta, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_PARETO) {
+		pareto_init(&td->next_file_zipf, nranges, td->pareto_h, seed);
+		zipf_disable_hash(&td->next_file_zipf);
+	} else if (td->o.file_service_type == FIO_FSERVICE_GAUSS) {
+		gauss_init(&td->next_file_gauss, nranges, td->gauss_dev, seed);
+		gauss_disable_hash(&td->next_file_gauss);
+	}
 }
 
-static void td_fill_rand_seeds_internal(struct thread_data *td)
+void td_fill_verify_state_seed(struct thread_data *td)
 {
-	init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF]);
-	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF]);
-	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF]);
+	bool use64;
+
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = 1;
+	else
+		use64 = 0;
+
+	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF],
+		use64);
+}
+
+static void td_fill_rand_seeds_internal(struct thread_data *td, bool use64)
+{
+	int i;
+
+	init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF], use64);
+	td_fill_verify_state_seed(td);
+	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF], false);
 
 	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
-		init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF]);
+		init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF], use64);
+	else if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+		init_rand_file_service(td);
 
-	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF]);
-	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF]);
-	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY]);
+	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], use64);
+	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF], use64);
+	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY], use64);
+	init_rand_seed(&td->poisson_state[0], td->rand_seeds[FIO_RAND_POISSON_OFF], 0);
+	init_rand_seed(&td->poisson_state[1], td->rand_seeds[FIO_RAND_POISSON2_OFF], 0);
+	init_rand_seed(&td->poisson_state[2], td->rand_seeds[FIO_RAND_POISSON3_OFF], 0);
+	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF], false);
+	init_rand_seed(&td->zone_state, td->rand_seeds[FIO_RAND_ZONE_OFF], false);
 
 	if (!td_random(td))
 		return;
@@ -839,14 +945,19 @@
 	if (td->o.rand_repeatable)
 		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
 
-	init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
-	init_rand_seed(&td->seq_rand_state[DDIR_READ], td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF]);
-	init_rand_seed(&td->seq_rand_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF]);
-	init_rand_seed(&td->seq_rand_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF]);
+	init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF], use64);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct frand_state *s = &td->seq_rand_state[i];
+
+		init_rand_seed(s, td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], false);
+	}
 }
 
 void td_fill_rand_seeds(struct thread_data *td)
 {
+	bool use64;
+
 	if (td->o.allrand_repeatable) {
 		unsigned int i;
 
@@ -855,12 +966,15 @@
 			       	+ i;
 	}
 
-	td_fill_rand_seeds_internal(td);
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		use64 = 1;
+	else
+		use64 = 0;
 
-	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF]);
+	td_fill_rand_seeds_internal(td, use64);
+
+	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF], use64);
 	frand_copy(&td->buf_state_prev, &td->buf_state);
-
-	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF]);
 }
 
 /*
@@ -912,7 +1026,7 @@
 		 */
 		if (origeo) {
 			memcpy(td->eo, origeo, td->io_ops->option_struct_size);
-			options_mem_dupe(td->eo, td->io_ops->options);
+			options_mem_dupe(td->io_ops->options, td->eo);
 		} else {
 			memset(td->eo, 0, td->io_ops->option_struct_size);
 			fill_default_options(td->eo, td->io_ops->options);
@@ -920,6 +1034,10 @@
 		*(struct thread_data **)td->eo = td;
 	}
 
+	if (td->o.odirect)
+		td->io_ops->flags |= FIO_RAWIO;
+
+	td_set_ioengine_flags(td);
 	return 0;
 }
 
@@ -949,6 +1067,12 @@
 		td->flags |= TD_F_SCRAMBLE_BUFFERS;
 	if (o->verify != VERIFY_NONE)
 		td->flags |= TD_F_VER_NONE;
+
+	if (o->verify_async || o->io_submit_mode == IO_MODE_OFFLOAD)
+		td->flags |= TD_F_NEED_LOCK;
+
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		td->flags &= ~TD_F_SCRAMBLE_BUFFERS;
 }
 
 static int setup_random_seeds(struct thread_data *td)
@@ -956,19 +1080,15 @@
 	unsigned long seed;
 	unsigned int i;
 
-	if (!td->o.rand_repeatable && !td->o.rand_seed)
+	if (!td->o.rand_repeatable && !fio_option_is_set(&td->o, rand_seed))
 		return init_random_state(td, td->rand_seeds, sizeof(td->rand_seeds));
 
-	if (!td->o.rand_seed)
-		seed = 0x89;
-	else
-		seed = td->o.rand_seed;
-
+	seed = td->o.rand_seed;
 	for (i = 0; i < 4; i++)
 		seed *= 0x9e370001UL;
 
 	for (i = 0; i < FIO_RAND_NR_OFFS; i++) {
-		td->rand_seeds[i] = seed;
+		td->rand_seeds[i] = seed * td->thread_number + i;
 		seed *= 0x9e370001UL;
 	}
 
@@ -1003,7 +1123,7 @@
 
 	if (!o->filename_format || !strlen(o->filename_format)) {
 		sprintf(buf, "%s.%d.%d", jobname, jobnum, filenum);
-		return NULL;
+		return buf;
 	}
 
 	for (f = &fpre_keywords[0]; f->keyword; f++)
@@ -1096,11 +1216,64 @@
 	return buf;
 }
 
-int parse_dryrun(void)
+bool parse_dryrun(void)
 {
 	return dump_cmdline || parse_only;
 }
 
+static void gen_log_name(char *name, size_t size, const char *logtype,
+			 const char *logname, unsigned int num,
+			 const char *suf, int per_job)
+{
+	if (per_job)
+		snprintf(name, size, "%s_%s.%d.%s", logname, logtype, num, suf);
+	else
+		snprintf(name, size, "%s_%s.%s", logname, logtype, suf);
+}
+
+static int check_waitees(char *waitee)
+{
+	struct thread_data *td;
+	int i, ret = 0;
+
+	for_each_td(td, i) {
+		if (td->subjob_number)
+			continue;
+
+		ret += !strcmp(td->o.name, waitee);
+	}
+
+	return ret;
+}
+
+static bool wait_for_ok(const char *jobname, struct thread_options *o)
+{
+	int nw;
+
+	if (!o->wait_for)
+		return true;
+
+	if (!strcmp(jobname, o->wait_for)) {
+		log_err("%s: a job cannot wait for itself (wait_for=%s).\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	if (!(nw = check_waitees(o->wait_for))) {
+		log_err("%s: waitee job %s unknown.\n", jobname, o->wait_for);
+		return false;
+	}
+
+	if (nw > 1) {
+		log_err("%s: multiple waitees %s found,\n"
+			"please avoid duplicates when using wait_for option.\n",
+				jobname, o->wait_for);
+		return false;
+	}
+
+	return true;
+}
+
 /*
  * Adds a job to the list of things todo. Sanitizes the various options
  * to make sure we don't have conflicts, and initializes various
@@ -1139,14 +1312,11 @@
 	if (ioengine_load(td))
 		goto err;
 
-	if (o->odirect)
-		td->io_ops->flags |= FIO_RAWIO;
-
 	file_alloced = 0;
 	if (!o->filename && !td->files_index && !o->read_iolog_file) {
 		file_alloced = 1;
 
-		if (o->nr_files == 1 && exists_and_not_file(jobname))
+		if (o->nr_files == 1 && exists_and_not_regfile(jobname))
 			add_file(td, jobname, job_add_num, 0);
 		else {
 			for (i = 0; i < o->nr_files; i++)
@@ -1157,6 +1327,12 @@
 	if (fixup_options(td))
 		goto err;
 
+	/*
+	 * Belongs to fixup_options, but o->name is not necessarily set as yet
+	 */
+	if (!wait_for_ok(jobname, o))
+		goto err;
+
 	flow_init_job(td);
 
 	/*
@@ -1166,7 +1342,7 @@
 	if (td->eo)
 		*(struct thread_data **)td->eo = NULL;
 
-	if (td->io_ops->flags & FIO_DISKLESSIO) {
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO)) {
 		struct fio_file *f;
 
 		for_each_file(td, f, i)
@@ -1190,6 +1366,10 @@
 	if ((o->stonewall || o->new_group) && prev_group_jobs) {
 		prev_group_jobs = 0;
 		groupid++;
+		if (groupid == INT_MAX) {
+			log_err("fio: too many groups defined\n");
+			goto err;
+		}
 	}
 
 	td->groupid = groupid;
@@ -1203,15 +1383,18 @@
 	if (setup_rate(td))
 		goto err;
 
-	if (o->lat_log_file) {
+	if (o->write_lat_log) {
 		struct log_params p = {
 			.td = td,
 			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
 			.log_type = IO_LOG_TYPE_LAT,
 			.log_offset = o->log_offset,
 			.log_gz = o->log_gz,
 			.log_gz_store = o->log_gz_store,
 		};
+		const char *pre = o->lat_log_file ? o->lat_log_file : o->name;
 		const char *suf;
 
 		if (p.log_gz_store)
@@ -1219,77 +1402,134 @@
 		else
 			suf = "log";
 
-		snprintf(logname, sizeof(logname), "%s_lat.%d.%s",
-				o->lat_log_file, td->thread_number, suf);
+		gen_log_name(logname, sizeof(logname), "lat", pre,
+				td->thread_number, suf, o->per_job_logs);
 		setup_log(&td->lat_log, &p, logname);
-		snprintf(logname, sizeof(logname), "%s_slat.%d.%s",
-				o->lat_log_file, td->thread_number, suf);
+
+		gen_log_name(logname, sizeof(logname), "slat", pre,
+				td->thread_number, suf, o->per_job_logs);
 		setup_log(&td->slat_log, &p, logname);
-		snprintf(logname, sizeof(logname), "%s_clat.%d.%s",
-				o->lat_log_file, td->thread_number, suf);
+
+		gen_log_name(logname, sizeof(logname), "clat", pre,
+				td->thread_number, suf, o->per_job_logs);
 		setup_log(&td->clat_log, &p, logname);
 	}
-	if (o->bw_log_file) {
+
+	if (o->write_hist_log) {
 		struct log_params p = {
 			.td = td,
 			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
+			.log_type = IO_LOG_TYPE_HIST,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *pre = o->hist_log_file ? o->hist_log_file : o->name;
+		const char *suf;
+
+#ifndef CONFIG_ZLIB
+		if (td->client_type) {
+			log_err("fio: --write_hist_log requires zlib in client/server mode\n");
+			goto err;
+		}
+#endif
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		gen_log_name(logname, sizeof(logname), "clat_hist", pre,
+				td->thread_number, suf, o->per_job_logs);
+		setup_log(&td->clat_hist_log, &p, logname);
+	}
+
+	if (o->write_bw_log) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
 			.log_type = IO_LOG_TYPE_BW,
 			.log_offset = o->log_offset,
 			.log_gz = o->log_gz,
 			.log_gz_store = o->log_gz_store,
 		};
+		const char *pre = o->bw_log_file ? o->bw_log_file : o->name;
 		const char *suf;
 
+		if (fio_option_is_set(o, bw_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->bw_avg_time);
+		else
+			o->bw_avg_time = p.avg_msec;
+	
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
 		if (p.log_gz_store)
 			suf = "log.fz";
 		else
 			suf = "log";
 
-		snprintf(logname, sizeof(logname), "%s_bw.%d.%s",
-				o->bw_log_file, td->thread_number, suf);
+		gen_log_name(logname, sizeof(logname), "bw", pre,
+				td->thread_number, suf, o->per_job_logs);
 		setup_log(&td->bw_log, &p, logname);
 	}
-	if (o->iops_log_file) {
+	if (o->write_iops_log) {
 		struct log_params p = {
 			.td = td,
 			.avg_msec = o->log_avg_msec,
+			.hist_msec = o->log_hist_msec,
+			.hist_coarseness = o->log_hist_coarseness,
 			.log_type = IO_LOG_TYPE_IOPS,
 			.log_offset = o->log_offset,
 			.log_gz = o->log_gz,
 			.log_gz_store = o->log_gz_store,
 		};
+		const char *pre = o->iops_log_file ? o->iops_log_file : o->name;
 		const char *suf;
 
+		if (fio_option_is_set(o, iops_avg_time))
+			p.avg_msec = min(o->log_avg_msec, o->iops_avg_time);
+		else
+			o->iops_avg_time = p.avg_msec;
+	
+		p.hist_msec = o->log_hist_msec;
+		p.hist_coarseness = o->log_hist_coarseness;
+
 		if (p.log_gz_store)
 			suf = "log.fz";
 		else
 			suf = "log";
 
-		snprintf(logname, sizeof(logname), "%s_iops.%d.%s",
-				o->iops_log_file, td->thread_number, suf);
+		gen_log_name(logname, sizeof(logname), "iops", pre,
+				td->thread_number, suf, o->per_job_logs);
 		setup_log(&td->iops_log, &p, logname);
 	}
 
 	if (!o->name)
 		o->name = strdup(jobname);
 
-	if (output_format == FIO_OUTPUT_NORMAL) {
+	if (output_format & FIO_OUTPUT_NORMAL) {
 		if (!job_add_num) {
 			if (is_backend && !recursed)
 				fio_server_send_add_job(td);
 
-			if (!(td->io_ops->flags & FIO_NOIO)) {
+			if (!td_ioengine_flagged(td, FIO_NOIO)) {
 				char *c1, *c2, *c3, *c4;
 				char *c5 = NULL, *c6 = NULL;
+				int i2p = is_power_of_2(o->kb_base);
 
-				c1 = fio_uint_to_kmg(o->min_bs[DDIR_READ]);
-				c2 = fio_uint_to_kmg(o->max_bs[DDIR_READ]);
-				c3 = fio_uint_to_kmg(o->min_bs[DDIR_WRITE]);
-				c4 = fio_uint_to_kmg(o->max_bs[DDIR_WRITE]);
+				c1 = num2str(o->min_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+				c2 = num2str(o->max_bs[DDIR_READ], 4, 1, i2p, N2S_BYTE);
+				c3 = num2str(o->min_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
+				c4 = num2str(o->max_bs[DDIR_WRITE], 4, 1, i2p, N2S_BYTE);
 
 				if (!o->bs_is_seq_rand) {
-					c5 = fio_uint_to_kmg(o->min_bs[DDIR_TRIM]);
-					c6 = fio_uint_to_kmg(o->max_bs[DDIR_TRIM]);
+					c5 = num2str(o->min_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE);
+					c6 = num2str(o->max_bs[DDIR_TRIM], 4, 1, i2p, N2S_BYTE);
 				}
 
 				log_info("%s: (g=%d): rw=%s, ", td->o.name,
@@ -1297,10 +1537,10 @@
 							ddir_str(o->td_ddir));
 
 				if (o->bs_is_seq_rand)
-					log_info("bs(seq/rand)=%s-%s/%s-%s, ",
+					log_info("bs=(R) %s-%s, (W) %s-%s, bs_is_seq_rand, ",
 							c1, c2, c3, c4);
 				else
-					log_info("bs=%s-%s/%s-%s/%s-%s, ",
+					log_info("bs=(R) %s-%s, (W) %s-%s, (T) %s-%s, ",
 							c1, c2, c3, c4, c5, c6);
 
 				log_info("ioengine=%s, iodepth=%u\n",
@@ -1317,13 +1557,16 @@
 			log_info("...\n");
 	}
 
+	if (td_steadystate_init(td))
+		goto err;
+
 	/*
 	 * recurse add identical jobs, clear numjobs and stonewall options
 	 * as they don't apply to sub-jobs
 	 */
 	numjobs = o->numjobs;
 	while (--numjobs) {
-		struct thread_data *td_new = get_new_job(0, td, 1, jobname);
+		struct thread_data *td_new = get_new_job(false, td, true, jobname);
 
 		if (!td_new)
 			goto err;
@@ -1332,6 +1575,8 @@
 		td_new->o.stonewall = 0;
 		td_new->o.new_group = 0;
 		td_new->subjob_number = numjobs;
+		td_new->o.ss_dur = o->ss_dur * 1000000l;
+		td_new->o.ss_limit = o->ss_limit;
 
 		if (file_alloced) {
 			if (td_new->files) {
@@ -1382,16 +1627,16 @@
 			sprintf(jobname, "%s", o[i] + 5);
 		}
 		if (in_global && !td_parent)
-			td_parent = get_new_job(1, &def_thread, 0, jobname);
+			td_parent = get_new_job(true, &def_thread, false, jobname);
 		else if (!in_global && !td) {
 			if (!td_parent)
 				td_parent = &def_thread;
-			td = get_new_job(0, td_parent, 0, jobname);
+			td = get_new_job(false, td_parent, false, jobname);
 		}
 		if (in_global)
-			fio_options_parse(td_parent, (char **) &o[i], 1, 0);
+			fio_options_parse(td_parent, (char **) &o[i], 1);
 		else
-			fio_options_parse(td, (char **) &o[i], 1, 0);
+			fio_options_parse(td, (char **) &o[i], 1);
 		i++;
 	}
 
@@ -1434,11 +1679,11 @@
 /*
  * This is our [ini] type file parser.
  */
-int __parse_jobs_ini(struct thread_data *td,
+static int __parse_jobs_ini(struct thread_data *td,
 		char *file, int is_buf, int stonewall_flag, int type,
 		int nested, char *name, char ***popts, int *aopts, int *nopts)
 {
-	unsigned int global = 0;
+	bool global = false;
 	char *string;
 	FILE *f;
 	char *p;
@@ -1547,7 +1792,7 @@
 				first_sect = 0;
 			}
 
-			td = get_new_job(global, &def_thread, 0, name);
+			td = get_new_job(global, &def_thread, false, name);
 			if (!td) {
 				ret = 1;
 				break;
@@ -1602,15 +1847,48 @@
 			strip_blank_end(p);
 
 			if (!strncmp(p, "include", strlen("include"))) {
-				char *filename = p + strlen("include") + 1;
+				char *filename = p + strlen("include") + 1,
+					*ts, *full_fn = NULL;
 
-				if ((ret = __parse_jobs_ini(td, filename,
-						is_buf, stonewall_flag, type, 1,
-						name, &opts, &alloc_opts, &num_opts))) {
-					log_err("Error %d while parsing include file %s\n",
-						ret, filename);
-					break;
+				/*
+				 * Allow for the include filename
+				 * specification to be relative.
+				 */
+				if (access(filename, F_OK) &&
+				    (ts = strrchr(file, '/'))) {
+					int len = ts - file +
+						strlen(filename) + 2;
+
+					if (!(full_fn = calloc(1, len))) {
+						ret = ENOMEM;
+						break;
+					}
+
+					strncpy(full_fn,
+						file, (ts - file) + 1);
+					strncpy(full_fn + (ts - file) + 1,
+						filename, strlen(filename));
+					full_fn[len - 1] = 0;
+					filename = full_fn;
 				}
+
+				ret = __parse_jobs_ini(td, filename, is_buf,
+						       stonewall_flag, type, 1,
+						       name, &opts,
+						       &alloc_opts, &num_opts);
+
+				if (ret) {
+					log_err("Error %d while parsing "
+						"include file %s\n",
+						ret, filename);
+				}
+
+				if (full_fn)
+					free(full_fn);
+
+				if (ret)
+					break;
+
 				continue;
 			}
 
@@ -1631,10 +1909,13 @@
 			goto out;
 		}
 
-		ret = fio_options_parse(td, opts, num_opts, dump_cmdline);
-		if (!ret)
+		ret = fio_options_parse(td, opts, num_opts);
+		if (!ret) {
+			if (dump_cmdline)
+				dump_opt_list(td);
+
 			ret = add_job(td, name, 0, 0, type);
-		else {
+		} else {
 			log_err("fio: job %s dropped\n", name);
 			put_job(td);
 		}
@@ -1672,6 +1953,7 @@
 static int fill_def_thread(void)
 {
 	memset(&def_thread, 0, sizeof(def_thread));
+	INIT_FLIST_HEAD(&def_thread.opt_list);
 
 	fio_getaffinity(getpid(), &def_thread.o.cpumask);
 	def_thread.o.error_dump = 1;
@@ -1683,24 +1965,59 @@
 	return 0;
 }
 
+static void show_debug_categories(void)
+{
+#ifdef FIO_INC_DEBUG
+	struct debug_level *dl = &debug_levels[0];
+	int curlen, first = 1;
+
+	curlen = 0;
+	while (dl->name) {
+		int has_next = (dl + 1)->name != NULL;
+
+		if (first || curlen + strlen(dl->name) >= 80) {
+			if (!first) {
+				printf("\n");
+				curlen = 0;
+			}
+			curlen += printf("\t\t\t%s", dl->name);
+			curlen += 3 * (8 - 1);
+			if (has_next)
+				curlen += printf(",");
+		} else {
+			curlen += printf("%s", dl->name);
+			if (has_next)
+				curlen += printf(",");
+		}
+		dl++;
+		first = 0;
+	}
+	printf("\n");
+#endif
+}
+
+/*
+ * Following options aren't printed by usage().
+ * --append-terse - Equivalent to --output-format=terse, see f6a7df53.
+ * --latency-log - Deprecated option.
+ */
 static void usage(const char *name)
 {
 	printf("%s\n", fio_version_string);
 	printf("%s [options] [job options] <job file(s)>\n", name);
-	printf("  --debug=options\tEnable debug logging. May be one/more of:\n"
-		"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
-		"\t\t\tdiskutil,job,mutex,profile,time,net,rate,compress\n");
+	printf("  --debug=options\tEnable debug logging. May be one/more of:\n");
+	show_debug_categories();
 	printf("  --parse-only\t\tParse options only, don't start any IO\n");
 	printf("  --output\t\tWrite output to file\n");
-	printf("  --runtime\t\tRuntime in seconds\n");
-	printf("  --bandwidth-log\tGenerate per-job bandwidth logs\n");
+	printf("  --bandwidth-log\tGenerate aggregate bandwidth logs\n");
 	printf("  --minimal\t\tMinimal (terse) output\n");
-	printf("  --output-format=x\tOutput format (terse,json,normal)\n");
-	printf("  --terse-version=x\tSet terse version output format to 'x'\n");
+	printf("  --output-format=type\tOutput format (terse,json,json+,normal)\n");
+	printf("  --terse-version=type\tSet terse version output format"
+		" (default 3, or 2 or 4)\n");
 	printf("  --version\t\tPrint version info and exit\n");
 	printf("  --help\t\tPrint this page\n");
 	printf("  --cpuclock-test\tPerform test/validation of CPU clock\n");
-	printf("  --crctest\t\tTest speed of checksum functions\n");
+	printf("  --crctest=type\tTest speed of checksum functions\n");
 	printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
 		" them\n");
 	printf("  --enghelp=engine\tPrint ioengine help, or list"
@@ -1716,14 +2033,15 @@
 	printf(" 't' period passed\n");
 	printf("  --readonly\t\tTurn on safety read-only checks, preventing"
 		" writes\n");
-	printf("  --section=name\tOnly run specified section in job file\n");
+	printf("  --section=name\tOnly run specified section in job file,"
+		" multiple sections can be specified\n");
 	printf("  --alloc-size=kb\tSet smalloc pool to this size in kb"
-		" (def 1024)\n");
+		" (def 16384)\n");
 	printf("  --warnings-fatal\tFio parser warnings are fatal\n");
 	printf("  --max-jobs=nr\t\tMaximum number of threads/processes to support\n");
 	printf("  --server=args\t\tStart a backend fio server\n");
 	printf("  --daemonize=pidfile\tBackground fio server, write pid to file\n");
-	printf("  --client=hostname\tTalk to remote backend fio server at hostname\n");
+	printf("  --client=hostname\tTalk to remote backend(s) fio server at hostname\n");
 	printf("  --remote-config=file\tTell fio server to load this local job file\n");
 	printf("  --idle-prof=option\tReport cpu idleness on a system or percpu basis\n"
 		"\t\t\t(option=system,percpu) or run unit work\n"
@@ -1735,6 +2053,7 @@
 	printf("  --trigger-timeout=t\tExecute trigger af this time\n");
 	printf("  --trigger=cmd\t\tSet this command as local trigger\n");
 	printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
+	printf("  --aux-path=path\tUse this path for fio state generated files\n");
 	printf("\nFio was written by Jens Axboe <jens.axboe@oracle.com>");
 	printf("\n                   Jens Axboe <jaxboe@fusionio.com>");
 	printf("\n                   Jens Axboe <axboe@fb.com>\n");
@@ -1806,6 +2125,14 @@
 	  .help = "Log compression logging",
 	  .shift = FD_COMPRESS,
 	},
+	{ .name = "steadystate",
+	  .help = "Steady state detection logging",
+	  .shift = FD_STEADYSTATE,
+	},
+	{ .name = "helperthread",
+	  .help = "Helper thread logging",
+	  .shift = FD_HELPERTHREAD,
+	},
 	{ .name = NULL, },
 };
 
@@ -1816,6 +2143,9 @@
 	char *opt;
 	int i;
 
+	if (!string)
+		return 0;
+
 	if (!strcmp(string, "?") || !strcmp(string, "help")) {
 		log_info("fio: dumping debug options:");
 		for (i = 0; debug_levels[i].name; i++) {
@@ -1932,16 +2262,47 @@
 		i++;
 	}
 
-	if (best_option != -1)
+	if (best_option != -1 && string_distance_ok(name, best_distance))
 		log_err("Did you mean %s?\n", l_opts[best_option].name);
 }
 
+static int parse_output_format(const char *optarg)
+{
+	char *p, *orig, *opt;
+	int ret = 0;
+
+	p = orig = strdup(optarg);
+
+	output_format = 0;
+
+	while ((opt = strsep(&p, ",")) != NULL) {
+		if (!strcmp(opt, "minimal") ||
+		    !strcmp(opt, "terse") ||
+		    !strcmp(opt, "csv"))
+			output_format |= FIO_OUTPUT_TERSE;
+		else if (!strcmp(opt, "json"))
+			output_format |= FIO_OUTPUT_JSON;
+		else if (!strcmp(opt, "json+"))
+			output_format |= (FIO_OUTPUT_JSON | FIO_OUTPUT_JSON_PLUS);
+		else if (!strcmp(opt, "normal"))
+			output_format |= FIO_OUTPUT_NORMAL;
+		else {
+			log_err("fio: invalid output format %s\n", opt);
+			ret = 1;
+			break;
+		}
+	}
+
+	free(orig);
+	return ret;
+}
+
 int parse_cmd_line(int argc, char *argv[], int client_type)
 {
 	struct thread_data *td = NULL;
 	int c, ini_idx = 0, lidx, ret = 0, do_exit = 0, exit_val = 0;
 	char *ostr = cmd_optstr;
-	void *pid_file = NULL;
+	char *pid_file = NULL;
 	void *cur_client = NULL;
 	int backend = 0;
 
@@ -1960,13 +2321,8 @@
 		switch (c) {
 		case 'a':
 			smalloc_pool_size = atoi(optarg);
-			break;
-		case 't':
-			if (check_str_time(optarg, &def_timeout, 1)) {
-				log_err("fio: failed parsing time %s\n", optarg);
-				do_exit++;
-				exit_val = 1;
-			}
+			smalloc_pool_size <<= 10;
+			sinit();
 			break;
 		case 'l':
 			log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n");
@@ -1976,38 +2332,35 @@
 		case 'b':
 			write_bw_log = 1;
 			break;
-		case 'o':
+		case 'o': {
+			FILE *tmp;
+
 			if (f_out && f_out != stdout)
 				fclose(f_out);
 
-			f_out = fopen(optarg, "w+");
-			if (!f_out) {
-				perror("fopen output");
-				exit(1);
-			}
-			f_err = f_out;
-			break;
-		case 'm':
-			output_format = FIO_OUTPUT_TERSE;
-			break;
-		case 'F':
-			if (!optarg) {
-				log_err("fio: missing --output-format argument\n");
+			tmp = fopen(optarg, "w+");
+			if (!tmp) {
+				log_err("fio: output file open error: %s\n", strerror(errno));
 				exit_val = 1;
 				do_exit++;
 				break;
 			}
-			if (!strcmp(optarg, "minimal") ||
-			    !strcmp(optarg, "terse") ||
-			    !strcmp(optarg, "csv"))
-				output_format = FIO_OUTPUT_TERSE;
-			else if (!strcmp(optarg, "json"))
-				output_format = FIO_OUTPUT_JSON;
-			else
-				output_format = FIO_OUTPUT_NORMAL;
+			f_err = f_out = tmp;
+			break;
+			}
+		case 'm':
+			output_format = FIO_OUTPUT_TERSE;
+			break;
+		case 'F':
+			if (parse_output_format(optarg)) {
+				log_err("fio: failed parsing output-format\n");
+				exit_val = 1;
+				do_exit++;
+				break;
+			}
 			break;
 		case 'f':
-			append_terse_output = 1;
+			output_format |= FIO_OUTPUT_TERSE;
 			break;
 		case 'h':
 			did_arg = 1;
@@ -2128,13 +2481,14 @@
 				if (is_section && skip_this_section(val))
 					continue;
 
-				td = get_new_job(global, &def_thread, 1, NULL);
+				td = get_new_job(global, &def_thread, true, NULL);
 				if (!td || ioengine_load(td)) {
 					if (td) {
 						put_job(td);
 						td = NULL;
 					}
 					do_exit++;
+					exit_val = 1;
 					break;
 				}
 				fio_options_set_ioengine_opts(l_opts, td);
@@ -2153,6 +2507,7 @@
 					td = NULL;
 				}
 				do_exit++;
+				exit_val = 1;
 			}
 
 			if (!ret && !strcmp(opt, "ioengine")) {
@@ -2161,6 +2516,7 @@
 					put_job(td);
 					td = NULL;
 					do_exit++;
+					exit_val = 1;
 					break;
 				}
 				fio_options_set_ioengine_opts(l_opts, td);
@@ -2229,6 +2585,35 @@
 				exit_val = 1;
 				break;
 			}
+			/* if --client parameter contains a pathname */
+			if (0 == access(optarg, R_OK)) {
+				/* file contains a list of host addrs or names */
+				char hostaddr[PATH_MAX] = {0};
+				char formatstr[8];
+				FILE * hostf = fopen(optarg, "r");
+				if (!hostf) {
+					log_err("fio: could not open client list file %s for read\n", optarg);
+					do_exit++;
+					exit_val = 1;
+					break;
+				}
+				sprintf(formatstr, "%%%ds", PATH_MAX - 1);
+				/*
+				 * read at most PATH_MAX-1 chars from each
+				 * record in this file
+				 */
+				while (fscanf(hostf, formatstr, hostaddr) == 1) {
+					/* expect EVERY host in file to be valid */
+					if (fio_client_add(&fio_client_ops, hostaddr, &cur_client)) {
+						log_err("fio: failed adding client %s from file %s\n", hostaddr, optarg);
+						do_exit++;
+						exit_val = 1;
+						break;
+					}
+				}
+				fclose(hostf);
+				break; /* no possibility of job file for "this client only" */
+			}
 			if (fio_client_add(&fio_client_ops, optarg, &cur_client)) {
 				log_err("fio: failed adding client %s\n", optarg);
 				do_exit++;
@@ -2244,14 +2629,14 @@
 				    !strncmp(argv[optind], "-", 1))
 					break;
 
-				if (fio_client_add_ini_file(cur_client, argv[optind], 0))
+				if (fio_client_add_ini_file(cur_client, argv[optind], false))
 					break;
 				optind++;
 			}
 			break;
 		case 'R':
 			did_arg = 1;
-			if (fio_client_add_ini_file(cur_client, optarg, 1)) {
+			if (fio_client_add_ini_file(cur_client, optarg, true)) {
 				do_exit++;
 				exit_val = 1;
 			}
@@ -2293,6 +2678,11 @@
 				free(trigger_remote_cmd);
 			trigger_remote_cmd = strdup(optarg);
 			break;
+		case 'K':
+			if (aux_path)
+				free(aux_path);
+			aux_path = strdup(optarg);
+			break;
 		case 'B':
 			if (check_str_time(optarg, &trigger_timeout, 1)) {
 				log_err("fio: failed parsing time %s\n", optarg);
@@ -2329,7 +2719,7 @@
 		if (!ret) {
 			ret = add_job(td, td->o.name ?: "fio", 0, 0, client_type);
 			if (ret)
-				did_arg = 1;
+				exit(1);
 		}
 	}
 
@@ -2341,9 +2731,6 @@
 	}
 
 out_free:
-	if (pid_file)
-		free(pid_file);
-
 	return ini_idx;
 }
 
@@ -2412,7 +2799,7 @@
 		if (did_arg)
 			return 0;
 
-		log_err("No jobs(s) defined\n\n");
+		log_err("No job(s) defined\n\n");
 
 		if (!did_arg) {
 			usage(argv[0]);
@@ -2422,7 +2809,7 @@
 		return 0;
 	}
 
-	if (output_format == FIO_OUTPUT_NORMAL)
+	if (output_format & FIO_OUTPUT_NORMAL)
 		log_info("%s\n", fio_version_string);
 
 	return 0;
@@ -2432,3 +2819,8 @@
 {
 	memcpy(o, &def_thread.o, sizeof(*o));
 }
+
+struct thread_data *get_global_options(void)
+{
+	return &def_thread;
+}

diff --git a/io_ddir.h b/io_ddir.h
index b16a6b9..613d5fb 100644
--- a/io_ddir.h
+++ b/io_ddir.h

@@ -16,8 +16,9 @@
 
 static inline const char *io_ddir_name(enum fio_ddir ddir)
 {
-	const char *name[] = { "read", "write", "trim", "sync", "datasync",
-				"sync_file_range", "write", };
+	static const char *name[] = { "read", "write", "trim", "sync",
+					"datasync", "sync_file_range",
+					"wait", };
 
 	if (ddir < DDIR_LAST)
 		return name[ddir];
@@ -35,6 +36,7 @@
 	TD_DDIR_RANDWRITE	= TD_DDIR_WRITE | TD_DDIR_RAND,
 	TD_DDIR_RANDRW		= TD_DDIR_RW | TD_DDIR_RAND,
 	TD_DDIR_RANDTRIM	= TD_DDIR_TRIM | TD_DDIR_RAND,
+	TD_DDIR_TRIMWRITE	= TD_DDIR_TRIM | TD_DDIR_WRITE,
 };
 
 #define td_read(td)		((td)->o.td_ddir & TD_DDIR_READ)
@@ -43,6 +45,8 @@
 #define td_rw(td)		(((td)->o.td_ddir & TD_DDIR_RW) == TD_DDIR_RW)
 #define td_random(td)		((td)->o.td_ddir & TD_DDIR_RAND)
 #define file_randommap(td, f)	(!(td)->o.norandommap && fio_file_axmap((f)))
+#define td_trimwrite(td)	(((td)->o.td_ddir & TD_DDIR_TRIMWRITE) \
+					== TD_DDIR_TRIMWRITE)
 
 static inline int ddir_sync(enum fio_ddir ddir)
 {
@@ -57,9 +61,9 @@
 
 static inline const char *ddir_str(enum td_ddir ddir)
 {
-	const char *__str[] = { NULL, "read", "write", "rw", NULL,
+	static const char *__str[] = { NULL, "read", "write", "rw", "rand",
 				"randread", "randwrite", "randrw",
-				"trim", NULL, NULL, NULL, "randtrim" };
+				"trim", NULL, "trimwrite", NULL, "randtrim" };
 
 	return __str[ddir];
 }

diff --git a/io_u.c b/io_u.c
index f61fee8..fd63119 100644
--- a/io_u.c
+++ b/io_u.c

@@ -12,6 +12,8 @@
 #include "lib/rand.h"
 #include "lib/axmap.h"
 #include "err.h"
+#include "lib/pow2.h"
+#include "minmax.h"
 
 struct io_completion_data {
 	int nr;				/* input */
@@ -25,7 +27,7 @@
  * The ->io_axmap contains a map of blocks we have or have not done io
  * to yet. Used to make sure we cover the entire range in a fair fashion.
  */
-static int random_map_free(struct fio_file *f, const uint64_t block)
+static bool random_map_free(struct fio_file *f, const uint64_t block)
 {
 	return !axmap_isset(f->io_axmap, block);
 }
@@ -60,6 +62,7 @@
 
 	/*
 	 * Hmm, should we make sure that ->io_size <= ->real_file_size?
+	 * -> not for now since there is code assuming it could go either.
 	 */
 	max_size = f->io_size;
 	if (max_size > f->real_file_size)
@@ -84,22 +87,19 @@
 };
 
 static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
-				  enum fio_ddir ddir, uint64_t *b)
+				  enum fio_ddir ddir, uint64_t *b,
+				  uint64_t lastb)
 {
 	uint64_t r;
 
-	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE) {
-		uint64_t lastb;
-
-		lastb = last_block(td, f, ddir);
-		if (!lastb)
-			return 1;
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE ||
+	    td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64) {
 
 		r = __rand(&td->random_state);
 
 		dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
 
-		*b = lastb * (r / ((uint64_t) FRAND_MAX + 1.0));
+		*b = lastb * (r / (rand_max(&td->random_state) + 1.0));
 	} else {
 		uint64_t off = 0;
 
@@ -149,6 +149,79 @@
 	return 0;
 }
 
+static int __get_next_rand_offset_gauss(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
+{
+	*b = gauss_next(&f->gauss);
+	return 0;
+}
+
+static int __get_next_rand_offset_zoned(struct thread_data *td,
+					struct fio_file *f, enum fio_ddir ddir,
+					uint64_t *b)
+{
+	unsigned int v, send, stotal;
+	uint64_t offset, lastb;
+	static int warned;
+	struct zone_split_index *zsi;
+
+	lastb = last_block(td, f, ddir);
+	if (!lastb)
+		return 1;
+
+	if (!td->o.zone_split_nr[ddir]) {
+bail:
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	}
+
+	/*
+	 * Generate a value, v, between 1 and 100, both inclusive
+	 */
+	v = rand32_between(&td->zone_state, 1, 100);
+
+	zsi = &td->zone_state_index[ddir][v - 1];
+	stotal = zsi->size_perc_prev;
+	send = zsi->size_perc;
+
+	/*
+	 * Should never happen
+	 */
+	if (send == -1U) {
+		if (!warned) {
+			log_err("fio: bug in zoned generation\n");
+			warned = 1;
+		}
+		goto bail;
+	}
+
+	/*
+	 * 'send' is some percentage below or equal to 100 that
+	 * marks the end of the current IO range. 'stotal' marks
+	 * the start, in percent.
+	 */
+	if (stotal)
+		offset = stotal * lastb / 100ULL;
+	else
+		offset = 0;
+
+	lastb = lastb * (send - stotal) / 100ULL;
+
+	/*
+	 * Generate index from 0..send-of-lastb
+	 */
+	if (__get_next_rand_offset(td, f, ddir, b, lastb) == 1)
+		return 1;
+
+	/*
+	 * Add our start offset, if any
+	 */
+	if (offset)
+		*b += offset;
+
+	return 0;
+}
+
 static int flist_cmp(void *data, struct flist_head *a, struct flist_head *b)
 {
 	struct rand_off *r1 = flist_entry(a, struct rand_off, list);
@@ -160,12 +233,22 @@
 static int get_off_from_method(struct thread_data *td, struct fio_file *f,
 			       enum fio_ddir ddir, uint64_t *b)
 {
-	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM)
-		return __get_next_rand_offset(td, f, ddir, b);
-	else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
+	if (td->o.random_distribution == FIO_RAND_DIST_RANDOM) {
+		uint64_t lastb;
+
+		lastb = last_block(td, f, ddir);
+		if (!lastb)
+			return 1;
+
+		return __get_next_rand_offset(td, f, ddir, b, lastb);
+	} else if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
 		return __get_next_rand_offset_zipf(td, f, ddir, b);
 	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
 		return __get_next_rand_offset_pareto(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		return __get_next_rand_offset_gauss(td, f, ddir, b);
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return __get_next_rand_offset_zoned(td, f, ddir, b);
 
 	log_err("fio: unknown random distribution: %d\n", td->o.random_distribution);
 	return 1;
@@ -175,30 +258,29 @@
  * Sort the reads for a verify phase in batches of verifysort_nr, if
  * specified.
  */
-static inline int should_sort_io(struct thread_data *td)
+static inline bool should_sort_io(struct thread_data *td)
 {
 	if (!td->o.verifysort_nr || !td->o.do_verify)
-		return 0;
+		return false;
 	if (!td_random(td))
-		return 0;
+		return false;
 	if (td->runstate != TD_VERIFYING)
-		return 0;
-	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE)
-		return 0;
+		return false;
+	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE ||
+	    td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE64)
+		return false;
 
-	return 1;
+	return true;
 }
 
-static int should_do_random(struct thread_data *td, enum fio_ddir ddir)
+static bool should_do_random(struct thread_data *td, enum fio_ddir ddir)
 {
 	unsigned int v;
-	unsigned long r;
 
 	if (td->o.perc_rand[ddir] == 100)
-		return 1;
+		return true;
 
-	r = __rand(&td->seq_rand_state[ddir]);
-	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
+	v = rand32_between(&td->seq_rand_state[ddir], 1, 100);
 
 	return v <= td->o.perc_rand[ddir];
 }
@@ -247,7 +329,8 @@
 	if (!get_next_rand_offset(td, f, ddir, b))
 		return 0;
 
-	if (td->o.time_based) {
+	if (td->o.time_based ||
+	    (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)) {
 		fio_file_reset(td, f);
 		if (!get_next_rand_offset(td, f, ddir, b))
 			return 0;
@@ -267,14 +350,25 @@
 	assert(ddir_rw(ddir));
 
 	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
-	    o->time_based)
-		f->last_pos[ddir] = f->last_pos[ddir] - f->io_size;
+	    o->time_based) {
+		struct thread_options *o = &td->o;
+		uint64_t io_size = f->io_size + (f->io_size % o->min_bs[ddir]);
+
+		if (io_size > f->last_pos[ddir])
+			f->last_pos[ddir] = 0;
+		else
+			f->last_pos[ddir] = f->last_pos[ddir] - io_size;
+	}
 
 	if (f->last_pos[ddir] < f->real_file_size) {
 		uint64_t pos;
 
-		if (f->last_pos[ddir] == f->file_offset && o->ddir_seq_add < 0)
-			f->last_pos[ddir] = f->real_file_size;
+		if (f->last_pos[ddir] == f->file_offset && o->ddir_seq_add < 0) {
+			if (f->real_file_size > f->io_size)
+				f->last_pos[ddir] = f->io_size;
+			else
+				f->last_pos[ddir] = f->real_file_size;
+		}
 
 		pos = f->last_pos[ddir] - f->file_offset;
 		if (pos && o->ddir_seq_add) {
@@ -283,10 +377,21 @@
 			/*
 			 * If we reach beyond the end of the file
 			 * with holed IO, wrap around to the
-			 * beginning again.
+			 * beginning again. If we're doing backwards IO,
+			 * wrap to the end.
 			 */
-			if (pos >= f->real_file_size)
-				pos = f->file_offset;
+			if (pos >= f->real_file_size) {
+				if (o->ddir_seq_add > 0)
+					pos = f->file_offset;
+				else {
+					if (f->real_file_size > f->io_size)
+						pos = f->io_size;
+					else
+						pos = f->real_file_size;
+
+					pos += o->ddir_seq_add;
+				}
+			}
 		}
 
 		*offset = pos;
@@ -315,7 +420,7 @@
 				*is_random = 1;
 			} else {
 				*is_random = 0;
-				io_u->flags |= IO_U_F_BUSY_OK;
+				io_u_set(td, io_u, IO_U_F_BUSY_OK);
 				ret = get_next_seq_offset(td, f, ddir, &offset);
 				if (ret)
 					ret = get_next_rand_block(td, f, ddir, &b);
@@ -325,7 +430,7 @@
 			ret = get_next_seq_offset(td, f, ddir, &offset);
 		}
 	} else {
-		io_u->flags |= IO_U_F_BUSY_OK;
+		io_u_set(td, io_u, IO_U_F_BUSY_OK);
 		*is_random = 0;
 
 		if (td->o.rw_seq == RW_SEQ_SEQ) {
@@ -413,8 +518,8 @@
 	return __get_next_offset(td, io_u, is_random);
 }
 
-static inline int io_u_fits(struct thread_data *td, struct io_u *io_u,
-			    unsigned int buflen)
+static inline bool io_u_fits(struct thread_data *td, struct io_u *io_u,
+			     unsigned int buflen)
 {
 	struct fio_file *f = io_u->file;
 
@@ -427,7 +532,8 @@
 	int ddir = io_u->ddir;
 	unsigned int buflen = 0;
 	unsigned int minbs, maxbs;
-	unsigned long r;
+	uint64_t frand_max, r;
+	bool power_2;
 
 	assert(ddir_rw(ddir));
 
@@ -446,16 +552,17 @@
 	if (!io_u_fits(td, io_u, minbs))
 		return 0;
 
+	frand_max = rand_max(&td->bsrange_state);
 	do {
 		r = __rand(&td->bsrange_state);
 
 		if (!td->o.bssplit_nr[ddir]) {
 			buflen = 1 + (unsigned int) ((double) maxbs *
-					(r / (FRAND_MAX + 1.0)));
+					(r / (frand_max + 1.0)));
 			if (buflen < minbs)
 				buflen = minbs;
 		} else {
-			long perc = 0;
+			long long perc = 0;
 			unsigned int i;
 
 			for (i = 0; i < td->o.bssplit_nr[ddir]; i++) {
@@ -463,19 +570,19 @@
 
 				buflen = bsp->bs;
 				perc += bsp->perc;
-				if ((r <= ((FRAND_MAX / 100L) * perc)) &&
+				if (!perc)
+					break;
+				if ((r / perc <= frand_max / 100ULL) &&
 				    io_u_fits(td, io_u, buflen))
 					break;
 			}
 		}
 
-		if (td->o.do_verify && td->o.verify != VERIFY_NONE)
-			buflen = (buflen + td->o.verify_interval - 1) &
-				~(td->o.verify_interval - 1);
-
-		if (!td->o.bs_unaligned && is_power_of_2(minbs))
-			buflen = (buflen + minbs - 1) & ~(minbs - 1);
-
+		power_2 = is_power_of_2(minbs);
+		if (!td->o.bs_unaligned && power_2)
+			buflen &= ~(minbs - 1);
+		else if (!td->o.bs_unaligned && !power_2) 
+			buflen -= buflen % minbs; 
 	} while (!io_u_fits(td, io_u, buflen));
 
 	return buflen;
@@ -510,10 +617,8 @@
 static inline enum fio_ddir get_rand_ddir(struct thread_data *td)
 {
 	unsigned int v;
-	unsigned long r;
 
-	r = __rand(&td->rwmix_state);
-	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
+	v = rand32_between(&td->rwmix_state, 1, 100);
 
 	if (v <= td->o.rwmix[DDIR_READ])
 		return DDIR_READ;
@@ -521,8 +626,10 @@
 	return DDIR_WRITE;
 }
 
-void io_u_quiesce(struct thread_data *td)
+int io_u_quiesce(struct thread_data *td)
 {
+	int completed = 0;
+
 	/*
 	 * We are going to sleep, ensure that we flush anything pending as
 	 * not to skew our latency numbers.
@@ -539,60 +646,64 @@
 	}
 
 	while (td->io_u_in_flight) {
-		int fio_unused ret;
+		int ret;
 
-		ret = io_u_queued_complete(td, 1, NULL);
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			completed += ret;
 	}
+
+	if (td->flags & TD_F_REGROW_LOGS)
+		regrow_logs(td);
+
+	return completed;
 }
 
 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 {
 	enum fio_ddir odir = ddir ^ 1;
 	long usec;
+	uint64_t now;
 
 	assert(ddir_rw(ddir));
+	now = utime_since_now(&td->start);
 
-	if (td->rate_pending_usleep[ddir] <= 0)
+	/*
+	 * if rate_next_io_time is in the past, need to catch up to rate
+	 */
+	if (td->rate_next_io_time[ddir] <= now)
 		return ddir;
 
 	/*
-	 * We have too much pending sleep in this direction. See if we
+	 * We are ahead of rate in this direction. See if we
 	 * should switch.
 	 */
 	if (td_rw(td) && td->o.rwmix[odir]) {
 		/*
-		 * Other direction does not have too much pending, switch
+		 * Other direction is behind rate, switch
 		 */
-		if (td->rate_pending_usleep[odir] < 100000)
+		if (td->rate_next_io_time[odir] <= now)
 			return odir;
 
 		/*
-		 * Both directions have pending sleep. Sleep the minimum time
-		 * and deduct from both.
+		 * Both directions are ahead of rate. sleep the min
+		 * switch if necissary
 		 */
-		if (td->rate_pending_usleep[ddir] <=
-			td->rate_pending_usleep[odir]) {
-			usec = td->rate_pending_usleep[ddir];
+		if (td->rate_next_io_time[ddir] <=
+			td->rate_next_io_time[odir]) {
+			usec = td->rate_next_io_time[ddir] - now;
 		} else {
-			usec = td->rate_pending_usleep[odir];
+			usec = td->rate_next_io_time[odir] - now;
 			ddir = odir;
 		}
 	} else
-		usec = td->rate_pending_usleep[ddir];
+		usec = td->rate_next_io_time[ddir] - now;
 
-	io_u_quiesce(td);
+	if (td->o.io_submit_mode == IO_MODE_INLINE)
+		io_u_quiesce(td);
 
 	usec = usec_sleep(td, usec);
 
-	td->rate_pending_usleep[ddir] -= usec;
-
-	odir = ddir ^ 1;
-	if (td_rw(td) && __should_check_rate(td, odir))
-		td->rate_pending_usleep[odir] -= usec;
-
-	if (ddir == DDIR_TRIM)
-		return DDIR_TRIM;
-
 	return ddir;
 }
 
@@ -606,28 +717,22 @@
 	enum fio_ddir ddir;
 
 	/*
-	 * see if it's time to fsync
+	 * See if it's time to fsync/fdatasync/sync_file_range first,
+	 * and if not then move on to check regular I/Os.
 	 */
-	if (td->o.fsync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC;
+	if (should_fsync(td)) {
+		if (td->o.fsync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fsync_blocks))
+			return DDIR_SYNC;
 
-	/*
-	 * see if it's time to fdatasync
-	 */
-	if (td->o.fdatasync_blocks &&
-	   !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_DATASYNC;
+		if (td->o.fdatasync_blocks && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->o.fdatasync_blocks))
+			return DDIR_DATASYNC;
 
-	/*
-	 * see if it's time to sync_file_range
-	 */
-	if (td->sync_file_range_nr &&
-	   !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr) &&
-	     td->io_issues[DDIR_WRITE] && should_fsync(td))
-		return DDIR_SYNC_FILE_RANGE;
+		if (td->sync_file_range_nr && td->io_issues[DDIR_WRITE] &&
+		    !(td->io_issues[DDIR_WRITE] % td->sync_file_range_nr))
+			return DDIR_SYNC_FILE_RANGE;
+	}
 
 	if (td_rw(td)) {
 		/*
@@ -651,8 +756,10 @@
 		ddir = DDIR_READ;
 	else if (td_write(td))
 		ddir = DDIR_WRITE;
-	else
+	else if (td_trim(td))
 		ddir = DDIR_TRIM;
+	else
+		ddir = DDIR_INVAL;
 
 	td->rwmix_ddir = rate_ddir(td, ddir);
 	return td->rwmix_ddir;
@@ -660,13 +767,23 @@
 
 static void set_rw_ddir(struct thread_data *td, struct io_u *io_u)
 {
-	io_u->ddir = io_u->acct_ddir = get_rw_ddir(td);
+	enum fio_ddir ddir = get_rw_ddir(td);
 
-	if (io_u->ddir == DDIR_WRITE && (td->io_ops->flags & FIO_BARRIER) &&
+	if (td_trimwrite(td)) {
+		struct fio_file *f = io_u->file;
+		if (f->last_pos[DDIR_WRITE] == f->last_pos[DDIR_TRIM])
+			ddir = DDIR_TRIM;
+		else
+			ddir = DDIR_WRITE;
+	}
+
+	io_u->ddir = io_u->acct_ddir = ddir;
+
+	if (io_u->ddir == DDIR_WRITE && td_ioengine_flagged(td, FIO_BARRIER) &&
 	    td->o.barrier_blocks &&
 	   !(td->io_issues[DDIR_WRITE] % td->o.barrier_blocks) &&
 	     td->io_issues[DDIR_WRITE])
-		io_u->flags |= IO_U_F_BARRIER;
+		io_u_set(td, io_u, IO_U_F_BARRIER);
 }
 
 void put_file_log(struct thread_data *td, struct fio_file *f)
@@ -679,16 +796,21 @@
 
 void put_io_u(struct thread_data *td, struct io_u *io_u)
 {
+	if (td->parent)
+		td = td->parent;
+
 	td_io_u_lock(td);
 
 	if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT))
 		put_file_log(td, io_u->file);
 
 	io_u->file = NULL;
-	io_u->flags |= IO_U_F_FREE;
+	io_u_set(td, io_u, IO_U_F_FREE);
 
-	if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
+	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
 	io_u_qpush(&td->io_u_freelist, io_u);
 	td_io_u_unlock(td);
 	td_io_u_free_notify(td);
@@ -696,7 +818,7 @@
 
 void clear_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	io_u->flags &= ~IO_U_F_FLIGHT;
+	io_u_clear(td, io_u, IO_U_F_FLIGHT);
 	put_io_u(td, io_u);
 }
 
@@ -707,18 +829,24 @@
 
 	dprint(FD_IO, "requeue %p\n", __io_u);
 
+	if (td->parent)
+		td = td->parent;
+
 	td_io_u_lock(td);
 
-	__io_u->flags |= IO_U_F_FREE;
+	io_u_set(td, __io_u, IO_U_F_FREE);
 	if ((__io_u->flags & IO_U_F_FLIGHT) && ddir_rw(ddir))
 		td->io_issues[ddir]--;
 
-	__io_u->flags &= ~IO_U_F_FLIGHT;
-	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH)
+	io_u_clear(td, __io_u, IO_U_F_FLIGHT);
+	if (__io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
+		assert(!(td->flags & TD_F_CHILD));
+	}
 
 	io_u_rpush(&td->io_u_requeues, __io_u);
 	td_io_u_unlock(td);
+	td_io_u_free_notify(td);
 	*io_u = NULL;
 }
 
@@ -726,7 +854,7 @@
 {
 	unsigned int is_random;
 
-	if (td->io_ops->flags & FIO_NOIO)
+	if (td_ioengine_flagged(td, FIO_NOIO))
 		goto out;
 
 	set_rw_ddir(td, io_u);
@@ -771,8 +899,9 @@
 	}
 
 	if (io_u->offset + io_u->buflen > io_u->file->real_file_size) {
-		dprint(FD_IO, "io_u %p, offset too large\n", io_u);
-		dprint(FD_IO, "  off=%llu/%lu > %llu\n",
+		dprint(FD_IO, "io_u %p, offset + buflen exceeds file size\n",
+			io_u);
+		dprint(FD_IO, "  offset=%llu/buflen=%lu > %llu\n",
 			(unsigned long long) io_u->offset, io_u->buflen,
 			(unsigned long long) io_u->file->real_file_size);
 		return 1;
@@ -954,6 +1083,34 @@
 		io_u_mark_lat_msec(td, usec / 1000);
 }
 
+static unsigned int __get_next_fileno_rand(struct thread_data *td)
+{
+	unsigned long fileno;
+
+	if (td->o.file_service_type == FIO_FSERVICE_RANDOM) {
+		uint64_t frand_max = rand_max(&td->next_file_state);
+		unsigned long r;
+
+		r = __rand(&td->next_file_state);
+		return (unsigned int) ((double) td->o.nr_files
+				* (r / (frand_max + 1.0)));
+	}
+
+	if (td->o.file_service_type == FIO_FSERVICE_ZIPF)
+		fileno = zipf_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_PARETO)
+		fileno = pareto_next(&td->next_file_zipf);
+	else if (td->o.file_service_type == FIO_FSERVICE_GAUSS)
+		fileno = gauss_next(&td->next_file_gauss);
+	else {
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		assert(0);
+		return 0;
+	}
+
+	return fileno >> FIO_FSERVICE_SHIFT;
+}
+
 /*
  * Get next file to service by choosing one at random
  */
@@ -966,11 +1123,8 @@
 
 	do {
 		int opened = 0;
-		unsigned long r;
 
-		r = __rand(&td->next_file_state);
-		fno = (unsigned int) ((double) td->o.nr_files
-				* (r / (FRAND_MAX + 1.0)));
+		fno = __get_next_fileno_rand(td);
 
 		f = td->files[fno];
 		if (fio_file_done(f))
@@ -1123,10 +1277,14 @@
 		put_file_log(td, f);
 		td_io_close_file(td, f);
 		io_u->file = NULL;
-		fio_file_set_done(f);
-		td->nr_done_files++;
-		dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
+		if (td->o.file_service_type & __FIO_FSERVICE_NONUNIFORM)
+			fio_file_reset(td, f);
+		else {
+			fio_file_set_done(f);
+			td->nr_done_files++;
+			dprint(FD_FILE, "%s: is done (%d of %d)\n", f->file_name,
 					td->nr_done_files, td->o.nr_files);
+		}
 	} while (1);
 
 	return 0;
@@ -1152,10 +1310,10 @@
  * We had an IO outside the latency target. Reduce the queue depth. If we
  * are at QD=1, then it's time to give up.
  */
-static int __lat_target_failed(struct thread_data *td)
+static bool __lat_target_failed(struct thread_data *td)
 {
 	if (td->latency_qd == 1)
-		return 1;
+		return true;
 
 	td->latency_qd_high = td->latency_qd;
 
@@ -1172,16 +1330,16 @@
 	 */
 	io_u_quiesce(td);
 	lat_new_cycle(td);
-	return 0;
+	return false;
 }
 
-static int lat_target_failed(struct thread_data *td)
+static bool lat_target_failed(struct thread_data *td)
 {
 	if (td->o.latency_percentile.u.f == 100.0)
 		return __lat_target_failed(td);
 
 	td->latency_failed++;
-	return 0;
+	return false;
 }
 
 void lat_target_init(struct thread_data *td)
@@ -1276,14 +1434,14 @@
  * If latency target is enabled, we might be ramping up or down and not
  * using the full queue depth available.
  */
-int queue_full(const struct thread_data *td)
+bool queue_full(const struct thread_data *td)
 {
 	const int qempty = io_u_qempty(&td->io_u_freelist);
 
 	if (qempty)
-		return 1;
+		return true;
 	if (!td->o.latency_target)
-		return 0;
+		return false;
 
 	return td->cur_depth >= td->latency_qd;
 }
@@ -1311,21 +1469,23 @@
 
 	if (io_u) {
 		assert(io_u->flags & IO_U_F_FREE);
-		io_u->flags &= ~(IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
+		io_u_clear(td, io_u, IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
 				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
 				 IO_U_F_VER_LIST);
 
 		io_u->error = 0;
 		io_u->acct_ddir = -1;
 		td->cur_depth++;
-		io_u->flags |= IO_U_F_IN_CUR_DEPTH;
+		assert(!(td->flags & TD_F_CHILD));
+		io_u_set(td, io_u, IO_U_F_IN_CUR_DEPTH);
 		io_u->ipo = NULL;
-	} else if (td->o.verify_async) {
+	} else if (td_async_processing(td)) {
 		/*
 		 * We ran out, wait for async verify threads to finish and
 		 * return one
 		 */
-		pthread_cond_wait(&td->free_cond, &td->io_u_lock);
+		assert(!(td->flags & TD_F_CHILD));
+		assert(!pthread_cond_wait(&td->free_cond, &td->io_u_lock));
 		goto again;
 	}
 
@@ -1333,10 +1493,10 @@
 	return io_u;
 }
 
-static int check_get_trim(struct thread_data *td, struct io_u *io_u)
+static bool check_get_trim(struct thread_data *td, struct io_u *io_u)
 {
 	if (!(td->flags & TD_F_TRIM_BACKLOG))
-		return 0;
+		return false;
 
 	if (td->trim_entries) {
 		int get_trim = 0;
@@ -1352,17 +1512,17 @@
 			get_trim = 1;
 		}
 
-		if (get_trim && !get_next_trim(td, io_u))
-			return 1;
+		if (get_trim && get_next_trim(td, io_u))
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
-static int check_get_verify(struct thread_data *td, struct io_u *io_u)
+static bool check_get_verify(struct thread_data *td, struct io_u *io_u)
 {
 	if (!(td->flags & TD_F_VER_BACKLOG))
-		return 0;
+		return false;
 
 	if (td->io_hist_len) {
 		int get_verify = 0;
@@ -1379,11 +1539,11 @@
 
 		if (get_verify && !get_next_verify(td, io_u)) {
 			td->verify_batch--;
-			return 1;
+			return true;
 		}
 	}
 
-	return 0;
+	return false;
 }
 
 /*
@@ -1474,7 +1634,7 @@
 	assert(fio_file_open(f));
 
 	if (ddir_rw(io_u->ddir)) {
-		if (!io_u->buflen && !(td->io_ops->flags & FIO_NOIO)) {
+		if (!io_u->buflen && !td_ioengine_flagged(td, FIO_NOIO)) {
 			dprint(FD_IO, "get_io_u: zero buflen on %p\n", io_u);
 			goto err_put;
 		}
@@ -1486,7 +1646,7 @@
 			if (td->flags & TD_F_REFILL_BUFFERS) {
 				io_u_fill_buffer(td, io_u,
 					td->o.min_bs[DDIR_WRITE],
-					io_u->xfer_buflen);
+					io_u->buflen);
 			} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
 				   !(td->flags & TD_F_COMPRESS))
 				do_scramble = 1;
@@ -1512,10 +1672,12 @@
 out:
 	assert(io_u->file);
 	if (!td_io_prep(td, io_u)) {
-		if (!td->o.disable_slat)
+		if (!td->o.disable_lat)
 			fio_gettime(&io_u->start_time, NULL);
+
 		if (do_scramble)
 			small_content_scramble(io_u);
+
 		return io_u;
 	}
 err_put:
@@ -1524,7 +1686,7 @@
 	return ERR_PTR(ret);
 }
 
-void io_u_log_error(struct thread_data *td, struct io_u *io_u)
+static void __io_u_log_error(struct thread_data *td, struct io_u *io_u)
 {
 	enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error);
 
@@ -1538,23 +1700,44 @@
 		io_ddir_name(io_u->ddir),
 		io_u->offset, io_u->xfer_buflen);
 
+	if (td->io_ops->errdetails) {
+		char *err = td->io_ops->errdetails(io_u);
+
+		log_err("fio: %s\n", err);
+		free(err);
+	}
+
 	if (!td->error)
 		td_verror(td, io_u->error, "io_u error");
 }
 
-static inline int gtod_reduce(struct thread_data *td)
+void io_u_log_error(struct thread_data *td, struct io_u *io_u)
 {
-	return td->o.disable_clat && td->o.disable_lat && td->o.disable_slat
-		&& td->o.disable_bw;
+	__io_u_log_error(td, io_u);
+	if (td->parent)
+		__io_u_log_error(td->parent, io_u);
+}
+
+static inline bool gtod_reduce(struct thread_data *td)
+{
+	return (td->o.disable_clat && td->o.disable_slat && td->o.disable_bw)
+			|| td->o.gtod_reduce;
 }
 
 static void account_io_completion(struct thread_data *td, struct io_u *io_u,
 				  struct io_completion_data *icd,
 				  const enum fio_ddir idx, unsigned int bytes)
 {
+	const int no_reduce = !gtod_reduce(td);
 	unsigned long lusec = 0;
 
-	if (!gtod_reduce(td))
+	if (td->parent)
+		td = td->parent;
+
+	if (!td->o.stats)
+		return;
+
+	if (no_reduce)
 		lusec = utime_since(&io_u->issue_time, &icd->time);
 
 	if (!td->o.disable_lat) {
@@ -1578,27 +1761,53 @@
 		}
 	}
 
-	if (!td->o.disable_clat) {
-		add_clat_sample(td, idx, lusec, bytes, io_u->offset);
-		io_u_mark_latency(td, lusec);
+	if (ddir_rw(idx)) {
+		if (!td->o.disable_clat) {
+			add_clat_sample(td, idx, lusec, bytes, io_u->offset);
+			io_u_mark_latency(td, lusec);
+		}
+
+		if (!td->o.disable_bw && per_unit_log(td->bw_log))
+			add_bw_sample(td, io_u, bytes, lusec);
+
+		if (no_reduce && per_unit_log(td->iops_log))
+			add_iops_sample(td, io_u, bytes);
 	}
 
-	if (!td->o.disable_bw)
-		add_bw_sample(td, idx, bytes, &icd->time);
-
-	if (!gtod_reduce(td))
-		add_iops_sample(td, idx, bytes, &icd->time);
+	if (td->ts.nr_block_infos && io_u->ddir == DDIR_TRIM) {
+		uint32_t *info = io_u_block_info(td, io_u);
+		if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+			if (io_u->ddir == DDIR_TRIM) {
+				*info = BLOCK_INFO(BLOCK_STATE_TRIMMED,
+						BLOCK_INFO_TRIMS(*info) + 1);
+			} else if (io_u->ddir == DDIR_WRITE) {
+				*info = BLOCK_INFO_SET_STATE(BLOCK_STATE_WRITTEN,
+								*info);
+			}
+		}
+	}
 }
 
-static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+static void file_log_write_comp(const struct thread_data *td, struct fio_file *f,
+				uint64_t offset, unsigned int bytes)
 {
-	uint64_t secs, remainder, bps, bytes;
+	int idx;
 
-	bytes = td->this_io_bytes[ddir];
-	bps = td->rate_bps[ddir];
-	secs = bytes / bps;
-	remainder = bytes % bps;
-	return remainder * 1000000 / bps + secs * 1000000;
+	if (!f)
+		return;
+
+	if (f->first_write == -1ULL || offset < f->first_write)
+		f->first_write = offset;
+	if (f->last_write == -1ULL || ((offset + bytes) > f->last_write))
+		f->last_write = offset + bytes;
+
+	if (!f->last_write_comp)
+		return;
+
+	idx = f->last_write_idx++;
+	f->last_write_comp[idx] = offset;
+	if (f->last_write_idx == td->o.iodepth)
+		f->last_write_idx = 0;
 }
 
 static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
@@ -1610,9 +1819,8 @@
 
 	dprint_io_u(io_u, "io complete");
 
-	td_io_u_lock(td);
 	assert(io_u->flags & IO_U_F_FLIGHT);
-	io_u->flags &= ~(IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
+	io_u_clear(td, io_u, IO_U_F_FLIGHT | IO_U_F_BUSY_OK);
 
 	/*
 	 * Mark IO ok to verify
@@ -1629,8 +1837,6 @@
 		}
 	}
 
-	td_io_u_unlock(td);
-
 	if (ddir_sync(ddir)) {
 		td->last_was_sync = 1;
 		if (f) {
@@ -1645,7 +1851,6 @@
 
 	if (!io_u->error && ddir_rw(ddir)) {
 		unsigned int bytes = io_u->buflen - io_u->resid;
-		const enum fio_ddir oddir = ddir ^ 1;
 		int ret;
 
 		td->io_blocks[ddir]++;
@@ -1655,41 +1860,13 @@
 		if (!(io_u->flags & IO_U_F_VER_LIST))
 			td->this_io_bytes[ddir] += bytes;
 
-		if (ddir == DDIR_WRITE) {
-			if (f) {
-				if (f->first_write == -1ULL ||
-				    io_u->offset < f->first_write)
-					f->first_write = io_u->offset;
-				if (f->last_write == -1ULL ||
-				    ((io_u->offset + bytes) > f->last_write))
-					f->last_write = io_u->offset + bytes;
-			}
-			if (td->last_write_comp) {
-				int idx = td->last_write_idx++;
-
-				td->last_write_comp[idx] = io_u->offset;
-				if (td->last_write_idx == td->o.iodepth)
-					td->last_write_idx = 0;
-			}
-		}
+		if (ddir == DDIR_WRITE)
+			file_log_write_comp(td, f, io_u->offset, bytes);
 
 		if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
-					   td->runstate == TD_VERIFYING)) {
+					   td->runstate == TD_VERIFYING))
 			account_io_completion(td, io_u, icd, ddir, bytes);
 
-			if (__should_check_rate(td, ddir)) {
-				td->rate_pending_usleep[ddir] =
-					(usec_for_io(td, ddir) -
-					 utime_since_now(&td->start));
-			}
-			if (ddir != DDIR_TRIM &&
-			    __should_check_rate(td, oddir)) {
-				td->rate_pending_usleep[oddir] =
-					(usec_for_io(td, oddir) -
-					 utime_since_now(&td->start));
-			}
-		}
-
 		icd->bytes_done[ddir] += bytes;
 
 		if (io_u->end_io) {
@@ -1731,7 +1908,7 @@
 	icd->nr = nr;
 
 	icd->error = 0;
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
 		icd->bytes_done[ddir] = 0;
 }
 
@@ -1754,10 +1931,10 @@
 /*
  * Complete a single io_u for the sync engines.
  */
-int io_u_sync_complete(struct thread_data *td, struct io_u *io_u,
-		       uint64_t *bytes)
+int io_u_sync_complete(struct thread_data *td, struct io_u *io_u)
 {
 	struct io_completion_data icd;
+	int ddir;
 
 	init_icd(td, &icd, 1);
 	io_completed(td, &io_u, &icd);
@@ -1770,12 +1947,8 @@
 		return -1;
 	}
 
-	if (bytes) {
-		int ddir;
-
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
-			bytes[ddir] += icd.bytes_done[ddir];
-	}
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
 	return 0;
 }
@@ -1783,22 +1956,23 @@
 /*
  * Called to complete min_events number of io for the async engines.
  */
-int io_u_queued_complete(struct thread_data *td, int min_evts,
-			 uint64_t *bytes)
+int io_u_queued_complete(struct thread_data *td, int min_evts)
 {
 	struct io_completion_data icd;
 	struct timespec *tvp = NULL;
-	int ret;
+	int ret, ddir;
 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
 
-	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
+	dprint(FD_IO, "io_u_queued_complete: min=%d\n", min_evts);
 
 	if (!min_evts)
 		tvp = &ts;
 	else if (min_evts > td->cur_depth)
 		min_evts = td->cur_depth;
 
-	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
+	/* No worries, td_io_getevents fixes min and max if they are
+	 * set incorrectly */
+	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete_max, tvp);
 	if (ret < 0) {
 		td_verror(td, -ret, "td_io_getevents");
 		return ret;
@@ -1812,14 +1986,10 @@
 		return -1;
 	}
 
-	if (bytes) {
-		int ddir;
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		td->bytes_done[ddir] += icd.bytes_done[ddir];
 
-		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
-			bytes[ddir] += icd.bytes_done[ddir];
-	}
-
-	return 0;
+	return ret;
 }
 
 /*
@@ -1827,10 +1997,14 @@
  */
 void io_u_queued(struct thread_data *td, struct io_u *io_u)
 {
-	if (!td->o.disable_slat) {
+	if (!td->o.disable_slat && ramp_time_over(td) && td->o.stats) {
 		unsigned long slat_time;
 
 		slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
+
+		if (td->parent)
+			td = td->parent;
+
 		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
 				io_u->offset);
 	}
@@ -1842,15 +2016,15 @@
 static struct frand_state *get_buf_state(struct thread_data *td)
 {
 	unsigned int v;
-	unsigned long r;
 
 	if (!td->o.dedupe_percentage)
 		return &td->buf_state;
-	else if (td->o.dedupe_percentage == 100)
-		return &td->buf_state_prev;
+	else if (td->o.dedupe_percentage == 100) {
+		frand_copy(&td->buf_state_prev, &td->buf_state);
+		return &td->buf_state;
+	}
 
-	r = __rand(&td->dedupe_state);
-	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
+	v = rand32_between(&td->dedupe_state, 1, 100);
 
 	if (v <= td->o.dedupe_percentage)
 		return &td->buf_state_prev;
@@ -1860,7 +2034,9 @@
 
 static void save_buf_state(struct thread_data *td, struct frand_state *rs)
 {
-	if (rs == &td->buf_state)
+	if (td->o.dedupe_percentage == 100)
+		frand_copy(rs, &td->buf_state_prev);
+	else if (rs == &td->buf_state)
 		frand_copy(&td->buf_state_prev, rs);
 }
 
@@ -1869,10 +2045,14 @@
 {
 	struct thread_options *o = &td->o;
 
+	if (o->mem_type == MEM_CUDA_MALLOC)
+		return;
+
 	if (o->compress_percentage || o->dedupe_percentage) {
 		unsigned int perc = td->o.compress_percentage;
 		struct frand_state *rs;
 		unsigned int left = max_bs;
+		unsigned int this_write;
 
 		do {
 			rs = get_buf_state(td);
@@ -1880,26 +2060,28 @@
 			min_write = min(min_write, left);
 
 			if (perc) {
-				unsigned int seg = min_write;
+				this_write = min_not_zero(min_write,
+							td->o.compress_chunk);
 
-				seg = min(min_write, td->o.compress_chunk);
-				if (!seg)
-					seg = min_write;
-
-				fill_random_buf_percentage(rs, buf, perc, seg,
-					min_write, o->buffer_pattern,
-						   o->buffer_pattern_bytes);
-			} else
+				fill_random_buf_percentage(rs, buf, perc,
+					this_write, this_write,
+					o->buffer_pattern,
+					o->buffer_pattern_bytes);
+			} else {
 				fill_random_buf(rs, buf, min_write);
+				this_write = min_write;
+			}
 
-			buf += min_write;
-			left -= min_write;
+			buf += this_write;
+			left -= this_write;
 			save_buf_state(td, rs);
 		} while (left);
 	} else if (o->buffer_pattern_bytes)
 		fill_buffer_pattern(td, buf, max_bs);
-	else
+	else if (o->zero_buffers)
 		memset(buf, 0, max_bs);
+	else
+		fill_random_buf(get_buf_state(td), buf, max_bs);
 }
 
 /*
@@ -1911,3 +2093,61 @@
 	io_u->buf_filled_len = 0;
 	fill_io_buffer(td, io_u->buf, min_write, max_bs);
 }
+
+static int do_sync_file_range(const struct thread_data *td,
+			      struct fio_file *f)
+{
+	off64_t offset, nbytes;
+
+	offset = f->first_write;
+	nbytes = f->last_write - f->first_write;
+
+	if (!nbytes)
+		return 0;
+
+	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
+}
+
+int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
+{
+	int ret;
+
+	if (io_u->ddir == DDIR_SYNC) {
+		ret = fsync(io_u->file->fd);
+	} else if (io_u->ddir == DDIR_DATASYNC) {
+#ifdef CONFIG_FDATASYNC
+		ret = fdatasync(io_u->file->fd);
+#else
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+#endif
+	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
+		ret = do_sync_file_range(td, io_u->file);
+	else {
+		ret = io_u->xfer_buflen;
+		io_u->error = EINVAL;
+	}
+
+	if (ret < 0)
+		io_u->error = errno;
+
+	return ret;
+}
+
+int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
+{
+#ifndef FIO_HAVE_TRIM
+	io_u->error = EINVAL;
+	return 0;
+#else
+	struct fio_file *f = io_u->file;
+	int ret;
+
+	ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen);
+	if (!ret)
+		return io_u->xfer_buflen;
+
+	io_u->error = ret;
+	return 0;
+#endif
+}

diff --git a/io_u.h b/io_u.h
new file mode 100644
index 0000000..155344d
--- /dev/null
+++ b/io_u.h

@@ -0,0 +1,179 @@
+#ifndef FIO_IO_U
+#define FIO_IO_U
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "log.h"
+#include "io_ddir.h"
+#include "debug.h"
+#include "file.h"
+#include "workqueue.h"
+
+#ifdef CONFIG_LIBAIO
+#include <libaio.h>
+#endif
+#ifdef CONFIG_GUASI
+#include <guasi.h>
+#endif
+
+enum {
+	IO_U_F_FREE		= 1 << 0,
+	IO_U_F_FLIGHT		= 1 << 1,
+	IO_U_F_NO_FILE_PUT	= 1 << 2,
+	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
+	IO_U_F_BUSY_OK		= 1 << 4,
+	IO_U_F_TRIMMED		= 1 << 5,
+	IO_U_F_BARRIER		= 1 << 6,
+	IO_U_F_VER_LIST		= 1 << 7,
+};
+
+/*
+ * The io unit
+ */
+struct io_u {
+	struct timeval start_time;
+	struct timeval issue_time;
+
+	struct fio_file *file;
+	unsigned int flags;
+	enum fio_ddir ddir;
+
+	/*
+	 * For replay workloads, we may want to account as a different
+	 * IO type than what is being submitted.
+	 */
+	enum fio_ddir acct_ddir;
+
+	/*
+	 * Write generation
+	 */
+	unsigned short numberio;
+
+	/*
+	 * Allocated/set buffer and length
+	 */
+	unsigned long buflen;
+	unsigned long long offset;
+	void *buf;
+
+	/*
+	 * Initial seed for generating the buffer contents
+	 */
+	uint64_t rand_seed;
+
+	/*
+	 * IO engine state, may be different from above when we get
+	 * partial transfers / residual data counts
+	 */
+	void *xfer_buf;
+	unsigned long xfer_buflen;
+
+	/*
+	 * Parameter related to pre-filled buffers and
+	 * their size to handle variable block sizes.
+	 */
+	unsigned long buf_filled_len;
+
+	struct io_piece *ipo;
+
+	unsigned int resid;
+	unsigned int error;
+
+	/*
+	 * io engine private data
+	 */
+	union {
+		unsigned int index;
+		unsigned int seen;
+		void *engine_data;
+	};
+
+	union {
+		struct flist_head verify_list;
+		struct workqueue_work work;
+	};
+
+	/*
+	 * Callback for io completion
+	 */
+	int (*end_io)(struct thread_data *, struct io_u **);
+
+	union {
+#ifdef CONFIG_LIBAIO
+		struct iocb iocb;
+#endif
+#ifdef CONFIG_POSIXAIO
+		os_aiocb_t aiocb;
+#endif
+#ifdef FIO_HAVE_SGIO
+		struct sg_io_hdr hdr;
+#endif
+#ifdef CONFIG_GUASI
+		guasi_req_t greq;
+#endif
+#ifdef CONFIG_SOLARISAIO
+		aio_result_t resultp;
+#endif
+#ifdef FIO_HAVE_BINJECT
+		struct b_user_cmd buc;
+#endif
+#ifdef CONFIG_RDMA
+		struct ibv_mr *mr;
+#endif
+		void *mmap_data;
+	};
+};
+
+/*
+ * io unit handling
+ */
+extern struct io_u *__get_io_u(struct thread_data *);
+extern struct io_u *get_io_u(struct thread_data *);
+extern void put_io_u(struct thread_data *, struct io_u *);
+extern void clear_io_u(struct thread_data *, struct io_u *);
+extern void requeue_io_u(struct thread_data *, struct io_u **);
+extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *);
+extern int __must_check io_u_queued_complete(struct thread_data *, int);
+extern void io_u_queued(struct thread_data *, struct io_u *);
+extern int io_u_quiesce(struct thread_data *);
+extern void io_u_log_error(struct thread_data *, struct io_u *);
+extern void io_u_mark_depth(struct thread_data *, unsigned int);
+extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
+extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
+void io_u_mark_complete(struct thread_data *, unsigned int);
+void io_u_mark_submit(struct thread_data *, unsigned int);
+bool queue_full(const struct thread_data *);
+
+int do_io_u_sync(const struct thread_data *, struct io_u *);
+int do_io_u_trim(const struct thread_data *, struct io_u *);
+
+#ifdef FIO_INC_DEBUG
+static inline void dprint_io_u(struct io_u *io_u, const char *p)
+{
+	struct fio_file *f = io_u->file;
+
+	dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u,
+					(unsigned long long) io_u->offset,
+					io_u->buflen, io_u->ddir);
+	if (f)
+		dprint(FD_IO, "/%s", f->file_name);
+	dprint(FD_IO, "\n");
+}
+#else
+#define dprint_io_u(io_u, p)
+#endif
+
+static inline enum fio_ddir acct_ddir(struct io_u *io_u)
+{
+	if (io_u->acct_ddir != -1)
+		return io_u->acct_ddir;
+
+	return io_u->ddir;
+}
+
+#define io_u_clear(td, io_u, val)	\
+	td_flags_clear((td), &(io_u->flags), (val))
+#define io_u_set(td, io_u, val)		\
+	td_flags_set((td), &(io_u)->flags, (val))
+
+#endif

diff --git a/io_u_queue.c b/io_u_queue.c
index 80a32ba..9994c78 100644
--- a/io_u_queue.c
+++ b/io_u_queue.c

@@ -8,6 +8,7 @@
 		return 1;
 
 	q->nr = 0;
+	q->max = nr;
 	return 0;
 }
 

diff --git a/io_u_queue.h b/io_u_queue.h
index bda40d5..118e593 100644
--- a/io_u_queue.h
+++ b/io_u_queue.h

@@ -8,6 +8,7 @@
 struct io_u_queue {
 	struct io_u **io_us;
 	unsigned int nr;
+	unsigned int max;
 };
 
 static inline struct io_u *io_u_qpop(struct io_u_queue *q)
@@ -25,7 +26,12 @@
 
 static inline void io_u_qpush(struct io_u_queue *q, struct io_u *io_u)
 {
-	q->io_us[q->nr++] = io_u;
+	if (q->nr < q->max) {
+		q->io_us[q->nr++] = io_u;
+		return;
+	}
+
+	assert(0);
 }
 
 static inline int io_u_qempty(const struct io_u_queue *q)

diff --git a/ioengine.h b/ioengine.h
deleted file mode 100644
index 85923fc..0000000
--- a/ioengine.h
+++ /dev/null

@@ -1,250 +0,0 @@
-#ifndef FIO_IOENGINE_H
-#define FIO_IOENGINE_H
-
-#include "compiler/compiler.h"
-#include "os/os.h"
-#include "log.h"
-#include "io_ddir.h"
-#include "debug.h"
-#include "file.h"
-
-#ifdef CONFIG_LIBAIO
-#include <libaio.h>
-#endif
-#ifdef CONFIG_GUASI
-#include <guasi.h>
-#endif
-
-#define FIO_IOOPS_VERSION	21
-
-enum {
-	IO_U_F_FREE		= 1 << 0,
-	IO_U_F_FLIGHT		= 1 << 1,
-	IO_U_F_NO_FILE_PUT	= 1 << 2,
-	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
-	IO_U_F_BUSY_OK		= 1 << 4,
-	IO_U_F_TRIMMED		= 1 << 5,
-	IO_U_F_BARRIER		= 1 << 6,
-	IO_U_F_VER_LIST		= 1 << 7,
-};
-
-/*
- * The io unit
- */
-struct io_u {
-	struct timeval start_time;
-	struct timeval issue_time;
-
-	struct fio_file *file;
-	unsigned int flags;
-	enum fio_ddir ddir;
-
-	/*
-	 * For replay workloads, we may want to account as a different
-	 * IO type than what is being submitted.
-	 */
-	enum fio_ddir acct_ddir;
-
-	/*
-	 * Allocated/set buffer and length
-	 */
-	unsigned long buflen;
-	unsigned long long offset;
-	unsigned short numberio;
-	void *buf;
-
-	/*
-	 * Initial seed for generating the buffer contents
-	 */
-	uint64_t rand_seed;
-
-	/*
-	 * IO engine state, may be different from above when we get
-	 * partial transfers / residual data counts
-	 */
-	void *xfer_buf;
-	unsigned long xfer_buflen;
-
-	/*
-	 * Parameter related to pre-filled buffers and
-	 * their size to handle variable block sizes.
-	 */
-	unsigned long buf_filled_len;
-
-	struct io_piece *ipo;
-
-	unsigned int resid;
-	unsigned int error;
-
-	/*
-	 * io engine private data
-	 */
-	union {
-		unsigned int index;
-		unsigned int seen;
-		void *engine_data;
-	};
-
-	struct flist_head verify_list;
-
-	/*
-	 * Callback for io completion
-	 */
-	int (*end_io)(struct thread_data *, struct io_u **);
-
-	union {
-#ifdef CONFIG_LIBAIO
-		struct iocb iocb;
-#endif
-#ifdef CONFIG_POSIXAIO
-		os_aiocb_t aiocb;
-#endif
-#ifdef FIO_HAVE_SGIO
-		struct sg_io_hdr hdr;
-#endif
-#ifdef CONFIG_GUASI
-		guasi_req_t greq;
-#endif
-#ifdef CONFIG_SOLARISAIO
-		aio_result_t resultp;
-#endif
-#ifdef FIO_HAVE_BINJECT
-		struct b_user_cmd buc;
-#endif
-#ifdef CONFIG_RDMA
-		struct ibv_mr *mr;
-#endif
-		void *mmap_data;
-	};
-};
-
-/*
- * io_ops->queue() return values
- */
-enum {
-	FIO_Q_COMPLETED	= 0,		/* completed sync */
-	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
-	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
-};
-
-struct ioengine_ops {
-	struct flist_head list;
-	char name[16];
-	int version;
-	int flags;
-	int (*setup)(struct thread_data *);
-	int (*init)(struct thread_data *);
-	int (*prep)(struct thread_data *, struct io_u *);
-	int (*queue)(struct thread_data *, struct io_u *);
-	int (*commit)(struct thread_data *);
-	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
-	struct io_u *(*event)(struct thread_data *, int);
-	int (*cancel)(struct thread_data *, struct io_u *);
-	void (*cleanup)(struct thread_data *);
-	int (*open_file)(struct thread_data *, struct fio_file *);
-	int (*close_file)(struct thread_data *, struct fio_file *);
-	int (*invalidate)(struct thread_data *, struct fio_file *);
-	int (*unlink_file)(struct thread_data *, struct fio_file *);
-	int (*get_file_size)(struct thread_data *, struct fio_file *);
-	void (*terminate)(struct thread_data *);
-	int (*io_u_init)(struct thread_data *, struct io_u *);
-	void (*io_u_free)(struct thread_data *, struct io_u *);
-	int option_struct_size;
-	struct fio_option *options;
-	void *data;
-	void *dlhandle;
-};
-
-enum fio_ioengine_flags {
-	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
-	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
-	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
-	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
-	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
-	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
-	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
-	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
-	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
-	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
-	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
-	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
-};
-
-/*
- * External engine defined symbol to fill in the engine ops structure
- */
-typedef void (*get_ioengine_t)(struct ioengine_ops **);
-
-/*
- * io engine entry points
- */
-extern int __must_check td_io_init(struct thread_data *);
-extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
-extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
-extern int __must_check td_io_sync(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
-extern int __must_check td_io_commit(struct thread_data *);
-extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
-extern int td_io_close_file(struct thread_data *, struct fio_file *);
-extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
-
-extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
-extern void register_ioengine(struct ioengine_ops *);
-extern void unregister_ioengine(struct ioengine_ops *);
-extern void free_ioengine(struct thread_data *);
-extern void close_ioengine(struct thread_data *);
-
-extern int fio_show_ioengine_help(const char *engine);
-
-/*
- * io unit handling
- */
-extern struct io_u *__get_io_u(struct thread_data *);
-extern struct io_u *get_io_u(struct thread_data *);
-extern void put_io_u(struct thread_data *, struct io_u *);
-extern void clear_io_u(struct thread_data *, struct io_u *);
-extern void requeue_io_u(struct thread_data *, struct io_u **);
-extern int __must_check io_u_sync_complete(struct thread_data *, struct io_u *, uint64_t *);
-extern int __must_check io_u_queued_complete(struct thread_data *, int, uint64_t *);
-extern void io_u_queued(struct thread_data *, struct io_u *);
-extern void io_u_quiesce(struct thread_data *);
-extern void io_u_log_error(struct thread_data *, struct io_u *);
-extern void io_u_mark_depth(struct thread_data *, unsigned int);
-extern void fill_io_buffer(struct thread_data *, void *, unsigned int, unsigned int);
-extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
-void io_u_mark_complete(struct thread_data *, unsigned int);
-void io_u_mark_submit(struct thread_data *, unsigned int);
-int queue_full(const struct thread_data *);
-
-int do_io_u_sync(const struct thread_data *, struct io_u *);
-int do_io_u_trim(const struct thread_data *, struct io_u *);
-
-#ifdef FIO_INC_DEBUG
-static inline void dprint_io_u(struct io_u *io_u, const char *p)
-{
-	struct fio_file *f = io_u->file;
-
-	dprint(FD_IO, "%s: io_u %p: off=%llu/len=%lu/ddir=%d", p, io_u,
-					(unsigned long long) io_u->offset,
-					io_u->buflen, io_u->ddir);
-	if (fio_debug & (1 << FD_IO)) {
-		if (f)
-			log_info("/%s", f->file_name);
-
-		log_info("\n");
-	}
-}
-#else
-#define dprint_io_u(io_u, p)
-#endif
-
-static inline enum fio_ddir acct_ddir(struct io_u *io_u)
-{
-	if (io_u->acct_ddir != -1)
-		return io_u->acct_ddir;
-
-	return io_u->ddir;
-}
-
-#endif

diff --git a/ioengines.c b/ioengines.c
index 00098d6..c90a2ca 100644
--- a/ioengines.c
+++ b/ioengines.c

@@ -22,39 +22,31 @@
 
 static FLIST_HEAD(engine_list);
 
-static int check_engine_ops(struct ioengine_ops *ops)
+static bool check_engine_ops(struct ioengine_ops *ops)
 {
 	if (ops->version != FIO_IOOPS_VERSION) {
 		log_err("bad ioops version %d (want %d)\n", ops->version,
 							FIO_IOOPS_VERSION);
-		return 1;
+		return true;
 	}
 
 	if (!ops->queue) {
 		log_err("%s: no queue handler\n", ops->name);
-		return 1;
+		return true;
 	}
 
 	/*
 	 * sync engines only need a ->queue()
 	 */
 	if (ops->flags & FIO_SYNCIO)
-		return 0;
+		return false;
 
-	if (!ops->event) {
-		log_err("%s: no event handler\n", ops->name);
-		return 1;
-	}
-	if (!ops->getevents) {
-		log_err("%s: no getevents handler\n", ops->name);
-		return 1;
-	}
-	if (!ops->queue) {
-		log_err("%s: no queue handler\n", ops->name);
-		return 1;
+	if (!ops->event || !ops->getevents) {
+		log_err("%s: no event/getevents handler\n", ops->name);
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void unregister_ioengine(struct ioengine_ops *ops)
@@ -127,17 +119,18 @@
 		return NULL;
 	}
 
-	ops->dlhandle = dlhandle;
+	td->io_ops_dlhandle = dlhandle;
 	return ops;
 }
 
 struct ioengine_ops *load_ioengine(struct thread_data *td, const char *name)
 {
-	struct ioengine_ops *ops, *ret;
-	char engine[16];
+	struct ioengine_ops *ops;
+	char engine[64];
 
 	dprint(FD_IO, "load ioengine %s\n", name);
 
+	engine[sizeof(engine) - 1] = '\0';
 	strncpy(engine, name, sizeof(engine) - 1);
 
 	/*
@@ -161,11 +154,7 @@
 	if (check_engine_ops(ops))
 		return NULL;
 
-	ret = malloc(sizeof(*ret));
-	memcpy(ret, ops, sizeof(*ret));
-	ret->data = NULL;
-
-	return ret;
+	return ops;
 }
 
 /*
@@ -181,10 +170,9 @@
 		td->eo = NULL;
 	}
 
-	if (td->io_ops->dlhandle)
-		dlclose(td->io_ops->dlhandle);
+	if (td->io_ops_dlhandle)
+		dlclose(td->io_ops_dlhandle);
 
-	free(td->io_ops);
 	td->io_ops = NULL;
 }
 
@@ -194,7 +182,7 @@
 
 	if (td->io_ops->cleanup) {
 		td->io_ops->cleanup(td);
-		td->io_ops->data = NULL;
+		td->io_ops_data = NULL;
 	}
 
 	free_ioengine(td);
@@ -264,13 +252,15 @@
 
 int td_io_queue(struct thread_data *td, struct io_u *io_u)
 {
+	const enum fio_ddir ddir = acct_ddir(io_u);
+	unsigned long buflen = io_u->xfer_buflen;
 	int ret;
 
 	dprint_io_u(io_u, "queue");
 	fio_ro_check(td, io_u);
 
 	assert((io_u->flags & IO_U_F_FLIGHT) == 0);
-	io_u->flags |= IO_U_F_FLIGHT;
+	io_u_set(td, io_u, IO_U_F_FLIGHT);
 
 	assert(fio_file_open(io_u->file));
 
@@ -282,7 +272,7 @@
 	io_u->error = 0;
 	io_u->resid = 0;
 
-	if (td->io_ops->flags & FIO_SYNCIO) {
+	if (td_ioengine_flagged(td, FIO_SYNCIO)) {
 		if (fio_fill_issue_time(td))
 			fio_gettime(&io_u->issue_time, NULL);
 
@@ -294,18 +284,21 @@
 					sizeof(struct timeval));
 	}
 
-	if (ddir_rw(acct_ddir(io_u))) {
-		td->io_issues[acct_ddir(io_u)]++;
-		td->io_issue_bytes[acct_ddir(io_u)] += io_u->xfer_buflen;
+	if (ddir_rw(ddir)) {
+		td->io_issues[ddir]++;
+		td->io_issue_bytes[ddir] += buflen;
+		td->rate_io_issue_bytes[ddir] += buflen;
 	}
 
 	ret = td->io_ops->queue(td, io_u);
 
 	unlock_file(td, io_u->file);
 
-	if (ret == FIO_Q_BUSY && ddir_rw(acct_ddir(io_u))) {
-		td->io_issues[acct_ddir(io_u)]--;
-		td->io_issue_bytes[acct_ddir(io_u)] -= io_u->xfer_buflen;
+	if (ret == FIO_Q_BUSY && ddir_rw(ddir)) {
+		td->io_issues[ddir]--;
+		td->io_issue_bytes[ddir] -= buflen;
+		td->rate_io_issue_bytes[ddir] -= buflen;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
 	}
 
 	/*
@@ -325,7 +318,8 @@
 	    td->o.odirect) {
 
 		log_info("fio: first direct IO errored. File system may not "
-			 "support direct IO, or iomem_align= is bad.\n");
+			 "support direct IO, or iomem_align= is bad. Try "
+			 "setting direct=0.\n");
 	}
 
 	if (!td->io_ops->commit || io_u->ddir == DDIR_TRIM) {
@@ -341,10 +335,10 @@
 	} else if (ret == FIO_Q_QUEUED) {
 		int r;
 
-		if (ddir_rw(io_u->ddir)) {
-			td->io_u_queued++;
+		td->io_u_queued++;
+
+		if (ddir_rw(io_u->ddir))
 			td->ts.total_io_u[io_u->ddir]++;
-		}
 
 		if (td->io_u_queued >= td->o.iodepth_batch) {
 			r = td_io_commit(td);
@@ -353,7 +347,7 @@
 		}
 	}
 
-	if ((td->io_ops->flags & FIO_SYNCIO) == 0) {
+	if (!td_ioengine_flagged(td, FIO_SYNCIO)) {
 		if (fio_fill_issue_time(td))
 			fio_gettime(&io_u->issue_time, NULL);
 
@@ -374,17 +368,17 @@
 
 	if (td->io_ops->init) {
 		ret = td->io_ops->init(td);
-		if (ret && td->o.iodepth > 1) {
-			log_err("fio: io engine init failed. Perhaps try"
-				" reducing io depth?\n");
-		}
+		if (ret)
+			log_err("fio: io engine %s init failed.%s\n",
+				td->io_ops->name,
+				td->o.iodepth > 1 ?
+				" Perhaps try reducing io depth?" : "");
+		else
+			td->io_ops_init = 1;
 		if (!td->error)
 			td->error = ret;
 	}
 
-	if (!ret && (td->io_ops->flags & FIO_NOIO))
-		td->flags |= TD_F_NOIO;
-
 	return ret;
 }
 
@@ -448,26 +442,47 @@
 		}
 	}
 
-	if (td->io_ops->flags & FIO_DISKLESSIO)
+	if (td_ioengine_flagged(td, FIO_DISKLESSIO))
 		goto done;
 
 	if (td->o.invalidate_cache && file_invalidate_cache(td, f))
 		goto err;
 
-	if (td->o.fadvise_hint &&
-	    (f->filetype == FIO_TYPE_BD || f->filetype == FIO_TYPE_FILE)) {
+	if (td->o.fadvise_hint != F_ADV_NONE &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
 		int flags;
 
-		if (td_random(td))
+		if (td->o.fadvise_hint == F_ADV_TYPE) {
+			if (td_random(td))
+				flags = POSIX_FADV_RANDOM;
+			else
+				flags = POSIX_FADV_SEQUENTIAL;
+		} else if (td->o.fadvise_hint == F_ADV_RANDOM)
 			flags = POSIX_FADV_RANDOM;
-		else
+		else if (td->o.fadvise_hint == F_ADV_SEQUENTIAL)
 			flags = POSIX_FADV_SEQUENTIAL;
+		else {
+			log_err("fio: unknown fadvise type %d\n",
+							td->o.fadvise_hint);
+			flags = POSIX_FADV_NORMAL;
+		}
 
 		if (posix_fadvise(f->fd, f->file_offset, f->io_size, flags) < 0) {
 			td_verror(td, errno, "fadvise");
 			goto err;
 		}
 	}
+#ifdef FIO_HAVE_STREAMID
+	if (td->o.fadvise_stream &&
+	    (f->filetype == FIO_TYPE_BLOCK || f->filetype == FIO_TYPE_FILE)) {
+		off_t stream = td->o.fadvise_stream;
+
+		if (posix_fadvise(f->fd, stream, f->io_size, POSIX_FADV_STREAMID) < 0) {
+			td_verror(td, errno, "fadvise streamid");
+			goto err;
+		}
+	}
+#endif
 
 #ifdef FIO_OS_DIRECTIO
 	/*
@@ -479,7 +494,12 @@
 
 		if (ret) {
 			td_verror(td, ret, "fio_set_odirect");
-			log_err("fio: the file system does not seem to support direct IO\n");
+			if (ret == ENOTTY) { /* ENOTTY suggests RAW device or ZFS */
+				log_err("fio: doing directIO to RAW devices or ZFS not supported\n");
+			} else {
+				log_err("fio: the file system does not seem to support direct IO\n");
+			}
+
 			goto err;
 		}
 	}
@@ -517,8 +537,15 @@
 {
 	if (td->io_ops->unlink_file)
 		return td->io_ops->unlink_file(td, f);
-	else
-		return unlink(f->file_name);
+	else {
+		int ret;
+
+		ret = unlink(f->file_name);
+		if (ret < 0)
+			return errno;
+
+		return 0;
+	}
 }
 
 int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
@@ -529,77 +556,19 @@
 	return td->io_ops->get_file_size(td, f);
 }
 
-static int do_sync_file_range(const struct thread_data *td,
-			      struct fio_file *f)
-{
-	off64_t offset, nbytes;
-
-	offset = f->first_write;
-	nbytes = f->last_write - f->first_write;
-
-	if (!nbytes)
-		return 0;
-
-	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
-}
-
-int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
-{
-	int ret;
-
-	if (io_u->ddir == DDIR_SYNC) {
-		ret = fsync(io_u->file->fd);
-	} else if (io_u->ddir == DDIR_DATASYNC) {
-#ifdef CONFIG_FDATASYNC
-		ret = fdatasync(io_u->file->fd);
-#else
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
-#endif
-	} else if (io_u->ddir == DDIR_SYNC_FILE_RANGE)
-		ret = do_sync_file_range(td, io_u->file);
-	else {
-		ret = io_u->xfer_buflen;
-		io_u->error = EINVAL;
-	}
-
-	if (ret < 0)
-		io_u->error = errno;
-
-	return ret;
-}
-
-int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
-{
-#ifndef FIO_HAVE_TRIM
-	io_u->error = EINVAL;
-	return 0;
-#else
-	struct fio_file *f = io_u->file;
-	int ret;
-
-	ret = os_trim(f->fd, io_u->offset, io_u->xfer_buflen);
-	if (!ret)
-		return io_u->xfer_buflen;
-
-	io_u->error = ret;
-	return 0;
-#endif
-}
-
 int fio_show_ioengine_help(const char *engine)
 {
 	struct flist_head *entry;
 	struct thread_data td;
+	struct ioengine_ops *io_ops;
 	char *sep;
 	int ret = 1;
 
 	if (!engine || !*engine) {
 		log_info("Available IO engines:\n");
 		flist_for_each(entry, &engine_list) {
-			td.io_ops = flist_entry(entry, struct ioengine_ops,
-						list);
-			log_info("\t%s\n", td.io_ops->name);
+			io_ops = flist_entry(entry, struct ioengine_ops, list);
+			log_info("\t%s\n", io_ops->name);
 		}
 		return 0;
 	}
@@ -611,16 +580,16 @@
 
 	memset(&td, 0, sizeof(td));
 
-	td.io_ops = load_ioengine(&td, engine);
-	if (!td.io_ops) {
+	io_ops = load_ioengine(&td, engine);
+	if (!io_ops) {
 		log_info("IO engine %s not found\n", engine);
 		return 1;
 	}
 
-	if (td.io_ops->options)
-		ret = show_cmd_help(td.io_ops->options, sep);
+	if (io_ops->options)
+		ret = show_cmd_help(io_ops->options, sep);
 	else
-		log_info("IO engine %s has no options\n", td.io_ops->name);
+		log_info("IO engine %s has no options\n", io_ops->name);
 
 	free_ioengine(&td);
 

diff --git a/ioengines.h b/ioengines.h
new file mode 100644
index 0000000..f24f4df
--- /dev/null
+++ b/ioengines.h

@@ -0,0 +1,90 @@
+#ifndef FIO_IOENGINE_H
+#define FIO_IOENGINE_H
+
+#include "compiler/compiler.h"
+#include "os/os.h"
+#include "file.h"
+#include "io_u.h"
+
+#define FIO_IOOPS_VERSION	23
+
+/*
+ * io_ops->queue() return values
+ */
+enum {
+	FIO_Q_COMPLETED	= 0,		/* completed sync */
+	FIO_Q_QUEUED	= 1,		/* queued, will complete async */
+	FIO_Q_BUSY	= 2,		/* no more room, call ->commit() */
+};
+
+struct ioengine_ops {
+	struct flist_head list;
+	const char *name;
+	int version;
+	int flags;
+	int (*setup)(struct thread_data *);
+	int (*init)(struct thread_data *);
+	int (*prep)(struct thread_data *, struct io_u *);
+	int (*queue)(struct thread_data *, struct io_u *);
+	int (*commit)(struct thread_data *);
+	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+	struct io_u *(*event)(struct thread_data *, int);
+	char *(*errdetails)(struct io_u *);
+	int (*cancel)(struct thread_data *, struct io_u *);
+	void (*cleanup)(struct thread_data *);
+	int (*open_file)(struct thread_data *, struct fio_file *);
+	int (*close_file)(struct thread_data *, struct fio_file *);
+	int (*invalidate)(struct thread_data *, struct fio_file *);
+	int (*unlink_file)(struct thread_data *, struct fio_file *);
+	int (*get_file_size)(struct thread_data *, struct fio_file *);
+	void (*terminate)(struct thread_data *);
+	int (*iomem_alloc)(struct thread_data *, size_t);
+	void (*iomem_free)(struct thread_data *);
+	int (*io_u_init)(struct thread_data *, struct io_u *);
+	void (*io_u_free)(struct thread_data *, struct io_u *);
+	int option_struct_size;
+	struct fio_option *options;
+};
+
+enum fio_ioengine_flags {
+	FIO_SYNCIO	= 1 << 0,	/* io engine has synchronous ->queue */
+	FIO_RAWIO	= 1 << 1,	/* some sort of direct/raw io */
+	FIO_DISKLESSIO	= 1 << 2,	/* no disk involved */
+	FIO_NOEXTEND	= 1 << 3,	/* engine can't extend file */
+	FIO_NODISKUTIL  = 1 << 4,	/* diskutil can't handle filename */
+	FIO_UNIDIR	= 1 << 5,	/* engine is uni-directional */
+	FIO_NOIO	= 1 << 6,	/* thread does only pseudo IO */
+	FIO_PIPEIO	= 1 << 7,	/* input/output no seekable */
+	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
+	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
+	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
+	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
+};
+
+/*
+ * External engine defined symbol to fill in the engine ops structure
+ */
+typedef void (*get_ioengine_t)(struct ioengine_ops **);
+
+/*
+ * io engine entry points
+ */
+extern int __must_check td_io_init(struct thread_data *);
+extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
+extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
+extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
+extern int __must_check td_io_commit(struct thread_data *);
+extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
+extern int td_io_close_file(struct thread_data *, struct fio_file *);
+extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
+extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
+
+extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
+extern void register_ioengine(struct ioengine_ops *);
+extern void unregister_ioengine(struct ioengine_ops *);
+extern void free_ioengine(struct thread_data *);
+extern void close_ioengine(struct thread_data *);
+
+extern int fio_show_ioengine_help(const char *engine);
+
+#endif

diff --git a/iolog.c b/iolog.c
index dfa329f..31d674c 100644
--- a/iolog.c
+++ b/iolog.c

@@ -18,7 +18,9 @@
 #include "verify.h"
 #include "trim.h"
 #include "filelock.h"
-#include "lib/tp.h"
+#include "smalloc.h"
+
+static int iolog_flush(struct io_log *log);
 
 static const char iolog_ver2[] = "fio version 2 iolog";
 
@@ -107,6 +109,11 @@
 
 	switch (ipo->file_action) {
 	case FIO_LOG_OPEN_FILE:
+		if (td->o.replay_redirect && fio_file_open(f)) {
+			dprint(FD_FILE, "iolog: ignoring re-open of file %s\n",
+					f->file_name);
+			break;
+		}
 		ret = td_io_open_file(td, f);
 		if (!ret)
 			break;
@@ -270,7 +277,7 @@
 			overlap = 1;
 
 		if (overlap) {
-			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
+			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu\n",
 				__ipo->offset, __ipo->len,
 				ipo->offset, ipo->len);
 			td->io_hist_len--;
@@ -291,6 +298,18 @@
 {
 	struct io_piece *ipo = io_u->ipo;
 
+	if (td->ts.nr_block_infos) {
+		uint32_t *info = io_u_block_info(td, io_u);
+		if (BLOCK_INFO_STATE(*info) < BLOCK_STATE_TRIM_FAILURE) {
+			if (io_u->ddir == DDIR_TRIM)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_TRIM_FAILURE);
+			else if (io_u->ddir == DDIR_WRITE)
+				*info = BLOCK_INFO_SET_STATE(*info,
+						BLOCK_STATE_WRITE_FAILURE);
+		}
+	}
+
 	if (!ipo)
 		return;
 
@@ -332,7 +351,7 @@
 	unsigned long long offset;
 	unsigned int bytes;
 	int reads, writes, waits, fileno = 0, file_action = 0; /* stupid gcc */
-	char *fname, *act;
+	char *rfname, *fname, *act;
 	char *str, *p;
 	enum fio_ddir rw;
 
@@ -343,7 +362,7 @@
 	 * for doing verifications.
 	 */
 	str = malloc(4096);
-	fname = malloc(256+16);
+	rfname = fname = malloc(256+16);
 	act = malloc(256+16);
 
 	reads = writes = waits = 0;
@@ -351,8 +370,12 @@
 		struct io_piece *ipo;
 		int r;
 
-		r = sscanf(p, "%256s %256s %llu %u", fname, act, &offset,
+		r = sscanf(p, "%256s %256s %llu %u", rfname, act, &offset,
 									&bytes);
+
+		if (td->o.replay_redirect)
+			fname = td->o.replay_redirect;
+
 		if (r == 4) {
 			/*
 			 * Check action first
@@ -378,8 +401,14 @@
 		} else if (r == 2) {
 			rw = DDIR_INVAL;
 			if (!strcmp(act, "add")) {
-				fileno = add_file(td, fname, 0, 1);
-				file_action = FIO_LOG_ADD_FILE;
+				if (td->o.replay_redirect &&
+				    get_fileno(td, fname) != -1) {
+					dprint(FD_FILE, "iolog: ignoring"
+						" re-add of file %s\n", fname);
+				} else {
+					fileno = add_file(td, fname, 0, 1);
+					file_action = FIO_LOG_ADD_FILE;
+				}
 				continue;
 			} else if (!strcmp(act, "open")) {
 				fileno = get_fileno(td, fname);
@@ -393,7 +422,7 @@
 				continue;
 			}
 		} else {
-			log_err("bad iolog2: %s", p);
+			log_err("bad iolog2: %s\n", p);
 			continue;
 		}
 
@@ -407,6 +436,8 @@
 				continue;
 			writes++;
 		} else if (rw == DDIR_WAIT) {
+			if (td->o.no_stall)
+				continue;
 			waits++;
 		} else if (rw == DDIR_INVAL) {
 		} else if (!ddir_sync(rw)) {
@@ -423,7 +454,12 @@
 		if (rw == DDIR_WAIT) {
 			ipo->delay = offset;
 		} else {
-			ipo->offset = offset;
+			if (td->o.replay_scale)
+				ipo->offset = offset / td->o.replay_scale;
+			else
+				ipo->offset = offset;
+			ipo_bytes_align(td->o.replay_align, ipo);
+
 			ipo->len = bytes;
 			if (rw != DDIR_INVAL && bytes > td->o.max_bs[rw])
 				td->o.max_bs[rw] = bytes;
@@ -437,7 +473,7 @@
 
 	free(str);
 	free(act);
-	free(fname);
+	free(rfname);
 
 	if (writes && read_only) {
 		log_err("fio: <%s> skips replay of %d writes due to"
@@ -562,19 +598,41 @@
 	       const char *filename)
 {
 	struct io_log *l;
+	int i;
+	struct io_u_plat_entry *entry;
+	struct flist_head *list;
 
-	l = calloc(1, sizeof(*l));
-	l->nr_samples = 0;
-	l->max_samples = 1024;
+	l = scalloc(1, sizeof(*l));
+	INIT_FLIST_HEAD(&l->io_logs);
 	l->log_type = p->log_type;
 	l->log_offset = p->log_offset;
 	l->log_gz = p->log_gz;
 	l->log_gz_store = p->log_gz_store;
-	l->log = malloc(l->max_samples * log_entry_sz(l));
 	l->avg_msec = p->avg_msec;
+	l->hist_msec = p->hist_msec;
+	l->hist_coarseness = p->hist_coarseness;
 	l->filename = strdup(filename);
 	l->td = p->td;
 
+	/* Initialize histogram lists for each r/w direction,
+	 * with initial io_u_plat of all zeros:
+	 */
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		list = &l->hist_window[i].list;
+		INIT_FLIST_HEAD(list);
+		entry = calloc(1, sizeof(struct io_u_plat_entry));
+		flist_add(&entry->list, list);
+	}
+
+	if (l->td && l->td->o.io_submit_mode != IO_MODE_OFFLOAD) {
+		struct io_logs *p;
+
+		p = calloc(1, sizeof(*l->pending));
+		p->max_samples = DEF_LOG_ENTRIES;
+		p->log = calloc(p->max_samples, log_entry_sz(l));
+		l->pending = p;
+	}
+
 	if (l->log_offset)
 		l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
 
@@ -582,8 +640,8 @@
 
 	if (l->log_gz && !p->td)
 		l->log_gz = 0;
-	else if (l->log_gz) {
-		pthread_mutex_init(&l->chunk_lock, NULL);
+	else if (l->log_gz || l->log_gz_store) {
+		mutex_init_pshared(&l->chunk_lock);
 		p->td->flags |= TD_F_COMPRESS_LOG;
 	}
 
@@ -618,12 +676,88 @@
 
 void free_log(struct io_log *log)
 {
-	free(log->log);
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		free(cur_log->log);
+		sfree(cur_log);
+	}
+
+	if (log->pending) {
+		free(log->pending->log);
+		free(log->pending);
+		log->pending = NULL;
+	}
+
+	free(log->pending);
 	free(log->filename);
-	free(log);
+	sfree(log);
 }
 
-static void flush_samples(FILE *f, void *samples, uint64_t sample_size)
+unsigned long hist_sum(int j, int stride, unsigned int *io_u_plat,
+		unsigned int *io_u_plat_last)
+{
+	unsigned long sum;
+	int k;
+
+	if (io_u_plat_last) {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k] - io_u_plat_last[j + k];
+	} else {
+		for (k = sum = 0; k < stride; k++)
+			sum += io_u_plat[j + k];
+	}
+
+	return sum;
+}
+
+static void flush_hist_samples(FILE *f, int hist_coarseness, void *samples,
+			       uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, j, nr_samples;
+	struct io_u_plat_entry *entry, *entry_before;
+	unsigned int *io_u_plat;
+	unsigned int *io_u_plat_before;
+
+	int stride = 1 << hist_coarseness;
+	
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		entry = s->data.plat_entry;
+		io_u_plat = entry->io_u_plat;
+
+		entry_before = flist_first_entry(&entry->list, struct io_u_plat_entry, list);
+		io_u_plat_before = entry_before->io_u_plat;
+
+		fprintf(f, "%lu, %u, %u, ", (unsigned long) s->time,
+						io_sample_ddir(s), s->bs);
+		for (j = 0; j < FIO_IO_U_PLAT_NR - stride; j += stride) {
+			fprintf(f, "%lu, ", hist_sum(j, stride, io_u_plat,
+						io_u_plat_before));
+		}
+		fprintf(f, "%lu\n", (unsigned long)
+		        hist_sum(FIO_IO_U_PLAT_NR - stride, stride, io_u_plat,
+					io_u_plat_before));
+
+		flist_del(&entry_before->list);
+		free(entry_before);
+	}
+}
+
+void flush_samples(FILE *f, void *samples, uint64_t sample_size)
 {
 	struct io_sample *s;
 	int log_offset;
@@ -641,16 +775,16 @@
 		s = __get_sample(samples, log_offset, i);
 
 		if (!log_offset) {
-			fprintf(f, "%lu, %lu, %u, %u\n",
+			fprintf(f, "%lu, %" PRId64 ", %u, %u\n",
 					(unsigned long) s->time,
-					(unsigned long) s->val,
+					s->data.val,
 					io_sample_ddir(s), s->bs);
 		} else {
 			struct io_sample_offset *so = (void *) s;
 
-			fprintf(f, "%lu, %lu, %u, %u, %llu\n",
+			fprintf(f, "%lu, %" PRId64 ", %u, %u, %llu\n",
 					(unsigned long) s->time,
-					(unsigned long) s->val,
+					s->data.val,
 					io_sample_ddir(s), s->bs,
 					(unsigned long long) so->offset);
 		}
@@ -660,17 +794,11 @@
 #ifdef CONFIG_ZLIB
 
 struct iolog_flush_data {
-	struct tp_work work;
+	struct workqueue_work work;
 	struct io_log *log;
 	void *samples;
-	uint64_t nr_samples;
-};
-
-struct iolog_compress {
-	struct flist_head list;
-	void *buf;
-	size_t len;
-	unsigned int seq;
+	uint32_t nr_samples;
+	bool free;
 };
 
 #define GZ_CHUNK	131072
@@ -697,6 +825,7 @@
 {
 	int wbits = 15;
 
+	memset(stream, 0, sizeof(*stream));
 	stream->zalloc = Z_NULL;
 	stream->zfree = Z_NULL;
 	stream->opaque = Z_NULL;
@@ -731,7 +860,8 @@
 
 	ret = inflateEnd(stream);
 	if (ret != Z_OK)
-		log_err("fio: failed to end log inflation (%d)\n", ret);
+		log_err("fio: failed to end log inflation seq %d (%d)\n",
+				iter->seq, ret);
 
 	flush_samples(f, iter->buf, iter->buf_used);
 	free(iter->buf);
@@ -748,7 +878,7 @@
 {
 	size_t ret;
 
-	dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u",
+	dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u\n",
 				(unsigned long) ic->len, ic->seq);
 
 	if (ic->seq != iter->seq) {
@@ -795,7 +925,7 @@
 
 	ret = (void *) stream->next_in - ic->buf;
 
-	dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) ret);
+	dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) iter->buf_size);
 
 	return ret;
 }
@@ -933,12 +1063,15 @@
 
 #endif
 
-void flush_log(struct io_log *log)
+void flush_log(struct io_log *log, bool do_append)
 {
 	void *buf;
 	FILE *f;
 
-	f = fopen(log->filename, "w");
+	if (!do_append)
+		f = fopen(log->filename, "w");
+	else
+		f = fopen(log->filename, "a");
 	if (!f) {
 		perror("fopen log");
 		return;
@@ -948,7 +1081,20 @@
 
 	inflate_gz_chunks(log, f);
 
-	flush_samples(f, log->log, log->nr_samples * log_entry_sz(log));
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+		
+		if (log->td && log == log->td->clat_hist_log)
+			flush_hist_samples(f, log->hist_coarseness, cur_log->log,
+			                   log_sample_sz(log, cur_log));
+		else
+			flush_samples(f, cur_log->log, log_sample_sz(log, cur_log));
+		
+		sfree(cur_log);
+	}
 
 	fclose(f);
 	clear_file_buffer(buf);
@@ -956,8 +1102,8 @@
 
 static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 {
-	if (td->tp_data)
-		iolog_flush(log, 1);
+	if (td->flags & TD_F_COMPRESS_LOG)
+		iolog_flush(log);
 
 	if (trylock) {
 		if (fio_trylock_file(log->filename))
@@ -965,27 +1111,41 @@
 	} else
 		fio_lock_file(log->filename);
 
-	if (td->client_type == FIO_CLIENT_TYPE_GUI)
+	if (td->client_type == FIO_CLIENT_TYPE_GUI || is_backend)
 		fio_send_iolog(td, log, log->filename);
 	else
-		flush_log(log);
+		flush_log(log, !td->o.per_job_logs);
 
 	fio_unlock_file(log->filename);
 	free_log(log);
 	return 0;
 }
 
+size_t log_chunk_sizes(struct io_log *log)
+{
+	struct flist_head *entry;
+	size_t ret;
+
+	if (flist_empty(&log->chunk_list))
+		return 0;
+
+	ret = 0;
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(entry, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(entry, struct iolog_compress, list);
+		ret += c->len;
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+	return ret;
+}
+
 #ifdef CONFIG_ZLIB
 
-/*
- * Invoked from our compress helper thread, when logging would have exceeded
- * the specified memory limitation. Compresses the previously stored
- * entries.
- */
-static int gz_work(struct tp_work *work)
+static int gz_work(struct iolog_flush_data *data)
 {
-	struct iolog_flush_data *data;
-	struct iolog_compress *c;
+	struct iolog_compress *c = NULL;
 	struct flist_head list;
 	unsigned int seq;
 	z_stream stream;
@@ -994,8 +1154,7 @@
 
 	INIT_FLIST_HEAD(&list);
 
-	data = container_of(work, struct iolog_flush_data, work);
-
+	memset(&stream, 0, sizeof(stream));
 	stream.zalloc = Z_NULL;
 	stream.zfree = Z_NULL;
 	stream.opaque = Z_NULL;
@@ -1003,7 +1162,7 @@
 	ret = deflateInit(&stream, Z_DEFAULT_COMPRESSION);
 	if (ret != Z_OK) {
 		log_err("fio: failed to init gz stream\n");
-		return 0;
+		goto err;
 	}
 
 	seq = ++data->log->chunk_seq;
@@ -1011,9 +1170,13 @@
 	stream.next_in = (void *) data->samples;
 	stream.avail_in = data->nr_samples * log_entry_sz(data->log);
 
-	dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u\n",
-				(unsigned long) stream.avail_in, seq);
+	dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u, log=%s\n",
+				(unsigned long) stream.avail_in, seq,
+				data->log->filename);
 	do {
+		if (c)
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
 		c = get_new_chunk(seq);
 		stream.avail_out = GZ_CHUNK;
 		stream.next_out = c->buf;
@@ -1033,9 +1196,26 @@
 	stream.avail_out = GZ_CHUNK - c->len;
 
 	ret = deflate(&stream, Z_FINISH);
-	if (ret == Z_STREAM_END)
-		c->len = GZ_CHUNK - stream.avail_out;
-	else {
+	if (ret < 0) {
+		/*
+		 * Z_BUF_ERROR is special, it just means we need more
+		 * output space. We'll handle that below. Treat any other
+		 * error as fatal.
+		 */
+		if (ret != Z_BUF_ERROR) {
+			log_err("fio: deflate log (%d)\n", ret);
+			flist_del(&c->list);
+			free_chunk(c);
+			goto err;
+		}
+	}
+
+	total -= c->len;
+	c->len = GZ_CHUNK - stream.avail_out;
+	total += c->len;
+	dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq, (unsigned long) c->len);
+
+	if (ret != Z_STREAM_END) {
 		do {
 			c = get_new_chunk(seq);
 			stream.avail_out = GZ_CHUNK;
@@ -1044,6 +1224,8 @@
 			c->len = GZ_CHUNK - stream.avail_out;
 			total += c->len;
 			flist_add_tail(&c->list, &list);
+			dprint(FD_COMPRESS, "seq=%d, chunk=%lu\n", seq,
+				(unsigned long) c->len);
 		} while (ret != Z_STREAM_END);
 	}
 
@@ -1063,12 +1245,8 @@
 
 	ret = 0;
 done:
-	if (work->wait) {
-		work->done = 1;
-		pthread_cond_signal(&work->cv);
-	} else
+	if (data->free)
 		free(data);
-
 	return ret;
 err:
 	while (!flist_empty(&list)) {
@@ -1081,16 +1259,93 @@
 }
 
 /*
- * Queue work item to compress the existing log entries. We copy the
- * samples, and reset the log sample count to 0 (so the logging will
- * continue to use the memory associated with the log). If called with
- * wait == 1, will not return until the log compression has completed.
+ * Invoked from our compress helper thread, when logging would have exceeded
+ * the specified memory limitation. Compresses the previously stored
+ * entries.
  */
-int iolog_flush(struct io_log *log, int wait)
+static int gz_work_async(struct submit_worker *sw, struct workqueue_work *work)
 {
-	struct tp_data *tdat = log->td->tp_data;
+	return gz_work(container_of(work, struct iolog_flush_data, work));
+}
+
+static int gz_init_worker(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->wq->td;
+
+	if (!fio_option_is_set(&td->o, log_gz_cpumask))
+		return 0;
+
+	if (fio_setaffinity(gettid(), td->o.log_gz_cpumask) == -1) {
+		log_err("gz: failed to set CPU affinity\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct workqueue_ops log_compress_wq_ops = {
+	.fn		= gz_work_async,
+	.init_worker_fn	= gz_init_worker,
+	.nice		= 1,
+};
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return 0;
+
+	workqueue_init(td, &td->log_compress_wq, &log_compress_wq_ops, 1, sk_out);
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+	if (!(td->flags & TD_F_COMPRESS_LOG))
+		return;
+
+	workqueue_exit(&td->log_compress_wq);
+}
+
+/*
+ * Queue work item to compress the existing log entries. We reset the
+ * current log to a small size, and reference the existing log in the
+ * data that we queue for compression. Once compression has been done,
+ * this old log is freed. If called with finish == true, will not return
+ * until the log compression has completed, and will flush all previous
+ * logs too
+ */
+static int iolog_flush(struct io_log *log)
+{
 	struct iolog_flush_data *data;
-	size_t sample_size;
+
+	data = malloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+	data->free = false;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		data->samples = cur_log->log;
+		data->nr_samples = cur_log->nr_samples;
+
+		sfree(cur_log);
+
+		gz_work(data);
+	}
+
+	free(data);
+	return 0;
+}
+
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	struct iolog_flush_data *data;
 
 	data = malloc(sizeof(*data));
 	if (!data)
@@ -1098,96 +1353,152 @@
 
 	data->log = log;
 
-	sample_size = log->nr_samples * log_entry_sz(log);
-	data->samples = malloc(sample_size);
-	if (!data->samples) {
-		free(data);
-		return 1;
-	}
+	data->samples = cur_log->log;
+	data->nr_samples = cur_log->nr_samples;
+	data->free = true;
 
-	memcpy(data->samples, log->log, sample_size);
-	data->nr_samples = log->nr_samples;
-	data->work.fn = gz_work;
-	log->nr_samples = 0;
+	cur_log->nr_samples = cur_log->max_samples = 0;
+	cur_log->log = NULL;
 
-	if (wait) {
-		pthread_mutex_init(&data->work.lock, NULL);
-		pthread_cond_init(&data->work.cv, NULL);
-		data->work.wait = 1;
-	} else
-		data->work.wait = 0;
-
-	data->work.prio = 1;
-	tp_queue_work(tdat, &data->work);
-
-	if (wait) {
-		pthread_mutex_lock(&data->work.lock);
-		while (!data->work.done)
-			pthread_cond_wait(&data->work.cv, &data->work.lock);
-		pthread_mutex_unlock(&data->work.lock);
-		free(data);
-	}
-
+	workqueue_enqueue(&log->td->log_compress_wq, &data->work);
 	return 0;
 }
-
 #else
 
-int iolog_flush(struct io_log *log, int wait)
+static int iolog_flush(struct io_log *log)
 {
 	return 1;
 }
 
+int iolog_cur_flush(struct io_log *log, struct io_logs *cur_log)
+{
+	return 1;
+}
+
+int iolog_compress_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	return 0;
+}
+
+void iolog_compress_exit(struct thread_data *td)
+{
+}
+
 #endif
 
-static int write_iops_log(struct thread_data *td, int try)
+struct io_logs *iolog_cur_log(struct io_log *log)
 {
-	struct io_log *log = td->iops_log;
+	if (flist_empty(&log->io_logs))
+		return NULL;
 
-	if (!log)
-		return 0;
-
-	return finish_log(td, log, try);
+	return flist_last_entry(&log->io_logs, struct io_logs, list);
 }
 
-static int write_slat_log(struct thread_data *td, int try)
+uint64_t iolog_nr_samples(struct io_log *iolog)
 {
-	struct io_log *log = td->slat_log;
+	struct flist_head *entry;
+	uint64_t ret = 0;
 
-	if (!log)
-		return 0;
+	flist_for_each(entry, &iolog->io_logs) {
+		struct io_logs *cur_log;
 
-	return finish_log(td, log, try);
+		cur_log = flist_entry(entry, struct io_logs, list);
+		ret += cur_log->nr_samples;
+	}
+
+	return ret;
 }
 
-static int write_clat_log(struct thread_data *td, int try)
+static int __write_log(struct thread_data *td, struct io_log *log, int try)
 {
-	struct io_log *log = td->clat_log;
+	if (log)
+		return finish_log(td, log, try);
 
-	if (!log)
-		return 0;
-
-	return finish_log(td, log, try);
+	return 0;
 }
 
-static int write_lat_log(struct thread_data *td, int try)
+static int write_iops_log(struct thread_data *td, int try, bool unit_log)
 {
-	struct io_log *log = td->lat_log;
+	int ret;
 
-	if (!log)
+	if (per_unit_log(td->iops_log) != unit_log)
 		return 0;
 
-	return finish_log(td, log, try);
+	ret = __write_log(td, td->iops_log, try);
+	if (!ret)
+		td->iops_log = NULL;
+
+	return ret;
 }
 
-static int write_bandw_log(struct thread_data *td, int try)
+static int write_slat_log(struct thread_data *td, int try, bool unit_log)
 {
-	struct io_log *log = td->bw_log;
+	int ret;
 
-	if (!log)
+	if (!unit_log)
 		return 0;
 
-	return finish_log(td, log, try);
+	ret = __write_log(td, td->slat_log, try);
+	if (!ret)
+		td->slat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_log, try);
+	if (!ret)
+		td->clat_log = NULL;
+
+	return ret;
+}
+
+static int write_clat_hist_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->clat_hist_log, try);
+	if (!ret)
+		td->clat_hist_log = NULL;
+
+	return ret;
+}
+
+static int write_lat_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (!unit_log)
+		return 0;
+
+	ret = __write_log(td, td->lat_log, try);
+	if (!ret)
+		td->lat_log = NULL;
+
+	return ret;
+}
+
+static int write_bandw_log(struct thread_data *td, int try, bool unit_log)
+{
+	int ret;
+
+	if (per_unit_log(td->bw_log) != unit_log)
+		return 0;
+
+	ret = __write_log(td, td->bw_log, try);
+	if (!ret)
+		td->bw_log = NULL;
+
+	return ret;
 }
 
 enum {
@@ -1196,13 +1507,14 @@
 	SLAT_LOG_MASK	= 4,
 	CLAT_LOG_MASK	= 8,
 	IOPS_LOG_MASK	= 16,
+	CLAT_HIST_LOG_MASK = 32,
 
-	ALL_LOG_NR	= 5,
+	ALL_LOG_NR	= 6,
 };
 
 struct log_type {
 	unsigned int mask;
-	int (*fn)(struct thread_data *, int);
+	int (*fn)(struct thread_data *, int, bool);
 };
 
 static struct log_type log_types[] = {
@@ -1226,9 +1538,13 @@
 		.mask	= IOPS_LOG_MASK,
 		.fn	= write_iops_log,
 	},
+	{
+		.mask	= CLAT_HIST_LOG_MASK,
+		.fn	= write_clat_hist_log,
+	}
 };
 
-void fio_writeout_logs(struct thread_data *td)
+void td_writeout_logs(struct thread_data *td, bool unit_logs)
 {
 	unsigned int log_mask = 0;
 	unsigned int log_left = ALL_LOG_NR;
@@ -1236,7 +1552,7 @@
 
 	old_state = td_bump_runstate(td, TD_FINISHING);
 
-	finalize_logs(td);
+	finalize_logs(td, unit_logs);
 
 	while (log_left) {
 		int prev_log_left = log_left;
@@ -1246,7 +1562,7 @@
 			int ret;
 
 			if (!(log_mask & lt->mask)) {
-				ret = lt->fn(td, log_left != 1);
+				ret = lt->fn(td, log_left != 1, unit_logs);
 				if (!ret) {
 					log_left--;
 					log_mask |= lt->mask;
@@ -1260,3 +1576,12 @@
 
 	td_restore_runstate(td, old_state);
 }
+
+void fio_writeout_logs(bool unit_logs)
+{
+	struct thread_data *td;
+	int i;
+
+	for_each_td(td, i)
+		td_writeout_logs(td, unit_logs);
+}

diff --git a/iolog.h b/iolog.h
index a1e32ae..0733ad3 100644
--- a/iolog.h
+++ b/iolog.h

@@ -4,7 +4,7 @@
 #include "lib/rbtree.h"
 #include "lib/ieee754.h"
 #include "flist.h"
-#include "ioengine.h"
+#include "ioengines.h"
 
 /*
  * Use for maintaining statistics
@@ -18,12 +18,27 @@
 	fio_fp64_t S;
 };
 
+struct io_hist {
+	uint64_t samples;
+	unsigned long hist_last;
+	struct flist_head list;
+};
+
+
+union io_sample_data {
+	uint64_t val;
+	struct io_u_plat_entry *plat_entry;
+};
+
+#define sample_val(value) ((union io_sample_data) { .val = value })
+#define sample_plat(plat) ((union io_sample_data) { .plat_entry = plat })
+
 /*
  * A single data sample
  */
 struct io_sample {
 	uint64_t time;
-	uint64_t val;
+	union io_sample_data data;
 	uint32_t __ddir;
 	uint32_t bs;
 };
@@ -39,6 +54,17 @@
 	IO_LOG_TYPE_SLAT,
 	IO_LOG_TYPE_BW,
 	IO_LOG_TYPE_IOPS,
+	IO_LOG_TYPE_HIST,
+};
+
+#define DEF_LOG_ENTRIES		1024
+#define MAX_LOG_ENTRIES		(1024 * DEF_LOG_ENTRIES)
+
+struct io_logs {
+	struct flist_head list;
+	uint64_t nr_samples;
+	uint64_t max_samples;
+	void *log;
 };
 
 /*
@@ -48,9 +74,14 @@
 	/*
 	 * Entries already logged
 	 */
-	uint64_t nr_samples;
-	uint64_t max_samples;
-	void *log;
+	struct flist_head io_logs;
+	uint32_t cur_log_max;
+
+	/*
+	 * When the current log runs out of space, store events here until
+	 * we have a chance to regrow
+	 */
+	struct io_logs *pending;
 
 	unsigned int log_ddir_mask;
 
@@ -63,7 +94,7 @@
 	/*
 	 * If we fail extending the log, stop collecting more entries.
 	 */
-	unsigned int disabled;
+	bool disabled;
 
 	/*
 	 * Log offsets
@@ -88,6 +119,15 @@
 	unsigned long avg_msec;
 	unsigned long avg_last;
 
+	/*
+	 * Windowed latency histograms, for keeping track of when we need to
+	 * save a copy of the histogram every approximately hist_msec
+	 * milliseconds.
+	 */
+	struct io_hist hist_window[DDIR_RWDIR_CNT];
+	unsigned long hist_msec;
+	unsigned int hist_coarseness;
+
 	pthread_mutex_t chunk_lock;
 	unsigned int chunk_seq;
 	struct flist_head chunk_list;
@@ -119,16 +159,27 @@
 	return __log_entry_sz(log->log_offset);
 }
 
+static inline size_t log_sample_sz(struct io_log *log, struct io_logs *cur_log)
+{
+	return cur_log->nr_samples * log_entry_sz(log);
+}
+
 static inline struct io_sample *__get_sample(void *samples, int log_offset,
 					     uint64_t sample)
 {
-	return samples + sample * __log_entry_sz(log_offset);
+	uint64_t sample_offset = sample * __log_entry_sz(log_offset);
+	return (struct io_sample *) ((char *) samples + sample_offset);
 }
 
+struct io_logs *iolog_cur_log(struct io_log *);
+uint64_t iolog_nr_samples(struct io_log *);
+void regrow_logs(struct thread_data *);
+
 static inline struct io_sample *get_sample(struct io_log *iolog,
+					   struct io_logs *cur_log,
 					   uint64_t sample)
 {
-	return __get_sample(iolog->log, iolog->log_offset, sample);
+	return __get_sample(cur_log->log, iolog->log_offset, sample);
 }
 
 enum {
@@ -183,6 +234,9 @@
 extern void queue_io_piece(struct thread_data *, struct io_piece *);
 extern void prune_io_piece_log(struct thread_data *);
 extern void write_iolog_close(struct thread_data *);
+extern int iolog_compress_init(struct thread_data *, struct sk_out *);
+extern void iolog_compress_exit(struct thread_data *);
+extern size_t log_chunk_sizes(struct io_log *);
 
 #ifdef CONFIG_ZLIB
 extern int iolog_file_inflate(const char *);
@@ -194,6 +248,8 @@
 struct log_params {
 	struct thread_data *td;
 	unsigned long avg_msec;
+	unsigned long hist_msec;
+	int hist_coarseness;
 	int log_type;
 	int log_offset;
 	int log_gz;
@@ -201,27 +257,35 @@
 	int log_compress;
 };
 
-extern void finalize_logs(struct thread_data *td);
-extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int, uint64_t);
-extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int, uint64_t);
-extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int, uint64_t);
-extern void add_bw_sample(struct thread_data *, enum fio_ddir, unsigned int,
-				struct timeval *);
-extern void add_iops_sample(struct thread_data *, enum fio_ddir, unsigned int,
-				struct timeval *);
-extern void init_disk_util(struct thread_data *);
-extern void update_rusage_stat(struct thread_data *);
+static inline bool per_unit_log(struct io_log *log)
+{
+	return log && !log->avg_msec;
+}
+
+static inline bool inline_log(struct io_log *log)
+{
+	return log->log_type == IO_LOG_TYPE_LAT ||
+		log->log_type == IO_LOG_TYPE_CLAT ||
+		log->log_type == IO_LOG_TYPE_SLAT;
+}
+
+static inline void ipo_bytes_align(unsigned int replay_align, struct io_piece *ipo)
+{
+	if (!replay_align)
+		return;
+
+	ipo->offset &= ~(replay_align - (uint64_t)1);
+}
+
+extern void finalize_logs(struct thread_data *td, bool);
 extern void setup_log(struct io_log **, struct log_params *, const char *);
-extern void flush_log(struct io_log *);
+extern void flush_log(struct io_log *, bool);
+extern void flush_samples(FILE *, void *, uint64_t);
+extern unsigned long hist_sum(int, int, unsigned int *, unsigned int *);
 extern void free_log(struct io_log *);
-extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
-extern int write_bw_log;
-extern void add_agg_sample(unsigned long, enum fio_ddir, unsigned int);
-extern void fio_writeout_logs(struct thread_data *);
-extern int iolog_flush(struct io_log *, int);
+extern void fio_writeout_logs(bool);
+extern void td_writeout_logs(struct thread_data *, bool);
+extern int iolog_cur_flush(struct io_log *, struct io_logs *);
 
 static inline void init_ipo(struct io_piece *ipo)
 {
@@ -229,4 +293,11 @@
 	INIT_FLIST_HEAD(&ipo->trim_list);
 }
 
+struct iolog_compress {
+	struct flist_head list;
+	void *buf;
+	size_t len;
+	unsigned int seq;
+};
+
 #endif

diff --git a/json.c b/json.c
index 6145ee4..e0227ec 100644
--- a/json.c
+++ b/json.c

@@ -40,7 +40,7 @@
 	return value;
 }
 
-static struct json_value *json_create_value_float(float number)
+static struct json_value *json_create_value_float(double number)
 {
 	struct json_value *value = malloc(sizeof(struct json_value));
 
@@ -231,7 +231,7 @@
 	return 0;
 }
 
-static void json_print_array(struct json_array *array);
+static void json_print_array(struct json_array *array, struct buf_output *);
 int json_array_add_value_type(struct json_array *array, int type, ...)
 {
 	struct json_value *value;
@@ -290,70 +290,70 @@
 		return json_array_level(value->parent_array) + 1;
 }
 
-static void json_print_level(int level)
+static void json_print_level(int level, struct buf_output *out)
 {
 	while (level-- > 0)
-		log_info("  ");
+		log_buf(out, "  ");
 }
 
-static void json_print_pair(struct json_pair *pair);
-static void json_print_array(struct json_array *array);
-static void json_print_value(struct json_value *value);
-void json_print_object(struct json_object *obj)
+static void json_print_pair(struct json_pair *pair, struct buf_output *);
+static void json_print_array(struct json_array *array, struct buf_output *);
+static void json_print_value(struct json_value *value, struct buf_output *);
+void json_print_object(struct json_object *obj, struct buf_output *out)
 {
 	int i;
 
-	log_info("{\n");
+	log_buf(out, "{\n");
 	for (i = 0; i < obj->pair_cnt; i++) {
 		if (i > 0)
-			log_info(",\n");
-		json_print_pair(obj->pairs[i]);
+			log_buf(out, ",\n");
+		json_print_pair(obj->pairs[i], out);
 	}
-	log_info("\n");
-	json_print_level(json_object_level(obj));
-	log_info("}");
+	log_buf(out, "\n");
+	json_print_level(json_object_level(obj), out);
+	log_buf(out, "}");
 }
 
-static void json_print_pair(struct json_pair *pair)
+static void json_print_pair(struct json_pair *pair, struct buf_output *out)
 {
-	json_print_level(json_pair_level(pair));
-	log_info("\"%s\" : ", pair->name);
-	json_print_value(pair->value);
+	json_print_level(json_pair_level(pair), out);
+	log_buf(out, "\"%s\" : ", pair->name);
+	json_print_value(pair->value, out);
 }
 
-static void json_print_array(struct json_array *array)
+static void json_print_array(struct json_array *array, struct buf_output *out)
 {
 	int i;
 
-	log_info("[\n");
+	log_buf(out, "[\n");
 	for (i = 0; i < array->value_cnt; i++) {
 		if (i > 0)
-			log_info(",\n");
-		json_print_level(json_value_level(array->values[i]));
-		json_print_value(array->values[i]);
+			log_buf(out, ",\n");
+		json_print_level(json_value_level(array->values[i]), out);
+		json_print_value(array->values[i], out);
 	}
-	log_info("\n");
-	json_print_level(json_array_level(array));
-	log_info("]");
+	log_buf(out, "\n");
+	json_print_level(json_array_level(array), out);
+	log_buf(out, "]");
 }
 
-static void json_print_value(struct json_value *value)
+static void json_print_value(struct json_value *value, struct buf_output *out)
 {
 	switch (value->type) {
 	case JSON_TYPE_STRING:
-		log_info("\"%s\"", value->string);
+		log_buf(out, "\"%s\"", value->string);
 		break;
 	case JSON_TYPE_INTEGER:
-		log_info("%lld", value->integer_number);
+		log_buf(out, "%lld", value->integer_number);
 		break;
 	case JSON_TYPE_FLOAT:
-		log_info("%.2f", value->float_number);
+		log_buf(out, "%f", value->float_number);
 		break;
 	case JSON_TYPE_OBJECT:
-		json_print_object(value->object);
+		json_print_object(value->object, out);
 		break;
 	case JSON_TYPE_ARRAY:
-		json_print_array(value->array);
+		json_print_array(value->array, out);
 		break;
 	}
 }

diff --git a/json.h b/json.h
index 962c11c..d7017e0 100644
--- a/json.h
+++ b/json.h

@@ -1,5 +1,8 @@
 #ifndef __JSON__H
 #define __JSON__H
+
+#include "lib/output_buffer.h"
+
 struct json_object;
 struct json_array;
 struct json_pair;
@@ -76,5 +79,5 @@
 #define json_array_last_value_object(obj) \
 	(obj->values[obj->value_cnt - 1]->object)
 
-void json_print_object(struct json_object *obj);
+void json_print_object(struct json_object *obj, struct buf_output *out);
 #endif

diff --git a/lib/axmap.c b/lib/axmap.c
index 9153df5..2ee3a25 100644
--- a/lib/axmap.c
+++ b/lib/axmap.c

@@ -129,8 +129,8 @@
 	return NULL;
 }
 
-static int axmap_handler(struct axmap *axmap, uint64_t bit_nr,
-			  int (*func)(struct axmap_level *, unsigned long, unsigned int,
+static bool axmap_handler(struct axmap *axmap, uint64_t bit_nr,
+			  bool (*func)(struct axmap_level *, unsigned long, unsigned int,
 			  void *), void *data)
 {
 	struct axmap_level *al;
@@ -144,14 +144,14 @@
 		al = &axmap->levels[i];
 
 		if (func(al, offset, bit, data))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
-static int axmap_handler_topdown(struct axmap *axmap, uint64_t bit_nr,
-	int (*func)(struct axmap_level *, unsigned long, unsigned int, void *),
+static bool axmap_handler_topdown(struct axmap *axmap, uint64_t bit_nr,
+	bool (*func)(struct axmap_level *, unsigned long, unsigned int, void *),
 	void *data)
 {
 	struct axmap_level *al;
@@ -165,20 +165,20 @@
 		al = &axmap->levels[i];
 
 		if (func(al, offset, bit, data))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
-static int axmap_clear_fn(struct axmap_level *al, unsigned long offset,
+static bool axmap_clear_fn(struct axmap_level *al, unsigned long offset,
 			   unsigned int bit, void *unused)
 {
 	if (!(al->map[offset] & (1UL << bit)))
-		return 1;
+		return true;
 
 	al->map[offset] &= ~(1UL << bit);
-	return 0;
+	return false;
 }
 
 void axmap_clear(struct axmap *axmap, uint64_t bit_nr)
@@ -213,7 +213,7 @@
 #endif
 };
 
-static int axmap_set_fn(struct axmap_level *al, unsigned long offset,
+static bool axmap_set_fn(struct axmap_level *al, unsigned long offset,
 			 unsigned int bit, void *__data)
 {
 	struct axmap_set_data *data = __data;
@@ -229,7 +229,7 @@
 	 */
 	overlap = al->map[offset] & mask;
 	if (overlap == mask)
-		return 1;
+		return true;
 
 	while (overlap) {
 		unsigned long clear_mask = ~(1UL << ffz(~overlap));
@@ -290,7 +290,8 @@
 	__axmap_set(axmap, bit_nr, &data);
 }
 
-unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits)
+unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr,
+			  unsigned int nr_bits)
 {
 	unsigned int set_bits = 0;
 
@@ -315,18 +316,18 @@
 	return set_bits;
 }
 
-static int axmap_isset_fn(struct axmap_level *al, unsigned long offset,
-			    unsigned int bit, void *unused)
+static bool axmap_isset_fn(struct axmap_level *al, unsigned long offset,
+			   unsigned int bit, void *unused)
 {
 	return (al->map[offset] & (1UL << bit)) != 0;
 }
 
-int axmap_isset(struct axmap *axmap, uint64_t bit_nr)
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr)
 {
 	if (bit_nr <= axmap->nr_bits)
 		return axmap_handler_topdown(axmap, bit_nr, axmap_isset_fn, NULL);
 
-	return 0;
+	return false;
 }
 
 static uint64_t axmap_find_first_free(struct axmap *axmap, unsigned int level,
@@ -384,23 +385,23 @@
 	uint64_t bit;
 };
 
-static int axmap_next_free_fn(struct axmap_level *al, unsigned long offset,
+static bool axmap_next_free_fn(struct axmap_level *al, unsigned long offset,
 			       unsigned int bit, void *__data)
 {
 	struct axmap_next_free_data *data = __data;
 	uint64_t mask = ~bit_masks[(data->bit + 1) & BLOCKS_PER_UNIT_MASK];
 
 	if (!(mask & ~al->map[offset]))
-		return 0;
+		return false;
 
 	if (al->map[offset] != -1UL) {
 		data->level = al->level;
 		data->offset = offset;
-		return 1;
+		return true;
 	}
 
 	data->bit = (data->bit + BLOCKS_PER_UNIT - 1) / BLOCKS_PER_UNIT;
-	return 0;
+	return false;
 }
 
 /*

diff --git a/lib/axmap.h b/lib/axmap.h
index 3705a1d..a7a6f94 100644
--- a/lib/axmap.h
+++ b/lib/axmap.h

@@ -2,6 +2,7 @@
 #define FIO_BITMAP_H
 
 #include <inttypes.h>
+#include "types.h"
 
 struct axmap;
 struct axmap *axmap_new(unsigned long nr_bits);
@@ -10,7 +11,7 @@
 void axmap_clear(struct axmap *axmap, uint64_t bit_nr);
 void axmap_set(struct axmap *axmap, uint64_t bit_nr);
 unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits);
-int axmap_isset(struct axmap *axmap, uint64_t bit_nr);
+bool axmap_isset(struct axmap *axmap, uint64_t bit_nr);
 uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr);
 void axmap_reset(struct axmap *axmap);
 

diff --git a/lib/bloom.c b/lib/bloom.c
index ee4ba0b..bb81dbb 100644
--- a/lib/bloom.c
+++ b/lib/bloom.c

@@ -35,7 +35,7 @@
 
 #define BLOOM_SEED	0x8989
 
-struct bloom_hash hashes[] = {
+static struct bloom_hash hashes[] = {
 	{
 		.seed = BLOOM_SEED,
 		.fn = jhash,
@@ -60,19 +60,17 @@
 
 #define N_HASHES	5
 
-#define MIN_ENTRIES	1073741824UL
-
 struct bloom *bloom_new(uint64_t entries)
 {
 	struct bloom *b;
 	size_t no_uints;
 
+	crc32c_arm64_probe();
 	crc32c_intel_probe();
 
 	b = malloc(sizeof(*b));
 	b->nentries = entries;
 	no_uints = (entries + BITS_PER_INDEX - 1) / BITS_PER_INDEX;
-	no_uints = max((unsigned long) no_uints, MIN_ENTRIES);
 	b->map = calloc(no_uints, sizeof(uint32_t));
 	if (!b->map) {
 		free(b);
@@ -88,14 +86,14 @@
 	free(b);
 }
 
-static int __bloom_check(struct bloom *b, uint32_t *data, unsigned int nwords,
-			 int set)
+static bool __bloom_check(struct bloom *b, const void *data, unsigned int len,
+			  bool set)
 {
 	uint32_t hash[N_HASHES];
 	int i, was_set;
 
 	for (i = 0; i < N_HASHES; i++) {
-		hash[i] = hashes[i].fn(data, nwords, hashes[i].seed);
+		hash[i] = hashes[i].fn(data, len, hashes[i].seed);
 		hash[i] = hash[i] % b->nentries;
 	}
 
@@ -106,14 +104,22 @@
 
 		if (b->map[index] & (1U << bit))
 			was_set++;
-		if (set)
+		else if (set)
 			b->map[index] |= 1U << bit;
+		else
+			break;
 	}
 
 	return was_set == N_HASHES;
 }
 
-int bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords)
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords)
 {
-	return __bloom_check(b, data, nwords, 1);
+	return __bloom_check(b, data, nwords * sizeof(uint32_t), true);
+}
+
+bool bloom_string(struct bloom *b, const char *data, unsigned int len,
+		  bool set)
+{
+	return __bloom_check(b, data, len, set);
 }

diff --git a/lib/bloom.h b/lib/bloom.h
index 127ed9b..141ead9 100644
--- a/lib/bloom.h
+++ b/lib/bloom.h

@@ -2,11 +2,13 @@
 #define FIO_BLOOM_H
 
 #include <inttypes.h>
+#include "../lib/types.h"
 
 struct bloom;
 
 struct bloom *bloom_new(uint64_t entries);
 void bloom_free(struct bloom *b);
-int bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords);
+bool bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords);
+bool bloom_string(struct bloom *b, const char *data, unsigned int len, bool);
 
 #endif

diff --git a/lib/ffz.h b/lib/ffz.h
index eef612d..e2c1b8e 100644
--- a/lib/ffz.h
+++ b/lib/ffz.h

@@ -1,16 +1,16 @@
 #ifndef FIO_FFZ_H
 #define FIO_FFZ_H
 
-static inline int __ffs(unsigned long word)
+#include <inttypes.h>
+
+static inline int ffs64(uint64_t word)
 {
 	int r = 0;
 
-#if BITS_PER_LONG == 64
 	if ((word & 0xffffffff) == 0) {
 		r += 32;
 		word >>= 32;
 	}
-#endif
 	if (!(word & 0xffff)) {
 		word >>= 16;
 		r += 16;
@@ -35,9 +35,20 @@
 	return r;
 }
 
+#ifndef ARCH_HAVE_FFZ
+
 static inline int ffz(unsigned long bitmask)
 {
-	return __ffs(~bitmask);
+	return ffs64(~bitmask);
+}
+
+#else
+#define ffz(bitmask)	arch_ffz(bitmask)
+#endif
+
+static inline int ffz64(uint64_t bitmask)
+{
+	return ffs64(~bitmask);
 }
 
 #endif

diff --git a/lib/gauss.c b/lib/gauss.c
new file mode 100644
index 0000000..f974490
--- /dev/null
+++ b/lib/gauss.c

@@ -0,0 +1,64 @@
+#include <math.h>
+#include <string.h>
+#include <stdio.h>
+#include "../hash.h"
+#include "gauss.h"
+
+#define GAUSS_ITERS	12
+
+static int gauss_dev(struct gauss_state *gs)
+{
+	unsigned int r;
+	int vr;
+
+	if (!gs->stddev)
+		return 0;
+
+	r = __rand(&gs->r);
+	vr = gs->stddev * (r / (FRAND32_MAX + 1.0));
+
+	return vr - gs->stddev / 2;
+}
+
+unsigned long long gauss_next(struct gauss_state *gs)
+{
+	unsigned long long sum = 0;
+	int i;
+
+	for (i = 0; i < GAUSS_ITERS; i++)
+		sum += __rand(&gs->r) % (gs->nranges + 1);
+
+	sum = (sum + GAUSS_ITERS - 1) / GAUSS_ITERS;
+
+	if (gs->stddev) {
+		int dev = gauss_dev(gs);
+
+		while (dev + sum >= gs->nranges)
+			dev /= 2;
+		sum += dev;
+	}
+
+	if (!gs->disable_hash)
+		sum = __hash_u64(sum);
+
+	return sum % gs->nranges;
+}
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed)
+{
+	memset(gs, 0, sizeof(*gs));
+	init_rand_seed(&gs->r, seed, 0);
+	gs->nranges = nranges;
+
+	if (dev != 0.0) {
+		gs->stddev = ceil((double) (nranges * 100.0) / dev);
+		if (gs->stddev > nranges / 2)
+			gs->stddev = nranges / 2;
+	}
+}
+
+void gauss_disable_hash(struct gauss_state *gs)
+{
+	gs->disable_hash = true;
+}

diff --git a/lib/gauss.h b/lib/gauss.h
new file mode 100644
index 0000000..478aa14
--- /dev/null
+++ b/lib/gauss.h

@@ -0,0 +1,19 @@
+#ifndef FIO_GAUSS_H
+#define FIO_GAUSS_H
+
+#include <inttypes.h>
+#include "rand.h"
+
+struct gauss_state {
+	struct frand_state r;
+	uint64_t nranges;
+	unsigned int stddev;
+	bool disable_hash;
+};
+
+void gauss_init(struct gauss_state *gs, unsigned long nranges, double dev,
+		unsigned int seed);
+unsigned long long gauss_next(struct gauss_state *gs);
+void gauss_disable_hash(struct gauss_state *gs);
+
+#endif

diff --git a/memalign.c b/lib/memalign.c
similarity index 87%
rename from memalign.c
rename to lib/memalign.c
index cfd6e46..137cc8e 100644
--- a/memalign.c
+++ b/lib/memalign.c

@@ -4,13 +4,13 @@
 
 #include "memalign.h"
 
+#define PTR_ALIGN(ptr, mask)   \
+	(char *)((uintptr_t)((ptr) + (mask)) & ~(mask))
+
 struct align_footer {
 	unsigned int offset;
 };
 
-#define PTR_ALIGN(ptr, mask)	\
-	(char *) (((uintptr_t) ((ptr) + (mask)) & ~(mask)))
-
 void *fio_memalign(size_t alignment, size_t size)
 {
 	struct align_footer *f;

diff --git a/memalign.h b/lib/memalign.h
similarity index 100%
rename from memalign.h
rename to lib/memalign.h


diff --git a/lib/mountcheck.c b/lib/mountcheck.c
new file mode 100644
index 0000000..2fb6fe7
--- /dev/null
+++ b/lib/mountcheck.c

@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <string.h>
+
+#ifdef CONFIG_GETMNTENT
+#include <mntent.h>
+
+#include "mountcheck.h"
+
+#define MTAB	"/etc/mtab"
+
+int device_is_mounted(const char *dev)
+{
+	FILE *mtab;
+	struct mntent *mnt;
+	int ret = 0;
+
+	mtab = setmntent(MTAB, "r");
+	if (!mtab)
+		return 0;
+
+	while ((mnt = getmntent(mtab)) != NULL) {
+		if (!mnt->mnt_fsname)
+			continue;
+		if (!strcmp(mnt->mnt_fsname, dev)) {
+			ret = 1;
+			break;
+		}
+	}
+
+	endmntent(mtab);
+	return ret;
+}
+
+#elif defined(CONFIG_GETMNTINFO)
+/* for most BSDs */
+#include <sys/param.h>
+#include <sys/mount.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#elif defined(CONFIG_GETMNTINFO_STATVFS)
+/* for NetBSD */
+#include <sys/statvfs.h>
+
+int device_is_mounted(const char *dev)
+{
+	struct statvfs *st;
+	int i, ret;
+
+	ret = getmntinfo(&st, MNT_NOWAIT);
+	if (ret <= 0)
+		return 0;
+
+	for (i = 0; i < ret; i++) {
+		if (!strcmp(st[i].f_mntfromname, dev))
+			return 1;
+	}
+
+	return 0;
+}
+
+#else
+/* others */
+
+int device_is_mounted(const char *dev)
+{
+	return 0;
+}
+
+#endif

diff --git a/lib/mountcheck.h b/lib/mountcheck.h
new file mode 100644
index 0000000..14ec45a
--- /dev/null
+++ b/lib/mountcheck.h

@@ -0,0 +1,6 @@
+#ifndef FIO_MOUNT_CHECK_H
+#define FIO_MOUNT_CHECK_H
+
+extern int device_is_mounted(const char *);
+
+#endif

diff --git a/lib/num2str.c b/lib/num2str.c
index 0ed05f3..8d08841 100644
--- a/lib/num2str.c
+++ b/lib/num2str.c

@@ -2,40 +2,71 @@
 #include <stdio.h>
 #include <string.h>
 
-#include "../fio.h"
+#include "../compiler/compiler.h"
+#include "num2str.h"
 
-#define ARRAY_LENGTH(arr)	sizeof(arr) / sizeof((arr)[0])
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
 
-/*
- * Cheesy number->string conversion, complete with carry rounding error.
+/**
+ * num2str() - Cheesy number->string conversion, complete with carry rounding error.
+ * @num: quantity (e.g., number of blocks, bytes or bits)
+ * @maxlen: max number of digits in the output string (not counting prefix and units, but counting .)
+ * @base: multiplier for num (e.g., if num represents Ki, use 1024)
+ * @pow2: select unit prefix - 0=power-of-10 decimal SI, nonzero=power-of-2 binary IEC
+ * @units: select units - N2S_* macros defined in num2str.h
+ * @returns a malloc'd buffer containing "number[<unit prefix>][<units>]"
  */
-char *num2str(uint64_t num, int maxlen, int base, int pow2, int unit_base)
+char *num2str(uint64_t num, int maxlen, int base, int pow2, int units)
 {
-	const char *postfix[] = { "", "K", "M", "G", "P", "E" };
-	const char *byte_postfix[] = { "", "B", "bit" };
+	const char *sistr[] = { "", "k", "M", "G", "T", "P" };
+	const char *iecstr[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi" };
+	const char **unitprefix;
+	const char *unitstr[] = { "", "/s", "B", "bit", "B/s", "bit/s" };
 	const unsigned int thousand[] = { 1000, 1024 };
-	unsigned int modulo, decimals;
-	int byte_post_index = 0, post_index, carry = 0;
-	char tmp[32];
+	unsigned int modulo;
+	int unit_index = 0, post_index, carry = 0;
+	char tmp[32], fmt[32];
 	char *buf;
 
+	compiletime_assert(sizeof(sistr) == sizeof(iecstr), "unit prefix arrays must be identical sizes");
+
 	buf = malloc(128);
+	if (!buf)
+		return NULL;
+
+	if (pow2)
+		unitprefix = iecstr;
+	else
+		unitprefix = sistr;
 
 	for (post_index = 0; base > 1; post_index++)
 		base /= thousand[!!pow2];
 
-	switch (unit_base) {
-	case 1:
-		byte_post_index = 2;
+	switch (units) {
+	case N2S_PERSEC:
+		unit_index = 1;
+		break;
+	case N2S_BYTE:
+		unit_index = 2;
+		break;
+	case N2S_BIT:
+		unit_index = 3;
 		num *= 8;
 		break;
-	case 8:
-		byte_post_index = 1;
+	case N2S_BYTEPERSEC:
+		unit_index = 4;
+		break;
+	case N2S_BITPERSEC:
+		unit_index = 5;
+		num *= 8;
 		break;
 	}
 
+	/*
+	 * Divide by K/Ki until string length of num <= maxlen.
+	 */
 	modulo = -1U;
-	while (post_index < sizeof(postfix)) {
+	while (post_index < sizeof(sistr)) {
 		sprintf(tmp, "%llu", (unsigned long long) num);
 		if (strlen(tmp) <= maxlen)
 			break;
@@ -46,33 +77,38 @@
 		post_index++;
 	}
 
+	/*
+	 * If no modulo, then we're done.
+	 */
 	if (modulo == -1U) {
 done:
-		if (post_index >= ARRAY_LENGTH(postfix))
+		if (post_index >= ARRAY_SIZE(sistr))
 			post_index = 0;
 
 		sprintf(buf, "%llu%s%s", (unsigned long long) num,
-			postfix[post_index], byte_postfix[byte_post_index]);
+			unitprefix[post_index], unitstr[unit_index]);
 		return buf;
 	}
 
+	/*
+	 * If no room for decimals, then we're done.
+	 */
 	sprintf(tmp, "%llu", (unsigned long long) num);
-	decimals = maxlen - strlen(tmp);
-	if (decimals <= 1) {
+	if ((int)(maxlen - strlen(tmp)) <= 1) {
 		if (carry)
 			num++;
 		goto done;
 	}
 
-	do {
-		sprintf(tmp, "%u", modulo);
-		if (strlen(tmp) <= decimals - 1)
-			break;
+	/*
+	 * Fill in everything and return the result.
+	 */
+	assert(maxlen - strlen(tmp) - 1 > 0);
+	assert(modulo < thousand[!!pow2]);
+	sprintf(fmt, "%%.%df", (int)(maxlen - strlen(tmp) - 1));
+	sprintf(tmp, fmt, (double)modulo / (double)thousand[!!pow2]);
 
-		modulo = (modulo + 9) / 10;
-	} while (1);
-
-	sprintf(buf, "%llu.%u%s%s", (unsigned long long) num, modulo,
-			postfix[post_index], byte_postfix[byte_post_index]);
+	sprintf(buf, "%llu.%s%s%s", (unsigned long long) num, &tmp[2],
+			unitprefix[post_index], unitstr[unit_index]);
 	return buf;
 }

diff --git a/lib/num2str.h b/lib/num2str.h
new file mode 100644
index 0000000..81358a1
--- /dev/null
+++ b/lib/num2str.h

@@ -0,0 +1,15 @@
+#ifndef FIO_NUM2STR_H
+#define FIO_NUM2STR_H
+
+#include <inttypes.h>
+
+#define N2S_NONE	0
+#define N2S_BITPERSEC	1	/* match unit_base for bit rates */
+#define N2S_PERSEC	2
+#define N2S_BIT		3
+#define N2S_BYTE	4
+#define N2S_BYTEPERSEC	8	/* match unit_base for byte rates */
+
+extern char *num2str(uint64_t, int, int, int, int);
+
+#endif

diff --git a/lib/output_buffer.c b/lib/output_buffer.c
new file mode 100644
index 0000000..c1fdfc9
--- /dev/null
+++ b/lib/output_buffer.c

@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "output_buffer.h"
+#include "../log.h"
+#include "../minmax.h"
+
+#define BUF_INC	1024
+
+void buf_output_init(struct buf_output *out)
+{
+	out->max_buflen = 0;
+	out->buflen = 0;
+	out->buf = NULL;
+}
+
+void buf_output_free(struct buf_output *out)
+{
+	free(out->buf);
+}
+
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len)
+{
+	if (out->max_buflen - out->buflen < len) {
+		size_t need = len - (out->max_buflen - out->buflen);
+		size_t old_max = out->max_buflen;
+
+		need = max((size_t) BUF_INC, need);
+		out->max_buflen += need;
+		out->buf = realloc(out->buf, out->max_buflen);
+
+		old_max = max(old_max, out->buflen + len);
+		if (old_max + need > out->max_buflen)
+			need = out->max_buflen - old_max;
+		memset(&out->buf[old_max], 0, need);
+	}
+
+	memcpy(&out->buf[out->buflen], buf, len);
+	out->buflen += len;
+	return len;
+}
+
+size_t buf_output_flush(struct buf_output *out)
+{
+	size_t ret = 0;
+
+	if (out->buflen) {
+		ret = log_info_buf(out->buf, out->buflen);
+		memset(out->buf, 0, out->max_buflen);
+		out->buflen = 0;
+	}
+
+	return ret;
+}

diff --git a/lib/output_buffer.h b/lib/output_buffer.h
new file mode 100644
index 0000000..396002f
--- /dev/null
+++ b/lib/output_buffer.h

@@ -0,0 +1,17 @@
+#ifndef FIO_OUTPUT_BUFFER_H
+#define FIO_OUTPUT_BUFFER_H
+
+#include <unistd.h>
+
+struct buf_output {
+	char *buf;
+	size_t buflen;
+	size_t max_buflen;
+};
+
+void buf_output_init(struct buf_output *out);
+void buf_output_free(struct buf_output *out);
+size_t buf_output_add(struct buf_output *out, const char *buf, size_t len);
+size_t buf_output_flush(struct buf_output *out);
+
+#endif

diff --git a/lib/pattern.c b/lib/pattern.c
new file mode 100644
index 0000000..0aeb935
--- /dev/null
+++ b/lib/pattern.c

@@ -0,0 +1,472 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "strntol.h"
+#include "pattern.h"
+#include "../minmax.h"
+#include "../oslib/strcasestr.h"
+
+/**
+ * parse_string() - parses string in double quotes, like "abc"
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_string(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+
+	if (!out_len)
+		return NULL;
+
+	assert(*beg == '"');
+	beg++;
+	end = strchr(beg, '"');
+	if (!end)
+		return NULL;
+	if (end - beg > out_len)
+		return NULL;
+
+	memcpy(out, beg, end - beg);
+	*filled = end - beg;
+
+	/* Catch up quote */
+	return end + 1;
+}
+
+/**
+ * parse_number() - parses numbers
+ * @beg - string input
+ * @out - output buffer where parsed number should be put
+ * @out_len - length of the output buffer
+ * @filled - pointer where number of bytes successfully
+ *           parsed will be put
+ *
+ * Supports decimals in the range [INT_MIN, INT_MAX] and
+ * hexidecimals of any size, which should be started with
+ * prefix 0x or 0X.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_number(const char *beg, char *out,
+				unsigned int out_len,
+				unsigned int *filled)
+{
+	const char *end;
+	unsigned int val;
+	long lval;
+	int num, i;
+
+	if (!out_len)
+		return NULL;
+
+	num = 0;
+	sscanf(beg, "0%*[xX]%*[0-9a-fA-F]%n", &num);
+	if (num == 0) {
+		/* Here we are trying to parse decimal */
+
+		char *_end;
+
+		/* Looking ahead */
+		_end = strcasestr(beg, "0x");
+		if (_end)
+			num = _end - beg;
+		if (num)
+			lval = strntol(beg, num, &_end, 10);
+		else
+			lval = strtol(beg, &_end, 10);
+		if (beg == _end || lval > INT_MAX || lval < INT_MIN)
+			return NULL;
+		end = _end;
+		i = 0;
+		if (!lval) {
+			num    = 0;
+			out[i] = 0x00;
+			i      = 1;
+		} else {
+			val = (unsigned int)lval;
+			for (; val && out_len; out_len--, i++, val >>= 8)
+				out[i] = val & 0xff;
+			if (val)
+				return NULL;
+		}
+	} else {
+		assert(num > 2);
+
+		/* Catch up 0x prefix */
+		num -= 2;
+		beg += 2;
+
+		/* Look back, handle this combined string: 0xff0x14 */
+		if (beg[num] && !strncasecmp(&beg[num - 1], "0x", 2))
+			num--;
+
+		end  = beg + num;
+
+		for (i = 0; num && out_len;
+		     out_len--, i++, num -= 2, beg += 2) {
+			const char *fmt;
+
+			fmt = (num & 1 ? "%1hhx" : "%2hhx");
+			sscanf(beg, fmt, &out[i]);
+			if (num & 1) {
+				num++;
+				beg--;
+			}
+		}
+		if (num)
+			return NULL;
+	}
+
+	*filled = i;
+	return end;
+
+}
+
+/**
+ * parse_format() - parses formats, like %o, etc
+ * @in - string input
+ * @out - output buffer where space for format should be reserved
+ * @parsed - number of bytes which were already parsed so far
+ * @out_len - length of the output buffer
+ * @fmt_desc - format descritor array, what we expect to find
+ * @fmt_desc_sz - size of the format descritor array
+ * @fmt - format array, the output
+ * @fmt_sz - size of format array
+ *
+ * This function tries to find formats, e.g.:
+ *   %o - offset of the block
+ *
+ * In case of successfull parsing it fills the format param
+ * with proper offset and the size of the expected value, which
+ * should be pasted into buffer using the format 'func' callback.
+ *
+ * Returns the end pointer where parsing has been stopped.
+ * In case of parsing error or lack of bytes in output buffer
+ * NULL will be returned.
+ */
+static const char *parse_format(const char *in, char *out, unsigned int parsed,
+				unsigned int out_len, unsigned int *filled,
+				const struct pattern_fmt_desc *fmt_desc,
+				unsigned int fmt_desc_sz,
+				struct pattern_fmt *fmt, unsigned int fmt_sz)
+{
+	int i;
+	struct pattern_fmt *f = NULL;
+	unsigned int len = 0;
+
+	if (!out_len || !fmt_desc || !fmt_desc_sz || !fmt || !fmt_sz)
+		return NULL;
+
+	assert(*in == '%');
+
+	for (i = 0; i < fmt_desc_sz; i++) {
+		const struct pattern_fmt_desc *desc;
+
+		desc = &fmt_desc[i];
+		len  = strlen(desc->fmt);
+		if (0 == strncmp(in, desc->fmt, len)) {
+			fmt->desc = desc;
+			fmt->off  = parsed;
+			f = fmt;
+			break;
+		}
+	}
+
+	if (!f)
+		return NULL;
+	if (f->desc->len > out_len)
+		return NULL;
+
+	memset(out, '\0', f->desc->len);
+	*filled = f->desc->len;
+
+	return in + len;
+}
+
+/**
+ * parse_and_fill_pattern() - Parses combined input, which consists of strings,
+ *                            numbers and pattern formats.
+ * @in - string input
+ * @in_len - size of the input string
+ * @out - output buffer where parsed result will be put
+ * @out_len - lengths of the output buffer
+ * @fmt_desc - array of pattern format descriptors [input]
+ * @fmt_desc_sz - size of the format descriptor array
+ * @fmt - array of pattern formats [output]
+ * @fmt_sz - pointer where the size of pattern formats array stored [input],
+ *           after successfull parsing this pointer will contain the number
+ *           of parsed formats if any [output].
+ *
+ * strings:
+ *   bytes sequence in double quotes, e.g. "123".
+ *   NOTE: there is no way to escape quote, so "123\"abc" does not work.
+ *
+ * numbers:
+ *   hexidecimal - sequence of hex bytes starting from 0x or 0X prefix,
+ *                 e.g. 0xff12ceff1100ff
+ *   decimal     - decimal number in range [INT_MIN, INT_MAX]
+ *
+ * formats:
+ *   %o - offset of block, reserved 8 bytes.
+ *
+ * Explicit examples of combined string:
+ * #1                  #2                 #3        #4
+ *    in="abcd"          in=-1024           in=66     in=0xFF0X1
+ *   out=61 62 63 64    out=00 fc ff ff    out=42    out=ff 01
+ *
+ * #5                                #6
+ *    in=%o                            in="123"0xFFeeCC
+ *   out=00 00 00 00 00 00 00 00      out=31 32 33 ff ec cc
+ *
+ * #7
+ *   in=-100xab"1"%o"2"
+ *  out=f6 ff ff ff ab 31 00 00 00 00 00 00 00 00 32
+ *
+ * #9
+ *    in=%o0xdeadbeef%o
+ *   out=00 00 00 00 00 00 00 00 de ad be ef 00 00 00 00 00 00 00 00
+ *
+ * #10
+ *    in=0xfefefefefefefefefefefefefefefefefefefefefe
+ *   out=fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe fe
+ *
+ * Returns number of bytes filled or err < 0 in case of failure.
+ */
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out)
+{
+	const char *beg, *end, *out_beg = out;
+	unsigned int total = 0, fmt_rem = 0;
+
+	if (!in || !in_len || !out || !out_len)
+		return -EINVAL;
+	if (fmt_sz_out)
+		fmt_rem = *fmt_sz_out;
+
+	beg = in;
+	do {
+		unsigned int filled;
+		int parsed_fmt;
+
+		filled     = 0;
+		parsed_fmt = 0;
+
+		switch (*beg) {
+		case '"':
+			end = parse_string(beg, out, out_len, &filled);
+			break;
+		case '%':
+			end = parse_format(beg, out, out - out_beg, out_len,
+					   &filled, fmt_desc, fmt_desc_sz,
+					   fmt, fmt_rem);
+			parsed_fmt = 1;
+			break;
+		default:
+			end = parse_number(beg, out, out_len, &filled);
+			break;
+		}
+
+		if (!end)
+			return -EINVAL;
+
+		if (parsed_fmt) {
+			assert(fmt_rem);
+			fmt_rem--;
+			fmt++;
+		}
+
+		assert(end - beg <= in_len);
+		in_len -= end - beg;
+		beg     = end;
+
+		assert(filled);
+		assert(filled <= out_len);
+		out_len -= filled;
+		out     += filled;
+		total   += filled;
+
+	} while (in_len);
+
+	if (fmt_sz_out)
+		*fmt_sz_out -= fmt_rem;
+	return total;
+}
+
+/**
+ * dup_pattern() - Duplicates part of the pattern all over the buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+static int dup_pattern(char *out, unsigned int out_len, unsigned int pattern_len)
+{
+	unsigned int left, len, off;
+
+	if (out_len <= pattern_len)
+		/* Normal case */
+		return 0;
+
+	off  = pattern_len;
+	left = (out_len - off);
+	len  = min(left, off);
+
+	/* Duplicate leftover */
+	while (left) {
+		memcpy(out + off, out, len);
+		left -= len;
+		off <<= 1;
+		len   = min(left, off);
+	}
+
+	return 0;
+}
+
+/**
+ * cpy_pattern() - Copies pattern to the buffer.
+ *
+ * Function copies pattern along the whole buffer.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len)
+{
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}
+
+/**
+ * cmp_pattern() - Compares pattern and buffer.
+ *
+ * For the sake of performance this function avoids any loops.
+ * Firstly it tries to compare the buffer itself, checking that
+ * buffer consists of repeating patterns along the buffer size.
+ *
+ * If the difference is not found then the function tries to compare
+ * buffer and pattern.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len)
+{
+	int rc;
+	unsigned int size;
+
+	/* Find the difference in buffer */
+	if (len > pattern_size) {
+		rc = memcmp(buf, buf + pattern_size, len - pattern_size);
+		if (rc)
+			return -EILSEQ;
+	}
+	/* Compare second part of the pattern with buffer */
+	if (off) {
+		size = min(len, pattern_size - off);
+		rc = memcmp(buf, pattern + off, size);
+		if (rc)
+			return -EILSEQ;
+		buf += size;
+		len -= size;
+	}
+	/* Compare first part of the pattern or the whole pattern
+	 * with buffer */
+	if (len) {
+		size = min(len, (off ? off : pattern_size));
+		rc = memcmp(buf, pattern, size);
+		if (rc)
+			return -EILSEQ;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format_inplace() - Pastes parsed formats to the pattern.
+ *
+ * This function pastes formats to the pattern. If @fmt_sz is 0
+ * function does nothing and pattern buffer is left untouched.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv)
+{
+	int i, rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !fmt)
+		return -EINVAL;
+
+	/* Paste formats for first pattern chunk */
+	for (i = 0; i < fmt_sz; i++) {
+		struct pattern_fmt *f;
+
+		f = &fmt[i];
+		if (pattern_len <= f->off)
+			break;
+		len = min(pattern_len - f->off, f->desc->len);
+		rc  = f->desc->paste(pattern + f->off, len, priv);
+		if (rc)
+			return rc;
+	}
+
+	return 0;
+}
+
+/**
+ * paste_format() - Pastes parsed formats to the buffer.
+ *
+ * This function copies pattern to the buffer, pastes format
+ * into it and then duplicates pattern all over the buffer size.
+ *
+ * Returns 0 in case of success or errno < 0 in case of failure.
+ */
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv)
+{
+	int rc;
+	unsigned int len;
+
+	if (!pattern || !pattern_len || !out || !out_len)
+		return -EINVAL;
+
+	/* Copy pattern */
+	len = min(pattern_len, out_len);
+	memcpy(out, pattern, len);
+
+	rc = paste_format_inplace(out, len, fmt, fmt_sz, priv);
+	if (rc)
+		return rc;
+
+	/* Spread filled chunk all over the buffer */
+	return dup_pattern(out, out_len, pattern_len);
+}

diff --git a/lib/pattern.h b/lib/pattern.h
new file mode 100644
index 0000000..9f937f0
--- /dev/null
+++ b/lib/pattern.h

@@ -0,0 +1,47 @@
+#ifndef FIO_PARSE_PATTERN_H
+#define FIO_PARSE_PATTERN_H
+
+struct pattern_fmt;
+
+/**
+ * Pattern format description. The input for 'parse_pattern'.
+ * Describes format with its name and callback, which should
+ * be called to paste something inside the buffer.
+ */
+struct pattern_fmt_desc {
+	const char  *fmt;
+	unsigned int len;
+	int (*paste)(char *buf, unsigned int len, void *priv);
+};
+
+/**
+ * Pattern format. The output of 'parse_pattern'.
+ * Describes the exact position inside the xbuffer.
+ */
+struct pattern_fmt {
+	unsigned int off;
+	const struct pattern_fmt_desc *desc;
+};
+
+int parse_and_fill_pattern(const char *in, unsigned int in_len,
+			   char *out, unsigned int out_len,
+			   const struct pattern_fmt_desc *fmt_desc,
+			   unsigned int fmt_desc_sz,
+			   struct pattern_fmt *fmt,
+			   unsigned int *fmt_sz_out);
+
+int paste_format_inplace(char *pattern, unsigned int pattern_len,
+			 struct pattern_fmt *fmt, unsigned int fmt_sz,
+			 void *priv);
+
+int paste_format(const char *pattern, unsigned int pattern_len,
+		 struct pattern_fmt *fmt, unsigned int fmt_sz,
+		 char *out, unsigned int out_len, void *priv);
+
+int cpy_pattern(const char *pattern, unsigned int pattern_len,
+		char *out, unsigned int out_len);
+
+int cmp_pattern(const char *pattern, unsigned int pattern_size,
+		unsigned int off, const char *buf, unsigned int len);
+
+#endif

diff --git a/lib/pow2.h b/lib/pow2.h
new file mode 100644
index 0000000..2cbca1a
--- /dev/null
+++ b/lib/pow2.h

@@ -0,0 +1,12 @@
+#ifndef FIO_POW2_H
+#define FIO_POW2_H
+
+#include <inttypes.h>
+#include "types.h"
+
+static inline bool is_power_of_2(uint64_t val)
+{
+	return (val != 0 && ((val & (val - 1)) == 0));
+}
+
+#endif

diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index e18ae32..de3fe1c 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c

@@ -13,9 +13,12 @@
 
 #include <stdlib.h>
 #include <limits.h>
-#include "../fio.h"
+
+#include "../compiler/compiler.h"
 #include "prio_tree.h"
 
+#define ARRAY_SIZE(x)    (sizeof((x)) / (sizeof((x)[0])))
+
 /*
  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
  * which is useful for storing intervals, e.g, we can consider a vma as a closed

diff --git a/lib/rand.c b/lib/rand.c
index 185b679..3f60a67 100644
--- a/lib/rand.c
+++ b/lib/rand.c

@@ -36,14 +36,17 @@
 #include <string.h>
 #include <assert.h>
 #include "rand.h"
+#include "pattern.h"
 #include "../hash.h"
 
-static inline int __seed(unsigned int x, unsigned int m)
+int arch_random;
+
+static inline uint64_t __seed(uint64_t x, uint64_t m)
 {
 	return (x < m) ? x + m : x;
 }
 
-static void __init_rand(struct frand_state *state, unsigned int seed)
+static void __init_rand32(struct taus88_state *state, unsigned int seed)
 {
 	int cranks = 6;
 
@@ -54,17 +57,43 @@
 	state->s3 = __seed(LCG(state->s2, seed), 15);
 
 	while (cranks--)
-		__rand(state);
+		__rand32(state);
 }
 
-void init_rand(struct frand_state *state)
+static void __init_rand64(struct taus258_state *state, uint64_t seed)
 {
-	__init_rand(state, 1);
+	int cranks = 6;
+
+#define LCG64(x, seed)  ((x) * 6906969069ULL ^ (seed))
+
+	state->s1 = __seed(LCG64((2^31) + (2^17) + (2^7), seed), 1);
+	state->s2 = __seed(LCG64(state->s1, seed), 7);
+	state->s3 = __seed(LCG64(state->s2, seed), 15);
+	state->s4 = __seed(LCG64(state->s3, seed), 33);
+	state->s5 = __seed(LCG64(state->s4, seed), 49);
+
+	while (cranks--)
+		__rand64(state);
 }
 
-void init_rand_seed(struct frand_state *state, unsigned int seed)
+void init_rand(struct frand_state *state, bool use64)
 {
-	__init_rand(state, seed);
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, 1);
+	else
+		__init_rand64(&state->state64, 1);
+}
+
+void init_rand_seed(struct frand_state *state, unsigned int seed, bool use64)
+{
+	state->use64 = use64;
+
+	if (!use64)
+		__init_rand32(&state->state32, seed);
+	else
+		__init_rand64(&state->state64, seed);
 }
 
 void __fill_random_buf(void *buf, unsigned int len, unsigned long seed)
@@ -106,32 +135,6 @@
 	return r;
 }
 
-void fill_pattern(void *p, unsigned int len, char *pattern,
-		  unsigned int pattern_bytes)
-{
-	switch (pattern_bytes) {
-	case 0:
-		assert(0);
-		break;
-	case 1:
-		memset(p, pattern[0], len);
-		break;
-	default: {
-		unsigned int i = 0, size = 0;
-		unsigned char *b = p;
-
-		while (i < len) {
-			size = pattern_bytes;
-			if (size > (len - i))
-				size = len - i;
-			memcpy(b+i, pattern, size);
-			i += size;
-		}
-		break;
-		}
-	}
-}
-
 void __fill_random_buf_percentage(unsigned long seed, void *buf,
 				  unsigned int percentage,
 				  unsigned int segment, unsigned int len,
@@ -141,7 +144,7 @@
 
 	if (percentage == 100) {
 		if (pbytes)
-			fill_pattern(buf, len, pattern, pbytes);
+			(void)cpy_pattern(pattern, pbytes, buf, len);
 		else
 			memset(buf, 0, len);
 		return;
@@ -171,7 +174,7 @@
 			this_len = len;
 
 		if (pbytes)
-			fill_pattern(buf, this_len, pattern, pbytes);
+			(void)cpy_pattern(pattern, pbytes, buf, this_len);
 		else
 			memset(buf, 0, this_len);
 

diff --git a/lib/rand.h b/lib/rand.h
index 089837d..bff4a35 100644
--- a/lib/rand.h
+++ b/lib/rand.h

@@ -1,21 +1,67 @@
 #ifndef FIO_RAND_H
 #define FIO_RAND_H
 
-#define FRAND_MAX	(-1U)
+#include <inttypes.h>
+#include <assert.h>
+#include "types.h"
+#include "../arch/arch.h"
 
-struct frand_state {
+#define FRAND32_MAX	(-1U)
+#define FRAND64_MAX	(-1ULL)
+
+struct taus88_state {
 	unsigned int s1, s2, s3;
 };
 
-static inline void frand_copy(struct frand_state *dst,
-			      struct frand_state *src)
+struct taus258_state {
+	uint64_t s1, s2, s3, s4, s5;
+};
+
+struct frand_state {
+	unsigned int use64;
+	union {
+		struct taus88_state state32;
+		struct taus258_state state64;
+	};
+};
+
+static inline uint64_t rand_max(struct frand_state *state)
+{
+	if (state->use64)
+		return FRAND64_MAX;
+	else
+		return FRAND32_MAX;
+}
+
+static inline void __frand32_copy(struct taus88_state *dst,
+				  struct taus88_state *src)
 {
 	dst->s1 = src->s1;
 	dst->s2 = src->s2;
 	dst->s3 = src->s3;
 }
 
-static inline unsigned int __rand(struct frand_state *state)
+static inline void __frand64_copy(struct taus258_state *dst,
+				  struct taus258_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+	dst->s4 = src->s4;
+	dst->s5 = src->s5;
+}
+
+static inline void frand_copy(struct frand_state *dst, struct frand_state *src)
+{
+	if (src->use64)
+		__frand64_copy(&dst->state64, &src->state64);
+	else
+		__frand32_copy(&dst->state32, &src->state32);
+
+	dst->use64 = src->use64;
+}
+
+static inline unsigned int __rand32(struct taus88_state *state)
 {
 #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
 
@@ -26,12 +72,67 @@
 	return (state->s1 ^ state->s2 ^ state->s3);
 }
 
-extern void init_rand(struct frand_state *);
-extern void init_rand_seed(struct frand_state *, unsigned int seed);
+static inline uint64_t __rand64(struct taus258_state *state)
+{
+	uint64_t xval;
+
+	xval = ((state->s1 <<  1) ^ state->s1) >> 53;
+	state->s1 = ((state->s1 & 18446744073709551614ULL) << 10) ^ xval;
+
+	xval = ((state->s2 << 24) ^ state->s2) >> 50;
+	state->s2 = ((state->s2 & 18446744073709551104ULL) <<  5) ^ xval;
+
+	xval = ((state->s3 <<  3) ^ state->s3) >> 23;
+	state->s3 = ((state->s3 & 18446744073709547520ULL) << 29) ^ xval;
+
+	xval = ((state->s4 <<  5) ^ state->s4) >> 24;
+	state->s4 = ((state->s4 & 18446744073709420544ULL) << 23) ^ xval;
+
+	xval = ((state->s5 <<  3) ^ state->s5) >> 33;
+	state->s5 = ((state->s5 & 18446744073701163008ULL) <<  8) ^ xval;
+
+	return (state->s1 ^ state->s2 ^ state->s3 ^ state->s4 ^ state->s5);
+}
+
+static inline uint64_t __rand(struct frand_state *state)
+{
+	if (state->use64)
+		return __rand64(&state->state64);
+	else
+		return __rand32(&state->state32);
+}
+
+static inline double __rand_0_1(struct frand_state *state)
+{
+	if (state->use64) {
+		uint64_t val = __rand64(&state->state64);
+
+		return (val + 1.0) / (FRAND64_MAX + 1.0);
+	} else {
+		uint32_t val = __rand32(&state->state32);
+
+		return (val + 1.0) / (FRAND32_MAX + 1.0);
+	}
+}
+
+/*
+ * Generate a random value between 'start' and 'end', both inclusive
+ */
+static inline int rand32_between(struct frand_state *state, int start, int end)
+{
+	uint32_t r;
+
+	assert(!state->use64);
+
+	r = __rand32(&state->state32);
+	return start + (int) ((double)end * (r / (FRAND32_MAX + 1.0)));
+}
+
+extern void init_rand(struct frand_state *, bool);
+extern void init_rand_seed(struct frand_state *, unsigned int seed, bool);
 extern void __fill_random_buf(void *buf, unsigned int len, unsigned long seed);
 extern unsigned long fill_random_buf(struct frand_state *, void *buf, unsigned int len);
 extern void __fill_random_buf_percentage(unsigned long, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
 extern unsigned long fill_random_buf_percentage(struct frand_state *, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
-extern void fill_pattern(void *p, unsigned int len, char *pattern, unsigned int pattern_bytes);
 
 #endif

diff --git a/lib/seqlock.h b/lib/seqlock.h
new file mode 100644
index 0000000..1ac1eb6
--- /dev/null
+++ b/lib/seqlock.h

@@ -0,0 +1,48 @@
+#ifndef FIO_SEQLOCK_H
+#define FIO_SEQLOCK_H
+
+#include "../arch/arch.h"
+
+struct seqlock {
+	volatile int sequence;
+};
+
+static inline void seqlock_init(struct seqlock *s)
+{
+	s->sequence = 0;
+}
+
+static inline unsigned int read_seqlock_begin(struct seqlock *s)
+{
+	unsigned int seq;
+
+	do {
+		seq = s->sequence;
+		if (!(seq & 1))
+			break;
+		nop;
+	} while (1);
+
+	read_barrier();
+	return seq;
+}
+
+static inline bool read_seqlock_retry(struct seqlock *s, unsigned int seq)
+{
+	read_barrier();
+	return s->sequence != seq;
+}
+
+static inline void write_seqlock_begin(struct seqlock *s)
+{
+	s->sequence++;
+	write_barrier();
+}
+
+static inline void write_seqlock_end(struct seqlock *s)
+{
+	write_barrier();
+	s->sequence++;
+}
+
+#endif

diff --git a/lib/strntol.c b/lib/strntol.c
new file mode 100644
index 0000000..f622c8d
--- /dev/null
+++ b/lib/strntol.c

@@ -0,0 +1,33 @@
+#include <string.h>
+#include <stdlib.h>
+#include <limits.h>
+
+#include "strntol.h"
+
+long strntol(const char *str, size_t sz, char **end, int base)
+{
+	/* Expect that digit representation of LONG_MAX/MIN
+	 * not greater than this buffer */
+	char buf[24];
+	long ret;
+	const char *beg = str;
+
+	/* Catch up leading spaces */
+	for (; beg && sz && *beg == ' '; beg++, sz--)
+		;
+
+	if (!sz || sz >= sizeof(buf)) {
+		if (end)
+			*end = (char *)str;
+		return 0;
+	}
+
+	memcpy(buf, beg, sz);
+	buf[sz] = '\0';
+	ret = strtol(buf, end, base);
+	if (ret == LONG_MIN || ret == LONG_MAX)
+		return ret;
+	if (end)
+		*end = (char *)str + (*end - buf);
+	return ret;
+}

diff --git a/lib/strntol.h b/lib/strntol.h
new file mode 100644
index 0000000..68f5d1b
--- /dev/null
+++ b/lib/strntol.h

@@ -0,0 +1,6 @@
+#ifndef FIO_STRNTOL_H
+#define FIO_STRNTOL_H
+
+long strntol(const char *str, size_t sz, char **end, int base);
+
+#endif

diff --git a/lib/tp.c b/lib/tp.c
deleted file mode 100644
index 7462f5b..0000000
--- a/lib/tp.c
+++ /dev/null

@@ -1,119 +0,0 @@
-/*
- * Basic workqueue like code, that sets up a thread and allows async
- * processing of some sort. Could be extended to allow for multiple
- * worker threads. But right now fio associates one of this per IO
- * thread, so should be enough to have just a single thread doing the
- * work.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <unistd.h>
-#include <errno.h>
-#include <pthread.h>
-#include <string.h>
-
-#include "../smalloc.h"
-#include "../log.h"
-#include "tp.h"
-
-static void tp_flush_work(struct flist_head *list)
-{
-	struct tp_work *work;
-
-	while (!flist_empty(list)) {
-		int prio;
-
-		work = flist_entry(list->next, struct tp_work, list);
-		flist_del(&work->list);
-
-		prio = work->prio;
-		if (nice(prio) < 0)
-			log_err("fio: nice %s\n", strerror(errno));
-
-		work->fn(work);
-
-		if (nice(prio) < 0)
-			log_err("fio: nice %s\n", strerror(errno));
-	}
-}
-
-static void *tp_thread(void *data)
-{
-	struct tp_data *tdat = data;
-	struct flist_head work_list;
-
-	INIT_FLIST_HEAD(&work_list);
-
-	while (1) {
-		pthread_mutex_lock(&tdat->lock);
-
-		if (!tdat->thread_exit && flist_empty(&tdat->work))
-			pthread_cond_wait(&tdat->cv, &tdat->lock);
-
-		if (!flist_empty(&tdat->work))
-			flist_splice_tail_init(&tdat->work, &work_list);
-
-		pthread_mutex_unlock(&tdat->lock);
-
-		if (flist_empty(&work_list)) {
-			if (tdat->thread_exit)
-				break;
-			continue;
-		}
-
-		tp_flush_work(&work_list);
-	}
-
-	return NULL;
-}
-
-void tp_queue_work(struct tp_data *tdat, struct tp_work *work)
-{
-	work->done = 0;
-
-	pthread_mutex_lock(&tdat->lock);
-	flist_add_tail(&work->list, &tdat->work);
-	pthread_mutex_unlock(&tdat->lock);
-
-	pthread_cond_signal(&tdat->cv);
-}
-
-void tp_init(struct tp_data **tdatp)
-{
-	struct tp_data *tdat;
-	int ret;
-
-	if (*tdatp)
-		return;
-
-	*tdatp = tdat = smalloc(sizeof(*tdat));
-	pthread_mutex_init(&tdat->lock, NULL);
-	INIT_FLIST_HEAD(&tdat->work);
-	pthread_cond_init(&tdat->cv, NULL);
-	pthread_cond_init(&tdat->sleep_cv, NULL);
-
-	ret = pthread_create(&tdat->thread, NULL, tp_thread, tdat);
-	if (ret)
-		log_err("fio: failed to create tp thread\n");
-}
-
-void tp_exit(struct tp_data **tdatp)
-{
-	struct tp_data *tdat = *tdatp;
-	void *ret;
-
-	if (!tdat)
-		return;
-
-	pthread_mutex_lock(&tdat->lock);
-	tdat->thread_exit = 1;
-	pthread_mutex_unlock(&tdat->lock);
-
-	pthread_cond_signal(&tdat->cv);
-
-	pthread_join(tdat->thread, &ret);
-
-	sfree(tdat);
-	*tdatp = NULL;
-}

diff --git a/lib/tp.h b/lib/tp.h
deleted file mode 100644
index 9147cc2..0000000
--- a/lib/tp.h
+++ /dev/null

@@ -1,33 +0,0 @@
-#ifndef FIO_TP_H
-#define FIO_TP_H
-
-#include "../flist.h"
-
-struct tp_work;
-typedef int (tp_work_fn)(struct tp_work *);
-
-struct tp_work {
-	struct flist_head list;
-	tp_work_fn *fn;
-	int wait;
-	int prio;
-	pthread_cond_t cv;
-	pthread_mutex_t lock;
-	volatile int done;
-};
-
-struct tp_data {
-	pthread_t thread;
-	pthread_cond_t cv;
-	pthread_mutex_t lock;
-	struct flist_head work;
-	volatile int thread_exit;
-	pthread_cond_t sleep_cv;
-	volatile int sleeping;
-};
-
-extern void tp_init(struct tp_data **);
-extern void tp_exit(struct tp_data **);
-extern void tp_queue_work(struct tp_data *, struct tp_work *);
-
-#endif

diff --git a/lib/types.h b/lib/types.h
new file mode 100644
index 0000000..287a3b4
--- /dev/null
+++ b/lib/types.h

@@ -0,0 +1,16 @@
+#ifndef FIO_TYPES_H
+#define FIO_TYPES_H
+
+#ifndef CONFIG_HAVE_BOOL
+typedef int bool;
+#ifndef false
+#define false	0
+#endif
+#ifndef true
+#define true	1
+#endif
+#else
+#include <stdbool.h>
+#endif
+
+#endif

diff --git a/lib/zipf.c b/lib/zipf.c
index c691bc5..3d535c7 100644
--- a/lib/zipf.c
+++ b/lib/zipf.c

@@ -6,7 +6,6 @@
 #include <sys/types.h>
 #include <fcntl.h>
 #include "ieee754.h"
-#include "../log.h"
 #include "zipf.h"
 #include "../minmax.h"
 #include "../hash.h"
@@ -35,7 +34,7 @@
 	memset(zs, 0, sizeof(*zs));
 	zs->nranges = nranges;
 
-	init_rand_seed(&zs->rand, seed);
+	init_rand_seed(&zs->rand, seed, 0);
 	zs->rand_off = __rand(&zs->rand);
 }
 
@@ -59,7 +58,7 @@
 	alpha = 1.0 / (1.0 - zs->theta);
 	eta = (1.0 - pow(2.0 / n, 1.0 - zs->theta)) / (1.0 - zs->zeta2 / zs->zetan);
 
-	rand_uni = (double) __rand(&zs->rand) / (double) FRAND_MAX;
+	rand_uni = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
 	rand_z = rand_uni * zs->zetan;
 
 	if (rand_z < 1.0)
@@ -69,7 +68,12 @@
 	else
 		val = 1 + (unsigned long long)(n * pow(eta*rand_uni - eta + 1.0, alpha));
 
-	return (__hash_u64(val - 1) + zs->rand_off) % zs->nranges;
+	val--;
+
+	if (!zs->disable_hash)
+		val = __hash_u64(val);
+
+	return (val + zs->rand_off) % zs->nranges;
 }
 
 void pareto_init(struct zipf_state *zs, unsigned long nranges, double h,
@@ -81,8 +85,18 @@
 
 unsigned long long pareto_next(struct zipf_state *zs)
 {
-	double rand = (double) __rand(&zs->rand) / (double) FRAND_MAX;
-	unsigned long long n = zs->nranges - 1;
+	double rand = (double) __rand(&zs->rand) / (double) FRAND32_MAX;
+	unsigned long long n;
 
-	return (__hash_u64(n * pow(rand, zs->pareto_pow)) + zs->rand_off) % zs->nranges;
+	n = (zs->nranges - 1) * pow(rand, zs->pareto_pow);
+
+	if (!zs->disable_hash)
+		n = __hash_u64(n);
+
+	return (n + zs->rand_off)  % zs->nranges;
+}
+
+void zipf_disable_hash(struct zipf_state *zs)
+{
+	zs->disable_hash = true;
 }

diff --git a/lib/zipf.h b/lib/zipf.h
index f98ad81..af2d0e6 100644
--- a/lib/zipf.h
+++ b/lib/zipf.h

@@ -12,6 +12,7 @@
 	double pareto_pow;
 	struct frand_state rand;
 	uint64_t rand_off;
+	bool disable_hash;
 };
 
 void zipf_init(struct zipf_state *zs, unsigned long nranges, double theta, unsigned int seed);
@@ -19,5 +20,6 @@
 
 void pareto_init(struct zipf_state *zs, unsigned long nranges, double h, unsigned int seed);
 unsigned long long pareto_next(struct zipf_state *zs);
+void zipf_disable_hash(struct zipf_state *zs);
 
 #endif

diff --git a/libfio.c b/libfio.c
index 57ce725..8310708 100644
--- a/libfio.c
+++ b/libfio.c

@@ -33,19 +33,17 @@
 #include "smalloc.h"
 #include "os/os.h"
 #include "filelock.h"
+#include "helper_thread.h"
+#include "filehash.h"
 
-/*
- * Just expose an empty list, if the OS does not support disk util stats
- */
-#ifndef FIO_HAVE_DISK_UTIL
 FLIST_HEAD(disk_list);
-#endif
 
 unsigned long arch_flags = 0;
 
 uintptr_t page_mask = 0;
 uintptr_t page_size = 0;
 
+/* see os/os.h */
 static const char *fio_os_strings[os_nr] = {
 	"Invalid",
 	"Linux",
@@ -61,6 +59,7 @@
 	"DragonFly",
 };
 
+/* see arch/arch.h */
 static const char *fio_arch_strings[arch_nr] = {
 	"Invalid",
 	"x86-64",
@@ -74,21 +73,29 @@
 	"arm",
 	"sh",
 	"hppa",
+	"mips",
+	"aarch64",
 	"generic"
 };
 
-static void reset_io_counters(struct thread_data *td)
+static void reset_io_counters(struct thread_data *td, int all)
 {
 	int ddir;
 
-	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
-		td->stat_io_bytes[ddir] = 0;
-		td->this_io_bytes[ddir] = 0;
-		td->stat_io_blocks[ddir] = 0;
-		td->this_io_blocks[ddir] = 0;
-		td->rate_bytes[ddir] = 0;
-		td->rate_blocks[ddir] = 0;
+	if (all) {
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td->stat_io_bytes[ddir] = 0;
+			td->this_io_bytes[ddir] = 0;
+			td->stat_io_blocks[ddir] = 0;
+			td->this_io_blocks[ddir] = 0;
+			td->rate_bytes[ddir] = 0;
+			td->rate_blocks[ddir] = 0;
+			td->bytes_done[ddir] = 0;
+			td->rate_io_issue_bytes[ddir] = 0;
+			td->rate_next_io_time[ddir] = 0;
+		}
 	}
+
 	td->zone_bytes = 0;
 
 	td->last_was_sync = 0;
@@ -101,12 +108,12 @@
 		td->nr_done_files = 0;
 }
 
-void clear_io_state(struct thread_data *td)
+void clear_io_state(struct thread_data *td, int all)
 {
 	struct fio_file *f;
 	unsigned int i;
 
-	reset_io_counters(td);
+	reset_io_counters(td, all);
 
 	close_files(td);
 	for_each_file(td, f, i) {
@@ -115,17 +122,17 @@
 	}
 
 	/*
-	 * Set the same seed to get repeatable runs
+	 * Re-Seed random number generator if rand_repeatable is true
 	 */
-	td_fill_rand_seeds(td);
+	if (td->o.rand_repeatable)
+		td_fill_rand_seeds(td);
 }
 
 void reset_all_stats(struct thread_data *td)
 {
-	struct timeval tv;
 	int i;
 
-	reset_io_counters(td);
+	reset_io_counters(td, 1);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		td->io_bytes[i] = 0;
@@ -136,11 +143,15 @@
 		td->rwmix_issues = 0;
 	}
 
-	fio_gettime(&tv, NULL);
-	memcpy(&td->epoch, &tv, sizeof(tv));
-	memcpy(&td->start, &tv, sizeof(tv));
+	set_epoch_time(td, td->o.log_unix_epoch);
+	memcpy(&td->start, &td->epoch, sizeof(struct timeval));
+	memcpy(&td->iops_sample_time, &td->epoch, sizeof(struct timeval));
+	memcpy(&td->bw_sample_time, &td->epoch, sizeof(struct timeval));
+	memcpy(&td->ss.prev_time, &td->epoch, sizeof(struct timeval));
 
 	lat_target_reset(td);
+	clear_rusage_stat(td);
+	helper_reset();
 }
 
 void reset_fio_state(void)
@@ -167,13 +178,38 @@
 	return NULL;
 }
 
+static const char *td_runstates[] = {
+	"NOT_CREATED",
+	"CREATED",
+	"INITIALIZED",
+	"RAMP",
+	"SETTING_UP",
+	"RUNNING",
+	"PRE_READING",
+	"VERIFYING",
+	"FSYNCING",
+	"FINISHING",
+	"EXITED",
+	"REAPED",
+};
+
+const char *runstate_to_name(int runstate)
+{
+	compiletime_assert(TD_LAST == 12, "td runstate list");
+	if (runstate >= 0 && runstate < TD_LAST)
+		return td_runstates[runstate];
+
+	return "invalid";
+}
+
 void td_set_runstate(struct thread_data *td, int runstate)
 {
 	if (td->runstate == runstate)
 		return;
 
-	dprint(FD_PROCESS, "pid=%d: runstate %d -> %d\n", (int) td->pid,
-						td->runstate, runstate);
+	dprint(FD_PROCESS, "pid=%d: runstate %s -> %s\n", (int) td->pid,
+						runstate_to_name(td->runstate),
+						runstate_to_name(runstate));
 	td->runstate = runstate;
 }
 
@@ -197,7 +233,7 @@
 	td->terminate = 1;
 }
 
-void fio_terminate_threads(int group_id)
+void fio_terminate_threads(unsigned int group_id)
 {
 	struct thread_data *td;
 	pid_t pid = getpid();
@@ -206,7 +242,7 @@
 	dprint(FD_PROCESS, "terminate group_id=%d\n", group_id);
 
 	for_each_td(td, i) {
-		if (group_id == TERMINATE_ALL || groupid == td->groupid) {
+		if (group_id == TERMINATE_ALL || group_id == td->groupid) {
 			dprint(FD_PROCESS, "setting terminate on %s/%d\n",
 						td->o.name, (int) td->pid);
 
@@ -237,14 +273,18 @@
 {
 	struct thread_data *td;
 	int i;
+	int nr_io_threads = 0;
 
 	for_each_td(td, i) {
-		if (td->flags & TD_F_NOIO)
+		if (td->io_ops_init && td_ioengine_flagged(td, FIO_NOIO))
 			continue;
+		nr_io_threads++;
 		if (td->runstate < TD_EXITED)
 			return 1;
 	}
 
+	if (!nr_io_threads)
+		return -1; /* we only had cpuio threads to begin with */
 	return 0;
 }
 
@@ -266,6 +306,13 @@
 	return flags;
 }
 
+enum {
+	ENDIAN_INVALID_BE = 1,
+	ENDIAN_INVALID_LE,
+	ENDIAN_INVALID_CONFIG,
+	ENDIAN_BROKEN,
+};
+
 static int endian_check(void)
 {
 	union {
@@ -282,16 +329,16 @@
 
 #if defined(CONFIG_LITTLE_ENDIAN)
 	if (be)
-		return 1;
+		return ENDIAN_INVALID_BE;
 #elif defined(CONFIG_BIG_ENDIAN)
 	if (le)
-		return 1;
+		return ENDIAN_INVALID_LE;
 #else
-	return 1;
+	return ENDIAN_INVALID_CONFIG;
 #endif
 
 	if (!le && !be)
-		return 1;
+		return ENDIAN_BROKEN;
 
 	return 0;
 }
@@ -299,6 +346,7 @@
 int initialize_fio(char *envp[])
 {
 	long ps;
+	int err;
 
 	/*
 	 * We need these to be properly 64-bit aligned, otherwise we
@@ -314,8 +362,26 @@
 	compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
 	compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
 
-	if (endian_check()) {
+	err = endian_check();
+	if (err) {
 		log_err("fio: endianness settings appear wrong.\n");
+		switch (err) {
+		case ENDIAN_INVALID_BE:
+			log_err("fio: got big-endian when configured for little\n");
+			break;
+		case ENDIAN_INVALID_LE:
+			log_err("fio: got little-endian when configured for big\n");
+			break;
+		case ENDIAN_INVALID_CONFIG:
+			log_err("fio: not configured to any endianness\n");
+			break;
+		case ENDIAN_BROKEN:
+			log_err("fio: failed to detect endianness\n");
+			break;
+		default:
+			assert(0);
+			break;
+		}
 		log_err("fio: please report this to fio@vger.kernel.org\n");
 		return 1;
 	}
@@ -333,6 +399,8 @@
 		return 1;
 	}
 
+	file_hash_init();
+
 	/*
 	 * We need locale for number printing, if it isn't set then just
 	 * go with the US format.
@@ -352,3 +420,8 @@
 	fio_keywords_init();
 	return 0;
 }
+
+void deinitialize_fio(void)
+{
+	fio_keywords_exit();
+}

diff --git a/log.c b/log.c
index c4a3b52..4eb4af5 100644
--- a/log.c
+++ b/log.c

@@ -6,35 +6,32 @@
 
 #include "fio.h"
 
-int log_valist(const char *str, va_list args)
+size_t log_info_buf(const char *buf, size_t len)
+{
+	if (is_backend) {
+		size_t ret = fio_server_text_output(FIO_LOG_INFO, buf, len);
+		if (ret != -1)
+			return ret;
+	}
+
+	if (log_syslog) {
+		syslog(LOG_INFO, "%s", buf);
+		return len;
+	} else
+		return fwrite(buf, len, 1, f_out);
+}
+
+size_t log_valist(const char *str, va_list args)
 {
 	char buffer[1024];
 	size_t len;
 
 	len = vsnprintf(buffer, sizeof(buffer), str, args);
-	len = min(len, sizeof(buffer) - 1);
 
-	if (is_backend)
-		len = fio_server_text_output(FIO_LOG_INFO, buffer, len);
-	if (log_syslog)
-		syslog(LOG_INFO, "%s", buffer);
-	else
-		len = fwrite(buffer, len, 1, f_out);
-
-	return len;
+	return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
 }
 
-int log_local_buf(const char *buf, size_t len)
-{
-	if (log_syslog)
-		syslog(LOG_INFO, "%s", buf);
-	else
-		len = fwrite(buf, len, 1, f_out);
-
-	return len;
-}
-
-int log_info(const char *format, ...)
+size_t log_info(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
@@ -43,15 +40,21 @@
 	va_start(args, format);
 	len = vsnprintf(buffer, sizeof(buffer), format, args);
 	va_end(args);
-	len = min(len, sizeof(buffer) - 1);
 
-	if (is_backend)
-		return fio_server_text_output(FIO_LOG_INFO, buffer, len);
-	else if (log_syslog) {
-		syslog(LOG_INFO, "%s", buffer);
-		return len;
-	} else
-		return fwrite(buffer, len, 1, f_out);
+	return log_info_buf(buffer, min(len, sizeof(buffer) - 1));
+}
+
+size_t __log_buf(struct buf_output *buf, const char *format, ...)
+{
+	char buffer[1024];
+	va_list args;
+	size_t len;
+
+	va_start(args, format);
+	len = vsnprintf(buffer, sizeof(buffer), format, args);
+	va_end(args);
+
+	return buf_output_add(buf, buffer, min(len, sizeof(buffer) - 1));
 }
 
 int log_info_flush(void)
@@ -62,7 +65,7 @@
 	return fflush(f_out);
 }
 
-int log_err(const char *format, ...)
+size_t log_err(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
@@ -73,9 +76,13 @@
 	va_end(args);
 	len = min(len, sizeof(buffer) - 1);
 
-	if (is_backend)
-		return fio_server_text_output(FIO_LOG_ERR, buffer, len);
-	else if (log_syslog) {
+	if (is_backend) {
+		size_t ret = fio_server_text_output(FIO_LOG_ERR, buffer, len);
+		if (ret != -1)
+			return ret;
+	}
+
+	if (log_syslog) {
 		syslog(LOG_INFO, "%s", buffer);
 		return len;
 	} else {

diff --git a/log.h b/log.h
index e509313..a39dea6 100644
--- a/log.h
+++ b/log.h

@@ -4,15 +4,26 @@
 #include <stdio.h>
 #include <stdarg.h>
 
+#include "lib/output_buffer.h"
+
 extern FILE *f_out;
 extern FILE *f_err;
 
-extern int log_err(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
-extern int log_info(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
-extern int log_valist(const char *str, va_list);
-extern int log_local_buf(const char *buf, size_t);
+extern size_t log_err(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern size_t log_info(const char *format, ...) __attribute__ ((__format__ (__printf__, 1, 2)));
+extern size_t __log_buf(struct buf_output *, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3)));
+extern size_t log_valist(const char *str, va_list);
+extern size_t log_info_buf(const char *buf, size_t len);
 extern int log_info_flush(void);
 
+#define log_buf(buf, format, args...)		\
+do {						\
+	if ((buf) != NULL)			\
+		__log_buf(buf, format, ##args);	\
+	else					\
+		log_info(format, ##args);	\
+} while (0)
+
 enum {
 	FIO_LOG_DEBUG	= 1,
 	FIO_LOG_INFO	= 2,

diff --git a/memory.c b/memory.c
index 23a0d94..22a7f5d 100644
--- a/memory.c
+++ b/memory.c

@@ -33,13 +33,13 @@
 	dprint(FD_MEM, "pinning %llu bytes\n", td->o.lockmem);
 
 	/*
-	 * Don't allow mlock of more than real_mem-128MB
+	 * Don't allow mlock of more than real_mem-128MiB
 	 */
 	phys_mem = os_phys_mem();
 	if (phys_mem) {
 		if ((td->o.lockmem + 128 * 1024 * 1024) > phys_mem) {
 			td->o.lockmem = phys_mem - 128 * 1024 * 1024;
-			log_info("fio: limiting mlocked memory to %lluMB\n",
+			log_info("fio: limiting mlocked memory to %lluMiB\n",
 							td->o.lockmem >> 20);
 		}
 	}
@@ -89,7 +89,7 @@
 					" support huge pages.\n");
 			} else if (errno == ENOMEM) {
 				log_err("fio: no huge pages available, do you"
-					" need to alocate some? See HOWTO.\n");
+					" need to allocate some? See HOWTO.\n");
 			}
 		}
 
@@ -146,12 +146,14 @@
 			return 1;
 		}
 		if (td->o.mem_type != MEM_MMAPHUGE &&
+		    td->o.mem_type != MEM_MMAPSHARED &&
 		    ftruncate(td->mmapfd, total_mem) < 0) {
 			td_verror(td, errno, "truncate mmap file");
 			td->orig_buffer = NULL;
 			return 1;
 		}
-		if (td->o.mem_type == MEM_MMAPHUGE)
+		if (td->o.mem_type == MEM_MMAPHUGE ||
+		    td->o.mem_type == MEM_MMAPSHARED)
 			flags |= MAP_SHARED;
 		else
 			flags |= MAP_PRIVATE;
@@ -205,6 +207,78 @@
 	free(td->orig_buffer);
 }
 
+static int alloc_mem_cudamalloc(struct thread_data *td, size_t total_mem)
+{
+#ifdef CONFIG_CUDA
+	CUresult ret;
+	char name[128];
+
+	ret = cuInit(0);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed initialize cuda driver api\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetCount(&td->gpu_dev_cnt);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device count\n");
+		return 1;
+	}
+	dprint(FD_MEM, "found %d GPU devices\n", td->gpu_dev_cnt);
+
+	if (td->gpu_dev_cnt == 0) {
+		log_err("fio: no GPU device found. "
+			"Can not perform GPUDirect RDMA.\n");
+		return 1;
+	}
+
+	td->gpu_dev_id = td->o.gpu_dev_id;
+	ret = cuDeviceGet(&td->cu_dev, td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get GPU device\n");
+		return 1;
+	}
+
+	ret = cuDeviceGetName(name, sizeof(name), td->gpu_dev_id);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed get device name\n");
+		return 1;
+	}
+	dprint(FD_MEM, "dev_id = [%d], device name = [%s]\n", \
+	       td->gpu_dev_id, name);
+
+	ret = cuCtxCreate(&td->cu_ctx, CU_CTX_MAP_HOST, td->cu_dev);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: failed to create cuda context: %d\n", ret);
+		return 1;
+	}
+
+	ret = cuMemAlloc(&td->dev_mem_ptr, total_mem);
+	if (ret != CUDA_SUCCESS) {
+		log_err("fio: cuMemAlloc %zu bytes failed\n", total_mem);
+		return 1;
+	}
+	td->orig_buffer = (void *) td->dev_mem_ptr;
+
+	dprint(FD_MEM, "cudaMalloc %llu %p\n",				\
+	       (unsigned long long) total_mem, td->orig_buffer);
+	return 0;
+#else
+	return -EINVAL;
+#endif
+}
+
+static void free_mem_cudamalloc(struct thread_data *td)
+{
+#ifdef CONFIG_CUDA
+	if (td->dev_mem_ptr != NULL)
+		cuMemFree(td->dev_mem_ptr);
+
+	if (cuCtxDestroy(td->cu_ctx) != CUDA_SUCCESS)
+		log_err("fio: failed to destroy cuda context\n");
+#endif
+}
+
 /*
  * Set up the buffer area we need for io.
  */
@@ -213,13 +287,13 @@
 	size_t total_mem;
 	int ret = 0;
 
-	if (td->io_ops->flags & FIO_NOIO)
+	if (td_ioengine_flagged(td, FIO_NOIO))
 		return 0;
 
 	total_mem = td->orig_buffer_size;
 
 	if (td->o.odirect || td->o.mem_align || td->o.oatomic ||
-	    (td->io_ops->flags & FIO_MEMALIGN)) {
+	    td_ioengine_flagged(td, FIO_MEMALIGN)) {
 		total_mem += page_mask;
 		if (td->o.mem_align && td->o.mem_align > page_size)
 			total_mem += td->o.mem_align - page_size;
@@ -227,12 +301,25 @@
 
 	dprint(FD_MEM, "Alloc %llu for buffers\n", (unsigned long long) total_mem);
 
-	if (td->o.mem_type == MEM_MALLOC)
+	/*
+	 * If the IO engine has hooks to allocate/free memory, use those. But
+	 * error out if the user explicitly asked for something else.
+	 */
+	if (td->io_ops->iomem_alloc) {
+		if (fio_option_is_set(&td->o, mem_type)) {
+			log_err("fio: option 'mem/iomem' conflicts with specified IO engine\n");
+			ret = 1;
+		} else
+			ret = td->io_ops->iomem_alloc(td, total_mem);
+	} else if (td->o.mem_type == MEM_MALLOC)
 		ret = alloc_mem_malloc(td, total_mem);
 	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
 		ret = alloc_mem_shm(td, total_mem);
-	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE)
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
 		ret = alloc_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		ret = alloc_mem_cudamalloc(td, total_mem);
 	else {
 		log_err("fio: bad mem type: %d\n", td->o.mem_type);
 		ret = 1;
@@ -252,12 +339,18 @@
 	if (td->o.odirect || td->o.oatomic)
 		total_mem += page_mask;
 
-	if (td->o.mem_type == MEM_MALLOC)
+	if (td->io_ops->iomem_alloc) {
+		if (td->io_ops->iomem_free)
+			td->io_ops->iomem_free(td);
+	} else if (td->o.mem_type == MEM_MALLOC)
 		free_mem_malloc(td);
 	else if (td->o.mem_type == MEM_SHM || td->o.mem_type == MEM_SHMHUGE)
 		free_mem_shm(td);
-	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE)
+	else if (td->o.mem_type == MEM_MMAP || td->o.mem_type == MEM_MMAPHUGE ||
+		 td->o.mem_type == MEM_MMAPSHARED)
 		free_mem_mmap(td, total_mem);
+	else if (td->o.mem_type == MEM_CUDA_MALLOC)
+		free_mem_cudamalloc(td);
 	else
 		log_err("Bad memory type %u\n", td->o.mem_type);
 

diff --git a/minmax.h b/minmax.h
index 97957c8..afc78f0 100644
--- a/minmax.h
+++ b/minmax.h

@@ -17,4 +17,9 @@
 	_x > _y ? _x : _y; })
 #endif
 
+#define min_not_zero(x, y) ({		\
+	typeof(x) __x = (x);		\
+	typeof(y) __y = (y);		\
+	__x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
+
 #endif

diff --git a/mutex.c b/mutex.c
index 53f9651..d8c4825 100644
--- a/mutex.c
+++ b/mutex.c

@@ -22,6 +22,12 @@
 {
 	assert(mutex->magic == FIO_MUTEX_MAGIC);
 	pthread_cond_destroy(&mutex->cond);
+
+	/*
+	 * Ensure any subsequent attempt to grab this mutex will fail
+	 * with an assert, instead of just silently hanging.
+	 */
+	memset(mutex, 0, sizeof(*mutex));
 }
 
 void fio_mutex_remove(struct fio_mutex *mutex)
@@ -30,16 +36,39 @@
 	munmap((void *) mutex, sizeof(*mutex));
 }
 
-int __fio_mutex_init(struct fio_mutex *mutex, int value)
+int cond_init_pshared(pthread_cond_t *cond)
 {
-	pthread_mutexattr_t attr;
-	pthread_condattr_t cond;
+	pthread_condattr_t cattr;
 	int ret;
 
-	mutex->value = value;
-	mutex->magic = FIO_MUTEX_MAGIC;
+	ret = pthread_condattr_init(&cattr);
+	if (ret) {
+		log_err("pthread_condattr_init: %s\n", strerror(ret));
+		return ret;
+	}
 
-	ret = pthread_mutexattr_init(&attr);
+#ifdef CONFIG_PSHARED
+	ret = pthread_condattr_setpshared(&cattr, PTHREAD_PROCESS_SHARED);
+	if (ret) {
+		log_err("pthread_condattr_setpshared: %s\n", strerror(ret));
+		return ret;
+	}
+#endif
+	ret = pthread_cond_init(cond, &cattr);
+	if (ret) {
+		log_err("pthread_cond_init: %s\n", strerror(ret));
+		return ret;
+	}
+
+	return 0;
+}
+
+int mutex_init_pshared(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t mattr;
+	int ret;
+
+	ret = pthread_mutexattr_init(&mattr);
 	if (ret) {
 		log_err("pthread_mutexattr_init: %s\n", strerror(ret));
 		return ret;
@@ -48,28 +77,48 @@
 	/*
 	 * Not all platforms support process shared mutexes (FreeBSD)
 	 */
-#ifdef FIO_HAVE_PSHARED_MUTEX
-	ret = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+#ifdef CONFIG_PSHARED
+	ret = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
 	if (ret) {
 		log_err("pthread_mutexattr_setpshared: %s\n", strerror(ret));
 		return ret;
 	}
 #endif
-
-	pthread_condattr_init(&cond);
-#ifdef FIO_HAVE_PSHARED_MUTEX
-	pthread_condattr_setpshared(&cond, PTHREAD_PROCESS_SHARED);
-#endif
-	pthread_cond_init(&mutex->cond, &cond);
-
-	ret = pthread_mutex_init(&mutex->lock, &attr);
+	ret = pthread_mutex_init(mutex, &mattr);
 	if (ret) {
 		log_err("pthread_mutex_init: %s\n", strerror(ret));
 		return ret;
 	}
 
-	pthread_condattr_destroy(&cond);
-	pthread_mutexattr_destroy(&attr);
+	return 0;
+}
+
+int mutex_cond_init_pshared(pthread_mutex_t *mutex, pthread_cond_t *cond)
+{
+	int ret;
+
+	ret = mutex_init_pshared(mutex);
+	if (ret)
+		return ret;
+
+	ret = cond_init_pshared(cond);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+int __fio_mutex_init(struct fio_mutex *mutex, int value)
+{
+	int ret;
+
+	mutex->value = value;
+	mutex->magic = FIO_MUTEX_MAGIC;
+
+	ret = mutex_cond_init_pshared(&mutex->lock, &mutex->cond);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 
@@ -92,12 +141,15 @@
 	return NULL;
 }
 
-static int mutex_timed_out(struct timeval *t, unsigned int seconds)
+static bool mutex_timed_out(struct timeval *t, unsigned int msecs)
 {
-	return mtime_since_now(t) >= seconds * 1000;
+	struct timeval now;
+
+	gettimeofday(&now, NULL);
+	return mtime_since(t, &now) >= msecs;
 }
 
-int fio_mutex_down_timeout(struct fio_mutex *mutex, unsigned int seconds)
+int fio_mutex_down_timeout(struct fio_mutex *mutex, unsigned int msecs)
 {
 	struct timeval tv_s;
 	struct timespec t;
@@ -106,43 +158,50 @@
 	assert(mutex->magic == FIO_MUTEX_MAGIC);
 
 	gettimeofday(&tv_s, NULL);
-	t.tv_sec = tv_s.tv_sec + seconds;
+	t.tv_sec = tv_s.tv_sec;
 	t.tv_nsec = tv_s.tv_usec * 1000;
 
+	t.tv_sec += msecs / 1000;
+	t.tv_nsec += ((msecs * 1000000ULL) % 1000000000);
+	if (t.tv_nsec >= 1000000000) {
+		t.tv_nsec -= 1000000000;
+		t.tv_sec++;
+	}
+
 	pthread_mutex_lock(&mutex->lock);
 
+	mutex->waiters++;
 	while (!mutex->value && !ret) {
-		mutex->waiters++;
-
 		/*
 		 * Some platforms (FreeBSD 9?) seems to return timed out
 		 * way too early, double check.
 		 */
 		ret = pthread_cond_timedwait(&mutex->cond, &mutex->lock, &t);
-		if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, seconds))
+		if (ret == ETIMEDOUT && !mutex_timed_out(&tv_s, msecs))
 			ret = 0;
-
-		mutex->waiters--;
 	}
+	mutex->waiters--;
 
 	if (!ret) {
 		mutex->value--;
 		pthread_mutex_unlock(&mutex->lock);
+		return 0;
 	}
 
+	pthread_mutex_unlock(&mutex->lock);
 	return ret;
 }
 
-int fio_mutex_down_trylock(struct fio_mutex *mutex)
+bool fio_mutex_down_trylock(struct fio_mutex *mutex)
 {
-	int ret = 1;
+	bool ret = true;
 
 	assert(mutex->magic == FIO_MUTEX_MAGIC);
 
 	pthread_mutex_lock(&mutex->lock);
 	if (mutex->value) {
 		mutex->value--;
-		ret = 0;
+		ret = false;
 	}
 	pthread_mutex_unlock(&mutex->lock);
 
@@ -228,7 +287,7 @@
 		log_err("pthread_rwlockattr_init: %s\n", strerror(ret));
 		goto err;
 	}
-#ifdef FIO_HAVE_PSHARED_MUTEX
+#ifdef CONFIG_PSHARED
 	ret = pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
 	if (ret) {
 		log_err("pthread_rwlockattr_setpshared: %s\n", strerror(ret));

diff --git a/mutex.h b/mutex.h
index 17380de..54009ba 100644
--- a/mutex.h
+++ b/mutex.h

@@ -2,6 +2,7 @@
 #define FIO_MUTEX_H
 
 #include <pthread.h>
+#include "lib/types.h"
 
 #define FIO_MUTEX_MAGIC		0x4d555445U
 #define FIO_RWLOCK_MAGIC	0x52574c4fU
@@ -30,7 +31,7 @@
 extern void fio_mutex_remove(struct fio_mutex *);
 extern void fio_mutex_up(struct fio_mutex *);
 extern void fio_mutex_down(struct fio_mutex *);
-extern int fio_mutex_down_trylock(struct fio_mutex *);
+extern bool fio_mutex_down_trylock(struct fio_mutex *);
 extern int fio_mutex_down_timeout(struct fio_mutex *, unsigned int);
 
 extern void fio_rwlock_read(struct fio_rwlock *);
@@ -39,4 +40,8 @@
 extern struct fio_rwlock *fio_rwlock_init(void);
 extern void fio_rwlock_remove(struct fio_rwlock *);
 
+extern int mutex_init_pshared(pthread_mutex_t *);
+extern int cond_init_pshared(pthread_cond_t *);
+extern int mutex_cond_init_pshared(pthread_mutex_t *, pthread_cond_t *);
+
 #endif

diff --git a/optgroup.c b/optgroup.c
new file mode 100644
index 0000000..122d24e
--- /dev/null
+++ b/optgroup.c

@@ -0,0 +1,205 @@
+#include <stdio.h>
+#include <inttypes.h>
+#include "optgroup.h"
+
+/*
+ * Option grouping
+ */
+static const struct opt_group fio_opt_groups[] = {
+	{
+		.name	= "General",
+		.mask	= FIO_OPT_C_GENERAL,
+	},
+	{
+		.name	= "I/O",
+		.mask	= FIO_OPT_C_IO,
+	},
+	{
+		.name	= "File",
+		.mask	= FIO_OPT_C_FILE,
+	},
+	{
+		.name	= "Statistics",
+		.mask	= FIO_OPT_C_STAT,
+	},
+	{
+		.name	= "Logging",
+		.mask	= FIO_OPT_C_LOG,
+	},
+	{
+		.name	= "Profiles",
+		.mask	= FIO_OPT_C_PROFILE,
+	},
+	{
+		.name	= "I/O engines",
+		.mask	= FIO_OPT_C_ENGINE,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group fio_opt_cat_groups[] = {
+	{
+		.name	= "Rate",
+		.mask	= FIO_OPT_G_RATE,
+	},
+	{
+		.name	= "Zone",
+		.mask	= FIO_OPT_G_ZONE,
+	},
+	{
+		.name	= "Read/write mix",
+		.mask	= FIO_OPT_G_RWMIX,
+	},
+	{
+		.name	= "Verify",
+		.mask	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "Trim",
+		.mask	= FIO_OPT_G_TRIM,
+	},
+	{
+		.name	= "I/O Logging",
+		.mask	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "I/O Depth",
+		.mask	= FIO_OPT_G_IO_DEPTH,
+	},
+	{
+		.name	= "I/O Flow",
+		.mask	= FIO_OPT_G_IO_FLOW,
+	},
+	{
+		.name	= "Description",
+		.mask	= FIO_OPT_G_DESC,
+	},
+	{
+		.name	= "Filename",
+		.mask	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "General I/O",
+		.mask	= FIO_OPT_G_IO_BASIC,
+	},
+	{
+		.name	= "Cgroups",
+		.mask	= FIO_OPT_G_CGROUP,
+	},
+	{
+		.name	= "Runtime",
+		.mask	= FIO_OPT_G_RUNTIME,
+	},
+	{
+		.name	= "Process",
+		.mask	= FIO_OPT_G_PROCESS,
+	},
+	{
+		.name	= "Job credentials / priority",
+		.mask	= FIO_OPT_G_CRED,
+	},
+	{
+		.name	= "Clock settings",
+		.mask	= FIO_OPT_G_CLOCK,
+	},
+	{
+		.name	= "I/O Type",
+		.mask	= FIO_OPT_G_IO_TYPE,
+	},
+	{
+		.name	= "I/O Thinktime",
+		.mask	= FIO_OPT_G_THINKTIME,
+	},
+	{
+		.name	= "Randomizations",
+		.mask	= FIO_OPT_G_RANDOM,
+	},
+	{
+		.name	= "I/O buffers",
+		.mask	= FIO_OPT_G_IO_BUF,
+	},
+	{
+		.name	= "Tiobench profile",
+		.mask	= FIO_OPT_G_TIOBENCH,
+	},
+	{
+		.name	= "Error handling",
+		.mask	= FIO_OPT_G_ERR,
+	},
+	{
+		.name	= "Ext4 defrag I/O engine", /* e4defrag */
+		.mask	= FIO_OPT_G_E4DEFRAG,
+	},
+	{
+		.name	= "Network I/O engine", /* net */
+		.mask	= FIO_OPT_G_NETIO,
+	},
+	{
+		.name	= "RDMA I/O engine", /* rdma */
+		.mask	= FIO_OPT_G_RDMA,
+	},
+	{
+		.name	= "libaio I/O engine", /* libaio */
+		.mask	= FIO_OPT_G_LIBAIO,
+	},
+	{
+		.name	= "ACT Aerospike like benchmark profile",
+		.mask	= FIO_OPT_G_ACT,
+	},
+	{
+		.name	= "Latency profiling",
+		.mask	= FIO_OPT_G_LATPROF,
+	},
+	{
+		.name	= "RBD I/O engine", /* rbd */
+		.mask	= FIO_OPT_G_RBD,
+	},
+	{
+		.name	= "GlusterFS I/O engine", /* gfapi,gfapi_async */
+		.mask	= FIO_OPT_G_GFAPI,
+	},
+	{
+		.name	= "MTD I/O engine", /* mtd */
+		.mask	= FIO_OPT_G_MTD,
+	},
+	{
+		.name	= "libhdfs I/O engine", /* libhdfs */
+		.mask	= FIO_OPT_G_HDFS,
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static const struct opt_group *group_from_mask(const struct opt_group *ogs,
+					       uint64_t *mask,
+					       uint64_t inv_mask)
+{
+	int i;
+
+	if (*mask == inv_mask || !*mask)
+		return NULL;
+
+	for (i = 0; ogs[i].name; i++) {
+		const struct opt_group *og = &ogs[i];
+
+		if (*mask & og->mask) {
+			*mask &= ~(og->mask);
+			return og;
+		}
+	}
+
+	return NULL;
+}
+
+const struct opt_group *opt_group_from_mask(uint64_t *mask)
+{
+	return group_from_mask(fio_opt_groups, mask, FIO_OPT_C_INVALID);
+}
+
+const struct opt_group *opt_group_cat_from_mask(uint64_t *mask)
+{
+	return group_from_mask(fio_opt_cat_groups, mask, FIO_OPT_G_INVALID);
+}

diff --git a/optgroup.h b/optgroup.h
new file mode 100644
index 0000000..815ac16
--- /dev/null
+++ b/optgroup.h

@@ -0,0 +1,102 @@
+#ifndef FIO_OPT_GROUP_H
+#define FIO_OPT_GROUP_H
+
+struct opt_group {
+	const char *name;
+	uint64_t mask;
+};
+
+enum opt_category {
+	__FIO_OPT_C_GENERAL	= 0,
+	__FIO_OPT_C_IO,
+	__FIO_OPT_C_FILE,
+	__FIO_OPT_C_STAT,
+	__FIO_OPT_C_LOG,
+	__FIO_OPT_C_PROFILE,
+	__FIO_OPT_C_ENGINE,
+	__FIO_OPT_C_NR,
+
+	FIO_OPT_C_GENERAL	= (1ULL << __FIO_OPT_C_GENERAL),
+	FIO_OPT_C_IO		= (1ULL << __FIO_OPT_C_IO),
+	FIO_OPT_C_FILE		= (1ULL << __FIO_OPT_C_FILE),
+	FIO_OPT_C_STAT		= (1ULL << __FIO_OPT_C_STAT),
+	FIO_OPT_C_LOG		= (1ULL << __FIO_OPT_C_LOG),
+	FIO_OPT_C_PROFILE	= (1ULL << __FIO_OPT_C_PROFILE),
+	FIO_OPT_C_ENGINE	= (1ULL << __FIO_OPT_C_ENGINE),
+	FIO_OPT_C_INVALID	= (1ULL << __FIO_OPT_C_NR),
+};
+
+enum opt_category_group {
+	__FIO_OPT_G_RATE	= 0,
+	__FIO_OPT_G_ZONE,
+	__FIO_OPT_G_RWMIX,
+	__FIO_OPT_G_VERIFY,
+	__FIO_OPT_G_TRIM,
+	__FIO_OPT_G_IOLOG,
+	__FIO_OPT_G_IO_DEPTH,
+	__FIO_OPT_G_IO_FLOW,
+	__FIO_OPT_G_DESC,
+	__FIO_OPT_G_FILENAME,
+	__FIO_OPT_G_IO_BASIC,
+	__FIO_OPT_G_CGROUP,
+	__FIO_OPT_G_RUNTIME,
+	__FIO_OPT_G_PROCESS,
+	__FIO_OPT_G_CRED,
+	__FIO_OPT_G_CLOCK,
+	__FIO_OPT_G_IO_TYPE,
+	__FIO_OPT_G_THINKTIME,
+	__FIO_OPT_G_RANDOM,
+	__FIO_OPT_G_IO_BUF,
+	__FIO_OPT_G_TIOBENCH,
+	__FIO_OPT_G_ERR,
+	__FIO_OPT_G_E4DEFRAG,
+	__FIO_OPT_G_NETIO,
+	__FIO_OPT_G_RDMA,
+	__FIO_OPT_G_LIBAIO,
+	__FIO_OPT_G_ACT,
+	__FIO_OPT_G_LATPROF,
+        __FIO_OPT_G_RBD,
+        __FIO_OPT_G_GFAPI,
+        __FIO_OPT_G_MTD,
+	__FIO_OPT_G_HDFS,
+	__FIO_OPT_G_NR,
+
+	FIO_OPT_G_RATE		= (1ULL << __FIO_OPT_G_RATE),
+	FIO_OPT_G_ZONE		= (1ULL << __FIO_OPT_G_ZONE),
+	FIO_OPT_G_RWMIX		= (1ULL << __FIO_OPT_G_RWMIX),
+	FIO_OPT_G_VERIFY	= (1ULL << __FIO_OPT_G_VERIFY),
+	FIO_OPT_G_TRIM		= (1ULL << __FIO_OPT_G_TRIM),
+	FIO_OPT_G_IOLOG		= (1ULL << __FIO_OPT_G_IOLOG),
+	FIO_OPT_G_IO_DEPTH	= (1ULL << __FIO_OPT_G_IO_DEPTH),
+	FIO_OPT_G_IO_FLOW	= (1ULL << __FIO_OPT_G_IO_FLOW),
+	FIO_OPT_G_DESC		= (1ULL << __FIO_OPT_G_DESC),
+	FIO_OPT_G_FILENAME	= (1ULL << __FIO_OPT_G_FILENAME),
+	FIO_OPT_G_IO_BASIC	= (1ULL << __FIO_OPT_G_IO_BASIC),
+	FIO_OPT_G_CGROUP	= (1ULL << __FIO_OPT_G_CGROUP),
+	FIO_OPT_G_RUNTIME	= (1ULL << __FIO_OPT_G_RUNTIME),
+	FIO_OPT_G_PROCESS	= (1ULL << __FIO_OPT_G_PROCESS),
+	FIO_OPT_G_CRED		= (1ULL << __FIO_OPT_G_CRED),
+	FIO_OPT_G_CLOCK		= (1ULL << __FIO_OPT_G_CLOCK),
+	FIO_OPT_G_IO_TYPE	= (1ULL << __FIO_OPT_G_IO_TYPE),
+	FIO_OPT_G_THINKTIME	= (1ULL << __FIO_OPT_G_THINKTIME),
+	FIO_OPT_G_RANDOM	= (1ULL << __FIO_OPT_G_RANDOM),
+	FIO_OPT_G_IO_BUF	= (1ULL << __FIO_OPT_G_IO_BUF),
+	FIO_OPT_G_TIOBENCH	= (1ULL << __FIO_OPT_G_TIOBENCH),
+	FIO_OPT_G_ERR		= (1ULL << __FIO_OPT_G_ERR),
+	FIO_OPT_G_E4DEFRAG	= (1ULL << __FIO_OPT_G_E4DEFRAG),
+	FIO_OPT_G_NETIO		= (1ULL << __FIO_OPT_G_NETIO),
+	FIO_OPT_G_RDMA		= (1ULL << __FIO_OPT_G_RDMA),
+	FIO_OPT_G_LIBAIO	= (1ULL << __FIO_OPT_G_LIBAIO),
+	FIO_OPT_G_ACT		= (1ULL << __FIO_OPT_G_ACT),
+	FIO_OPT_G_LATPROF	= (1ULL << __FIO_OPT_G_LATPROF),
+	FIO_OPT_G_RBD		= (1ULL << __FIO_OPT_G_RBD),
+	FIO_OPT_G_GFAPI		= (1ULL << __FIO_OPT_G_GFAPI),
+	FIO_OPT_G_MTD		= (1ULL << __FIO_OPT_G_MTD),
+	FIO_OPT_G_HDFS		= (1ULL << __FIO_OPT_G_HDFS),
+	FIO_OPT_G_INVALID	= (1ULL << __FIO_OPT_G_NR),
+};
+
+extern const struct opt_group *opt_group_from_mask(uint64_t *mask);
+extern const struct opt_group *opt_group_cat_from_mask(uint64_t *mask);
+
+#endif

diff --git a/options.c b/options.c
index ab6e399..b489e90 100644
--- a/options.c
+++ b/options.c

@@ -8,14 +8,27 @@
 #include <fcntl.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <netinet/in.h>
 
 #include "fio.h"
 #include "verify.h"
 #include "parse.h"
 #include "lib/fls.h"
+#include "lib/pattern.h"
 #include "options.h"
+#include "optgroup.h"
 
-#include "crc/crc32c.h"
+char client_sockaddr_str[INET6_ADDRSTRLEN] = { 0 };
+
+#define cb_data_to_td(data)	container_of(data, struct thread_data, o)
+
+static struct pattern_fmt_desc fmt_desc[] = {
+	{
+		.fmt   = "%o",
+		.len   = FIELD_SIZE(struct io_u *, offset),
+		.paste = paste_blockoff
+	}
+};
 
 /*
  * Check if mmap/mmaphuge has a :/foo/bar/file at the end. If so, return that.
@@ -33,63 +46,36 @@
 	return strdup(p);
 }
 
-static int converthexchartoint(char a)
-{
-	int base;
-
-	switch (a) {
-	case '0'...'9':
-		base = '0';
-		break;
-	case 'A'...'F':
-		base = 'A' - 10;
-		break;
-	case 'a'...'f':
-		base = 'a' - 10;
-		break;
-	default:
-		base = 0;
-	}
-	return a - base;
-}
-
 static int bs_cmp(const void *p1, const void *p2)
 {
 	const struct bssplit *bsp1 = p1;
 	const struct bssplit *bsp2 = p2;
 
-	return bsp1->perc < bsp2->perc;
+	return (int) bsp1->perc - (int) bsp2->perc;
 }
 
-static int bssplit_ddir(struct thread_options *o, int ddir, char *str)
+struct split {
+	unsigned int nr;
+	unsigned int val1[100];
+	unsigned int val2[100];
+};
+
+static int split_parse_ddir(struct thread_options *o, struct split *split,
+			    enum fio_ddir ddir, char *str)
 {
-	struct bssplit *bssplit;
-	unsigned int i, perc, perc_missing;
-	unsigned int max_bs, min_bs;
+	unsigned int i, perc;
 	long long val;
 	char *fname;
 
-	o->bssplit_nr[ddir] = 4;
-	bssplit = malloc(4 * sizeof(struct bssplit));
+	split->nr = 0;
 
 	i = 0;
-	max_bs = 0;
-	min_bs = -1;
 	while ((fname = strsep(&str, ":")) != NULL) {
 		char *perc_str;
 
 		if (!strlen(fname))
 			break;
 
-		/*
-		 * grow struct buffer, if needed
-		 */
-		if (i == o->bssplit_nr[ddir]) {
-			o->bssplit_nr[ddir] <<= 1;
-			bssplit = realloc(bssplit, o->bssplit_nr[ddir]
-						  * sizeof(struct bssplit));
-		}
-
 		perc_str = strstr(fname, "/");
 		if (perc_str) {
 			*perc_str = '\0';
@@ -104,28 +90,53 @@
 
 		if (str_to_decimal(fname, &val, 1, o, 0, 0)) {
 			log_err("fio: bssplit conversion failed\n");
-			free(bssplit);
 			return 1;
 		}
 
-		if (val > max_bs)
-			max_bs = val;
-		if (val < min_bs)
-			min_bs = val;
-
-		bssplit[i].bs = val;
-		bssplit[i].perc = perc;
+		split->val1[i] = val;
+		split->val2[i] = perc;
 		i++;
+		if (i == 100)
+			break;
 	}
 
-	o->bssplit_nr[ddir] = i;
+	split->nr = i;
+	return 0;
+}
+
+static int bssplit_ddir(struct thread_options *o, enum fio_ddir ddir, char *str)
+{
+	unsigned int i, perc, perc_missing;
+	unsigned int max_bs, min_bs;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, ddir, str))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	max_bs = 0;
+	min_bs = -1;
+	o->bssplit[ddir] = malloc(split.nr * sizeof(struct bssplit));
+	o->bssplit_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		if (split.val1[i] > max_bs)
+			max_bs = split.val1[i];
+		if (split.val1[i] < min_bs)
+			min_bs = split.val1[i];
+
+		o->bssplit[ddir][i].bs = split.val1[i];
+		o->bssplit[ddir][i].perc =split.val2[i];
+	}
 
 	/*
 	 * Now check if the percentages add up, and how much is missing
 	 */
 	perc = perc_missing = 0;
 	for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-		struct bssplit *bsp = &bssplit[i];
+		struct bssplit *bsp = &o->bssplit[ddir][i];
 
 		if (bsp->perc == -1U)
 			perc_missing++;
@@ -135,7 +146,8 @@
 
 	if (perc > 100 && perc_missing > 1) {
 		log_err("fio: bssplit percentages add to more than 100%%\n");
-		free(bssplit);
+		free(o->bssplit[ddir]);
+		o->bssplit[ddir] = NULL;
 		return 1;
 	}
 
@@ -147,7 +159,7 @@
 		if (perc_missing == 1 && o->bssplit_nr[ddir] == 1)
 			perc = 100;
 		for (i = 0; i < o->bssplit_nr[ddir]; i++) {
-			struct bssplit *bsp = &bssplit[i];
+			struct bssplit *bsp = &o->bssplit[ddir][i];
 
 			if (bsp->perc == -1U)
 				bsp->perc = (100 - perc) / perc_missing;
@@ -160,59 +172,78 @@
 	/*
 	 * now sort based on percentages, for ease of lookup
 	 */
-	qsort(bssplit, o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
-	o->bssplit[ddir] = bssplit;
+	qsort(o->bssplit[ddir], o->bssplit_nr[ddir], sizeof(struct bssplit), bs_cmp);
 	return 0;
 }
 
-static int str_bssplit_cb(void *data, const char *input)
+typedef int (split_parse_fn)(struct thread_options *, enum fio_ddir, char *);
+
+static int str_split_parse(struct thread_data *td, char *str, split_parse_fn *fn)
 {
-	struct thread_data *td = data;
-	char *str, *p, *odir, *ddir;
+	char *odir, *ddir;
 	int ret = 0;
 
-	if (parse_dryrun())
-		return 0;
-
-	p = str = strdup(input);
-
-	strip_blank_front(&str);
-	strip_blank_end(str);
-
 	odir = strchr(str, ',');
 	if (odir) {
 		ddir = strchr(odir + 1, ',');
 		if (ddir) {
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, ddir + 1);
+			ret = fn(&td->o, DDIR_TRIM, ddir + 1);
 			if (!ret)
 				*ddir = '\0';
 		} else {
 			char *op;
 
 			op = strdup(odir + 1);
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+			ret = fn(&td->o, DDIR_TRIM, op);
 
 			free(op);
 		}
 		if (!ret)
-			ret = bssplit_ddir(&td->o, DDIR_WRITE, odir + 1);
+			ret = fn(&td->o, DDIR_WRITE, odir + 1);
 		if (!ret) {
 			*odir = '\0';
-			ret = bssplit_ddir(&td->o, DDIR_READ, str);
+			ret = fn(&td->o, DDIR_READ, str);
 		}
 	} else {
 		char *op;
 
 		op = strdup(str);
-		ret = bssplit_ddir(&td->o, DDIR_WRITE, op);
+		ret = fn(&td->o, DDIR_WRITE, op);
 		free(op);
 
 		if (!ret) {
 			op = strdup(str);
-			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+			ret = fn(&td->o, DDIR_TRIM, op);
 			free(op);
 		}
-		ret = bssplit_ddir(&td->o, DDIR_READ, str);
+		if (!ret)
+			ret = fn(&td->o, DDIR_READ, str);
+	}
+
+	return ret;
+}
+
+static int str_bssplit_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	char *str, *p;
+	int ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	ret = str_split_parse(td, str, bssplit_ddir);
+
+	if (parse_dryrun()) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.bssplit[i]);
+			td->o.bssplit[i] = NULL;
+			td->o.bssplit_nr[i] = 0;
+		}
 	}
 
 	free(p);
@@ -229,7 +260,7 @@
 			    "EINVAL", "ENFILE", "EMFILE", "ENOTTY",
 			    "ETXTBSY","EFBIG", "ENOSPC", "ESPIPE",
 			    "EROFS","EMLINK", "EPIPE", "EDOM", "ERANGE" };
-	int i = 0, num = sizeof(err) / sizeof(void *);
+	int i = 0, num = sizeof(err) / sizeof(char *);
 
 	while (i < num) {
 		if (!strcmp(err[i], str))
@@ -295,7 +326,7 @@
 
 static int str_ignore_error_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *str, *p, *n;
 	int type = 0, ret = 1;
 
@@ -323,7 +354,7 @@
 
 static int str_rw_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	struct thread_options *o = &td->o;
 	char *nr;
 
@@ -357,9 +388,10 @@
 
 static int str_mem_cb(void *data, const char *mem)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
-	if (td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAP)
+	if (td->o.mem_type == MEM_MMAPHUGE || td->o.mem_type == MEM_MMAP ||
+	    td->o.mem_type == MEM_MMAPSHARED)
 		td->o.mmapfile = get_opt_postfix(mem);
 
 	return 0;
@@ -367,7 +399,7 @@
 
 static int fio_clock_source_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	fio_clock_source = td->o.clocksource;
 	fio_clock_source_set = 1;
@@ -377,7 +409,7 @@
 
 static int str_rwmix_read_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->o.rwmix[DDIR_READ] = *val;
 	td->o.rwmix[DDIR_WRITE] = 100 - *val;
@@ -386,7 +418,7 @@
 
 static int str_rwmix_write_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->o.rwmix[DDIR_WRITE] = *val;
 	td->o.rwmix[DDIR_READ] = 100 - *val;
@@ -424,7 +456,7 @@
 
 static int str_cpumask_cb(void *data, unsigned long long *val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	unsigned int i;
 	long max_cpu;
 	int ret;
@@ -443,9 +475,9 @@
 
 	for (i = 0; i < sizeof(int) * 8; i++) {
 		if ((1 << i) & *val) {
-			if (i > max_cpu) {
+			if (i >= max_cpu) {
 				log_err("fio: CPU %d too large (max=%ld)\n", i,
-								max_cpu);
+								max_cpu - 1);
 				return 1;
 			}
 			dprint(FD_PARSE, "set cpu allowed %d\n", i);
@@ -503,9 +535,9 @@
 				ret = 1;
 				break;
 			}
-			if (icpu > max_cpu) {
+			if (icpu >= max_cpu) {
 				log_err("fio: CPU %d too large (max=%ld)\n",
-							icpu, max_cpu);
+							icpu, max_cpu - 1);
 				ret = 1;
 				break;
 			}
@@ -524,7 +556,7 @@
 
 static int str_cpus_allowed_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (parse_dryrun())
 		return 0;
@@ -534,16 +566,32 @@
 
 static int str_verify_cpus_allowed_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
 
 	return set_cpus_allowed(td, &td->o.verify_cpumask, input);
 }
-#endif
+
+#ifdef CONFIG_ZLIB
+static int str_log_cpus_allowed_cb(void *data, const char *input)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (parse_dryrun())
+		return 0;
+
+	return set_cpus_allowed(td, &td->o.log_gz_cpumask, input);
+}
+#endif /* CONFIG_ZLIB */
+
+#endif /* FIO_HAVE_CPU_AFFINITY */
 
 #ifdef CONFIG_LIBNUMA
 static int str_numa_cpunodes_cb(void *data, char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	struct bitmask *verify_bitmask;
 
 	if (parse_dryrun())
@@ -568,7 +616,7 @@
 
 static int str_numa_mpol_cb(void *data, char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	const char * const policy_types[] =
 		{ "default", "prefer", "bind", "interleave", "local", NULL };
 	int i;
@@ -677,13 +725,78 @@
 
 static int str_fst_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
-	char *nr = get_opt_postfix(str);
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	bool done = false;
+	char *nr;
 
 	td->file_service_nr = 1;
-	if (nr) {
-		td->file_service_nr = atoi(nr);
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_RANDOM:
+	case FIO_FSERVICE_RR:
+	case FIO_FSERVICE_SEQ:
+		nr = get_opt_postfix(str);
+		if (nr) {
+			td->file_service_nr = atoi(nr);
+			free(nr);
+		}
+		done = true;
+		break;
+	case FIO_FSERVICE_ZIPF:
+		val = FIO_DEF_ZIPF;
+		break;
+	case FIO_FSERVICE_PARETO:
+		val = FIO_DEF_PARETO;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		val = 0.0;
+		break;
+	default:
+		log_err("fio: bad file service type: %d\n", td->o.file_service_type);
+		return 1;
+	}
+
+	if (done)
+		return 0;
+
+	nr = get_opt_postfix(str);
+	if (nr && !str_to_float(nr, &val, 0)) {
+		log_err("fio: file service type random postfix parsing failed\n");
 		free(nr);
+		return 1;
+	}
+
+	free(nr);
+
+	switch (td->o.file_service_type) {
+	case FIO_FSERVICE_ZIPF:
+		if (val == 1.00) {
+			log_err("fio: zipf theta must be different than 1.0\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->zipf_theta = val;
+		break;
+	case FIO_FSERVICE_PARETO:
+		if (val <= 0.00 || val >= 1.00) {
+                          log_err("fio: pareto input out of range (0 < input < 1.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->pareto_h = val;
+		break;
+	case FIO_FSERVICE_GAUSS:
+		if (val < 0.00 || val >= 100.00) {
+                          log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+                          return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->gauss_dev = val;
+		break;
 	}
 
 	return 0;
@@ -692,7 +805,7 @@
 #ifdef CONFIG_SYNC_FILE_RANGE
 static int str_sfr_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *nr = get_opt_postfix(str);
 
 	td->sync_file_range_nr = 1;
@@ -705,19 +818,208 @@
 }
 #endif
 
+static int zone_cmp(const void *p1, const void *p2)
+{
+	const struct zone_split *zsp1 = p1;
+	const struct zone_split *zsp2 = p2;
+
+	return (int) zsp2->access_perc - (int) zsp1->access_perc;
+}
+
+static int zone_split_ddir(struct thread_options *o, enum fio_ddir ddir,
+			   char *str)
+{
+	unsigned int i, perc, perc_missing, sperc, sperc_missing;
+	struct split split;
+
+	memset(&split, 0, sizeof(split));
+
+	if (split_parse_ddir(o, &split, ddir, str))
+		return 1;
+	if (!split.nr)
+		return 0;
+
+	o->zone_split[ddir] = malloc(split.nr * sizeof(struct zone_split));
+	o->zone_split_nr[ddir] = split.nr;
+	for (i = 0; i < split.nr; i++) {
+		o->zone_split[ddir][i].access_perc = split.val1[i];
+		o->zone_split[ddir][i].size_perc = split.val2[i];
+	}
+
+	/*
+	 * Now check if the percentages add up, and how much is missing
+	 */
+	perc = perc_missing = 0;
+	sperc = sperc_missing = 0;
+	for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &o->zone_split[ddir][i];
+
+		if (zsp->access_perc == (uint8_t) -1U)
+			perc_missing++;
+		else
+			perc += zsp->access_perc;
+
+		if (zsp->size_perc == (uint8_t) -1U)
+			sperc_missing++;
+		else
+			sperc += zsp->size_perc;
+
+	}
+
+	if (perc > 100 || sperc > 100) {
+		log_err("fio: zone_split percentages add to more than 100%%\n");
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+	if (perc < 100) {
+		log_err("fio: access percentage don't add up to 100 for zoned "
+			"random distribution (got=%u)\n", perc);
+		free(o->zone_split[ddir]);
+		o->zone_split[ddir] = NULL;
+		return 1;
+	}
+
+	/*
+	 * If values didn't have a percentage set, divide the remains between
+	 * them.
+	 */
+	if (perc_missing) {
+		if (perc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			perc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->access_perc == (uint8_t) -1U)
+				zsp->access_perc = (100 - perc) / perc_missing;
+		}
+	}
+	if (sperc_missing) {
+		if (sperc_missing == 1 && o->zone_split_nr[ddir] == 1)
+			sperc = 100;
+		for (i = 0; i < o->zone_split_nr[ddir]; i++) {
+			struct zone_split *zsp = &o->zone_split[ddir][i];
+
+			if (zsp->size_perc == (uint8_t) -1U)
+				zsp->size_perc = (100 - sperc) / sperc_missing;
+		}
+	}
+
+	/*
+	 * now sort based on percentages, for ease of lookup
+	 */
+	qsort(o->zone_split[ddir], o->zone_split_nr[ddir], sizeof(struct zone_split), zone_cmp);
+	return 0;
+}
+
+static void __td_zone_gen_index(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned int i, j, sprev, aprev;
+
+	td->zone_state_index[ddir] = malloc(sizeof(struct zone_split_index) * 100);
+
+	sprev = aprev = 0;
+	for (i = 0; i < td->o.zone_split_nr[ddir]; i++) {
+		struct zone_split *zsp = &td->o.zone_split[ddir][i];
+
+		for (j = aprev; j < aprev + zsp->access_perc; j++) {
+			struct zone_split_index *zsi = &td->zone_state_index[ddir][j];
+
+			zsi->size_perc = sprev + zsp->size_perc;
+			zsi->size_perc_prev = sprev;
+		}
+
+		aprev += zsp->access_perc;
+		sprev += zsp->size_perc;
+	}
+}
+
+/*
+ * Generate state table for indexes, so we don't have to do it inline from
+ * the hot IO path
+ */
+static void td_zone_gen_index(struct thread_data *td)
+{
+	int i;
+
+	td->zone_state_index = malloc(DDIR_RWDIR_CNT *
+					sizeof(struct zone_split_index *));
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		__td_zone_gen_index(td, i);
+}
+
+static int parse_zoned_distribution(struct thread_data *td, const char *input)
+{
+	char *str, *p;
+	int i, ret = 0;
+
+	p = str = strdup(input);
+
+	strip_blank_front(&str);
+	strip_blank_end(str);
+
+	/* We expect it to start like that, bail if not */
+	if (strncmp(str, "zoned:", 6)) {
+		log_err("fio: mismatch in zoned input <%s>\n", str);
+		free(p);
+		return 1;
+	}
+	str += strlen("zoned:");
+
+	ret = str_split_parse(td, str, zone_split_ddir);
+
+	free(p);
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		int j;
+
+		dprint(FD_PARSE, "zone ddir %d (nr=%u): \n", i, td->o.zone_split_nr[i]);
+
+		for (j = 0; j < td->o.zone_split_nr[i]; j++) {
+			struct zone_split *zsp = &td->o.zone_split[i][j];
+
+			dprint(FD_PARSE, "\t%d: %u/%u\n", j, zsp->access_perc,
+								zsp->size_perc);
+		}
+	}
+
+	if (parse_dryrun()) {
+		int i;
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			free(td->o.zone_split[i]);
+			td->o.zone_split[i] = NULL;
+			td->o.zone_split_nr[i] = 0;
+		}
+
+		return ret;
+	}
+
+	if (!ret)
+		td_zone_gen_index(td);
+	else {
+		for (i = 0; i < DDIR_RWDIR_CNT; i++)
+			td->o.zone_split_nr[i] = 0;
+	}
+
+	return ret;
+}
+
 static int str_random_distribution_cb(void *data, const char *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	double val;
 	char *nr;
 
-	if (parse_dryrun())
-		return 0;
-
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
 		val = FIO_DEF_ZIPF;
 	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
 		val = FIO_DEF_PARETO;
+	else if (td->o.random_distribution == FIO_RAND_DIST_GAUSS)
+		val = 0.0;
+	else if (td->o.random_distribution == FIO_RAND_DIST_ZONED)
+		return parse_zoned_distribution(td, str);
 	else
 		return 0;
 
@@ -735,18 +1037,102 @@
 			log_err("fio: zipf theta must different than 1.0\n");
 			return 1;
 		}
+		if (parse_dryrun())
+			return 0;
 		td->o.zipf_theta.u.f = val;
-	} else {
+	} else if (td->o.random_distribution == FIO_RAND_DIST_PARETO) {
 		if (val <= 0.00 || val >= 1.00) {
 			log_err("fio: pareto input out of range (0 < input < 1.0)\n");
 			return 1;
 		}
+		if (parse_dryrun())
+			return 0;
 		td->o.pareto_h.u.f = val;
+	} else {
+		if (val < 0.00 || val >= 100.0) {
+			log_err("fio: normal deviation out of range (0 <= input < 100.0)\n");
+			return 1;
+		}
+		if (parse_dryrun())
+			return 0;
+		td->o.gauss_dev.u.f = val;
 	}
 
 	return 0;
 }
 
+static int str_steadystate_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	double val;
+	char *nr;
+	char *pct;
+	long long ll;
+
+	if (td->o.ss_state != FIO_SS_IOPS && td->o.ss_state != FIO_SS_IOPS_SLOPE &&
+	    td->o.ss_state != FIO_SS_BW && td->o.ss_state != FIO_SS_BW_SLOPE) {
+		/* should be impossible to get here */
+		log_err("fio: unknown steady state criterion\n");
+		return 1;
+	}
+
+	nr = get_opt_postfix(str);
+	if (!nr) {
+		log_err("fio: steadystate threshold must be specified in addition to criterion\n");
+		free(nr);
+		return 1;
+	}
+
+	/* ENHANCEMENT Allow fio to understand size=10.2% and use here */
+	pct = strstr(nr, "%");
+	if (pct) {
+		*pct = '\0';
+		strip_blank_end(nr);
+		if (!str_to_float(nr, &val, 0))	{
+			log_err("fio: could not parse steadystate threshold percentage\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state threshold to %f%%\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_state |= __FIO_SS_PCT;
+		td->o.ss_limit.u.f = val;
+	} else if (td->o.ss_state & __FIO_SS_IOPS) {
+		if (!str_to_float(nr, &val, 0)) {
+			log_err("fio: steadystate IOPS threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state IOPS threshold to %f\n", val);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = val;
+	} else {	/* bandwidth criterion */
+		if (str_to_decimal(nr, &ll, 1, td, 0, 0)) {
+			log_err("fio: steadystate BW threshold postfix parsing failed\n");
+			free(nr);
+			return 1;
+		}
+
+		dprint(FD_PARSE, "set steady state BW threshold to %lld\n", ll);
+		free(nr);
+		if (parse_dryrun())
+			return 0;
+
+		td->o.ss_limit.u.f = (double) ll;
+	}
+
+	td->ss.state = td->o.ss_state;
+	return 0;
+}
+
 /*
  * Return next name in the string. Files are separated with ':'. If the ':'
  * is escaped with a '\', then that ':' is part of the filename and does not
@@ -812,7 +1198,8 @@
  * Returns the directory at the index, indexes > entires will be
  * assigned via modulo division of the index
  */
-int set_name_idx(char *target, char *input, int index)
+int set_name_idx(char *target, size_t tlen, char *input, int index,
+		 bool unique_filename)
 {
 	unsigned int cur_idx;
 	int len;
@@ -824,7 +1211,13 @@
 	for (cur_idx = 0; cur_idx <= index; cur_idx++)
 		fname = get_next_name(&str);
 
-	len = sprintf(target, "%s/", fname);
+	if (client_sockaddr_str[0] && unique_filename) {
+		len = snprintf(target, tlen, "%s/%s.", fname,
+				client_sockaddr_str);
+	} else
+		len = snprintf(target, tlen, "%s/", fname);
+
+	target[tlen - 1] = '\0';
 	free(p);
 
 	return len;
@@ -832,7 +1225,7 @@
 
 static int str_filename_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	char *fname, *str, *p;
 
 	p = str = strdup(input);
@@ -840,6 +1233,9 @@
 	strip_blank_front(&str);
 	strip_blank_end(str);
 
+	/*
+	 * Ignore what we may already have from nrfiles option.
+	 */
 	if (!td->files_index)
 		td->o.nr_files = 0;
 
@@ -855,7 +1251,7 @@
 
 static int str_directory_cb(void *data, const char fio_unused *unused)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	struct stat sb;
 	char *dirname, *str, *p;
 	int ret = 0;
@@ -886,7 +1282,7 @@
 
 static int str_opendir_cb(void *data, const char fio_unused *str)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (parse_dryrun())
 		return 0;
@@ -897,127 +1293,39 @@
 	return add_dir_files(td, td->o.opendir);
 }
 
-static int pattern_cb(char *pattern, unsigned int max_size,
-		      const char *input, unsigned int *pattern_bytes)
-{
-	long off;
-	int i = 0, j = 0, len, k, base = 10;
-	uint32_t pattern_length;
-	char *loc1, *loc2;
-
-	/*
-	 * Check if it's a string input
-	 */
-	loc1 = strchr(input, '\"');
-	if (loc1) {
-		do {
-			loc1++;
-			if (*loc1 == '\0' || *loc1 == '\"')
-				break;
-
-			pattern[i] = *loc1;
-			i++;
-		} while (i < max_size);
-
-		if (!i)
-			return 1;
-
-		goto fill;
-	}
-
-	/*
-	 * No string, find out if it's decimal or hexidecimal
-	 */
-	loc1 = strstr(input, "0x");
-	loc2 = strstr(input, "0X");
-	if (loc1 || loc2)
-		base = 16;
-	off = strtol(input, NULL, base);
-	if (off != LONG_MAX || errno != ERANGE) {
-		while (off) {
-			pattern[i] = off & 0xff;
-			off >>= 8;
-			i++;
-		}
-	} else {
-		len = strlen(input);
-		k = len - 1;
-		if (base == 16) {
-			if (loc1)
-				j = loc1 - input + 2;
-			else
-				j = loc2 - input + 2;
-		} else
-			return 1;
-		if (len - j < max_size * 2) {
-			while (k >= j) {
-				off = converthexchartoint(input[k--]);
-				if (k >= j)
-					off += (converthexchartoint(input[k--])
-						* 16);
-				pattern[i++] = (char) off;
-			}
-		}
-	}
-
-	/*
-	 * Fill the pattern all the way to the end. This greatly reduces
-	 * the number of memcpy's we have to do when verifying the IO.
-	 */
-fill:
-	pattern_length = i;
-	while (i > 1 && i * 2 <= max_size) {
-		memcpy(&pattern[i], &pattern[0], i);
-		i *= 2;
-	}
-
-	/*
-	 * Fill remainder, if the pattern multiple ends up not being
-	 * max_size.
-	 */
-	while (i > 1 && i < max_size) {
-		unsigned int b = min(pattern_length, max_size - i);
-
-		memcpy(&pattern[i], &pattern[0], b);
-		i += b;
-	}
-
-	if (i == 1) {
-		/*
-		 * The code in verify_io_u_pattern assumes a single byte
-		 * pattern fills the whole verify pattern buffer.
-		 */
-		memset(pattern, pattern[0], max_size);
-	}
-
-	*pattern_bytes = i;
-	return 0;
-}
-
 static int str_buffer_pattern_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	int ret;
 
-	ret = pattern_cb(td->o.buffer_pattern, MAX_PATTERN_SIZE, input,
-				&td->o.buffer_pattern_bytes);
+	/* FIXME: for now buffer pattern does not support formats */
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.buffer_pattern,
+				     MAX_PATTERN_SIZE, NULL, 0, NULL, NULL);
+	if (ret < 0)
+		return 1;
 
-	if (!ret && td->o.buffer_pattern_bytes) {
-		if (!td->o.compress_percentage)
-			td->o.refill_buffers = 0;
-		td->o.scramble_buffers = 0;
-		td->o.zero_buffers = 0;
-	} else {
-		log_err("fio: failed parsing pattern `%s`\n", input);
-		ret = 1;
-	}
+	assert(ret != 0);
+	td->o.buffer_pattern_bytes = ret;
 
-	return ret;
+	/*
+	 * If this job is doing any reading or has compression set,
+	 * ensure that we refill buffers for writes or we could be
+	 * invalidating the pattern through reads.
+	 */
+	if (!td->o.compress_percentage && !td_read(td))
+		td->o.refill_buffers = 0;
+	else
+		td->o.refill_buffers = 1;
+
+	td->o.scramble_buffers = 0;
+	td->o.zero_buffers = 0;
+
+	return 0;
 }
 
 static int str_buffer_compress_cb(void *data, unsigned long long *il)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->flags |= TD_F_COMPRESS;
 	td->o.compress_percentage = *il;
@@ -1026,7 +1334,7 @@
 
 static int str_dedupe_cb(void *data, unsigned long long *il)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	td->flags |= TD_F_COMPRESS;
 	td->o.dedupe_percentage = *il;
@@ -1036,24 +1344,30 @@
 
 static int str_verify_pattern_cb(void *data, const char *input)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	int ret;
 
-	ret = pattern_cb(td->o.verify_pattern, MAX_PATTERN_SIZE, input,
-				&td->o.verify_pattern_bytes);
+	td->o.verify_fmt_sz = ARRAY_SIZE(td->o.verify_fmt);
+	ret = parse_and_fill_pattern(input, strlen(input), td->o.verify_pattern,
+				     MAX_PATTERN_SIZE, fmt_desc, sizeof(fmt_desc),
+				     td->o.verify_fmt, &td->o.verify_fmt_sz);
+	if (ret < 0)
+		return 1;
 
+	assert(ret != 0);
+	td->o.verify_pattern_bytes = ret;
 	/*
-	 * VERIFY_META could already be set
+	 * VERIFY_* could already be set
 	 */
-	if (!ret && td->o.verify == VERIFY_NONE)
+	if (!fio_option_is_set(&td->o, verify))
 		td->o.verify = VERIFY_PATTERN;
 
-	return ret;
+	return 0;
 }
 
 static int str_gtod_reduce_cb(void *data, int *il)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	int val = *il;
 
 	td->o.disable_lat = !!val;
@@ -1069,7 +1383,7 @@
 
 static int str_size_cb(void *data, unsigned long long *__val)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 	unsigned long long v = *__val;
 
 	if (parse_is_percent(v)) {
@@ -1081,9 +1395,53 @@
 	return 0;
 }
 
+static int str_write_bw_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.bw_log_file = strdup(str);
+
+	td->o.write_bw_log = 1;
+	return 0;
+}
+
+static int str_write_lat_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.lat_log_file = strdup(str);
+
+	td->o.write_lat_log = 1;
+	return 0;
+}
+
+static int str_write_iops_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.iops_log_file = strdup(str);
+
+	td->o.write_iops_log = 1;
+	return 0;
+}
+
+static int str_write_hist_log_cb(void *data, const char *str)
+{
+	struct thread_data *td = cb_data_to_td(data);
+
+	if (str)
+		td->o.hist_log_file = strdup(str);
+
+	td->o.write_hist_log = 1;
+	return 0;
+}
+
 static int rw_verify(struct fio_option *o, void *data)
 {
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (read_only && td_write(td)) {
 		log_err("fio: job <%s> has write bit set, but fio is in"
@@ -1097,7 +1455,7 @@
 static int gtod_cpu_verify(struct fio_option *o, void *data)
 {
 #ifndef FIO_HAVE_CPU_AFFINITY
-	struct thread_data *td = data;
+	struct thread_data *td = cb_data_to_td(data);
 
 	if (td->o.gtod_cpu) {
 		log_err("fio: platform must support CPU affinity for"
@@ -1110,165 +1468,6 @@
 }
 
 /*
- * Option grouping
- */
-static struct opt_group fio_opt_groups[] = {
-	{
-		.name	= "General",
-		.mask	= FIO_OPT_C_GENERAL,
-	},
-	{
-		.name	= "I/O",
-		.mask	= FIO_OPT_C_IO,
-	},
-	{
-		.name	= "File",
-		.mask	= FIO_OPT_C_FILE,
-	},
-	{
-		.name	= "Statistics",
-		.mask	= FIO_OPT_C_STAT,
-	},
-	{
-		.name	= "Logging",
-		.mask	= FIO_OPT_C_LOG,
-	},
-	{
-		.name	= "Profiles",
-		.mask	= FIO_OPT_C_PROFILE,
-	},
-	{
-		.name	= NULL,
-	},
-};
-
-static struct opt_group *__opt_group_from_mask(struct opt_group *ogs, unsigned int *mask,
-					       unsigned int inv_mask)
-{
-	struct opt_group *og;
-	int i;
-
-	if (*mask == inv_mask || !*mask)
-		return NULL;
-
-	for (i = 0; ogs[i].name; i++) {
-		og = &ogs[i];
-
-		if (*mask & og->mask) {
-			*mask &= ~(og->mask);
-			return og;
-		}
-	}
-
-	return NULL;
-}
-
-struct opt_group *opt_group_from_mask(unsigned int *mask)
-{
-	return __opt_group_from_mask(fio_opt_groups, mask, FIO_OPT_C_INVALID);
-}
-
-static struct opt_group fio_opt_cat_groups[] = {
-	{
-		.name	= "Latency profiling",
-		.mask	= FIO_OPT_G_LATPROF,
-	},
-	{
-		.name	= "Rate",
-		.mask	= FIO_OPT_G_RATE,
-	},
-	{
-		.name	= "Zone",
-		.mask	= FIO_OPT_G_ZONE,
-	},
-	{
-		.name	= "Read/write mix",
-		.mask	= FIO_OPT_G_RWMIX,
-	},
-	{
-		.name	= "Verify",
-		.mask	= FIO_OPT_G_VERIFY,
-	},
-	{
-		.name	= "Trim",
-		.mask	= FIO_OPT_G_TRIM,
-	},
-	{
-		.name	= "I/O Logging",
-		.mask	= FIO_OPT_G_IOLOG,
-	},
-	{
-		.name	= "I/O Depth",
-		.mask	= FIO_OPT_G_IO_DEPTH,
-	},
-	{
-		.name	= "I/O Flow",
-		.mask	= FIO_OPT_G_IO_FLOW,
-	},
-	{
-		.name	= "Description",
-		.mask	= FIO_OPT_G_DESC,
-	},
-	{
-		.name	= "Filename",
-		.mask	= FIO_OPT_G_FILENAME,
-	},
-	{
-		.name	= "General I/O",
-		.mask	= FIO_OPT_G_IO_BASIC,
-	},
-	{
-		.name	= "Cgroups",
-		.mask	= FIO_OPT_G_CGROUP,
-	},
-	{
-		.name	= "Runtime",
-		.mask	= FIO_OPT_G_RUNTIME,
-	},
-	{
-		.name	= "Process",
-		.mask	= FIO_OPT_G_PROCESS,
-	},
-	{
-		.name	= "Job credentials / priority",
-		.mask	= FIO_OPT_G_CRED,
-	},
-	{
-		.name	= "Clock settings",
-		.mask	= FIO_OPT_G_CLOCK,
-	},
-	{
-		.name	= "I/O Type",
-		.mask	= FIO_OPT_G_IO_TYPE,
-	},
-	{
-		.name	= "I/O Thinktime",
-		.mask	= FIO_OPT_G_THINKTIME,
-	},
-	{
-		.name	= "Randomizations",
-		.mask	= FIO_OPT_G_RANDOM,
-	},
-	{
-		.name	= "I/O buffers",
-		.mask	= FIO_OPT_G_IO_BUF,
-	},
-	{
-		.name	= "Tiobench profile",
-		.mask	= FIO_OPT_G_TIOBENCH,
-	},
-
-	{
-		.name	= NULL,
-	}
-};
-
-struct opt_group *opt_group_cat_from_mask(unsigned int *mask)
-{
-	return __opt_group_from_mask(fio_opt_cat_groups, mask, FIO_OPT_G_INVALID);
-}
-
-/*
  * Map of job/command line options
  */
 struct fio_option fio_options[FIO_MAX_OPTS] = {
@@ -1276,7 +1475,7 @@
 		.name	= "description",
 		.lname	= "Description of job",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(description),
+		.off1	= offsetof(struct thread_options, description),
 		.help	= "Text job description",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_DESC,
@@ -1285,16 +1484,25 @@
 		.name	= "name",
 		.lname	= "Job name",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(name),
+		.off1	= offsetof(struct thread_options, name),
 		.help	= "Name of this job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_DESC,
 	},
 	{
+		.name	= "wait_for",
+		.lname	= "Waitee name",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct thread_options, wait_for),
+		.help	= "Name of the job this one wants to wait for before starting",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_DESC,
+	},
+	{
 		.name	= "filename",
 		.lname	= "Filename(s)",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(filename),
+		.off1	= offsetof(struct thread_options, filename),
 		.cb	= str_filename_cb,
 		.prio	= -1, /* must come after "directory" */
 		.help	= "File(s) to use for the workload",
@@ -1305,7 +1513,7 @@
 		.name	= "directory",
 		.lname	= "Directory",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(directory),
+		.off1	= offsetof(struct thread_options, directory),
 		.cb	= str_directory_cb,
 		.help	= "Directory to store files in",
 		.category = FIO_OPT_C_FILE,
@@ -1313,8 +1521,9 @@
 	},
 	{
 		.name	= "filename_format",
+		.lname	= "Filename Format",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(filename_format),
+		.off1	= offsetof(struct thread_options, filename_format),
 		.prio	= -1, /* must come after "directory" */
 		.help	= "Override default $jobname.$jobnum.$filenum naming",
 		.def	= "$jobname.$jobnum.$filenum",
@@ -1322,10 +1531,20 @@
 		.group	= FIO_OPT_G_FILENAME,
 	},
 	{
+		.name	= "unique_filename",
+		.lname	= "Unique Filename",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unique_filename),
+		.help	= "For network clients, prefix file with source IP",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
 		.name	= "lockfile",
 		.lname	= "Lockfile",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(file_lock_mode),
+		.off1	= offsetof(struct thread_options, file_lock_mode),
 		.help	= "Lock file when doing IO to it",
 		.prio	= 1,
 		.parent	= "filename",
@@ -1353,7 +1572,7 @@
 		.name	= "opendir",
 		.lname	= "Open directory",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(opendir),
+		.off1	= offsetof(struct thread_options, opendir),
 		.cb	= str_opendir_cb,
 		.help	= "Recursively add files from this directory and down",
 		.category = FIO_OPT_C_FILE,
@@ -1365,7 +1584,7 @@
 		.alias	= "readwrite",
 		.type	= FIO_OPT_STR,
 		.cb	= str_rw_cb,
-		.off1	= td_var_offset(td_ddir),
+		.off1	= offsetof(struct thread_options, td_ddir),
 		.help	= "IO direction",
 		.def	= "read",
 		.verify	= rw_verify,
@@ -1408,13 +1627,17 @@
 			    .oval = TD_DDIR_RANDRW,
 			    .help = "Random read and write mix"
 			  },
+			  { .ival = "trimwrite",
+			    .oval = TD_DDIR_TRIMWRITE,
+			    .help = "Trim and write mix, trims preceding writes"
+			  },
 		},
 	},
 	{
 		.name	= "rw_sequencer",
 		.lname	= "RW Sequencer",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(rw_seq),
+		.off1	= offsetof(struct thread_options, rw_seq),
 		.help	= "IO offset generator modifier",
 		.def	= "sequential",
 		.category = FIO_OPT_C_IO,
@@ -1435,7 +1658,7 @@
 		.name	= "ioengine",
 		.lname	= "IO Engine",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(ioengine),
+		.off1	= offsetof(struct thread_options, ioengine),
 		.help	= "IO engine to use",
 		.def	= FIO_PREFERRED_ENGINE,
 		.category = FIO_OPT_C_IO,
@@ -1455,6 +1678,11 @@
 			    .help = "Use preadv/pwritev",
 			  },
 #endif
+#ifdef FIO_HAVE_PWRITEV2
+			  { .ival = "pvsync2",
+			    .help = "Use preadv2/pwritev2",
+			  },
+#endif
 #ifdef CONFIG_LIBAIO
 			  { .ival = "libaio",
 			    .help = "Linux native asynchronous IO",
@@ -1548,6 +1776,17 @@
 			    .help = "Hadoop Distributed Filesystem (HDFS) engine"
 			  },
 #endif
+#ifdef CONFIG_PMEMBLK
+			  { .ival = "pmemblk",
+			    .help = "NVML libpmemblk based IO engine",
+			  },
+
+#endif
+#ifdef CONFIG_LINUX_DEVDAX
+			  { .ival = "dev-dax",
+			    .help = "DAX Device based IO engine",
+			  },
+#endif
 			  { .ival = "external",
 			    .help = "Load external engine (append name)",
 			  },
@@ -1557,7 +1796,7 @@
 		.name	= "iodepth",
 		.lname	= "IO Depth",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth),
+		.off1	= offsetof(struct thread_options, iodepth),
 		.help	= "Number of IO buffers to keep in flight",
 		.minval = 1,
 		.interval = 1,
@@ -1570,22 +1809,22 @@
 		.lname	= "IO Depth batch",
 		.alias	= "iodepth_batch_submit",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_batch),
+		.off1	= offsetof(struct thread_options, iodepth_batch),
 		.help	= "Number of IO buffers to submit in one go",
 		.parent	= "iodepth",
 		.hide	= 1,
-		.minval	= 1,
 		.interval = 1,
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
-		.name	= "iodepth_batch_complete",
-		.lname	= "IO Depth batch complete",
+		.name	= "iodepth_batch_complete_min",
+		.lname	= "Min IO depth batch complete",
+		.alias	= "iodepth_batch_complete",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_batch_complete),
-		.help	= "Number of IO buffers to retrieve in one go",
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_min),
+		.help	= "Min number of IO buffers to retrieve in one go",
 		.parent	= "iodepth",
 		.hide	= 1,
 		.minval	= 0,
@@ -1595,10 +1834,23 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "iodepth_batch_complete_max",
+		.lname	= "Max IO depth batch complete",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, iodepth_batch_complete_max),
+		.help	= "Max number of IO buffers to retrieve in one go",
+		.parent	= "iodepth",
+		.hide	= 1,
+		.minval	= 0,
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+	},
+	{
 		.name	= "iodepth_low",
 		.lname	= "IO Depth batch low",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iodepth_low),
+		.off1	= offsetof(struct thread_options, iodepth_low),
 		.help	= "Low water mark for queuing depth",
 		.parent	= "iodepth",
 		.hide	= 1,
@@ -1607,11 +1859,31 @@
 		.group	= FIO_OPT_G_IO_BASIC,
 	},
 	{
+		.name	= "io_submit_mode",
+		.lname	= "IO submit mode",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, io_submit_mode),
+		.help	= "How IO submissions and completions are done",
+		.def	= "inline",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BASIC,
+		.posval = {
+			  { .ival = "inline",
+			    .oval = IO_MODE_INLINE,
+			    .help = "Submit and complete IO inline",
+			  },
+			  { .ival = "offload",
+			    .oval = IO_MODE_OFFLOAD,
+			    .help = "Offload submit and complete to threads",
+			  },
+		},
+	},
+	{
 		.name	= "size",
 		.lname	= "Size",
 		.type	= FIO_OPT_STR_VAL,
 		.cb	= str_size_cb,
-		.off1	= td_var_offset(size),
+		.off1	= offsetof(struct thread_options, size),
 		.help	= "Total size of device or files",
 		.interval = 1024 * 1024,
 		.category = FIO_OPT_C_IO,
@@ -1622,7 +1894,8 @@
 		.alias	= "io_limit",
 		.lname	= "IO Size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(io_limit),
+		.off1	= offsetof(struct thread_options, io_size),
+		.help	= "Total size of I/O to be performed",
 		.interval = 1024 * 1024,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
@@ -1632,7 +1905,7 @@
 		.lname	= "Fill device",
 		.alias	= "fill_fs",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fill_device),
+		.off1	= offsetof(struct thread_options, fill_device),
 		.help	= "Write until an ENOSPC error occurs",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -1642,8 +1915,8 @@
 		.name	= "filesize",
 		.lname	= "File size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(file_size_low),
-		.off2	= td_var_offset(file_size_high),
+		.off1	= offsetof(struct thread_options, file_size_low),
+		.off2	= offsetof(struct thread_options, file_size_high),
 		.minval = 1,
 		.help	= "Size of individual files",
 		.interval = 1024 * 1024,
@@ -1654,7 +1927,7 @@
 		.name	= "file_append",
 		.lname	= "File append",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(file_append),
+		.off1	= offsetof(struct thread_options, file_append),
 		.help	= "IO will start at the end of the file(s)",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -1665,7 +1938,7 @@
 		.lname	= "IO offset",
 		.alias	= "fileoffset",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(start_offset),
+		.off1	= offsetof(struct thread_options, start_offset),
 		.help	= "Start IO from this offset",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -1676,7 +1949,7 @@
 		.name	= "offset_increment",
 		.lname	= "IO offset increment",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(offset_increment),
+		.off1	= offsetof(struct thread_options, offset_increment),
 		.help	= "What is the increment from one offset to the next",
 		.parent = "offset",
 		.hide	= 1,
@@ -1689,7 +1962,7 @@
 		.name	= "number_ios",
 		.lname	= "Number of IOs to perform",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(number_ios),
+		.off1	= offsetof(struct thread_options, number_ios),
 		.help	= "Force job completion after this number of IOs",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -1700,12 +1973,12 @@
 		.lname	= "Block size",
 		.alias	= "blocksize",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(bs[DDIR_READ]),
-		.off2	= td_var_offset(bs[DDIR_WRITE]),
-		.off3	= td_var_offset(bs[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, bs[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Block size unit",
-		.def	= "4k",
+		.def	= "4096",
 		.parent = "rw",
 		.hide	= 1,
 		.interval = 512,
@@ -1717,9 +1990,9 @@
 		.lname	= "Block size align",
 		.alias	= "blockalign",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ba[DDIR_READ]),
-		.off2	= td_var_offset(ba[DDIR_WRITE]),
-		.off3	= td_var_offset(ba[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, ba[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ba[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ba[DDIR_TRIM]),
 		.minval	= 1,
 		.help	= "IO block offset alignment",
 		.parent	= "rw",
@@ -1733,12 +2006,12 @@
 		.lname	= "Block size range",
 		.alias	= "blocksize_range",
 		.type	= FIO_OPT_RANGE,
-		.off1	= td_var_offset(min_bs[DDIR_READ]),
-		.off2	= td_var_offset(max_bs[DDIR_READ]),
-		.off3	= td_var_offset(min_bs[DDIR_WRITE]),
-		.off4	= td_var_offset(max_bs[DDIR_WRITE]),
-		.off5	= td_var_offset(min_bs[DDIR_TRIM]),
-		.off6	= td_var_offset(max_bs[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, min_bs[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, max_bs[DDIR_READ]),
+		.off3	= offsetof(struct thread_options, min_bs[DDIR_WRITE]),
+		.off4	= offsetof(struct thread_options, max_bs[DDIR_WRITE]),
+		.off5	= offsetof(struct thread_options, min_bs[DDIR_TRIM]),
+		.off6	= offsetof(struct thread_options, max_bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Set block size range (in more detail than bs)",
 		.parent = "rw",
@@ -1752,7 +2025,7 @@
 		.lname	= "Block size split",
 		.type	= FIO_OPT_STR,
 		.cb	= str_bssplit_cb,
-		.off1	= td_var_offset(bssplit),
+		.off1	= offsetof(struct thread_options, bssplit),
 		.help	= "Set a specific mix of block sizes",
 		.parent	= "rw",
 		.hide	= 1,
@@ -1764,7 +2037,7 @@
 		.lname	= "Block size unaligned",
 		.alias	= "blocksize_unaligned",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(bs_unaligned),
+		.off1	= offsetof(struct thread_options, bs_unaligned),
 		.help	= "Don't sector align IO buffer sizes",
 		.parent = "rw",
 		.hide	= 1,
@@ -1775,7 +2048,7 @@
 		.name	= "bs_is_seq_rand",
 		.lname	= "Block size division is seq/random (not read/write)",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(bs_is_seq_rand),
+		.off1	= offsetof(struct thread_options, bs_is_seq_rand),
 		.help	= "Consider any blocksize setting to be sequential,random",
 		.def	= "0",
 		.parent = "blocksize",
@@ -1786,7 +2059,7 @@
 		.name	= "randrepeat",
 		.lname	= "Random repeatable",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(rand_repeatable),
+		.off1	= offsetof(struct thread_options, rand_repeatable),
 		.help	= "Use repeatable random IO pattern",
 		.def	= "1",
 		.parent = "rw",
@@ -1798,8 +2071,9 @@
 		.name	= "randseed",
 		.lname	= "The random generator seed",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(rand_seed),
+		.off1	= offsetof(struct thread_options, rand_seed),
 		.help	= "Set the random generator seed value",
+		.def	= "0x89",
 		.parent = "rw",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
@@ -1808,7 +2082,7 @@
 		.name	= "use_os_rand",
 		.lname	= "Use OS random",
 		.type	= FIO_OPT_DEPRECATED,
-		.off1	= td_var_offset(dep_use_os_rand),
+		.off1	= offsetof(struct thread_options, dep_use_os_rand),
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
 	},
@@ -1816,7 +2090,7 @@
 		.name	= "norandommap",
 		.lname	= "No randommap",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(norandommap),
+		.off1	= offsetof(struct thread_options, norandommap),
 		.help	= "Accept potential duplicate random blocks",
 		.parent = "rw",
 		.hide	= 1,
@@ -1828,7 +2102,7 @@
 		.name	= "softrandommap",
 		.lname	= "Soft randommap",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(softrandommap),
+		.off1	= offsetof(struct thread_options, softrandommap),
 		.help	= "Set norandommap if randommap allocation fails",
 		.parent	= "norandommap",
 		.hide	= 1,
@@ -1838,8 +2112,9 @@
 	},
 	{
 		.name	= "random_generator",
+		.lname	= "Random Generator",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(random_generator),
+		.off1	= offsetof(struct thread_options, random_generator),
 		.help	= "Type of random number generator to use",
 		.def	= "tausworthe",
 		.posval	= {
@@ -1851,14 +2126,20 @@
 			    .oval = FIO_RAND_GEN_LFSR,
 			    .help = "Variable length LFSR",
 			  },
+			  {
+			    .ival = "tausworthe64",
+			    .oval = FIO_RAND_GEN_TAUSWORTHE64,
+			    .help = "64-bit Tausworthe variant",
+			  },
 		},
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
 	},
 	{
 		.name	= "random_distribution",
+		.lname	= "Random Distribution",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(random_distribution),
+		.off1	= offsetof(struct thread_options, random_distribution),
 		.cb	= str_random_distribution_cb,
 		.help	= "Random offset distribution generator",
 		.def	= "random",
@@ -1875,6 +2156,15 @@
 			    .oval = FIO_RAND_DIST_PARETO,
 			    .help = "Pareto distribution",
 			  },
+			  { .ival = "normal",
+			    .oval = FIO_RAND_DIST_GAUSS,
+			    .help = "Normal (Gaussian) distribution",
+			  },
+			  { .ival = "zoned",
+			    .oval = FIO_RAND_DIST_ZONED,
+			    .help = "Zoned random distribution",
+			  },
+
 		},
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
@@ -1883,9 +2173,9 @@
 		.name	= "percentage_random",
 		.lname	= "Percentage Random",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(perc_rand[DDIR_READ]),
-		.off2	= td_var_offset(perc_rand[DDIR_WRITE]),
-		.off3	= td_var_offset(perc_rand[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, perc_rand[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, perc_rand[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, perc_rand[DDIR_TRIM]),
 		.maxval	= 100,
 		.help	= "Percentage of seq/random mix that should be random",
 		.def	= "100,100,100",
@@ -1903,8 +2193,9 @@
 	},
 	{
 		.name	= "allrandrepeat",
+		.lname	= "All Random Repeat",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(allrand_repeatable),
+		.off1	= offsetof(struct thread_options, allrand_repeatable),
 		.help	= "Use repeatable random numbers for everything",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -1915,7 +2206,7 @@
 		.lname	= "Number of files",
 		.alias	= "nr_files",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(nr_files),
+		.off1	= offsetof(struct thread_options, nr_files),
 		.help	= "Split job workload between this number of files",
 		.def	= "1",
 		.interval = 1,
@@ -1926,7 +2217,7 @@
 		.name	= "openfiles",
 		.lname	= "Number of open files",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(open_files),
+		.off1	= offsetof(struct thread_options, open_files),
 		.help	= "Number of files to keep open at the same time",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -1936,7 +2227,7 @@
 		.lname	= "File service type",
 		.type	= FIO_OPT_STR,
 		.cb	= str_fst_cb,
-		.off1	= td_var_offset(file_service_type),
+		.off1	= offsetof(struct thread_options, file_service_type),
 		.help	= "How to select which file to service next",
 		.def	= "roundrobin",
 		.category = FIO_OPT_C_FILE,
@@ -1944,7 +2235,19 @@
 		.posval	= {
 			  { .ival = "random",
 			    .oval = FIO_FSERVICE_RANDOM,
-			    .help = "Choose a file at random",
+			    .help = "Choose a file at random (uniform)",
+			  },
+			  { .ival = "zipf",
+			    .oval = FIO_FSERVICE_ZIPF,
+			    .help = "Zipf randomized",
+			  },
+			  { .ival = "pareto",
+			    .oval = FIO_FSERVICE_PARETO,
+			    .help = "Pareto randomized",
+			  },
+			  { .ival = "gauss",
+			    .oval = FIO_FSERVICE_GAUSS,
+			    .help = "Normal (Gaussian) distribution",
 			  },
 			  { .ival = "roundrobin",
 			    .oval = FIO_FSERVICE_RR,
@@ -1963,7 +2266,7 @@
 		.name	= "fallocate",
 		.lname	= "Fallocate",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(fallocate_mode),
+		.off1	= offsetof(struct thread_options, fallocate_mode),
 		.help	= "Whether pre-allocation is performed when laying out files",
 		.def	= "posix",
 		.category = FIO_OPT_C_FILE,
@@ -1994,22 +2297,65 @@
 			  },
 		},
 	},
-#endif	/* CONFIG_POSIX_FALLOCATE */
+#else	/* CONFIG_POSIX_FALLOCATE */
+	{
+		.name	= "fallocate",
+		.lname	= "Fallocate",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support fallocate",
+	},
+#endif /* CONFIG_POSIX_FALLOCATE */
 	{
 		.name	= "fadvise_hint",
 		.lname	= "Fadvise hint",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fadvise_hint),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, fadvise_hint),
+		.posval	= {
+			  { .ival = "0",
+			    .oval = F_ADV_NONE,
+			    .help = "Don't issue fadvise",
+			  },
+			  { .ival = "1",
+			    .oval = F_ADV_TYPE,
+			    .help = "Advise using fio IO pattern",
+			  },
+			  { .ival = "random",
+			    .oval = F_ADV_RANDOM,
+			    .help = "Advise using FADV_RANDOM",
+			  },
+			  { .ival = "sequential",
+			    .oval = F_ADV_SEQUENTIAL,
+			    .help = "Advise using FADV_SEQUENTIAL",
+			  },
+		},
 		.help	= "Use fadvise() to advise the kernel on IO pattern",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#ifdef FIO_HAVE_STREAMID
+	{
+		.name	= "fadvise_stream",
+		.lname	= "Fadvise stream",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, fadvise_stream),
+		.help	= "Use fadvise() to set stream ID",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "fadvise_stream",
+		.lname	= "Fadvise stream",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support fadvise stream ID",
+	},
+#endif
 	{
 		.name	= "fsync",
 		.lname	= "Fsync",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(fsync_blocks),
+		.off1	= offsetof(struct thread_options, fsync_blocks),
 		.help	= "Issue fsync for writes every given number of blocks",
 		.def	= "0",
 		.interval = 1,
@@ -2020,7 +2366,7 @@
 		.name	= "fdatasync",
 		.lname	= "Fdatasync",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(fdatasync_blocks),
+		.off1	= offsetof(struct thread_options, fdatasync_blocks),
 		.help	= "Issue fdatasync for writes every given number of blocks",
 		.def	= "0",
 		.interval = 1,
@@ -2031,7 +2377,7 @@
 		.name	= "write_barrier",
 		.lname	= "Write barrier",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(barrier_blocks),
+		.off1	= offsetof(struct thread_options, barrier_blocks),
 		.help	= "Make every Nth write a barrier write",
 		.def	= "0",
 		.interval = 1,
@@ -2062,17 +2408,24 @@
 		},
 		.type	= FIO_OPT_STR_MULTI,
 		.cb	= str_sfr_cb,
-		.off1	= td_var_offset(sync_file_range),
+		.off1	= offsetof(struct thread_options, sync_file_range),
 		.help	= "Use sync_file_range()",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "sync_file_range",
+		.lname	= "Sync file range",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support sync_file_range",
+	},
 #endif
 	{
 		.name	= "direct",
 		.lname	= "Direct I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(odirect),
+		.off1	= offsetof(struct thread_options, odirect),
 		.help	= "Use O_DIRECT IO (negates buffered)",
 		.def	= "0",
 		.inverse = "buffered",
@@ -2083,7 +2436,7 @@
 		.name	= "atomic",
 		.lname	= "Atomic I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(oatomic),
+		.off1	= offsetof(struct thread_options, oatomic),
 		.help	= "Use Atomic IO with O_DIRECT (implies O_DIRECT)",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -2093,7 +2446,7 @@
 		.name	= "buffered",
 		.lname	= "Buffered I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(odirect),
+		.off1	= offsetof(struct thread_options, odirect),
 		.neg	= 1,
 		.help	= "Use buffered IO (negates direct)",
 		.def	= "1",
@@ -2105,7 +2458,7 @@
 		.name	= "overwrite",
 		.lname	= "Overwrite",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(overwrite),
+		.off1	= offsetof(struct thread_options, overwrite),
 		.help	= "When writing, set whether to overwrite current data",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2115,7 +2468,7 @@
 		.name	= "loops",
 		.lname	= "Loops",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(loops),
+		.off1	= offsetof(struct thread_options, loops),
 		.help	= "Number of times to run the job",
 		.def	= "1",
 		.interval = 1,
@@ -2126,7 +2479,7 @@
 		.name	= "numjobs",
 		.lname	= "Number of jobs",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(numjobs),
+		.off1	= offsetof(struct thread_options, numjobs),
 		.help	= "Duplicate this job this many times",
 		.def	= "1",
 		.interval = 1,
@@ -2137,8 +2490,8 @@
 		.name	= "startdelay",
 		.lname	= "Start delay",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(start_delay),
-		.off2	= td_var_offset(start_delay_high),
+		.off1	= offsetof(struct thread_options, start_delay),
+		.off2	= offsetof(struct thread_options, start_delay_high),
 		.help	= "Only start job when this period has passed",
 		.def	= "0",
 		.is_seconds = 1,
@@ -2151,7 +2504,7 @@
 		.lname	= "Runtime",
 		.alias	= "timeout",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(timeout),
+		.off1	= offsetof(struct thread_options, timeout),
 		.help	= "Stop workload when this amount of time has passed",
 		.def	= "0",
 		.is_seconds = 1,
@@ -2163,7 +2516,7 @@
 		.name	= "time_based",
 		.lname	= "Time based",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(time_based),
+		.off1	= offsetof(struct thread_options, time_based),
 		.help	= "Keep running until runtime/timeout is met",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
@@ -2172,7 +2525,7 @@
 		.name	= "verify_only",
 		.lname	= "Verify only",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(verify_only),
+		.off1	= offsetof(struct thread_options, verify_only),
 		.help	= "Verifies previously written data is still valid",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
@@ -2181,7 +2534,7 @@
 		.name	= "ramp_time",
 		.lname	= "Ramp time",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(ramp_time),
+		.off1	= offsetof(struct thread_options, ramp_time),
 		.help	= "Ramp up time before measuring performance",
 		.is_seconds = 1,
 		.is_time = 1,
@@ -2193,7 +2546,7 @@
 		.lname	= "Clock source",
 		.type	= FIO_OPT_STR,
 		.cb	= fio_clock_source_cb,
-		.off1	= td_var_offset(clocksource),
+		.off1	= offsetof(struct thread_options, clocksource),
 		.help	= "What type of timing source to use",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CLOCK,
@@ -2224,7 +2577,7 @@
 		.lname	= "I/O Memory",
 		.type	= FIO_OPT_STR,
 		.cb	= str_mem_cb,
-		.off1	= td_var_offset(mem_type),
+		.off1	= offsetof(struct thread_options, mem_type),
 		.help	= "Backing type for IO buffers",
 		.def	= "malloc",
 		.category = FIO_OPT_C_IO,
@@ -2250,12 +2603,22 @@
 			    .oval = MEM_MMAP,
 			    .help = "Use mmap(2) (file or anon) for IO buffers",
 			  },
+			  { .ival = "mmapshared",
+			    .oval = MEM_MMAPSHARED,
+			    .help = "Like mmap, but use the shared flag",
+			  },
 #ifdef FIO_HAVE_HUGETLB
 			  { .ival = "mmaphuge",
 			    .oval = MEM_MMAPHUGE,
 			    .help = "Like mmap, but use huge pages",
 			  },
 #endif
+#ifdef CONFIG_CUDA
+			  { .ival = "cudamalloc",
+			    .oval = MEM_CUDA_MALLOC,
+			    .help = "Allocate GPU device memory for GPUDirect RDMA",
+			  },
+#endif
 		  },
 	},
 	{
@@ -2263,7 +2626,7 @@
 		.alias	= "mem_align",
 		.lname	= "I/O memory alignment",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(mem_align),
+		.off1	= offsetof(struct thread_options, mem_align),
 		.minval	= 0,
 		.help	= "IO memory buffer offset alignment",
 		.def	= "0",
@@ -2276,7 +2639,7 @@
 		.name	= "verify",
 		.lname	= "Verify",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(verify),
+		.off1	= offsetof(struct thread_options, verify),
 		.help	= "Verify data written",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -2326,13 +2689,37 @@
 			    .oval = VERIFY_SHA512,
 			    .help = "Use sha512 checksums for verification",
 			  },
+			  { .ival = "sha3-224",
+			    .oval = VERIFY_SHA3_224,
+			    .help = "Use sha3-224 checksums for verification",
+			  },
+			  { .ival = "sha3-256",
+			    .oval = VERIFY_SHA3_256,
+			    .help = "Use sha3-256 checksums for verification",
+			  },
+			  { .ival = "sha3-384",
+			    .oval = VERIFY_SHA3_384,
+			    .help = "Use sha3-384 checksums for verification",
+			  },
+			  { .ival = "sha3-512",
+			    .oval = VERIFY_SHA3_512,
+			    .help = "Use sha3-512 checksums for verification",
+			  },
 			  { .ival = "xxhash",
 			    .oval = VERIFY_XXHASH,
 			    .help = "Use xxhash checksums for verification",
 			  },
+			  /* Meta information was included into verify_header,
+			   * 'meta' verification is implied by default. */
 			  { .ival = "meta",
-			    .oval = VERIFY_META,
-			    .help = "Use io information",
+			    .oval = VERIFY_HDR_ONLY,
+			    .help = "Use io information for verification. "
+				    "Now is implied by default, thus option is obsolete, "
+				    "don't use it",
+			  },
+			  { .ival = "pattern",
+			    .oval = VERIFY_PATTERN_NO_HDR,
+			    .help = "Verify strict pattern",
 			  },
 			  {
 			    .ival = "null",
@@ -2345,7 +2732,7 @@
 		.name	= "do_verify",
 		.lname	= "Perform verify step",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(do_verify),
+		.off1	= offsetof(struct thread_options, do_verify),
 		.help	= "Run verification stage after write",
 		.def	= "1",
 		.parent = "verify",
@@ -2357,7 +2744,7 @@
 		.name	= "verifysort",
 		.lname	= "Verify sort",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verifysort),
+		.off1	= offsetof(struct thread_options, verifysort),
 		.help	= "Sort written verify blocks for read back",
 		.def	= "1",
 		.parent = "verify",
@@ -2367,8 +2754,9 @@
 	},
 	{
 		.name	= "verifysort_nr",
+		.lname	= "Verify Sort Nr",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verifysort_nr),
+		.off1	= offsetof(struct thread_options, verifysort_nr),
 		.help	= "Pre-load and sort verify blocks for a read workload",
 		.minval	= 0,
 		.maxval	= 131072,
@@ -2381,7 +2769,7 @@
 		.name   = "verify_interval",
 		.lname	= "Verify interval",
 		.type   = FIO_OPT_INT,
-		.off1   = td_var_offset(verify_interval),
+		.off1   = offsetof(struct thread_options, verify_interval),
 		.minval	= 2 * sizeof(struct verify_header),
 		.help   = "Store verify buffer header every N bytes",
 		.parent	= "verify",
@@ -2395,7 +2783,7 @@
 		.lname	= "Verify offset",
 		.type	= FIO_OPT_INT,
 		.help	= "Offset verify header location by N bytes",
-		.off1	= td_var_offset(verify_offset),
+		.off1	= offsetof(struct thread_options, verify_offset),
 		.minval	= sizeof(struct verify_header),
 		.parent	= "verify",
 		.hide	= 1,
@@ -2407,7 +2795,7 @@
 		.lname	= "Verify pattern",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_pattern_cb,
-		.off1	= td_var_offset(verify_pattern),
+		.off1	= offsetof(struct thread_options, verify_pattern),
 		.help	= "Fill pattern for IO buffers",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2418,7 +2806,7 @@
 		.name	= "verify_fatal",
 		.lname	= "Verify fatal",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verify_fatal),
+		.off1	= offsetof(struct thread_options, verify_fatal),
 		.def	= "0",
 		.help	= "Exit on a single verify failure, don't continue",
 		.parent = "verify",
@@ -2430,7 +2818,7 @@
 		.name	= "verify_dump",
 		.lname	= "Verify dump",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(verify_dump),
+		.off1	= offsetof(struct thread_options, verify_dump),
 		.def	= "0",
 		.help	= "Dump contents of good and bad blocks on failure",
 		.parent = "verify",
@@ -2442,7 +2830,7 @@
 		.name	= "verify_async",
 		.lname	= "Verify asynchronously",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verify_async),
+		.off1	= offsetof(struct thread_options, verify_async),
 		.def	= "0",
 		.help	= "Number of async verifier threads to use",
 		.parent	= "verify",
@@ -2454,7 +2842,7 @@
 		.name	= "verify_backlog",
 		.lname	= "Verify backlog",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(verify_backlog),
+		.off1	= offsetof(struct thread_options, verify_backlog),
 		.help	= "Verify after this number of blocks are written",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2465,7 +2853,7 @@
 		.name	= "verify_backlog_batch",
 		.lname	= "Verify backlog batch",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(verify_batch),
+		.off1	= offsetof(struct thread_options, verify_batch),
 		.help	= "Verify this number of IO blocks",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2478,17 +2866,25 @@
 		.lname	= "Async verify CPUs",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_cpus_allowed_cb,
-		.off1	= td_var_offset(verify_cpumask),
+		.off1	= offsetof(struct thread_options, verify_cpumask),
 		.help	= "Set CPUs allowed for async verify threads",
 		.parent	= "verify_async",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
+#else
+	{
+		.name	= "verify_async_cpus",
+		.lname	= "Async verify CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
 #endif
 	{
 		.name	= "experimental_verify",
-		.off1	= td_var_offset(experimental_verify),
+		.lname	= "Experimental Verify",
+		.off1	= offsetof(struct thread_options, experimental_verify),
 		.type	= FIO_OPT_BOOL,
 		.help	= "Enable experimental verification",
 		.parent	= "verify",
@@ -2498,7 +2894,7 @@
 	{
 		.name	= "verify_state_load",
 		.lname	= "Load verify state",
-		.off1	= td_var_offset(verify_state),
+		.off1	= offsetof(struct thread_options, verify_state),
 		.type	= FIO_OPT_BOOL,
 		.help	= "Load verify termination state",
 		.parent	= "verify",
@@ -2508,7 +2904,7 @@
 	{
 		.name	= "verify_state_save",
 		.lname	= "Save verify state",
-		.off1	= td_var_offset(verify_state_save),
+		.off1	= offsetof(struct thread_options, verify_state_save),
 		.type	= FIO_OPT_BOOL,
 		.def	= "1",
 		.help	= "Save verify state on termination",
@@ -2521,10 +2917,10 @@
 		.name	= "trim_percentage",
 		.lname	= "Trim percentage",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(trim_percentage),
+		.off1	= offsetof(struct thread_options, trim_percentage),
 		.minval = 0,
 		.maxval = 100,
-		.help	= "Number of verify blocks to discard/trim",
+		.help	= "Number of verify blocks to trim (i.e., discard)",
 		.parent	= "verify",
 		.def	= "0",
 		.interval = 1,
@@ -2536,8 +2932,8 @@
 		.name	= "trim_verify_zero",
 		.lname	= "Verify trim zero",
 		.type	= FIO_OPT_BOOL,
-		.help	= "Verify that trim/discarded blocks are returned as zeroes",
-		.off1	= td_var_offset(trim_zero),
+		.help	= "Verify that trimmed (i.e., discarded) blocks are returned as zeroes",
+		.off1	= offsetof(struct thread_options, trim_zero),
 		.parent	= "trim_percentage",
 		.hide	= 1,
 		.def	= "1",
@@ -2548,7 +2944,7 @@
 		.name	= "trim_backlog",
 		.lname	= "Trim backlog",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(trim_backlog),
+		.off1	= offsetof(struct thread_options, trim_backlog),
 		.help	= "Trim after this number of blocks are written",
 		.parent	= "trim_percentage",
 		.hide	= 1,
@@ -2560,7 +2956,7 @@
 		.name	= "trim_backlog_batch",
 		.lname	= "Trim backlog batch",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(trim_batch),
+		.off1	= offsetof(struct thread_options, trim_batch),
 		.help	= "Trim this number of IO blocks",
 		.parent	= "trim_percentage",
 		.hide	= 1,
@@ -2568,12 +2964,37 @@
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_TRIM,
 	},
+#else
+	{
+		.name	= "trim_percentage",
+		.lname	= "Trim percentage",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_verify_zero",
+		.lname	= "Verify trim zero",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog",
+		.lname	= "Trim backlog",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
+	{
+		.name	= "trim_backlog_batch",
+		.lname	= "Trim backlog batch",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Fio does not support TRIM on your platform",
+	},
 #endif
 	{
 		.name	= "write_iolog",
 		.lname	= "Write I/O log",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(write_iolog_file),
+		.off1	= offsetof(struct thread_options, write_iolog_file),
 		.help	= "Store IO pattern to file",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IOLOG,
@@ -2582,7 +3003,7 @@
 		.name	= "read_iolog",
 		.lname	= "Read I/O log",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(read_iolog_file),
+		.off1	= offsetof(struct thread_options, read_iolog_file),
 		.help	= "Playback IO pattern from file",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IOLOG,
@@ -2591,7 +3012,7 @@
 		.name	= "replay_no_stall",
 		.lname	= "Don't stall on replay",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(no_stall),
+		.off1	= offsetof(struct thread_options, no_stall),
 		.def	= "0",
 		.parent	= "read_iolog",
 		.hide	= 1,
@@ -2603,7 +3024,7 @@
 		.name	= "replay_redirect",
 		.lname	= "Redirect device for replay",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(replay_redirect),
+		.off1	= offsetof(struct thread_options, replay_redirect),
 		.parent	= "read_iolog",
 		.hide	= 1,
 		.help	= "Replay all I/O onto this device, regardless of trace device",
@@ -2611,10 +3032,32 @@
 		.group	= FIO_OPT_G_IOLOG,
 	},
 	{
+		.name	= "replay_scale",
+		.lname	= "Replace offset scale factor",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_scale),
+		.parent	= "read_iolog",
+		.def	= "1",
+		.help	= "Align offsets to this blocksize",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+	},
+	{
+		.name	= "replay_align",
+		.lname	= "Replace alignment",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, replay_align),
+		.parent	= "read_iolog",
+		.help	= "Scale offset down by this factor",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IOLOG,
+		.pow2	= 1,
+	},
+	{
 		.name	= "exec_prerun",
 		.lname	= "Pre-execute runnable",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(exec_prerun),
+		.off1	= offsetof(struct thread_options, exec_prerun),
 		.help	= "Execute this file prior to running job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -2623,7 +3066,7 @@
 		.name	= "exec_postrun",
 		.lname	= "Post-execute runnable",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(exec_postrun),
+		.off1	= offsetof(struct thread_options, exec_postrun),
 		.help	= "Execute this file after running job",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -2633,17 +3076,24 @@
 		.name	= "ioscheduler",
 		.lname	= "I/O scheduler",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(ioscheduler),
+		.off1	= offsetof(struct thread_options, ioscheduler),
 		.help	= "Use this IO scheduler on the backing device",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "ioscheduler",
+		.lname	= "I/O scheduler",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO scheduler switching",
+	},
 #endif
 	{
 		.name	= "zonesize",
 		.lname	= "Zone size",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_size),
+		.off1	= offsetof(struct thread_options, zone_size),
 		.help	= "Amount of data to read per zone",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2654,7 +3104,7 @@
 		.name	= "zonerange",
 		.lname	= "Zone range",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_range),
+		.off1	= offsetof(struct thread_options, zone_range),
 		.help	= "Give size of an IO zone",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2665,7 +3115,7 @@
 		.name	= "zoneskip",
 		.lname	= "Zone skip",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(zone_skip),
+		.off1	= offsetof(struct thread_options, zone_skip),
 		.help	= "Space between IO zones",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2676,7 +3126,7 @@
 		.name	= "lockmem",
 		.lname	= "Lock memory",
 		.type	= FIO_OPT_STR_VAL,
-		.off1	= td_var_offset(lockmem),
+		.off1	= offsetof(struct thread_options, lockmem),
 		.help	= "Lock down this amount of memory (per worker)",
 		.def	= "0",
 		.interval = 1024 * 1024,
@@ -2688,7 +3138,7 @@
 		.lname	= "Read/write mix read",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_read_cb,
-		.off1	= td_var_offset(rwmix[DDIR_READ]),
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_READ]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is reads",
 		.def	= "50",
@@ -2702,7 +3152,7 @@
 		.lname	= "Read/write mix write",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_write_cb,
-		.off1	= td_var_offset(rwmix[DDIR_WRITE]),
+		.off1	= offsetof(struct thread_options, rwmix[DDIR_WRITE]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is writes",
 		.def	= "50",
@@ -2722,7 +3172,7 @@
 		.name	= "nice",
 		.lname	= "Nice",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(nice),
+		.off1	= offsetof(struct thread_options, nice),
 		.help	= "Set job CPU nice value",
 		.minval	= -19,
 		.maxval	= 20,
@@ -2736,32 +3186,51 @@
 		.name	= "prio",
 		.lname	= "I/O nice priority",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ioprio),
+		.off1	= offsetof(struct thread_options, ioprio),
 		.help	= "Set job IO priority value",
-		.minval	= 0,
-		.maxval	= 7,
+		.minval	= IOPRIO_MIN_PRIO,
+		.maxval	= IOPRIO_MAX_PRIO,
 		.interval = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+#else
+	{
+		.name	= "prio",
+		.lname	= "I/O nice priority",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priorities",
+	},
+#endif
+#ifdef FIO_HAVE_IOPRIO_CLASS
+#ifndef FIO_HAVE_IOPRIO
+#error "FIO_HAVE_IOPRIO_CLASS requires FIO_HAVE_IOPRIO"
+#endif
 	{
 		.name	= "prioclass",
 		.lname	= "I/O nice priority class",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ioprio_class),
+		.off1	= offsetof(struct thread_options, ioprio_class),
 		.help	= "Set job IO priority class",
-		.minval	= 0,
-		.maxval	= 3,
+		.minval	= IOPRIO_MIN_PRIO_CLASS,
+		.maxval	= IOPRIO_MAX_PRIO_CLASS,
 		.interval = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+#else
+	{
+		.name	= "prioclass",
+		.lname	= "I/O nice priority class",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support IO priority classes",
+	},
 #endif
 	{
 		.name	= "thinktime",
 		.lname	= "Thinktime",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime),
+		.off1	= offsetof(struct thread_options, thinktime),
 		.help	= "Idle time between IO buffers (usec)",
 		.def	= "0",
 		.is_time = 1,
@@ -2772,7 +3241,7 @@
 		.name	= "thinktime_spin",
 		.lname	= "Thinktime spin",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime_spin),
+		.off1	= offsetof(struct thread_options, thinktime_spin),
 		.help	= "Start think time by spinning this amount (usec)",
 		.def	= "0",
 		.is_time = 1,
@@ -2785,7 +3254,7 @@
 		.name	= "thinktime_blocks",
 		.lname	= "Thinktime blocks",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(thinktime_blocks),
+		.off1	= offsetof(struct thread_options, thinktime_blocks),
 		.help	= "IO buffer period between 'thinktime'",
 		.def	= "1",
 		.parent	= "thinktime",
@@ -2797,20 +3266,21 @@
 		.name	= "rate",
 		.lname	= "I/O rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate[DDIR_READ]),
-		.off2	= td_var_offset(rate[DDIR_WRITE]),
-		.off3	= td_var_offset(rate[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate[DDIR_TRIM]),
 		.help	= "Set bandwidth rate",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
-		.name	= "ratemin",
+		.name	= "rate_min",
+		.alias	= "ratemin",
 		.lname	= "I/O min rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ratemin[DDIR_READ]),
-		.off2	= td_var_offset(ratemin[DDIR_WRITE]),
-		.off3	= td_var_offset(ratemin[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, ratemin[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, ratemin[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, ratemin[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shutdown",
 		.parent	= "rate",
 		.hide	= 1,
@@ -2821,9 +3291,9 @@
 		.name	= "rate_iops",
 		.lname	= "I/O rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops[DDIR_READ]),
-		.off2	= td_var_offset(rate_iops[DDIR_WRITE]),
-		.off3	= td_var_offset(rate_iops[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate_iops[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops[DDIR_TRIM]),
 		.help	= "Limit IO used to this number of IO operations/sec",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
@@ -2833,9 +3303,9 @@
 		.name	= "rate_iops_min",
 		.lname	= "I/O min rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops_min[DDIR_READ]),
-		.off2	= td_var_offset(rate_iops_min[DDIR_WRITE]),
-		.off3	= td_var_offset(rate_iops_min[DDIR_TRIM]),
+		.off1	= offsetof(struct thread_options, rate_iops_min[DDIR_READ]),
+		.off2	= offsetof(struct thread_options, rate_iops_min[DDIR_WRITE]),
+		.off3	= offsetof(struct thread_options, rate_iops_min[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shut down",
 		.parent	= "rate_iops",
 		.hide	= 1,
@@ -2843,10 +3313,33 @@
 		.group	= FIO_OPT_G_RATE,
 	},
 	{
-		.name	= "ratecycle",
+		.name	= "rate_process",
+		.lname	= "Rate Process",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, rate_process),
+		.help	= "What process controls how rated IO is managed",
+		.def	= "linear",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_RATE,
+		.posval = {
+			  { .ival = "linear",
+			    .oval = RATE_PROCESS_LINEAR,
+			    .help = "Linear rate of IO",
+			  },
+			  {
+			    .ival = "poisson",
+			    .oval = RATE_PROCESS_POISSON,
+			    .help = "Rate follows Poisson process",
+			  },
+		},
+		.parent = "rate",
+	},
+	{
+		.name	= "rate_cycle",
+		.alias	= "ratecycle",
 		.lname	= "I/O rate cycle",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ratecycle),
+		.off1	= offsetof(struct thread_options, ratecycle),
 		.help	= "Window average for rate limits (msec)",
 		.def	= "1000",
 		.parent = "rate",
@@ -2856,8 +3349,9 @@
 	},
 	{
 		.name	= "max_latency",
+		.lname	= "Max Latency",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(max_latency),
+		.off1	= offsetof(struct thread_options, max_latency),
 		.help	= "Maximum tolerated IO latency (usec)",
 		.is_time = 1,
 		.category = FIO_OPT_C_IO,
@@ -2867,7 +3361,7 @@
 		.name	= "latency_target",
 		.lname	= "Latency Target (usec)",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(latency_target),
+		.off1	= offsetof(struct thread_options, latency_target),
 		.help	= "Ramp to max queue depth supporting this latency",
 		.is_time = 1,
 		.category = FIO_OPT_C_IO,
@@ -2877,7 +3371,7 @@
 		.name	= "latency_window",
 		.lname	= "Latency Window (usec)",
 		.type	= FIO_OPT_STR_VAL_TIME,
-		.off1	= td_var_offset(latency_window),
+		.off1	= offsetof(struct thread_options, latency_window),
 		.help	= "Time to sustain latency_target",
 		.is_time = 1,
 		.category = FIO_OPT_C_IO,
@@ -2887,7 +3381,7 @@
 		.name	= "latency_percentile",
 		.lname	= "Latency Percentile",
 		.type	= FIO_OPT_FLOAT_LIST,
-		.off1	= td_var_offset(latency_percentile),
+		.off1	= offsetof(struct thread_options, latency_percentile),
 		.help	= "Percentile of IOs must be below latency_target",
 		.def	= "100",
 		.maxlen	= 1,
@@ -2900,7 +3394,7 @@
 		.name	= "invalidate",
 		.lname	= "Cache invalidate",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(invalidate_cache),
+		.off1	= offsetof(struct thread_options, invalidate_cache),
 		.help	= "Invalidate buffer/page cache prior to running job",
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
@@ -2910,7 +3404,7 @@
 		.name	= "sync",
 		.lname	= "Synchronous I/O",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(sync_io),
+		.off1	= offsetof(struct thread_options, sync_io),
 		.help	= "Use O_SYNC for buffered writes",
 		.def	= "0",
 		.parent = "buffered",
@@ -2922,8 +3416,8 @@
 		.name	= "create_serialize",
 		.lname	= "Create serialize",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_serialize),
-		.help	= "Serialize creating of job files",
+		.off1	= offsetof(struct thread_options, create_serialize),
+		.help	= "Serialize creation of job files",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -2932,7 +3426,7 @@
 		.name	= "create_fsync",
 		.lname	= "Create fsync",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_fsync),
+		.off1	= offsetof(struct thread_options, create_fsync),
 		.help	= "fsync file after creation",
 		.def	= "1",
 		.category = FIO_OPT_C_FILE,
@@ -2942,7 +3436,7 @@
 		.name	= "create_on_open",
 		.lname	= "Create on open",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_on_open),
+		.off1	= offsetof(struct thread_options, create_on_open),
 		.help	= "Create files when they are opened for IO",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2950,17 +3444,38 @@
 	},
 	{
 		.name	= "create_only",
+		.lname	= "Create Only",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(create_only),
+		.off1	= offsetof(struct thread_options, create_only),
 		.help	= "Only perform file creation phase",
 		.category = FIO_OPT_C_FILE,
 		.def	= "0",
 	},
 	{
+		.name	= "allow_file_create",
+		.lname	= "Allow file create",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_create),
+		.help	= "Permit fio to create files, if they don't exist",
+		.def	= "1",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
+		.name	= "allow_mounted_write",
+		.lname	= "Allow mounted write",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, allow_mounted_write),
+		.help	= "Allow writes to a mounted partition",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_FILENAME,
+	},
+	{
 		.name	= "pre_read",
 		.lname	= "Pre-read files",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(pre_read),
+		.off1	= offsetof(struct thread_options, pre_read),
 		.help	= "Pre-read files before starting official testing",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -2972,7 +3487,7 @@
 		.lname	= "CPU mask",
 		.type	= FIO_OPT_INT,
 		.cb	= str_cpumask_cb,
-		.off1	= td_var_offset(cpumask),
+		.off1	= offsetof(struct thread_options, cpumask),
 		.help	= "CPU affinity mask",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -2982,7 +3497,7 @@
 		.lname	= "CPUs allowed",
 		.type	= FIO_OPT_STR,
 		.cb	= str_cpus_allowed_cb,
-		.off1	= td_var_offset(cpumask),
+		.off1	= offsetof(struct thread_options, cpumask),
 		.help	= "Set CPUs allowed",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -2991,7 +3506,7 @@
 		.name	= "cpus_allowed_policy",
 		.lname	= "CPUs allowed distribution policy",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(cpus_allowed_policy),
+		.off1	= offsetof(struct thread_options, cpus_allowed_policy),
 		.help	= "Distribution policy for cpus_allowed",
 		.parent = "cpus_allowed",
 		.prio	= 1,
@@ -3008,32 +3523,78 @@
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+#else
+	{
+		.name	= "cpumask",
+		.lname	= "CPU mask",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed",
+		.lname	= "CPUs allowed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
 #endif
 #ifdef CONFIG_LIBNUMA
 	{
 		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_cpunodes_cb,
-		.off1	= td_var_offset(numa_cpunodes),
+		.off1	= offsetof(struct thread_options, numa_cpunodes),
 		.help	= "NUMA CPU nodes bind",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_mpol_cb,
-		.off1	= td_var_offset(numa_memnodes),
+		.off1	= offsetof(struct thread_options, numa_memnodes),
 		.help	= "NUMA memory policy setup",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "numa_cpu_nodes",
+		.lname	= "NUMA CPU Nodes",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+	{
+		.name	= "numa_mem_policy",
+		.lname	= "NUMA Memory Policy",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Build fio with libnuma-dev(el) to enable this option",
+	},
+#endif
+#ifdef CONFIG_CUDA
+	{
+		.name	= "gpu_dev_id",
+		.lname	= "GPU device ID",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, gpu_dev_id),
+		.help	= "Set GPU device ID for GPUDirect RDMA",
+		.def    = "0",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_INVALID,
+	},
 #endif
 	{
 		.name	= "end_fsync",
 		.lname	= "End fsync",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(end_fsync),
+		.off1	= offsetof(struct thread_options, end_fsync),
 		.help	= "Include fsync at the end of job",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -3043,7 +3604,7 @@
 		.name	= "fsync_on_close",
 		.lname	= "Fsync on close",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(fsync_on_close),
+		.off1	= offsetof(struct thread_options, fsync_on_close),
 		.help	= "fsync files on close",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
@@ -3053,13 +3614,23 @@
 		.name	= "unlink",
 		.lname	= "Unlink file",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(unlink),
+		.off1	= offsetof(struct thread_options, unlink),
 		.help	= "Unlink created files after job has completed",
 		.def	= "0",
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "unlink_each_loop",
+		.lname	= "Unlink file after each loop of a job",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, unlink_each_loop),
+		.help	= "Unlink created files after each loop in a job has completed",
+		.def	= "0",
+		.category = FIO_OPT_C_FILE,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "exitall",
 		.lname	= "Exit-all on terminate",
 		.type	= FIO_OPT_STR_SET,
@@ -3069,11 +3640,20 @@
 		.group	= FIO_OPT_G_PROCESS,
 	},
 	{
+		.name	= "exitall_on_error",
+		.lname	= "Exit-all on terminate in error",
+		.type	= FIO_OPT_STR_SET,
+		.off1	= offsetof(struct thread_options, exitall_error),
+		.help	= "Terminate all jobs when one exits in error",
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_PROCESS,
+	},
+	{
 		.name	= "stonewall",
 		.lname	= "Wait for previous",
 		.alias	= "wait_for_previous",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(stonewall),
+		.off1	= offsetof(struct thread_options, stonewall),
 		.help	= "Insert a hard barrier between this job and previous",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
@@ -3082,7 +3662,7 @@
 		.name	= "new_group",
 		.lname	= "New group",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(new_group),
+		.off1	= offsetof(struct thread_options, new_group),
 		.help	= "Mark the start of a new group (for reporting)",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
@@ -3091,7 +3671,7 @@
 		.name	= "thread",
 		.lname	= "Thread",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(use_thread),
+		.off1	= offsetof(struct thread_options, use_thread),
 		.help	= "Use threads instead of processes",
 #ifdef CONFIG_NO_SHM
 		.def	= "1",
@@ -3101,10 +3681,21 @@
 		.group	= FIO_OPT_G_PROCESS,
 	},
 	{
+		.name	= "per_job_logs",
+		.lname	= "Per Job Logs",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, per_job_logs),
+		.help	= "Include job number in generated log files or not",
+		.def	= "1",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "write_bw_log",
 		.lname	= "Write bandwidth log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(bw_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, bw_log_file),
+		.cb	= str_write_bw_log_cb,
 		.help	= "Write log of bandwidth during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -3112,8 +3703,9 @@
 	{
 		.name	= "write_lat_log",
 		.lname	= "Write latency log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(lat_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, lat_log_file),
+		.cb	= str_write_lat_log_cb,
 		.help	= "Write log of latency during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -3121,8 +3713,9 @@
 	{
 		.name	= "write_iops_log",
 		.lname	= "Write IOPS log",
-		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(iops_log_file),
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, iops_log_file),
+		.cb	= str_write_iops_log_cb,
 		.help	= "Write log of IOPS during run",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
@@ -3131,17 +3724,59 @@
 		.name	= "log_avg_msec",
 		.lname	= "Log averaging (msec)",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(log_avg_msec),
+		.off1	= offsetof(struct thread_options, log_avg_msec),
 		.help	= "Average bw/iops/lat logs over this period of time",
 		.def	= "0",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "log_hist_msec",
+		.lname	= "Log histograms (msec)",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_msec),
+		.help	= "Dump completion latency histograms at frequency of this time value",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_hist_coarseness",
+		.lname	= "Histogram logs coarseness",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, log_hist_coarseness),
+		.help	= "Integer in range [0,6]. Higher coarseness outputs"
+			" fewer histogram bins per sample. The number of bins for"
+			" these are [1216, 608, 304, 152, 76, 38, 19] respectively.",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "write_hist_log",
+		.lname	= "Write latency histogram logs",
+		.type	= FIO_OPT_STR,
+		.off1	= offsetof(struct thread_options, hist_log_file),
+		.cb	= str_write_hist_log_cb,
+		.help	= "Write log of latency histograms during run",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_max_value",
+		.lname	= "Log maximum instead of average",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, log_max),
+		.help	= "Log max sample in a window instead of average",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "log_offset",
 		.lname	= "Log offset of IO",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(log_offset),
+		.off1	= offsetof(struct thread_options, log_offset),
 		.help	= "Include offset of IO for each log entry",
 		.def	= "0",
 		.category = FIO_OPT_C_LOG,
@@ -3152,28 +3787,80 @@
 		.name	= "log_compression",
 		.lname	= "Log compression",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(log_gz),
+		.off1	= offsetof(struct thread_options, log_gz),
 		.help	= "Log in compressed chunks of this size",
-		.minval	= 32 * 1024 * 1024ULL,
+		.minval	= 1024ULL,
 		.maxval	= 512 * 1024 * 1024ULL,
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#ifdef FIO_HAVE_CPU_AFFINITY
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_STR,
+		.cb	= str_log_cpus_allowed_cb,
+		.off1	= offsetof(struct thread_options, log_gz_cpumask),
+		.parent = "log_compression",
+		.help	= "Limit log compression to these CPUs",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#else
+	{
+		.name	= "log_compression_cpus",
+		.lname	= "Log Compression CPUs",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support CPU affinities",
+	},
+#endif
 	{
 		.name	= "log_store_compressed",
 		.lname	= "Log store compressed",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(log_gz_store),
+		.off1	= offsetof(struct thread_options, log_gz_store),
 		.help	= "Store logs in a compressed format",
 		.category = FIO_OPT_C_LOG,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Install libz-dev(el) to get compression support",
+	},
 #endif
 	{
+		.name = "log_unix_epoch",
+		.lname = "Log epoch unix",
+		.type = FIO_OPT_BOOL,
+		.off1 = offsetof(struct thread_options, log_unix_epoch),
+		.help = "Use Unix time in log files",
+		.category = FIO_OPT_C_LOG,
+		.group = FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "block_error_percentiles",
+		.lname	= "Block error percentiles",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, block_error_hist),
+		.help	= "Record trim block errors and make a histogram",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "bwavgtime",
 		.lname	= "Bandwidth average time",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(bw_avg_time),
+		.off1	= offsetof(struct thread_options, bw_avg_time),
 		.help	= "Time window over which to calculate bandwidth"
 			  " (msec)",
 		.def	= "500",
@@ -3187,7 +3874,7 @@
 		.name	= "iopsavgtime",
 		.lname	= "IOPS average time",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(iops_avg_time),
+		.off1	= offsetof(struct thread_options, iops_avg_time),
 		.help	= "Time window over which to calculate IOPS (msec)",
 		.def	= "500",
 		.parent	= "write_iops_log",
@@ -3200,16 +3887,26 @@
 		.name	= "group_reporting",
 		.lname	= "Group reporting",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(group_reporting),
+		.off1	= offsetof(struct thread_options, group_reporting),
 		.help	= "Do reporting on a per-group basis",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "stats",
+		.lname	= "Stats",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, stats),
+		.help	= "Enable collection of stats",
+		.def	= "1",
+		.category = FIO_OPT_C_STAT,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
 		.name	= "zero_buffers",
 		.lname	= "Zero I/O buffers",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(zero_buffers),
+		.off1	= offsetof(struct thread_options, zero_buffers),
 		.help	= "Init IO buffers to all zeroes",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -3218,7 +3915,7 @@
 		.name	= "refill_buffers",
 		.lname	= "Refill I/O buffers",
 		.type	= FIO_OPT_STR_SET,
-		.off1	= td_var_offset(refill_buffers),
+		.off1	= offsetof(struct thread_options, refill_buffers),
 		.help	= "Refill IO buffers on every IO submit",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -3227,7 +3924,7 @@
 		.name	= "scramble_buffers",
 		.lname	= "Scramble I/O buffers",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(scramble_buffers),
+		.off1	= offsetof(struct thread_options, scramble_buffers),
 		.help	= "Slightly scramble buffers on every IO submit",
 		.def	= "1",
 		.category = FIO_OPT_C_IO,
@@ -3238,7 +3935,7 @@
 		.lname	= "Buffer pattern",
 		.type	= FIO_OPT_STR,
 		.cb	= str_buffer_pattern_cb,
-		.off1	= td_var_offset(buffer_pattern),
+		.off1	= offsetof(struct thread_options, buffer_pattern),
 		.help	= "Fill pattern for IO buffers",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -3248,7 +3945,7 @@
 		.lname	= "Buffer compression percentage",
 		.type	= FIO_OPT_INT,
 		.cb	= str_buffer_compress_cb,
-		.off1	= td_var_offset(compress_percentage),
+		.off1	= offsetof(struct thread_options, compress_percentage),
 		.maxval	= 100,
 		.minval	= 0,
 		.help	= "How compressible the buffer is (approximately)",
@@ -3260,7 +3957,7 @@
 		.name	= "buffer_compress_chunk",
 		.lname	= "Buffer compression chunk size",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(compress_chunk),
+		.off1	= offsetof(struct thread_options, compress_chunk),
 		.parent	= "buffer_compress_percentage",
 		.hide	= 1,
 		.help	= "Size of compressible region in buffer",
@@ -3273,7 +3970,7 @@
 		.lname	= "Dedupe percentage",
 		.type	= FIO_OPT_INT,
 		.cb	= str_dedupe_cb,
-		.off1	= td_var_offset(dedupe_percentage),
+		.off1	= offsetof(struct thread_options, dedupe_percentage),
 		.maxval	= 100,
 		.minval	= 0,
 		.help	= "Percentage of buffers that are dedupable",
@@ -3285,7 +3982,7 @@
 		.name	= "clat_percentiles",
 		.lname	= "Completion latency percentiles",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(clat_percentiles),
+		.off1	= offsetof(struct thread_options, clat_percentiles),
 		.help	= "Enable the reporting of completion latency percentiles",
 		.def	= "1",
 		.category = FIO_OPT_C_STAT,
@@ -3293,11 +3990,12 @@
 	},
 	{
 		.name	= "percentile_list",
-		.lname	= "Completion latency percentile list",
+		.lname	= "Percentile list",
 		.type	= FIO_OPT_FLOAT_LIST,
-		.off1	= td_var_offset(percentile_list),
-		.off2	= td_var_offset(percentile_precision),
-		.help	= "Specify a custom list of percentiles to report",
+		.off1	= offsetof(struct thread_options, percentile_list),
+		.off2	= offsetof(struct thread_options, percentile_precision),
+		.help	= "Specify a custom list of percentiles to report for "
+			  "completion latency and block errors",
 		.def    = "1:5:10:20:30:40:50:60:70:80:90:95:99:99.5:99.9:99.95:99.99",
 		.maxlen	= FIO_IO_U_LIST_MAX_LEN,
 		.minfp	= 0.0,
@@ -3311,12 +4009,19 @@
 		.name	= "disk_util",
 		.lname	= "Disk utilization",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(do_disk_util),
+		.off1	= offsetof(struct thread_options, do_disk_util),
 		.help	= "Log disk utilization statistics",
 		.def	= "1",
 		.category = FIO_OPT_C_STAT,
 		.group	= FIO_OPT_G_INVALID,
 	},
+#else
+	{
+		.name	= "disk_util",
+		.lname	= "Disk utilization",
+		.type	= FIO_OPT_UNSUPPORTED,
+		.help	= "Your platform does not support disk utilization",
+	},
 #endif
 	{
 		.name	= "gtod_reduce",
@@ -3333,7 +4038,7 @@
 		.name	= "disable_lat",
 		.lname	= "Disable all latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_lat),
+		.off1	= offsetof(struct thread_options, disable_lat),
 		.help	= "Disable latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3345,7 +4050,7 @@
 		.name	= "disable_clat",
 		.lname	= "Disable completion latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_clat),
+		.off1	= offsetof(struct thread_options, disable_clat),
 		.help	= "Disable completion latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3357,7 +4062,7 @@
 		.name	= "disable_slat",
 		.lname	= "Disable submission latency stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_slat),
+		.off1	= offsetof(struct thread_options, disable_slat),
 		.help	= "Disable submission latency numbers",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3367,9 +4072,10 @@
 	},
 	{
 		.name	= "disable_bw_measurement",
+		.alias	= "disable_bw",
 		.lname	= "Disable bandwidth stats",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(disable_bw),
+		.off1	= offsetof(struct thread_options, disable_bw),
 		.help	= "Disable bandwidth logging",
 		.parent	= "gtod_reduce",
 		.hide	= 1,
@@ -3381,7 +4087,7 @@
 		.name	= "gtod_cpu",
 		.lname	= "Dedicated gettimeofday() CPU",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(gtod_cpu),
+		.off1	= offsetof(struct thread_options, gtod_cpu),
 		.help	= "Set up dedicated gettimeofday() thread on this CPU",
 		.verify	= gtod_cpu_verify,
 		.category = FIO_OPT_C_GENERAL,
@@ -3389,8 +4095,9 @@
 	},
 	{
 		.name	= "unified_rw_reporting",
+		.lname	= "Unified RW Reporting",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(unified_rw_rep),
+		.off1	= offsetof(struct thread_options, unified_rw_rep),
 		.help	= "Unify reporting across data direction",
 		.def	= "0",
 		.category = FIO_OPT_C_GENERAL,
@@ -3400,7 +4107,7 @@
 		.name	= "continue_on_error",
 		.lname	= "Continue on error",
 		.type	= FIO_OPT_STR,
-		.off1	= td_var_offset(continue_on_error),
+		.off1	= offsetof(struct thread_options, continue_on_error),
 		.help	= "Continue on non-fatal errors during IO",
 		.def	= "none",
 		.category = FIO_OPT_C_GENERAL,
@@ -3442,9 +4149,10 @@
 	},
 	{
 		.name	= "ignore_error",
+		.lname	= "Ignore Error",
 		.type	= FIO_OPT_STR,
 		.cb	= str_ignore_error_cb,
-		.off1	= td_var_offset(ignore_error_nr),
+		.off1	= offsetof(struct thread_options, ignore_error_nr),
 		.help	= "Set a specific list of errors to ignore",
 		.parent	= "rw",
 		.category = FIO_OPT_C_GENERAL,
@@ -3452,8 +4160,9 @@
 	},
 	{
 		.name	= "error_dump",
+		.lname	= "Error Dump",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(error_dump),
+		.off1	= offsetof(struct thread_options, error_dump),
 		.def	= "0",
 		.help	= "Dump info on each error",
 		.category = FIO_OPT_C_GENERAL,
@@ -3463,7 +4172,7 @@
 		.name	= "profile",
 		.lname	= "Profile",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(profile),
+		.off1	= offsetof(struct thread_options, profile),
 		.help	= "Select a specific builtin performance test",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_INVALID,
@@ -3472,7 +4181,7 @@
 		.name	= "cgroup",
 		.lname	= "Cgroup",
 		.type	= FIO_OPT_STR_STORE,
-		.off1	= td_var_offset(cgroup),
+		.off1	= offsetof(struct thread_options, cgroup),
 		.help	= "Add job to cgroup of this name",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CGROUP,
@@ -3481,7 +4190,7 @@
 		.name	= "cgroup_nodelete",
 		.lname	= "Cgroup no-delete",
 		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(cgroup_nodelete),
+		.off1	= offsetof(struct thread_options, cgroup_nodelete),
 		.help	= "Do not delete cgroups after job completion",
 		.def	= "0",
 		.parent	= "cgroup",
@@ -3492,7 +4201,7 @@
 		.name	= "cgroup_weight",
 		.lname	= "Cgroup weight",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(cgroup_weight),
+		.off1	= offsetof(struct thread_options, cgroup_weight),
 		.help	= "Use given weight for cgroup",
 		.minval = 100,
 		.maxval	= 1000,
@@ -3504,7 +4213,7 @@
 		.name	= "uid",
 		.lname	= "User ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(uid),
+		.off1	= offsetof(struct thread_options, uid),
 		.help	= "Run job with this user ID",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -3513,7 +4222,7 @@
 		.name	= "gid",
 		.lname	= "Group ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(gid),
+		.off1	= offsetof(struct thread_options, gid),
 		.help	= "Run job with this group ID",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -3522,28 +4231,28 @@
 		.name	= "kb_base",
 		.lname	= "KB Base",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(kb_base),
+		.off1	= offsetof(struct thread_options, kb_base),
 		.prio	= 1,
 		.def	= "1024",
 		.posval = {
 			  { .ival = "1024",
 			    .oval = 1024,
-			    .help = "Use 1024 as the K base",
+			    .help = "Inputs invert IEC and SI prefixes (for compatibility); outputs prefer binary",
 			  },
 			  { .ival = "1000",
 			    .oval = 1000,
-			    .help = "Use 1000 as the K base",
+			    .help = "Inputs use IEC and SI prefixes; outputs prefer SI",
 			  },
 		},
-		.help	= "How many bytes per KB for reporting (1000 or 1024)",
+		.help	= "Unit prefix interpretation for quantities of data (IEC and SI)",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
 		.name	= "unit_base",
-		.lname	= "Base unit for reporting (Bits or Bytes)",
+		.lname	= "Unit for quantities of data (Bits or Bytes)",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(unit_base),
+		.off1	= offsetof(struct thread_options, unit_base),
 		.prio	= 1,
 		.posval = {
 			  { .ival = "0",
@@ -3567,7 +4276,7 @@
 		.name	= "hugepage-size",
 		.lname	= "Hugepage size",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(hugepage_size),
+		.off1	= offsetof(struct thread_options, hugepage_size),
 		.help	= "When using hugepages, specify size of each page",
 		.def	= __fio_stringify(FIO_HUGE_PAGE),
 		.interval = 1024 * 1024,
@@ -3578,7 +4287,7 @@
 		.name	= "flow_id",
 		.lname	= "I/O flow ID",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_id),
+		.off1	= offsetof(struct thread_options, flow_id),
 		.help	= "The flow index ID to use",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
@@ -3588,7 +4297,7 @@
 		.name	= "flow",
 		.lname	= "I/O flow weight",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow),
+		.off1	= offsetof(struct thread_options, flow),
 		.help	= "Weight for flow control of this job",
 		.parent	= "flow_id",
 		.hide	= 1,
@@ -3600,7 +4309,7 @@
 		.name	= "flow_watermark",
 		.lname	= "I/O flow watermark",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_watermark),
+		.off1	= offsetof(struct thread_options, flow_watermark),
 		.help	= "High watermark for flow control. This option"
 			" should be set to the same value for all threads"
 			" with non-zero flow.",
@@ -3614,7 +4323,7 @@
 		.name	= "flow_sleep",
 		.lname	= "I/O flow sleep",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(flow_sleep),
+		.off1	= offsetof(struct thread_options, flow_sleep),
 		.help	= "How many microseconds to sleep after being held"
 			" back by the flow control mechanism",
 		.parent	= "flow_id",
@@ -3624,6 +4333,76 @@
 		.group	= FIO_OPT_G_IO_FLOW,
 	},
 	{
+		.name	= "skip_bad",
+		.lname	= "Skip operations against bad blocks",
+		.type	= FIO_OPT_BOOL,
+		.off1	= offsetof(struct thread_options, skip_bad),
+		.help	= "Skip operations against known bad blocks.",
+		.hide	= 1,
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_MTD,
+	},
+	{
+		.name   = "steadystate",
+		.lname  = "Steady state threshold",
+		.alias  = "ss",
+		.type   = FIO_OPT_STR,
+		.off1   = offsetof(struct thread_options, ss_state),
+		.cb	= str_steadystate_cb,
+		.help   = "Define the criterion and limit to judge when a job has reached steady state",
+		.def	= "iops_slope:0.01%",
+		.posval	= {
+			  { .ival = "iops",
+			    .oval = FIO_SS_IOPS,
+			    .help = "maximum mean deviation of IOPS measurements",
+			  },
+			  { .ival = "iops_slope",
+			    .oval = FIO_SS_IOPS_SLOPE,
+			    .help = "slope calculated from IOPS measurements",
+			  },
+			  { .ival = "bw",
+			    .oval = FIO_SS_BW,
+			    .help = "maximum mean deviation of bandwidth measurements",
+			  },
+			  {
+			    .ival = "bw_slope",
+			    .oval = FIO_SS_BW_SLOPE,
+			    .help = "slope calculated from bandwidth measurements",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_duration",
+		.lname  = "Steady state duration",
+		.alias  = "ss_dur",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_dur),
+		.help   = "Stop workload upon attaining steady state for specified duration",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+        {
+		.name   = "steadystate_ramp_time",
+		.lname  = "Steady state ramp time",
+		.alias  = "ss_ramp",
+		.parent	= "steadystate",
+		.type   = FIO_OPT_STR_VAL_TIME,
+		.off1   = offsetof(struct thread_options, ss_ramp_time),
+		.help   = "Delay before initiation of data collection for steady state job termination testing",
+		.def    = "0",
+		.is_seconds = 1,
+		.is_time = 1,
+		.category = FIO_OPT_C_GENERAL,
+		.group  = FIO_OPT_G_RUNTIME,
+	},
+	{
 		.name = NULL,
 	},
 };
@@ -3718,6 +4497,18 @@
 	},
 };
 
+void fio_keywords_exit(void)
+{
+	struct fio_keyword *kw;
+
+	kw = &fio_keywords[0];
+	while (kw->word) {
+		free(kw->replace);
+		kw->replace = NULL;
+		kw++;
+	}
+}
+
 void fio_keywords_init(void)
 {
 	unsigned long long mb_memory;
@@ -3936,14 +4727,14 @@
 		i++;
 	}
 
-	if (best_option != -1)
+	if (best_option != -1 && string_distance_ok(name, best_distance) &&
+	    fio_options[best_option].type != FIO_OPT_UNSUPPORTED)
 		log_err("Did you mean %s?\n", fio_options[best_option].name);
 
 	free(name);
 }
 
-int fio_options_parse(struct thread_data *td, char **opts, int num_opts,
-			int dump_cmdline)
+int fio_options_parse(struct thread_data *td, char **opts, int num_opts)
 {
 	int i, ret, unknown;
 	char **opts_copy;
@@ -3954,7 +4745,7 @@
 	for (ret = 0, i = 0, unknown = 0; i < num_opts; i++) {
 		struct fio_option *o;
 		int newret = parse_option(opts_copy[i], opts[i], fio_options,
-						&o, td, dump_cmdline);
+						&o, &td->o, &td->opt_list);
 
 		if (!newret && o)
 			fio_option_mark_set(&td->o, o);
@@ -3987,7 +4778,7 @@
 			if (td->eo)
 				newret = parse_option(opts_copy[i], opts[i],
 						      td->io_ops->options, &o,
-						      td->eo, dump_cmdline);
+						      td->eo, &td->opt_list);
 
 			ret |= newret;
 			if (!o) {
@@ -4007,7 +4798,7 @@
 {
 	int ret;
 
-	ret = parse_cmd_option(opt, val, fio_options, td);
+	ret = parse_cmd_option(opt, val, fio_options, &td->o, &td->opt_list);
 	if (!ret) {
 		struct fio_option *o;
 
@@ -4022,13 +4813,14 @@
 int fio_cmd_ioengine_option_parse(struct thread_data *td, const char *opt,
 				char *val)
 {
-	return parse_cmd_option(opt, val, td->io_ops->options, td->eo);
+	return parse_cmd_option(opt, val, td->io_ops->options, td->eo,
+					&td->opt_list);
 }
 
 void fio_fill_default_options(struct thread_data *td)
 {
 	td->o.magic = OPT_MAGIC;
-	fill_default_options(td, fio_options);
+	fill_default_options(&td->o, fio_options);
 }
 
 int fio_show_option_help(const char *opt)
@@ -4036,40 +4828,26 @@
 	return show_cmd_help(fio_options, opt);
 }
 
-void options_mem_dupe(void *data, struct fio_option *options)
-{
-	struct fio_option *o;
-	char **ptr;
-
-	for (o = &options[0]; o->name; o++) {
-		if (o->type != FIO_OPT_STR_STORE)
-			continue;
-
-		ptr = td_var(data, o, o->off1);
-		if (*ptr)
-			*ptr = strdup(*ptr);
-	}
-}
-
 /*
  * dupe FIO_OPT_STR_STORE options
  */
 void fio_options_mem_dupe(struct thread_data *td)
 {
-	options_mem_dupe(&td->o, fio_options);
+	options_mem_dupe(fio_options, &td->o);
 
 	if (td->eo && td->io_ops) {
 		void *oldeo = td->eo;
 
 		td->eo = malloc(td->io_ops->option_struct_size);
 		memcpy(td->eo, oldeo, td->io_ops->option_struct_size);
-		options_mem_dupe(td->eo, td->io_ops->options);
+		options_mem_dupe(td->io_ops->options, td->eo);
 	}
 }
 
 unsigned int fio_get_kb_base(void *data)
 {
-	struct thread_options *o = data;
+	struct thread_data *td = cb_data_to_td(data);
+	struct thread_options *o = &td->o;
 	unsigned int kb_base = 0;
 
 	/*
@@ -4165,7 +4943,7 @@
 
 void fio_options_free(struct thread_data *td)
 {
-	options_free(fio_options, td);
+	options_free(fio_options, &td->o);
 	if (td->eo && td->io_ops && td->io_ops->options) {
 		options_free(td->io_ops->options, td->eo);
 		free(td->eo);
@@ -4208,22 +4986,22 @@
 	opt_off = opt - &fio_options[0];
 	index = opt_off / (8 * sizeof(uint64_t));
 	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
-	return (o->set_options[index] & (1UL << offset)) != 0;
+	return (o->set_options[index] & ((uint64_t)1 << offset)) != 0;
 }
 
-int __fio_option_is_set(struct thread_options *o, unsigned int off1)
+bool __fio_option_is_set(struct thread_options *o, unsigned int off1)
 {
 	struct fio_option *opt, *next;
 
 	next = NULL;
 	while ((opt = find_next_opt(o, next, off1)) != NULL) {
 		if (opt_is_set(o, opt))
-			return 1;
+			return true;
 
 		next = opt;
 	}
 
-	return 0;
+	return false;
 }
 
 void fio_option_mark_set(struct thread_options *o, struct fio_option *opt)
@@ -4233,5 +5011,5 @@
 	opt_off = opt - &fio_options[0];
 	index = opt_off / (8 * sizeof(uint64_t));
 	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
-	o->set_options[index] |= 1UL << offset;
+	o->set_options[index] |= (uint64_t)1 << offset;
 }

diff --git a/options.h b/options.h
index 36fd35d..83a58e2 100644
--- a/options.h
+++ b/options.h

@@ -4,10 +4,10 @@
 #define FIO_MAX_OPTS		512
 
 #include <string.h>
+#include <inttypes.h>
 #include "parse.h"
 #include "flist.h"
-
-#define td_var_offset(var)	((size_t) &((struct thread_options *)0)->var)
+#include "lib/types.h"
 
 int add_option(struct fio_option *);
 void invalidate_profile_options(const char *);
@@ -17,141 +17,35 @@
 void del_opt_posval(const char *, const char *);
 struct thread_data;
 void fio_options_free(struct thread_data *);
-char *get_name_idx(char *, int);
-int set_name_idx(char *, char *, int);
+int set_name_idx(char *, size_t, char *, int, bool);
+
+extern char client_sockaddr_str[];  /* used with --client option */
 
 extern struct fio_option fio_options[FIO_MAX_OPTS];
 
-extern int __fio_option_is_set(struct thread_options *, unsigned int off);
+extern bool __fio_option_is_set(struct thread_options *, unsigned int off);
 
 #define fio_option_is_set(__td, name)					\
 ({									\
-	const unsigned int off = td_var_offset(name);			\
-	int __r = __fio_option_is_set((__td), off);			\
-	if (__r == -1) {						\
-		dprint(FD_PARSE, "option %s/%u not found in map\n",	\
-				__fio_stringify(name), off);		\
-		__r = 0;						\
-	}								\
+	const unsigned int off = offsetof(struct thread_options, name);	\
+	bool __r = __fio_option_is_set((__td), off);			\
 	__r;								\
 })
 
 extern void fio_option_mark_set(struct thread_options *, struct fio_option *);
 
-static inline int o_match(struct fio_option *o, const char *opt)
+static inline bool o_match(struct fio_option *o, const char *opt)
 {
 	if (!strcmp(o->name, opt))
-		return 1;
+		return true;
 	else if (o->alias && !strcmp(o->alias, opt))
-		return 1;
+		return true;
 
-	return 0;
+	return false;
 }
 
-static inline struct fio_option *find_option(struct fio_option *options,
-					     const char *opt)
-{
-	struct fio_option *o;
-
-	for (o = &options[0]; o->name; o++)
-		if (o_match(o, opt))
-			return o;
-
-	return NULL;
-}
-
-struct opt_group {
-	const char *name;
-	unsigned int mask;
-};
-
-enum opt_category {
-	__FIO_OPT_C_GENERAL	= 0,
-	__FIO_OPT_C_IO,
-	__FIO_OPT_C_FILE,
-	__FIO_OPT_C_STAT,
-	__FIO_OPT_C_LOG,
-	__FIO_OPT_C_PROFILE,
-	__FIO_OPT_C_ENGINE,
-	__FIO_OPT_C_NR,
-
-	FIO_OPT_C_GENERAL	= (1U << __FIO_OPT_C_GENERAL),
-	FIO_OPT_C_IO		= (1U << __FIO_OPT_C_IO),
-	FIO_OPT_C_FILE		= (1U << __FIO_OPT_C_FILE),
-	FIO_OPT_C_STAT		= (1U << __FIO_OPT_C_STAT),
-	FIO_OPT_C_LOG		= (1U << __FIO_OPT_C_LOG),
-	FIO_OPT_C_PROFILE	= (1U << __FIO_OPT_C_PROFILE),
-	FIO_OPT_C_ENGINE	= (1U << __FIO_OPT_C_ENGINE),
-	FIO_OPT_C_INVALID	= (1U << __FIO_OPT_C_NR),
-};
-
-enum opt_category_group {
-	__FIO_OPT_G_RATE	= 0,
-	__FIO_OPT_G_ZONE,
-	__FIO_OPT_G_RWMIX,
-	__FIO_OPT_G_VERIFY,
-	__FIO_OPT_G_TRIM,
-	__FIO_OPT_G_IOLOG,
-	__FIO_OPT_G_IO_DEPTH,
-	__FIO_OPT_G_IO_FLOW,
-	__FIO_OPT_G_DESC,
-	__FIO_OPT_G_FILENAME,
-	__FIO_OPT_G_IO_BASIC,
-	__FIO_OPT_G_CGROUP,
-	__FIO_OPT_G_RUNTIME,
-	__FIO_OPT_G_PROCESS,
-	__FIO_OPT_G_CRED,
-	__FIO_OPT_G_CLOCK,
-	__FIO_OPT_G_IO_TYPE,
-	__FIO_OPT_G_THINKTIME,
-	__FIO_OPT_G_RANDOM,
-	__FIO_OPT_G_IO_BUF,
-	__FIO_OPT_G_TIOBENCH,
-	__FIO_OPT_G_ERR,
-	__FIO_OPT_G_E4DEFRAG,
-	__FIO_OPT_G_NETIO,
-	__FIO_OPT_G_LIBAIO,
-	__FIO_OPT_G_ACT,
-	__FIO_OPT_G_LATPROF,
-        __FIO_OPT_G_RBD,
-        __FIO_OPT_G_GFAPI,
-	__FIO_OPT_G_NR,
-
-	FIO_OPT_G_RATE		= (1U << __FIO_OPT_G_RATE),
-	FIO_OPT_G_ZONE		= (1U << __FIO_OPT_G_ZONE),
-	FIO_OPT_G_RWMIX		= (1U << __FIO_OPT_G_RWMIX),
-	FIO_OPT_G_VERIFY	= (1U << __FIO_OPT_G_VERIFY),
-	FIO_OPT_G_TRIM		= (1U << __FIO_OPT_G_TRIM),
-	FIO_OPT_G_IOLOG		= (1U << __FIO_OPT_G_IOLOG),
-	FIO_OPT_G_IO_DEPTH	= (1U << __FIO_OPT_G_IO_DEPTH),
-	FIO_OPT_G_IO_FLOW	= (1U << __FIO_OPT_G_IO_FLOW),
-	FIO_OPT_G_DESC		= (1U << __FIO_OPT_G_DESC),
-	FIO_OPT_G_FILENAME	= (1U << __FIO_OPT_G_FILENAME),
-	FIO_OPT_G_IO_BASIC	= (1U << __FIO_OPT_G_IO_BASIC),
-	FIO_OPT_G_CGROUP	= (1U << __FIO_OPT_G_CGROUP),
-	FIO_OPT_G_RUNTIME	= (1U << __FIO_OPT_G_RUNTIME),
-	FIO_OPT_G_PROCESS	= (1U << __FIO_OPT_G_PROCESS),
-	FIO_OPT_G_CRED		= (1U << __FIO_OPT_G_CRED),
-	FIO_OPT_G_CLOCK		= (1U << __FIO_OPT_G_CLOCK),
-	FIO_OPT_G_IO_TYPE	= (1U << __FIO_OPT_G_IO_TYPE),
-	FIO_OPT_G_THINKTIME	= (1U << __FIO_OPT_G_THINKTIME),
-	FIO_OPT_G_RANDOM	= (1U << __FIO_OPT_G_RANDOM),
-	FIO_OPT_G_IO_BUF	= (1U << __FIO_OPT_G_IO_BUF),
-	FIO_OPT_G_TIOBENCH	= (1U << __FIO_OPT_G_TIOBENCH),
-	FIO_OPT_G_ERR		= (1U << __FIO_OPT_G_ERR),
-	FIO_OPT_G_E4DEFRAG	= (1U << __FIO_OPT_G_E4DEFRAG),
-	FIO_OPT_G_NETIO		= (1U << __FIO_OPT_G_NETIO),
-	FIO_OPT_G_LIBAIO	= (1U << __FIO_OPT_G_LIBAIO),
-	FIO_OPT_G_ACT		= (1U << __FIO_OPT_G_ACT),
-	FIO_OPT_G_LATPROF	= (1U << __FIO_OPT_G_LATPROF),
-	FIO_OPT_G_RBD		= (1U << __FIO_OPT_G_RBD),
-	FIO_OPT_G_GFAPI		= (1U << __FIO_OPT_G_GFAPI),
-	FIO_OPT_G_INVALID	= (1U << __FIO_OPT_G_NR),
-};
-
-extern struct opt_group *opt_group_from_mask(unsigned int *mask);
-extern struct opt_group *opt_group_cat_from_mask(unsigned int *mask);
-extern struct fio_option *fio_option_find(const char *name);
+extern struct fio_option *find_option(struct fio_option *, const char *);
+extern struct fio_option *fio_option_find(const char *);
 extern unsigned int fio_get_kb_base(void *);
 
 #endif

diff --git a/os/os-aix.h b/os/os-aix.h
index 3d67765..e204d6f 100644
--- a/os/os-aix.h
+++ b/os/os-aix.h

@@ -14,8 +14,6 @@
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 
-#define FIO_HAVE_PSHARED_MUTEX
-
 #define OS_MAP_ANON		MAP_ANON
 #define OS_MSG_DONTWAIT		0
 
@@ -23,7 +21,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)

diff --git a/os/os-android.h b/os/os-android.h
index 96ff5ba..1c3eb7d 100644
--- a/os/os-android.h
+++ b/os/os-android.h

@@ -12,20 +12,25 @@
 #include <fcntl.h>
 #include <errno.h>
 #include <sched.h>
+#include <linux/unistd.h>
 #include <linux/major.h>
 #include <asm/byteorder.h>
-#include <byteswap.h>
 
+#include "./os-linux-syscall.h"
 #include "binject.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
 #define FIO_HAVE_FS_STAT
 #define FIO_HAVE_TRIM
@@ -52,21 +57,28 @@
 #define MAP_HUGETLB 0x40000 /* arch specific */
 #endif
 
-
+#ifndef CONFIG_NO_SHM
 /*
  * The Android NDK doesn't currently export <sys/shm.h>, so define the
  * necessary stuff here.
  */
 
-#include <linux/shm.h>
-#define SHM_HUGETLB    04000
+#if __ANDROID_API__ >= 26
+#define shmat bionic_shmat
+#define shmctl bionic_shmctl
+#define shmdt bionic_shmdt
+#define shmget bionic_shmget
+#endif
+#include <sys/shm.h>
+#undef shmat
+#undef shmctl
+#undef shmdt
+#undef shmget
 
-#define shmid_ds shmid64_ds
-#undef __key
+#define SHM_HUGETLB    04000
 
 #include <stdio.h>
 #include <linux/ashmem.h>
-#include <sys/mman.h>
 
 #define ASHMEM_DEVICE	"/dev/ashmem"
 
@@ -86,14 +98,14 @@
 static inline int shmget (key_t __key, size_t __size, int __shmflg)
 {
 	int fd,ret;
-	char key[11];
-	
+	char keybuf[11];
+
 	fd = open(ASHMEM_DEVICE, O_RDWR);
 	if (fd < 0)
 		return fd;
 
-	sprintf(key,"%d",__key);
-	ret = ioctl(fd, ASHMEM_SET_NAME, key);
+	sprintf(keybuf,"%d",__key);
+	ret = ioctl(fd, ASHMEM_SET_NAME, keybuf);
 	if (ret < 0)
 		goto error;
 
@@ -102,7 +114,7 @@
 		goto error;
 
 	return fd;
-	
+
 error:
 	close(fd);
 	return ret;
@@ -124,6 +136,7 @@
 	size = *ptr;    //find mmap size which we stored at the beginning of the buffer
 	return munmap((void *)ptr, size + sizeof(size_t));
 }
+#endif
 
 #define SPLICE_DEF_SIZE	(64*1024)
 
@@ -143,6 +156,12 @@
 #define IOPRIO_BITS		16
 #define IOPRIO_CLASS_SHIFT	13
 
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
 static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
 {
 	/*
@@ -215,9 +234,19 @@
 #define FIO_O_NOATIME	0
 #endif
 
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
+#define fio_swap16(x)	__builtin_bswap16(x)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
+#include <byteswap.h>
 #define fio_swap16(x)	bswap_16(x)
 #define fio_swap32(x)	bswap_32(x)
 #define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -241,7 +270,7 @@
 		return atoi(size);
 }
 
-static inline unsigned long long get_fs_size(const char *path)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	unsigned long long ret;
 	struct statfs s;

diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h
index cc3de31..8a116e6 100644
--- a/os/os-dragonfly.h
+++ b/os/os-dragonfly.h

@@ -4,23 +4,27 @@
 #define	FIO_OS	os_dragonfly
 
 #include <errno.h>
+#include <unistd.h>
 #include <sys/param.h>
-/* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
-#define	rb_node	_rb_node
 #include <sys/sysctl.h>
-#undef rb_node
-#undef rb_left
-#undef rb_right
+#include <sys/statvfs.h>
+#include <sys/diskslice.h>
+#include <sys/ioctl_compat.h>
+#include <sys/usched.h>
+#include <sys/resource.h>
 
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_BDEV_SIZE
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
+#define FIO_HAVE_CHARDEV_SIZE
 #define FIO_HAVE_GETTID
-
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
+#define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_IOPRIO
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -32,11 +36,155 @@
 #define fio_swap32(x)	bswap32(x)
 #define fio_swap64(x)	bswap64(x)
 
+/* This is supposed to equal (sizeof(cpumask_t)*8) */
+#define FIO_MAX_CPUS	SMP_MAXCPU
+
 typedef off_t off64_t;
+typedef cpumask_t os_cpu_mask_t;
+
+/*
+ * These macros are copied from sys/cpu/x86_64/include/types.h.
+ * It's okay to copy from arch dependent header because x86_64 is the only
+ * supported arch, and no other arch is going to be supported any time soon.
+ *
+ * These are supposed to be able to be included from userspace by defining
+ * _KERNEL_STRUCTURES, however this scheme is badly broken that enabling it
+ * causes compile-time conflicts with other headers. Although the current
+ * upstream code no longer requires _KERNEL_STRUCTURES, they should be kept
+ * here for compatibility with older versions.
+ */
+#ifndef CPUMASK_SIMPLE
+#define CPUMASK_SIMPLE(cpu)		((uint64_t)1 << (cpu))
+#define CPUMASK_TESTBIT(val, i)		((val).ary[((i) >> 6) & 3] & \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ORBIT(mask, i)		((mask).ary[((i) >> 6) & 3] |= \
+					 CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_NANDBIT(mask, i)	((mask).ary[((i) >> 6) & 3] &= \
+					 ~CPUMASK_SIMPLE((i) & 63))
+#define CPUMASK_ASSZERO(mask)		do {				\
+					(mask).ary[0] = 0;		\
+					(mask).ary[1] = 0;		\
+					(mask).ary[2] = 0;		\
+					(mask).ary[3] = 0;		\
+					} while(0)
+#endif
+
+/*
+ * Define USCHED_GET_CPUMASK as the macro didn't exist until release 4.5.
+ * usched_set(2) returns EINVAL if the kernel doesn't support it.
+ *
+ * Also note usched_set(2) works only for the current thread regardless of
+ * the command type. It doesn't work against another thread regardless of
+ * a caller's privilege. A caller would generally specify 0 for pid for the
+ * current thread though that's the only choice. See BUGS in usched_set(2).
+ */
+#ifndef USCHED_GET_CPUMASK
+#define USCHED_GET_CPUMASK	5
+#endif
+
+/* No CPU_COUNT(), but use the default function defined in os/os.h */
+#define fio_cpu_count(mask)             CPU_COUNT((mask))
+
+static inline int fio_cpuset_init(os_cpu_mask_t *mask)
+{
+	CPUMASK_ASSZERO(*mask);
+	return 0;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return 0;
+}
+
+static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_NANDBIT(*mask, cpu);
+}
+
+static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
+{
+	CPUMASK_ORBIT(*mask, cpu);
+}
+
+static inline int fio_cpu_isset(os_cpu_mask_t *mask, int cpu)
+{
+	if (CPUMASK_TESTBIT(*mask, cpu))
+		return 1;
+
+	return 0;
+}
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t mask)
+{
+	int i, firstcall = 1;
+
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	for (i = 0; i < FIO_MAX_CPUS; i++) {
+		if (!CPUMASK_TESTBIT(mask, i))
+			continue;
+		if (firstcall) {
+			if (usched_set(pid, USCHED_SET_CPU, &i, sizeof(int)))
+				return -1;
+			firstcall = 0;
+		} else {
+			if (usched_set(pid, USCHED_ADD_CPU, &i, sizeof(int)))
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
+{
+	/* 0 for the current thread, see BUGS in usched_set(2) */
+	pid = 0;
+
+	if (usched_set(pid, USCHED_GET_CPUMASK, mask, sizeof(*mask)))
+		return -1;
+
+	return 0;
+}
+
+/* fio code is Linux based, so rename macros to Linux style */
+#define IOPRIO_WHO_PROCESS	PRIO_PROCESS
+#define IOPRIO_WHO_PGRP		PRIO_PGRP
+#define IOPRIO_WHO_USER		PRIO_USER
+
+#define IOPRIO_MIN_PRIO		1	/* lowest priority */
+#define IOPRIO_MAX_PRIO		10	/* highest priority */
+
+/*
+ * Prototypes declared in sys/sys/resource.h are preventing from defining
+ * ioprio_set() with 4 arguments, so define fio's ioprio_set() as a macro.
+ * Note that there is no idea of class within ioprio_set(2) unlike Linux.
+ */
+#define ioprio_set(which, who, ioprio_class, ioprio)	\
+	ioprio_set(which, who, ioprio)
+
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct partinfo pi;
+
+	if (!ioctl(f->fd, DIOCGPART, &pi)) {
+		*bytes = (unsigned long long) pi.media_size;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
+static inline int chardev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	return blockdev_size(f, bytes);
+}
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -54,8 +202,46 @@
 	return (int) lwp_gettid();
 }
 
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(int fd, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(fd, IOCTLTRIM, range))
+		return 0;
+
+	return errno;
+}
+
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
 #endif

diff --git a/os/os-freebsd.h b/os/os-freebsd.h
index 22765ce..c7863b5 100644
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h

@@ -10,6 +10,7 @@
 #include <sys/socket.h>
 #include <sys/param.h>
 #include <sys/cpuset.h>
+#include <sys/statvfs.h>
 
 #include "../file.h"
 
@@ -17,8 +18,11 @@
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
 #define FIO_HAVE_CHARDEV_SIZE
+#define FIO_HAVE_FS_STAT
+#define FIO_HAVE_TRIM
 #define FIO_HAVE_GETTID
 #define FIO_HAVE_CPU_AFFINITY
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -78,7 +82,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -99,8 +103,46 @@
 	return (int) lwpid;
 }
 
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
+static inline int os_trim(int fd, unsigned long long start,
+			  unsigned long long len)
+{
+	off_t range[2];
+
+	range[0] = start;
+	range[1] = len;
+
+	if (!ioctl(fd, DIOCGDELETE, range))
+		return 0;
+
+	return errno;
+}
+
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	int x;
+	size_t len = sizeof(x);
+
+	if (sysctlbyname("kern.ipc.shm_allow_removed", &x, &len, NULL, 0) < 0)
+		return 0;
+
+	return x > 0 ? 1 : 0;
+}
+
 #endif

diff --git a/os/os-hpux.h b/os/os-hpux.h
index 82acd11..6a240b0 100644
--- a/os/os-hpux.h
+++ b/os/os-hpux.h

@@ -22,7 +22,6 @@
 #define FIO_HAVE_ODIRECT
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 
 #define OS_MAP_ANON		MAP_ANONYMOUS
@@ -44,7 +43,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)

diff --git a/os/os-linux-syscall.h b/os/os-linux-syscall.h
new file mode 100644
index 0000000..c399b2f
--- /dev/null
+++ b/os/os-linux-syscall.h

@@ -0,0 +1,277 @@
+#ifndef FIO_OS_LINUX_SYSCALL_H
+#define FIO_OS_LINUX_SYSCALL_H
+
+#include "../arch/arch.h"
+
+/* Linux syscalls for x86 */
+#if defined(ARCH_X86_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		289
+#define __NR_ioprio_get		290
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		378
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		379
+#endif
+
+/* Linux syscalls for x86_64 */
+#elif defined(ARCH_X86_64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		251
+#define __NR_ioprio_get		252
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		221
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		275
+#define __NR_sys_tee		276
+#define __NR_sys_vmsplice	278
+#endif
+
+#ifndef __NR_shmget
+#define __NR_shmget		 29
+#define __NR_shmat		 30
+#define __NR_shmctl		 31
+#define __NR_shmdt		 67
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		327
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		328
+#endif
+
+/* Linux syscalls for ppc */
+#elif defined(ARCH_PPC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		273
+#define __NR_ioprio_get		274
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		233
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		283
+#define __NR_sys_tee		284
+#define __NR_sys_vmsplice	285
+#endif
+
+/* Linux syscalls for ia64 */
+#elif defined(ARCH_IA64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		1274
+#define __NR_ioprio_get		1275
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		1234
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		1297
+#define __NR_sys_tee		1301
+#define __NR_sys_vmsplice	1302
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		1348
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		1349
+#endif
+
+/* Linux syscalls for alpha */
+#elif defined(ARCH_ALPHA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		442
+#define __NR_ioprio_get		443
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		413
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		468
+#define __NR_sys_tee		470
+#define __NR_sys_vmsplice	471
+#endif
+
+/* Linux syscalls for s390 */
+#elif defined(ARCH_S390_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		282
+#define __NR_ioprio_get		283
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		253
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		306
+#define __NR_sys_tee		308
+#define __NR_sys_vmsplice	309
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		376
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		377
+#endif
+
+/* Linux syscalls for sparc */
+#elif defined(ARCH_SPARC_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for sparc64 */
+#elif defined(ARCH_SPARC64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		196
+#define __NR_ioprio_get		218
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		209
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		232
+#define __NR_sys_tee		280
+#define __NR_sys_vmsplice	25
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		358
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		359
+#endif
+
+/* Linux syscalls for arm */
+#elif defined(ARCH_ARM_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		270
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		340
+#define __NR_sys_tee		342
+#define __NR_sys_vmsplice	343
+#endif
+
+#ifndef __NR_preadv2
+#define __NR_preadv2		392
+#endif
+#ifndef __NR_pwritev2
+#define __NR_pwritev2		393
+#endif
+
+/* Linux syscalls for mips */
+#elif defined(ARCH_MIPS64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		314
+#define __NR_ioprio_get		315
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		215
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		263
+#define __NR_sys_tee		265
+#define __NR_sys_vmsplice	266
+#endif
+
+/* Linux syscalls for sh */
+#elif defined(ARCH_SH_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		288
+#define __NR_ioprio_get		289
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		250
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		313
+#define __NR_sys_tee		315
+#define __NR_sys_vmsplice	316
+#endif
+
+/* Linux syscalls for hppa */
+#elif defined(ARCH_HPPA_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		267
+#define __NR_ioprio_get		268
+#endif
+
+#ifndef __NR_fadvise64
+#define __NR_fadvise64		236
+#endif
+
+#ifndef __NR_sys_splice
+#define __NR_sys_splice		291
+#define __NR_sys_tee		293
+#define __NR_sys_vmsplice	294
+#endif
+
+/* Linux syscalls for aarch64 */
+#elif defined(ARCH_AARCH64_H)
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		30
+#define __NR_ioprio_get		31
+#endif
+
+#else
+#warning "Unknown architecture"
+#endif
+
+#endif /* FIO_OS_LINUX_SYSCALL_H */

diff --git a/os/os-linux.h b/os/os-linux.h
index e193634..ba53590 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h

@@ -6,6 +6,7 @@
 #include <sys/ioctl.h>
 #include <sys/uio.h>
 #include <sys/syscall.h>
+#include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/mman.h>
 #include <unistd.h>
@@ -15,21 +16,25 @@
 #include <linux/unistd.h>
 #include <linux/raw.h>
 #include <linux/major.h>
-#include <byteswap.h>
 
+#include "./os-linux-syscall.h"
 #include "binject.h"
 #include "../file.h"
 
+#ifndef __has_builtin         // Optional of course.
+  #define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif
+
 #define FIO_HAVE_CPU_AFFINITY
 #define FIO_HAVE_DISK_UTIL
 #define FIO_HAVE_SGIO
 #define FIO_HAVE_IOPRIO
+#define FIO_HAVE_IOPRIO_CLASS
 #define FIO_HAVE_IOSCHED_SWITCH
 #define FIO_HAVE_ODIRECT
 #define FIO_HAVE_HUGETLB
 #define FIO_HAVE_RAWBIND
 #define FIO_HAVE_BLKTRACE
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
 #define FIO_HAVE_CGROUPS
 #define FIO_HAVE_FS_STAT
@@ -37,6 +42,8 @@
 #define FIO_HAVE_BINJECT
 #define FIO_HAVE_GETTID
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_PWRITEV2
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
 #ifdef MAP_HUGETLB
 #define FIO_HAVE_MMAP_HUGE
@@ -94,6 +101,12 @@
 #define IOPRIO_BITS		16
 #define IOPRIO_CLASS_SHIFT	13
 
+#define IOPRIO_MIN_PRIO		0	/* highest priority */
+#define IOPRIO_MAX_PRIO		7	/* lowest priority */
+
+#define IOPRIO_MIN_PRIO_CLASS	0
+#define IOPRIO_MAX_PRIO_CLASS	3
+
 static inline int ioprio_set(int which, int who, int ioprio_class, int ioprio)
 {
 	/*
@@ -209,21 +222,19 @@
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
 
-#if defined(__builtin_bswap16)
+/* Check for GCC or Clang byte swap intrinsics */
+#if (__has_builtin(__builtin_bswap16) && __has_builtin(__builtin_bswap32) \
+     && __has_builtin(__builtin_bswap64)) || (__GNUC__ > 4 \
+     || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)) /* fio_swapN */
 #define fio_swap16(x)	__builtin_bswap16(x)
-#else
-#define fio_swap16(x)	__bswap_16(x)
-#endif
-#if defined(__builtin_bswap32)
 #define fio_swap32(x)	__builtin_bswap32(x)
-#else
-#define fio_swap32(x)	__bswap_32(x)
-#endif
-#if defined(__builtin_bswap64)
 #define fio_swap64(x)	__builtin_bswap64(x)
 #else
-#define fio_swap64(x)	__bswap_64(x)
-#endif
+#include <byteswap.h>
+#define fio_swap16(x)	bswap_16(x)
+#define fio_swap32(x)	bswap_32(x)
+#define fio_swap64(x)	bswap_64(x)
+#endif /* fio_swapN */
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
@@ -247,7 +258,7 @@
 		return atoi(size);
 }
 
-static inline unsigned long long get_fs_size(const char *path)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	unsigned long long ret;
 	struct statfs s;
@@ -282,4 +293,70 @@
 }
 #endif
 
+#ifndef POSIX_FADV_STREAMID
+#define POSIX_FADV_STREAMID	8
+#endif
+
+#define FIO_HAVE_STREAMID
+
+#ifndef RWF_HIPRI
+#define RWF_HIPRI	0x00000001
+#endif
+#ifndef RWF_DSYNC
+#define RWF_DSYNC	0x00000002
+#endif
+#ifndef RWF_SYNC
+#define RWF_SYNC	0x00000004
+#endif
+
+#ifndef CONFIG_PWRITEV2
+#ifdef __NR_preadv2
+static inline void make_pos_h_l(unsigned long *pos_h, unsigned long *pos_l,
+				off_t offset)
+{
+#if BITS_PER_LONG == 64
+	*pos_l = offset;
+	*pos_h = 0;
+#else
+	*pos_l = offset & 0xffffffff;
+	*pos_h = ((uint64_t) offset) >> 32;
+#endif
+}
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_preadv2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	unsigned long pos_l, pos_h;
+
+	make_pos_h_l(&pos_h, &pos_l, offset);
+	return syscall(__NR_pwritev2, fd, iov, iovcnt, pos_l, pos_h, flags);
+}
+#else
+static inline ssize_t preadv2(int fd, const struct iovec *iov, int iovcnt,
+			      off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+static inline ssize_t pwritev2(int fd, const struct iovec *iov, int iovcnt,
+			       off_t offset, unsigned int flags)
+{
+	errno = ENOSYS;
+	return -1;
+}
+#endif /* __NR_preadv2 */
+#endif /* CONFIG_PWRITEV2 */
+
+static inline int shm_attach_to_open_removed(void)
+{
+	return 1;
+}
+
 #endif

diff --git a/os/os-mac.h b/os/os-mac.h
index d202e99..7de36ea 100644
--- a/os/os-mac.h
+++ b/os/os-mac.h

@@ -35,76 +35,9 @@
 
 typedef off_t off64_t;
 
-/* OS X as of 10.6 doesn't have the timer_* functions. 
- * Emulate the functionality using setitimer and sigaction here
- */
-
-#define MAX_TIMERS 64
-
+#ifndef CONFIG_CLOCKID_T
 typedef unsigned int clockid_t;
-typedef unsigned int timer_t;
-
-struct itimerspec {
-	struct timespec it_value;
-	struct timespec it_interval;
-};
-
-static struct sigevent fio_timers[MAX_TIMERS];
-static unsigned int num_timers = 0;
-
-static void sig_alrm(int signum)
-{
-	union sigval sv;
-	
-	for (int i = 0; i < num_timers; i++) {
-		if (fio_timers[i].sigev_notify_function == NULL)
-			continue;
-		
-		if (fio_timers[i].sigev_notify == SIGEV_THREAD)
-			fio_timers[i].sigev_notify_function(sv);
-		else if (fio_timers[i].sigev_notify == SIGEV_SIGNAL)
-			kill(getpid(), fio_timers[i].sigev_signo);
-	}
-}
-
-static inline int timer_settime(timer_t timerid, int flags,
-				const struct itimerspec *value,
-				struct itimerspec *ovalue)
-{
-	struct sigaction sa;
-	struct itimerval tv;
-	struct itimerval tv_out;
-	int rc;
-	
-	tv.it_interval.tv_sec = value->it_interval.tv_sec;
-	tv.it_interval.tv_usec = value->it_interval.tv_nsec / 1000;
-
-	tv.it_value.tv_sec = value->it_value.tv_sec;
-	tv.it_value.tv_usec = value->it_value.tv_nsec / 1000;
-
-	sa.sa_handler = sig_alrm;
-	sigemptyset(&sa.sa_mask);
-	sa.sa_flags = 0;
-	
-	rc = sigaction(SIGALRM, &sa, NULL);
-
-	if (!rc)
-		rc = setitimer(ITIMER_REAL, &tv, &tv_out);
-	
-	if (!rc && ovalue != NULL) {
-		ovalue->it_interval.tv_sec = tv_out.it_interval.tv_sec;
-		ovalue->it_interval.tv_nsec = tv_out.it_interval.tv_usec * 1000;
-		ovalue->it_value.tv_sec = tv_out.it_value.tv_sec;
-		ovalue->it_value.tv_nsec = tv_out.it_value.tv_usec * 1000;
-	}
-
-	return rc;
-}
-
-static inline int timer_delete(timer_t timer)
-{
-	return 0;
-}
+#endif
 
 #define FIO_OS_DIRECTIO
 static inline int fio_set_odirect(int fd)
@@ -144,7 +77,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)

diff --git a/os/os-netbsd.h b/os/os-netbsd.h
index 4b0269e..7be02a7 100644
--- a/os/os-netbsd.h
+++ b/os/os-netbsd.h

@@ -6,6 +6,10 @@
 #include <errno.h>
 #include <lwp.h>
 #include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
 /* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
 #define	rb_node	_rb_node
 #include <sys/sysctl.h>
@@ -16,12 +20,12 @@
 #include "../file.h"
 
 #define FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_BDEV_SIZE
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
 #define FIO_HAVE_GETTID
 
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
+#undef	FIO_HAVE_CPU_AFFINITY	/* doesn't exist */
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -35,9 +39,22 @@
 
 typedef off_t off64_t;
 
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -55,11 +72,21 @@
 	return (int) _lwp_self();
 }
 
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
+}
+
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
-/* XXX NetBSD doesn't have getopt_long_only */
-#define	getopt_long_only	getopt_long
-
 #endif

diff --git a/os/os-openbsd.h b/os/os-openbsd.h
index b1d8e83..d874ee2 100644
--- a/os/os-openbsd.h
+++ b/os/os-openbsd.h

@@ -5,6 +5,11 @@
 
 #include <errno.h>
 #include <sys/param.h>
+#include <sys/statvfs.h>
+#include <sys/ioctl.h>
+#include <sys/dkio.h>
+#include <sys/disklabel.h>
+#include <sys/utsname.h>
 /* XXX hack to avoid conflicts between rbtree.h and <sys/tree.h> */
 #include <sys/sysctl.h>
 #undef RB_BLACK
@@ -14,12 +19,13 @@
 #include "../file.h"
 
 #undef  FIO_HAVE_ODIRECT
-#define FIO_USE_GENERIC_BDEV_SIZE
 #define FIO_USE_GENERIC_RAND
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_FS_STAT
 #define FIO_HAVE_GETTID
+#define FIO_HAVE_SHM_ATTACH_REMOVED
 
-#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
+#undef	FIO_HAVE_CPU_AFFINITY	/* doesn't exist */
 
 #define OS_MAP_ANON		MAP_ANON
 
@@ -33,9 +39,22 @@
 
 typedef off_t off64_t;
 
+static inline int blockdev_size(struct fio_file *f, unsigned long long *bytes)
+{
+	struct disklabel dl;
+
+	if (!ioctl(f->fd, DIOCGDINFO, &dl)) {
+		*bytes = ((unsigned long long)dl.d_secperunit) * dl.d_secsize;
+		return 0;
+	}
+
+	*bytes = 0;
+	return errno;
+}
+
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return EINVAL;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -50,11 +69,54 @@
 
 static inline int gettid(void)
 {
-	return (int) pthread_self();
+	return (int)(intptr_t) pthread_self();
+}
+
+static inline unsigned long long get_fs_free_size(const char *path)
+{
+	unsigned long long ret;
+	struct statvfs s;
+
+	if (statvfs(path, &s) < 0)
+		return -1ULL;
+
+	ret = s.f_frsize;
+	ret *= (unsigned long long) s.f_bfree;
+	return ret;
 }
 
 #ifdef MADV_FREE
 #define FIO_MADV_FREE	MADV_FREE
 #endif
 
+static inline int shm_attach_to_open_removed(void)
+{
+	struct utsname uts;
+	int major, minor;
+
+	if (uname(&uts) == -1)
+		return 0;
+
+	/*
+	 * Return 1 if >= OpenBSD 5.1 according to 97900ebf,
+	 * assuming both major/minor versions are < 10.
+	 */
+	if (uts.release[0] > '9' || uts.release[0] < '0')
+		return 0;
+	if (uts.release[1] != '.')
+		return 0;
+	if (uts.release[2] > '9' || uts.release[2] < '0')
+		return 0;
+
+	major = uts.release[0] - '0';
+	minor = uts.release[2] - '0';
+
+	if (major > 5)
+		return 1;
+	if (major == 5 && minor >= 1)
+		return 1;
+
+	return 0;
+}
+
 #endif

diff --git a/os/os-solaris.h b/os/os-solaris.h
index 5b78cc2..8f8f53b 100644
--- a/os/os-solaris.h
+++ b/os/os-solaris.h

@@ -16,7 +16,6 @@
 #include "../file.h"
 
 #define FIO_HAVE_CPU_AFFINITY
-#define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CHARDEV_SIZE
 #define FIO_USE_GENERIC_BDEV_SIZE
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
@@ -61,7 +60,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)

diff --git a/os/os-windows.h b/os/os-windows.h
index 6603635..0c8c42d 100644
--- a/os/os-windows.h
+++ b/os/os-windows.h

@@ -16,9 +16,15 @@
 #include "../file.h"
 #include "../log.h"
 #include "../lib/hweight.h"
+#include "../oslib/strcasestr.h"
 
 #include "windows/posix.h"
 
+/* Cygwin doesn't define rand_r if C99 or newer is being used */
+#if defined(WIN32) && !defined(rand_r)
+int rand_r(unsigned *);
+#endif
+
 #ifndef PTHREAD_STACK_MIN
 #define PTHREAD_STACK_MIN 65535
 #endif
@@ -105,6 +111,7 @@
 int fdatasync(int fildes);
 int lstat(const char * path, struct stat * buf);
 uid_t geteuid(void);
+char* ctime_r(const time_t *t, char *buf);
 int nanosleep(const struct timespec *rqtp, struct timespec *rmtp);
 ssize_t pread(int fildes, void *buf, size_t nbyte, off_t offset);
 ssize_t pwrite(int fildes, const void *buf, size_t nbyte,
@@ -145,9 +152,7 @@
 
 static inline int blockdev_invalidate_cache(struct fio_file *f)
 {
-	/* There's no way to invalidate the cache in Windows
-	 * so just pretend to succeed */
-	return 0;
+	return ENOTSUP;
 }
 
 static inline unsigned long long os_phys_mem(void)
@@ -186,7 +191,7 @@
 	return (bSuccess)? 0 : -1;
 }
 
-static inline void fio_getaffinity(int pid, os_cpu_mask_t *mask)
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *mask)
 {
 	os_cpu_mask_t systemMask;
 
@@ -197,7 +202,10 @@
 		CloseHandle(h);
 	} else {
 		log_err("fio_getaffinity failed: failed to get handle for pid %d\n", pid);
+		return -1;
 	}
+
+	return 0;
 }
 
 static inline void fio_cpu_clear(os_cpu_mask_t *mask, int cpu)

diff --git a/os/os.h b/os/os.h
index 7cb8121..5e3c813 100644
--- a/os/os.h
+++ b/os/os.h

@@ -9,6 +9,7 @@
 #include <stdlib.h>
 
 #include "../arch/arch.h"
+#include "../lib/types.h"
 
 enum {
 	os_linux = 1,
@@ -65,7 +66,11 @@
 #endif
 
 #ifndef CONFIG_STRSEP
-#include "../lib/strsep.h"
+#include "../oslib/strsep.h"
+#endif
+
+#ifndef CONFIG_STRLCAT
+#include "../oslib/strlcat.h"
 #endif
 
 #ifdef MSG_DONTWAIT
@@ -76,15 +81,32 @@
 #define POSIX_FADV_DONTNEED	(0)
 #define POSIX_FADV_SEQUENTIAL	(0)
 #define POSIX_FADV_RANDOM	(0)
+#define POSIX_FADV_NORMAL	(0)
 #endif
 
 #ifndef FIO_HAVE_CPU_AFFINITY
-#define fio_setaffinity(pid, mask)	(0)
-#define fio_getaffinity(pid, mask)	do { } while (0)
 #define fio_cpu_clear(mask, cpu)	do { } while (0)
-#define fio_cpuset_exit(mask)		(-1)
-#define fio_cpus_split(mask, cpu)	(0)
 typedef unsigned long os_cpu_mask_t;
+
+static inline int fio_setaffinity(int pid, os_cpu_mask_t cpumask)
+{
+	return 0;
+}
+
+static inline int fio_getaffinity(int pid, os_cpu_mask_t *cpumask)
+{
+	return -1;
+}
+
+static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
+{
+	return -1;
+}
+
+static inline int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu_index)
+{
+	return 0;
+}
 #else
 extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #endif
@@ -134,7 +156,7 @@
 #endif
 
 #ifndef FIO_PREFERRED_ENGINE
-#define FIO_PREFERRED_ENGINE	"sync"
+#define FIO_PREFERRED_ENGINE	"psync"
 #endif
 
 #ifndef FIO_OS_PATH_SEPARATOR
@@ -150,7 +172,7 @@
 #endif
 
 #ifndef FIO_MAX_JOBS
-#define FIO_MAX_JOBS		2048
+#define FIO_MAX_JOBS		4096
 #endif
 
 #ifndef CONFIG_SOCKLEN_T
@@ -320,12 +342,20 @@
 #endif
 
 #ifndef FIO_HAVE_FS_STAT
-static inline unsigned long long get_fs_size(const char *path)
+static inline unsigned long long get_fs_free_size(const char *path)
 {
 	return 0;
 }
 #endif
 
+#ifdef __powerpc64__
+#define FIO_HAVE_CPU_ONLINE_SYSCONF
+static inline unsigned int cpus_online(void)
+{
+        return sysconf(_SC_NPROCESSORS_CONF);
+}
+#endif
+
 #ifndef FIO_HAVE_CPU_ONLINE_SYSCONF
 static inline unsigned int cpus_online(void)
 {
@@ -356,4 +386,11 @@
 }
 #endif
 
+#ifndef FIO_HAVE_SHM_ATTACH_REMOVED
+static inline int shm_attach_to_open_removed(void)
+{
+	return 0;
+}
+#endif
+
 #endif

diff --git a/os/windows/eula.rtf b/os/windows/eula.rtf
index cc7be7f..1c92932 100755
--- a/os/windows/eula.rtf
+++ b/os/windows/eula.rtf
Binary files differ

diff --git a/os/windows/examples.wxs b/os/windows/examples.wxs
index a21182a..cc2ff5c 100755
--- a/os/windows/examples.wxs
+++ b/os/windows/examples.wxs

@@ -9,46 +9,109 @@
                     <File Source="..\..\examples\aio-read.fio" />
                 </Component>
                 <Component>
+                    <File Source="..\..\examples\backwards-read.fio" />
+                </Component>
+                <Component>
+                    <File Source="..\..\examples\basic-verify.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\cpuio.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\dev-dax.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\disk-zone-profile.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\e4defrag.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\e4defrag2.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\enospc-pressure.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\falloc.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\fixed-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\flow.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\fsx.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\fusion-aw-sync.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\gfapi.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\iometer-file-access-server.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\jesd219.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\latency-profile.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\libhdfs.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\mtd.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\netio.fio" />
                 </Component>
                 <Component>
                     <File Source="..\..\examples\netio_multicast.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\null.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\numa.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\pmemblk.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\poisson-rate-submission.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rand-zones.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rbd.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-client.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\rdmaio-server.fio" />
+                </Component>
+                <Component>
+                  <File Source="..\..\examples\ssd-steadystate.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\ssd-test.fio" />
                 </Component>
                 <Component>
+                  <File Source="..\..\examples\steadystate.fio" />
+                </Component>
+                <Component>
                     <File Source="..\..\examples\surface-scan.fio" />
                 </Component>
                 <Component>
                     <File Source="..\..\examples\tiobench-example.fio" />
                 </Component>
                 <Component>
-                  <File Source="..\..\examples\null.fio" />
-                </Component>
-                <Component>
-                  <File Source="..\..\examples\flow.fio" />
-                </Component>
-                <Component>
-                  <File Source="..\..\examples\cpuio.fio" />
-                </Component>
-                <Component>
-                  <File Source="..\..\examples\falloc.fio" />
-                </Component>
-                <Component>
-                  <File Source="..\..\examples\fusion-aw-sync.fio" />
-                </Component>
-                <Component>
-                  <File Source="..\..\examples\ssd-steadystate.fio" />
+                  <File Source="..\..\examples\waitfor.fio" />
                 </Component>
                 <Component>
                   <File Source="..\..\examples\zipf.fio" />
@@ -59,20 +122,41 @@
         <ComponentGroup Id="examples">
             <ComponentRef Id="_1mbs_clients.fio" />
             <ComponentRef Id="aio_read.fio" />
+            <ComponentRef Id="backwards_read.fio" />
+            <ComponentRef Id="basic_verify.fio" />
+            <ComponentRef Id="cpuio.fio" />
+            <ComponentRef Id="dev_dax.fio" />
             <ComponentRef Id="disk_zone_profile.fio" />
+            <ComponentRef Id="e4defrag.fio" />
+            <ComponentRef Id="e4defrag2.fio" />
+            <ComponentRef Id="enospc_pressure.fio" />
+            <ComponentRef Id="falloc.fio" />
+            <ComponentRef Id="fixed_rate_submission.fio" />
+            <ComponentRef Id="flow.fio" />
             <ComponentRef Id="fsx.fio" />
+            <ComponentRef Id="fusion_aw_sync.fio" />
+            <ComponentRef Id="gfapi.fio" />
             <ComponentRef Id="iometer_file_access_server.fio" />
+            <ComponentRef Id="jesd219.fio" />
+            <ComponentRef Id="latency_profile.fio" />
+            <ComponentRef Id="libhdfs.fio" />
+            <ComponentRef Id="mtd.fio" />
             <ComponentRef Id="netio.fio" />
             <ComponentRef Id="netio_multicast.fio" />
+            <ComponentRef Id="null.fio" />
+            <ComponentRef Id="numa.fio" />
+            <ComponentRef Id="pmemblk.fio" />
+            <ComponentRef Id="poisson_rate_submission.fio" />
+            <ComponentRef Id="rand_zones.fio" />
+            <ComponentRef Id="rbd.fio" />
+            <ComponentRef Id="rdmaio_client.fio" />
+            <ComponentRef Id="rdmaio_server.fio" />
+            <ComponentRef Id="ssd_steadystate.fio" />
             <ComponentRef Id="ssd_test.fio" />
+            <ComponentRef Id="steadystate.fio" />
             <ComponentRef Id="surface_scan.fio" />
             <ComponentRef Id="tiobench_example.fio" />
-            <ComponentRef Id="null.fio" />
-            <ComponentRef Id="flow.fio" />
-            <ComponentRef Id="cpuio.fio" />
-            <ComponentRef Id="falloc.fio" />
-            <ComponentRef Id="fusion_aw_sync.fio" />
-            <ComponentRef Id="ssd_steadystate.fio" />
+            <ComponentRef Id="waitfor.fio" />
             <ComponentRef Id="zipf.fio" />
         </ComponentGroup>
     </Fragment>

diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index 74f1d28..05d2a83 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs

@@ -10,7 +10,7 @@
 	<Product Id="*"
 	  Codepage="1252" Language="1033"
 	  Manufacturer="fio" Name="fio"
-	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.2.6">
+	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.20">
 		<Package
 		  Description="Flexible IO Tester"
 		  InstallerVersion="301" Keywords="Installer,MSI,Database"
@@ -40,7 +40,7 @@
 							<File Id="COPYING" Name="COPYING.txt" Source="..\..\COPYING"/>
 						</Component>
 						<Component>
-							<File Id="LICENSE" Name="LICENSE.txt" Source="..\..\LICENSE"/>
+							<File Id="MORAL_LICENSE" Name="MORAL-LICENSE.txt" Source="..\..\MORAL-LICENSE"/>
 						</Component>
 						<Directory Id="examples" Name="examples"/>
 					</Directory>
@@ -54,11 +54,11 @@
 		<ComponentRef Id="README"/>
 		<ComponentRef Id="REPORTING_BUGS"/>
 		<ComponentRef Id="COPYING"/>
-		<ComponentRef Id="LICENSE"/>
+		<ComponentRef Id="MORAL_LICENSE"/>
 		<ComponentGroupRef Id="examples"/>
 	</Feature>
 
-	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/?p=fio.git" />
+	<Property Id="ARPURLINFOABOUT" Value="http://git.kernel.dk/cgit/fio/" />
 	<Property Id='ARPCONTACT'>fio@vger.kernel.org</Property>
 	<Property Id='ARPHELPLINK'>http://www.spinics.net/lists/fio/</Property>
 	<Property Id='ARPURLUPDATEINFO'>http://bluestop.org/fio/</Property>

diff --git a/os/windows/posix.c b/os/windows/posix.c
index d238c64..eae8c86 100755
--- a/os/windows/posix.c
+++ b/os/windows/posix.c

@@ -40,12 +40,6 @@
   const char *pszFormat,
   ...);
 
-int vsprintf_s(
-  char *buffer,
-  size_t numberOfElements,
-  const char *format,
-  va_list argptr);
-
 int win_to_posix_error(DWORD winerr)
 {
 	switch (winerr)
@@ -229,6 +223,32 @@
 	return dl_error;
 }
 
+/* Copied from http://blogs.msdn.com/b/joshpoley/archive/2007/12/19/date-time-formats-and-conversions.aspx */
+void Time_tToSystemTime(time_t dosTime, SYSTEMTIME *systemTime)
+{
+    FILETIME utcFT;
+    LONGLONG jan1970;
+
+    jan1970 = Int32x32To64(dosTime, 10000000) + 116444736000000000;
+    utcFT.dwLowDateTime = (DWORD)jan1970;
+    utcFT.dwHighDateTime = jan1970 >> 32;
+
+    FileTimeToSystemTime((FILETIME*)&utcFT, systemTime);
+}
+
+char* ctime_r(const time_t *t, char *buf)
+{
+    SYSTEMTIME systime;
+    const char * const dayOfWeek[] = { "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" };
+    const char * const monthOfYear[] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
+
+    Time_tToSystemTime(*t, &systime);
+    /* We don't know how long `buf` is, but assume it's rounded up from the minimum of 25 to 32 */
+    StringCchPrintfA(buf, 31, "%s %s %d %02d:%02d:%02d %04d\n", dayOfWeek[systime.wDayOfWeek % 7], monthOfYear[(systime.wMonth - 1) % 12],
+										 systime.wDay, systime.wHour, systime.wMinute, systime.wSecond, systime.wYear);
+    return buf;
+}
+
 int gettimeofday(struct timeval *restrict tp, void *restrict tzp)
 {
 	FILETIME fileTime;
@@ -278,35 +298,76 @@
 		int fildes, off_t off)
 {
 	DWORD vaProt = 0;
+	DWORD mapAccess = 0;
+	DWORD lenlow;
+	DWORD lenhigh;
+	HANDLE hMap;
 	void* allocAddr = NULL;
 
 	if (prot & PROT_NONE)
 		vaProt |= PAGE_NOACCESS;
 
-	if ((prot & PROT_READ) && !(prot & PROT_WRITE))
+	if ((prot & PROT_READ) && !(prot & PROT_WRITE)) {
 		vaProt |= PAGE_READONLY;
+		mapAccess = FILE_MAP_READ;
+	}
 
-	if (prot & PROT_WRITE)
+	if (prot & PROT_WRITE) {
 		vaProt |= PAGE_READWRITE;
+		mapAccess |= FILE_MAP_WRITE;
+	}
 
-	if ((flags & MAP_ANON) | (flags & MAP_ANONYMOUS))
+	lenlow = len & 0xFFFF;
+	lenhigh = len >> 16;
+	/* If the low DWORD is zero and the high DWORD is non-zero, `CreateFileMapping`
+	   will return ERROR_INVALID_PARAMETER. To avoid this, set both to zero. */
+	if (lenlow == 0) {
+		lenhigh = 0;
+	}
+
+	if (flags & MAP_ANON || flags & MAP_ANONYMOUS)
 	{
 		allocAddr = VirtualAlloc(addr, len, MEM_COMMIT, vaProt);
 		if (allocAddr == NULL)
 			errno = win_to_posix_error(GetLastError());
 	}
+	else
+	{
+		hMap = CreateFileMapping((HANDLE)_get_osfhandle(fildes), NULL, vaProt, lenhigh, lenlow, NULL);
+
+		if (hMap != NULL)
+		{
+			allocAddr = MapViewOfFile(hMap, mapAccess, off >> 16, off & 0xFFFF, len);
+		}
+
+		if (hMap == NULL || allocAddr == NULL)
+			errno = win_to_posix_error(GetLastError());
+
+	}
 
 	return allocAddr;
 }
 
 int munmap(void *addr, size_t len)
 {
-	if (!VirtualFree(addr, 0, MEM_RELEASE)) {
-		errno = win_to_posix_error(GetLastError());
-		return -1;
+	BOOL success;
+
+	/* We may have allocated the memory with either MapViewOfFile or
+		 VirtualAlloc. Therefore, try calling UnmapViewOfFile first, and if that
+		 fails, call VirtualFree. */
+	success = UnmapViewOfFile(addr);
+
+	if (!success)
+	{
+		success = VirtualFree(addr, 0, MEM_RELEASE);
 	}
 
-	return 0;
+	return !success;
+}
+
+int msync(void *addr, size_t len, int flags)
+{
+	return !FlushViewOfFile(addr, len);
 }
 
 int fork(void)
@@ -621,10 +682,19 @@
 
 int nice(int incr)
 {
-	if (incr != 0) {
-		errno = EINVAL;
-		return -1;
-	}
+	DWORD prioclass = NORMAL_PRIORITY_CLASS;
+	
+	if (incr < -15)
+		prioclass = HIGH_PRIORITY_CLASS;
+	else if (incr < 0)
+		prioclass = ABOVE_NORMAL_PRIORITY_CLASS;
+	else if (incr > 15)
+		prioclass = IDLE_PRIORITY_CLASS;
+	else if (incr > 0)
+		prioclass = BELOW_NORMAL_PRIORITY_CLASS;
+	
+	if (!SetPriorityClass(GetCurrentProcess(), prioclass))
+		log_err("fio: SetPriorityClass failed\n");
 
 	return 0;
 }
@@ -667,17 +737,9 @@
 
 int posix_madvise(void *addr, size_t len, int advice)
 {
-	log_err("%s is not implemented\n", __func__);
 	return ENOSYS;
 }
 
-/* Windows doesn't support advice for memory pages. Just ignore it. */
-int msync(void *addr, size_t len, int flags)
-{
-	errno = ENOSYS;
-	return -1;
-}
-
 int fdatasync(int fildes)
 {
 	return fsync(fildes);
@@ -864,7 +926,7 @@
 
 	if (dirp->find_handle == INVALID_HANDLE_VALUE) {
 		char search_pattern[MAX_PATH];
-		StringCchPrintfA(search_pattern, MAX_PATH, "%s\\*", dirp->dirname);
+		StringCchPrintfA(search_pattern, MAX_PATH-1, "%s\\*", dirp->dirname);
 		dirp->find_handle = FindFirstFileA(search_pattern, &find_data);
 		if (dirp->find_handle == INVALID_HANDLE_VALUE)
 			return NULL;

diff --git a/os/windows/posix/include/sys/ioctl.h b/os/windows/posix/include/sys/ioctl.h
new file mode 100644
index 0000000..a42247d
--- /dev/null
+++ b/os/windows/posix/include/sys/ioctl.h

@@ -0,0 +1,7 @@
+#ifndef IOCTL_H
+#define IOCTL_H
+
+/* This file is empty since it only needs to exist on Windows
+   but isn't otherwise used */
+
+#endif /* IOCTL_H */
\ No newline at end of file

diff --git a/lib/getopt.h b/oslib/getopt.h
similarity index 100%
rename from lib/getopt.h
rename to oslib/getopt.h


diff --git a/lib/getopt_long.c b/oslib/getopt_long.c
similarity index 82%
rename from lib/getopt_long.c
rename to oslib/getopt_long.c
index 11d879a..8ec7741 100644
--- a/lib/getopt_long.c
+++ b/oslib/getopt_long.c

@@ -26,14 +26,14 @@
 } pvt;
 
 static inline const char *option_matches(const char *arg_str,
-					 const char *opt_name)
+					 const char *opt_name, int smatch)
 {
 	while (*arg_str != '\0' && *arg_str != '=') {
 		if (*arg_str++ != *opt_name++)
 			return NULL;
 	}
 
-	if (*opt_name)
+	if (*opt_name && !smatch)
 		return NULL;
 
 	return arg_str;
@@ -84,11 +84,37 @@
 		}
 
 		for (lo = longopts; lo->name; lo++) {
-			if ((opt_end = option_matches(carg+2, lo->name)))
+			opt_end = option_matches(carg+2, lo->name, 0);
+			if (opt_end)
 			    break;
 		}
-		if (!opt_end)
-			return '?';
+		/*
+		 * The GNU getopt_long_only() apparently allows a short match,
+		 * if it's unique and if we don't have a full match. Let's
+		 * do the same here, search and see if there is one (and only
+		 * one) short match.
+		 */
+		if (!opt_end) {
+			const struct option *lo_match = NULL;
+
+			for (lo = longopts; lo->name; lo++) {
+				const char *ret;
+
+				ret = option_matches(carg+2, lo->name, 1);
+				if (!ret)
+					continue;
+				if (!opt_end) {
+					opt_end = ret;
+					lo_match = lo;
+				} else {
+					opt_end = NULL;
+					break;
+				}
+			}
+			if (!opt_end)
+				return '?';
+			lo = lo_match;
+		}
 
 		if (longindex)
 			*longindex = lo-longopts;

diff --git a/lib/inet_aton.c b/oslib/inet_aton.c
similarity index 100%
rename from lib/inet_aton.c
rename to oslib/inet_aton.c


diff --git a/lib/inet_aton.h b/oslib/inet_aton.h
similarity index 100%
rename from lib/inet_aton.h
rename to oslib/inet_aton.h


diff --git a/oslib/libmtd.c b/oslib/libmtd.c
new file mode 100644
index 0000000..24e9db9
--- /dev/null
+++ b/oslib/libmtd.c

@@ -0,0 +1,1425 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <inttypes.h>
+
+#include <mtd/mtd-user.h>
+#include "libmtd.h"
+
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+/**
+ * mkpath - compose full path from 2 given components.
+ * @path: the first component
+ * @name: the second component
+ *
+ * This function returns the resulting path in case of success and %NULL in
+ * case of failure.
+ */
+static char *mkpath(const char *path, const char *name)
+{
+	char *n;
+	size_t len1 = strlen(path);
+	size_t len2 = strlen(name);
+
+	n = xmalloc(len1 + len2 + 6);
+
+	memcpy(n, path, len1);
+	if (n[len1 - 1] != '/')
+		n[len1++] = '/';
+
+	memcpy(n + len1, name, len2 + 1);
+	return n;
+}
+
+/**
+ * read_data - read data from a file.
+ * @file: the file to read from
+ * @buf: the buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure. Note, if the file contains more then @buf_len bytes of
+ * date, this function fails with %EINVAL error code.
+ */
+static int read_data(const char *file, void *buf, int buf_len)
+{
+	int fd, rd, tmp, tmp1;
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, buf_len);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+
+	if (rd == buf_len) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	((char *)buf)[rd] = '\0';
+
+	/* Make sure all data is read */
+	tmp1 = read(fd, &tmp, 1);
+	if (tmp1 == 1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (tmp1) {
+		errmsg("file \"%s\" contains too much data (> %d bytes)",
+		       file, buf_len);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return rd;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_major - read major and minor numbers from a file.
+ * @file: name of the file to read from
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns % in case of success, and %-1 in case of failure.
+ */
+static int read_major(const char *file, int *major, int *minor)
+{
+	int ret;
+	char buf[50];
+
+	ret = read_data(file, buf, 50);
+	if (ret < 0)
+		return ret;
+
+	ret = sscanf(buf, "%d:%d\n", major, minor);
+	if (ret != 2) {
+		errno = EINVAL;
+		return errmsg("\"%s\" does not have major:minor format", file);
+	}
+
+	if (*major < 0 || *minor < 0) {
+		errno = EINVAL;
+		return errmsg("bad major:minor %d:%d in \"%s\"",
+			      *major, *minor, file);
+	}
+
+	return 0;
+}
+
+/**
+ * dev_get_major - get major and minor numbers of an MTD device.
+ * @lib: libmtd descriptor
+ * @mtd_num: MTD device number
+ * @major: major number is returned here
+ * @minor: minor number is returned here
+ *
+ * This function returns zero in case of success and %-1 in case of failure.
+ */
+static int dev_get_major(struct libmtd *lib, int mtd_num, int *major, int *minor)
+{
+	char file[strlen(lib->mtd_dev) + 50];
+
+	sprintf(file, lib->mtd_dev, mtd_num);
+	return read_major(file, major, minor);
+}
+
+/**
+ * dev_read_data - read data from an MTD device's sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @buf: buffer to read to
+ * @buf_len: buffer length
+ *
+ * This function returns number of read bytes in case of success and %-1 in
+ * case of failure.
+ */
+static int dev_read_data(const char *patt, int mtd_num, void *buf, int buf_len)
+{
+	char file[strlen(patt) + 100];
+
+	sprintf(file, patt, mtd_num);
+	return read_data(file, buf, buf_len);
+}
+
+/**
+ * read_hex_ll - read a hex 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as hexadecimal
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_hex_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, sizeof(buf));
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == sizeof(buf)) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+	buf[rd] = '\0';
+
+	if (sscanf(buf, "%llx\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_pos_ll - read a positive 'long long' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function reads file @file and interprets its contents as a positive
+ * 'long long' integer. If this is not true, it fails with %EINVAL error code.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+static int read_pos_ll(const char *file, long long *value)
+{
+	int fd, rd;
+	char buf[50];
+
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return -1;
+
+	rd = read(fd, buf, 50);
+	if (rd == -1) {
+		sys_errmsg("cannot read \"%s\"", file);
+		goto out_error;
+	}
+	if (rd == 50) {
+		errmsg("contents of \"%s\" is too long", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (sscanf(buf, "%lld\n", value) != 1) {
+		errmsg("cannot read integer from \"%s\"\n", file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (*value < 0) {
+		errmsg("negative value %lld in \"%s\"", *value, file);
+		errno = EINVAL;
+		goto out_error;
+	}
+
+	if (close(fd))
+		return sys_errmsg("close failed on \"%s\"", file);
+
+	return 0;
+
+out_error:
+	close(fd);
+	return -1;
+}
+
+/**
+ * read_hex_int - read an 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_hex_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_hex_ll(file, &res))
+		return -1;
+
+	/* Make sure the value has correct range */
+	if (res > INT_MAX || res < INT_MIN) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * read_pos_int - read a positive 'int' value from a file.
+ * @file: the file to read from
+ * @value: the result is stored here
+ *
+ * This function is the same as 'read_pos_ll()', but it reads an 'int'
+ * value, not 'long long'.
+ */
+static int read_pos_int(const char *file, int *value)
+{
+	long long res;
+
+	if (read_pos_ll(file, &res))
+		return -1;
+
+	/* Make sure the value is not too big */
+	if (res > INT_MAX) {
+		errmsg("value %lld read from file \"%s\" is out of range",
+		       res, file);
+		errno = EINVAL;
+		return -1;
+	}
+
+	*value = res;
+	return 0;
+}
+
+/**
+ * dev_read_hex_int - read an hex 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_hex_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_hex_int(file, value);
+}
+
+/**
+ * dev_read_pos_int - read a positive 'int' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_int(const char *patt, int mtd_num, int *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_int(file, value);
+}
+
+/**
+ * dev_read_pos_ll - read a positive 'long long' value from an MTD device sysfs file.
+ * @patt: file pattern to read from
+ * @mtd_num: MTD device number
+ * @value: the result is stored here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_read_pos_ll(const char *patt, int mtd_num, long long *value)
+{
+	char file[strlen(patt) + 50];
+
+	sprintf(file, patt, mtd_num);
+	return read_pos_ll(file, value);
+}
+
+/**
+ * type_str2int - convert MTD device type to integer.
+ * @str: MTD device type string to convert
+ *
+ * This function converts MTD device type string @str, read from sysfs, into an
+ * integer.
+ */
+static int type_str2int(const char *str)
+{
+	if (!strcmp(str, "nand"))
+		return MTD_NANDFLASH;
+	if (!strcmp(str, "mlc-nand"))
+		return MTD_MLCNANDFLASH;
+	if (!strcmp(str, "nor"))
+		return MTD_NORFLASH;
+	if (!strcmp(str, "rom"))
+		return MTD_ROM;
+	if (!strcmp(str, "absent"))
+		return MTD_ABSENT;
+	if (!strcmp(str, "dataflash"))
+		return MTD_DATAFLASH;
+	if (!strcmp(str, "ram"))
+		return MTD_RAM;
+	if (!strcmp(str, "ubi"))
+		return MTD_UBIVOLUME;
+	return -1;
+}
+
+/**
+ * dev_node2num - find UBI device number by its character device node.
+ * @lib: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd_num: MTD device number is returned here
+ *
+ * This function returns %0 in case of success and %-1 in case of failure.
+ */
+static int dev_node2num(struct libmtd *lib, const char *node, int *mtd_num)
+{
+	struct stat st;
+	int i, mjr, mnr;
+	struct mtd_info info;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr) {
+			errno = 0;
+			*mtd_num = i;
+			return 0;
+		}
+	}
+
+	errno = ENODEV;
+	return -1;
+}
+
+/**
+ * sysfs_is_supported - check whether the MTD sub-system supports MTD.
+ * @lib: MTD library descriptor
+ *
+ * The Linux kernel MTD subsystem gained MTD support starting from kernel
+ * 2.6.30 and libmtd tries to use sysfs interface if possible, because the NAND
+ * sub-page size is available there (and not available at all in pre-sysfs
+ * kernels).
+ *
+ * Very old kernels did not have "/sys/class/mtd" directory. Not very old
+ * kernels (e.g., 2.6.29) did have "/sys/class/mtd/mtdX" directories, by there
+ * were no files there, e.g., the "name" file was not present. So all we can do
+ * is to check for a "/sys/class/mtd/mtdX/name" file. But this is not a
+ * reliable check, because if this is a new system with no MTD devices - we'll
+ * treat it as a pre-sysfs system.
+ */
+static int sysfs_is_supported(struct libmtd *lib)
+{
+	int fd, num = -1;
+	DIR *sysfs_mtd;
+	char file[strlen(lib->mtd_name) + 10];
+
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = 0;
+			return 0;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	/*
+	 * First of all find an "mtdX" directory. This is needed because there
+	 * may be, for example, mtd1 but no mtd0.
+	 */
+	while (1) {
+		int ret, mtd_num;
+		char tmp_buf[256];
+		struct dirent *dirent;
+
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			closedir(sysfs_mtd);
+			return -1;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			num = mtd_num;
+			break;
+		}
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (num == -1)
+		/* No mtd device, treat this as pre-sysfs system */
+		return 0;
+
+	sprintf(file, lib->mtd_name, num);
+	fd = open(file, O_RDONLY | O_CLOEXEC);
+	if (fd == -1)
+		return 0;
+
+	if (close(fd)) {
+		sys_errmsg("close failed on \"%s\"", file);
+		return -1;
+	}
+
+	return 1;
+}
+
+libmtd_t libmtd_open(void)
+{
+	struct libmtd *lib;
+
+	lib = xzalloc(sizeof(*lib));
+
+	lib->offs64_ioctls = OFFS64_IOCTLS_UNKNOWN;
+
+	lib->sysfs_mtd = mkpath("/sys", SYSFS_MTD);
+	if (!lib->sysfs_mtd)
+		goto out_error;
+
+	lib->mtd = mkpath(lib->sysfs_mtd, MTD_NAME_PATT);
+	if (!lib->mtd)
+		goto out_error;
+
+	lib->mtd_name = mkpath(lib->mtd, MTD_NAME);
+	if (!lib->mtd_name)
+		goto out_error;
+
+	if (!sysfs_is_supported(lib)) {
+		free(lib->mtd);
+		free(lib->sysfs_mtd);
+		free(lib->mtd_name);
+		lib->mtd_name = lib->mtd = lib->sysfs_mtd = NULL;
+		return lib;
+	}
+
+	lib->mtd_dev = mkpath(lib->mtd, MTD_DEV);
+	if (!lib->mtd_dev)
+		goto out_error;
+
+	lib->mtd_type = mkpath(lib->mtd, MTD_TYPE);
+	if (!lib->mtd_type)
+		goto out_error;
+
+	lib->mtd_eb_size = mkpath(lib->mtd, MTD_EB_SIZE);
+	if (!lib->mtd_eb_size)
+		goto out_error;
+
+	lib->mtd_size = mkpath(lib->mtd, MTD_SIZE);
+	if (!lib->mtd_size)
+		goto out_error;
+
+	lib->mtd_min_io_size = mkpath(lib->mtd, MTD_MIN_IO_SIZE);
+	if (!lib->mtd_min_io_size)
+		goto out_error;
+
+	lib->mtd_subpage_size = mkpath(lib->mtd, MTD_SUBPAGE_SIZE);
+	if (!lib->mtd_subpage_size)
+		goto out_error;
+
+	lib->mtd_oob_size = mkpath(lib->mtd, MTD_OOB_SIZE);
+	if (!lib->mtd_oob_size)
+		goto out_error;
+
+	lib->mtd_region_cnt = mkpath(lib->mtd, MTD_REGION_CNT);
+	if (!lib->mtd_region_cnt)
+		goto out_error;
+
+	lib->mtd_flags = mkpath(lib->mtd, MTD_FLAGS);
+	if (!lib->mtd_flags)
+		goto out_error;
+
+	lib->sysfs_supported = 1;
+	return lib;
+
+out_error:
+	libmtd_close((libmtd_t)lib);
+	return NULL;
+}
+
+void libmtd_close(libmtd_t desc)
+{
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	free(lib->mtd_flags);
+	free(lib->mtd_region_cnt);
+	free(lib->mtd_oob_size);
+	free(lib->mtd_subpage_size);
+	free(lib->mtd_min_io_size);
+	free(lib->mtd_size);
+	free(lib->mtd_eb_size);
+	free(lib->mtd_type);
+	free(lib->mtd_dev);
+	free(lib->mtd_name);
+	free(lib->mtd);
+	free(lib->sysfs_mtd);
+	free(lib);
+}
+
+int mtd_dev_present(libmtd_t desc, int mtd_num) {
+	struct stat st;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported) {
+		return legacy_dev_present(mtd_num) == 1;
+	} else {
+		char file[strlen(lib->mtd) + 10];
+
+		sprintf(file, lib->mtd, mtd_num);
+		return !stat(file, &st);
+	}
+}
+
+int mtd_get_info(libmtd_t desc, struct mtd_info *info)
+{
+	DIR *sysfs_mtd;
+	struct dirent *dirent;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(info, 0, sizeof(struct mtd_info));
+
+	if (!lib->sysfs_supported)
+		return legacy_mtd_get_info(info);
+
+	info->sysfs_supported = 1;
+
+	/*
+	 * We have to scan the MTD sysfs directory to identify how many MTD
+	 * devices are present.
+	 */
+	sysfs_mtd = opendir(lib->sysfs_mtd);
+	if (!sysfs_mtd) {
+		if (errno == ENOENT) {
+			errno = ENODEV;
+			return -1;
+		}
+		return sys_errmsg("cannot open \"%s\"", lib->sysfs_mtd);
+	}
+
+	info->lowest_mtd_num = INT_MAX;
+	while (1) {
+		int mtd_num, ret;
+		char tmp_buf[256];
+
+		errno = 0;
+		dirent = readdir(sysfs_mtd);
+		if (!dirent)
+			break;
+
+		if (strlen(dirent->d_name) >= 255) {
+			errmsg("invalid entry in %s: \"%s\"",
+			       lib->sysfs_mtd, dirent->d_name);
+			errno = EINVAL;
+			goto out_close;
+		}
+
+		ret = sscanf(dirent->d_name, MTD_NAME_PATT"%s",
+			     &mtd_num, tmp_buf);
+		if (ret == 1) {
+			info->mtd_dev_cnt += 1;
+			if (mtd_num > info->highest_mtd_num)
+				info->highest_mtd_num = mtd_num;
+			if (mtd_num < info->lowest_mtd_num)
+				info->lowest_mtd_num = mtd_num;
+		}
+	}
+
+	if (!dirent && errno) {
+		sys_errmsg("readdir failed on \"%s\"", lib->sysfs_mtd);
+		goto out_close;
+	}
+
+	if (closedir(sysfs_mtd))
+		return sys_errmsg("closedir failed on \"%s\"", lib->sysfs_mtd);
+
+	if (info->lowest_mtd_num == INT_MAX)
+		info->lowest_mtd_num = 0;
+
+	return 0;
+
+out_close:
+	closedir(sysfs_mtd);
+	return -1;
+}
+
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	memset(mtd, 0, sizeof(struct mtd_dev_info));
+	mtd->mtd_num = mtd_num;
+
+	if (!mtd_dev_present(desc, mtd_num)) {
+		errno = ENODEV;
+		return -1;
+	} else if (!lib->sysfs_supported)
+		return legacy_get_dev_info1(mtd_num, mtd);
+
+	if (dev_get_major(lib, mtd_num, &mtd->major, &mtd->minor))
+		return -1;
+
+	ret = dev_read_data(lib->mtd_name, mtd_num, &mtd->name,
+			    MTD_NAME_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->name)[ret - 1] = '\0';
+
+	ret = dev_read_data(lib->mtd_type, mtd_num, &mtd->type_str,
+			    MTD_TYPE_MAX + 1);
+	if (ret < 0)
+		return -1;
+	((char *)mtd->type_str)[ret - 1] = '\0';
+
+	if (dev_read_pos_int(lib->mtd_eb_size, mtd_num, &mtd->eb_size))
+		return -1;
+	if (dev_read_pos_ll(lib->mtd_size, mtd_num, &mtd->size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_min_io_size, mtd_num, &mtd->min_io_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_subpage_size, mtd_num, &mtd->subpage_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_oob_size, mtd_num, &mtd->oob_size))
+		return -1;
+	if (dev_read_pos_int(lib->mtd_region_cnt, mtd_num, &mtd->region_cnt))
+		return -1;
+	if (dev_read_hex_int(lib->mtd_flags, mtd_num, &ret))
+		return -1;
+	mtd->writable = !!(ret & MTD_WRITEABLE);
+
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+	mtd->type = type_str2int(mtd->type_str);
+	mtd->bb_allowed = !!(mtd->type == MTD_NANDFLASH ||
+				mtd->type == MTD_MLCNANDFLASH);
+
+	return 0;
+}
+
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd)
+{
+	int mtd_num;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (!lib->sysfs_supported)
+		return legacy_get_dev_info(node, mtd);
+
+	if (dev_node2num(lib, node, &mtd_num))
+		return -1;
+
+	return mtd_get_dev_info1(desc, mtd_num, mtd);
+}
+
+static inline int mtd_ioctl_error(const struct mtd_dev_info *mtd, int eb,
+				  const char *sreq)
+{
+	return sys_errmsg("%s ioctl failed for eraseblock %d (mtd%d)",
+			  sreq, eb, mtd->mtd_num);
+}
+
+static int mtd_valid_erase_block(const struct mtd_dev_info *mtd, int eb)
+{
+	if (eb < 0 || eb >= mtd->eb_cnt) {
+		errmsg("bad eraseblock number %d, mtd%d has %d eraseblocks",
+		       eb, mtd->mtd_num, mtd->eb_cnt);
+		errno = EINVAL;
+		return -1;
+	}
+	return 0;
+}
+
+static int mtd_xlock(const struct mtd_dev_info *mtd, int fd, int eb, int req,
+		     const char *sreq)
+{
+	int ret;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, req, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, sreq);
+
+	return 0;
+}
+#define mtd_xlock(mtd, fd, eb, req) mtd_xlock(mtd, fd, eb, req, #req)
+
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMLOCK);
+}
+
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	return mtd_xlock(mtd, fd, eb, MEMUNLOCK);
+}
+
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	struct libmtd *lib = (struct libmtd *)desc;
+	struct erase_info_user64 ei64;
+	struct erase_info_user ei;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	ei64.start = (__u64)eb * mtd->eb_size;
+	ei64.length = mtd->eb_size;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, MEMERASE64, &ei64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN)
+			return mtd_ioctl_error(mtd, eb, "MEMERASE64");
+
+		/*
+		 * MEMERASE64 support was added in kernel version 2.6.31, so
+		 * probably we are working with older kernel and this ioctl is
+		 * not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (ei64.start + ei64.length > 0xFFFFFFFF) {
+		errmsg("this system can address only %u eraseblocks",
+		       0xFFFFFFFFU / mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	ei.start = ei64.start;
+	ei.length = ei64.length;
+	ret = ioctl(fd, MEMERASE, &ei);
+	if (ret < 0)
+		return mtd_ioctl_error(mtd, eb, "MEMERASE");
+	return 0;
+}
+
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo)
+{
+	int ret;
+
+	if (regidx < 0) {
+		errno = ENODEV;
+		return -1;
+	}
+
+	reginfo->regionindex = regidx;
+
+	ret = ioctl(fd, MEMGETREGIONINFO, reginfo);
+	if (ret < 0)
+		return sys_errmsg("%s ioctl failed for erase region %d",
+			"MEMGETREGIONINFO", regidx);
+
+	return 0;
+}
+
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	erase_info_t ei;
+
+	ei.start = eb * mtd->eb_size;
+	ei.length = mtd->eb_size;
+
+	ret = ioctl(fd, MEMISLOCKED, &ei);
+	if (ret < 0) {
+		if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMISLOCKED");
+		else
+			errno = EOPNOTSUPP;
+	}
+
+	return ret;
+}
+
+/* Patterns to write to a physical eraseblock when torturing it */
+static uint8_t patterns[] = {0xa5, 0x5a, 0x0};
+
+/**
+ * check_pattern - check if buffer contains only a certain byte pattern.
+ * @buf: buffer to check
+ * @patt: the pattern to check
+ * @size: buffer size in bytes
+ *
+ * This function returns %1 in there are only @patt bytes in @buf, and %0 if
+ * something else was also found.
+ */
+static int check_pattern(const void *buf, uint8_t patt, int size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		if (((const uint8_t *)buf)[i] != patt)
+			return 0;
+	return 1;
+}
+
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int err, i, patt_count;
+	void *buf;
+
+	normsg("run torture test for PEB %d", eb);
+	patt_count = ARRAY_SIZE(patterns);
+
+	buf = xmalloc(mtd->eb_size);
+
+	for (i = 0; i < patt_count; i++) {
+		err = mtd_erase(desc, mtd, fd, eb);
+		if (err)
+			goto out;
+
+		/* Make sure the PEB contains only 0xFF bytes */
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, 0xFF, mtd->eb_size);
+		if (err == 0) {
+			errmsg("erased PEB %d, but a non-0xFF byte found", eb);
+			errno = EIO;
+			goto out;
+		}
+
+		/* Write a pattern and check it */
+		memset(buf, patterns[i], mtd->eb_size);
+		err = mtd_write(desc, mtd, fd, eb, 0, buf, mtd->eb_size, NULL,
+				0, 0);
+		if (err)
+			goto out;
+
+		memset(buf, ~patterns[i], mtd->eb_size);
+		err = mtd_read(mtd, fd, eb, 0, buf, mtd->eb_size);
+		if (err)
+			goto out;
+
+		err = check_pattern(buf, patterns[i], mtd->eb_size);
+		if (err == 0) {
+			errmsg("pattern %x checking failed for PEB %d",
+				patterns[i], eb);
+			errno = EIO;
+			goto out;
+		}
+	}
+
+	err = 0;
+	normsg("PEB %d passed torture test, do not mark it a bad", eb);
+
+out:
+	free(buf);
+	return -1;
+}
+
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (!mtd->bb_allowed)
+		return 0;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMGETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMGETBADBLOCK");
+	return ret;
+}
+
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb)
+{
+	int ret;
+	loff_t seek;
+
+	if (!mtd->bb_allowed) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	seek = (loff_t)eb * mtd->eb_size;
+	ret = ioctl(fd, MEMSETBADBLOCK, &seek);
+	if (ret == -1)
+		return mtd_ioctl_error(mtd, eb, "MEMSETBADBLOCK");
+	return 0;
+}
+
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len)
+{
+	int ret, rd = 0;
+	off_t seek;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek)
+		return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+				  mtd->mtd_num, seek);
+
+	while (rd < len) {
+		ret = read(fd, buf, len);
+		if (ret < 0)
+			return sys_errmsg("cannot read %d bytes from mtd%d (eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+		rd += ret;
+	}
+
+	return 0;
+}
+
+static int legacy_auto_oob_layout(const struct mtd_dev_info *mtd, int fd,
+				  int ooblen, void *oob) {
+	struct nand_oobinfo old_oobinfo;
+	int start, len;
+	uint8_t *tmp_buf;
+
+	/* Read the current oob info */
+	if (ioctl(fd, MEMGETOOBSEL, &old_oobinfo))
+		return sys_errmsg("MEMGETOOBSEL failed");
+
+	tmp_buf = malloc(ooblen);
+	memcpy(tmp_buf, oob, ooblen);
+
+	/*
+	 * We use autoplacement and have the oobinfo with the autoplacement
+	 * information from the kernel available
+	 */
+	if (old_oobinfo.useecc == MTD_NANDECC_AUTOPLACE) {
+		int i, tags_pos = 0;
+		for (i = 0; old_oobinfo.oobfree[i][1]; i++) {
+			/* Set the reserved bytes to 0xff */
+			start = old_oobinfo.oobfree[i][0];
+			len = old_oobinfo.oobfree[i][1];
+			memcpy(oob + start, tmp_buf + tags_pos, len);
+			tags_pos += len;
+		}
+	} else {
+		/* Set at least the ecc byte positions to 0xff */
+		start = old_oobinfo.eccbytes;
+		len = mtd->oob_size - start;
+		memcpy(oob + start, tmp_buf + start, len);
+	}
+	free(tmp_buf);
+
+	return 0;
+}
+
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode)
+{
+	int ret;
+	off_t seek;
+	struct mtd_write_req ops;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs + len > mtd->eb_size) {
+		errmsg("bad offset %d or length %d, mtd%d eraseblock size is %d",
+		       offs, len, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (len % mtd->subpage_size) {
+		errmsg("write length %d is not aligned to mtd%d min. I/O size %d",
+		       len, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	/* Calculate seek address */
+	seek = (off_t)eb * mtd->eb_size + offs;
+
+	if (oob) {
+		ops.start = seek;
+		ops.len = len;
+		ops.ooblen = ooblen;
+		ops.usr_data = (uint64_t)(unsigned long)data;
+		ops.usr_oob = (uint64_t)(unsigned long)oob;
+		ops.mode = mode;
+
+		ret = ioctl(fd, MEMWRITE, &ops);
+		if (ret == 0)
+			return 0;
+		else if (errno != ENOTTY && errno != EOPNOTSUPP)
+			return mtd_ioctl_error(mtd, eb, "MEMWRITE");
+
+		/* Fall back to old OOB ioctl() if necessary */
+		if (mode == MTD_OPS_AUTO_OOB)
+			if (legacy_auto_oob_layout(mtd, fd, ooblen, oob))
+				return -1;
+		if (mtd_write_oob(desc, mtd, fd, seek, ooblen, oob) < 0)
+			return sys_errmsg("cannot write to OOB");
+	}
+	if (data) {
+		/* Seek to the beginning of the eraseblock */
+		if (lseek(fd, seek, SEEK_SET) != seek)
+			return sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+					mtd->mtd_num, seek);
+		ret = write(fd, data, len);
+		if (ret != len)
+			return sys_errmsg("cannot write %d bytes to mtd%d "
+					  "(eraseblock %d, offset %d)",
+					  len, mtd->mtd_num, eb, offs);
+	}
+
+	return 0;
+}
+
+static int do_oob_op(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+	      uint64_t start, uint64_t length, void *data, unsigned int cmd64,
+	      unsigned int cmd)
+{
+	int ret, oob_offs;
+	struct mtd_oob_buf64 oob64;
+	struct mtd_oob_buf oob;
+	unsigned long long max_offs;
+	const char *cmd64_str, *cmd_str;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (cmd64 ==  MEMREADOOB64) {
+		cmd64_str = "MEMREADOOB64";
+		cmd_str   = "MEMREADOOB";
+	} else {
+		cmd64_str = "MEMWRITEOOB64";
+		cmd_str   = "MEMWRITEOOB";
+	}
+
+	max_offs = (unsigned long long)mtd->eb_cnt * mtd->eb_size;
+	if (start >= max_offs) {
+		errmsg("bad page address %" PRIu64 ", mtd%d has %d eraseblocks (%llu bytes)",
+		       start, mtd->mtd_num, mtd->eb_cnt, max_offs);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob_offs = start & (mtd->min_io_size - 1);
+	if (oob_offs + length > mtd->oob_size || length == 0) {
+		errmsg("Cannot write %" PRIu64 " OOB bytes to address %" PRIu64 " (OOB offset %u) - mtd%d OOB size is only %d bytes",
+		       length, start, oob_offs, mtd->mtd_num,  mtd->oob_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob64.start = start;
+	oob64.length = length;
+	oob64.usr_ptr = (uint64_t)(unsigned long)data;
+
+	if (lib->offs64_ioctls == OFFS64_IOCTLS_SUPPORTED ||
+	    lib->offs64_ioctls == OFFS64_IOCTLS_UNKNOWN) {
+		ret = ioctl(fd, cmd64, &oob64);
+		if (ret == 0)
+			return ret;
+
+		if (errno != ENOTTY ||
+		    lib->offs64_ioctls != OFFS64_IOCTLS_UNKNOWN) {
+			sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+				   cmd64_str, mtd->mtd_num, start, start / mtd->eb_size);
+		}
+
+		/*
+		 * MEMREADOOB64/MEMWRITEOOB64 support was added in kernel
+		 * version 2.6.31, so probably we are working with older kernel
+		 * and these ioctls are not supported.
+		 */
+		lib->offs64_ioctls = OFFS64_IOCTLS_NOT_SUPPORTED;
+	}
+
+	if (oob64.start > 0xFFFFFFFFULL) {
+		errmsg("this system can address only up to address %lu",
+		       0xFFFFFFFFUL);
+		errno = EINVAL;
+		return -1;
+	}
+
+	oob.start = oob64.start;
+	oob.length = oob64.length;
+	oob.ptr = data;
+
+	ret = ioctl(fd, cmd, &oob);
+	if (ret < 0)
+		sys_errmsg("%s ioctl failed for mtd%d, offset %" PRIu64 " (eraseblock %" PRIu64 ")",
+			   cmd_str, mtd->mtd_num, start, start / mtd->eb_size);
+	return ret;
+}
+
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMREADOOB64, MEMREADOOB);
+}
+
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data)
+{
+	return do_oob_op(desc, mtd, fd, start, length, data,
+			 MEMWRITEOOB64, MEMWRITEOOB);
+}
+
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name)
+{
+	int tmp, ret, in_fd, len, written = 0;
+	off_t seek;
+	struct stat st;
+	char *buf;
+
+	ret = mtd_valid_erase_block(mtd, eb);
+	if (ret)
+		return ret;
+
+	if (offs < 0 || offs >= mtd->eb_size) {
+		errmsg("bad offset %d, mtd%d eraseblock size is %d",
+		       offs, mtd->mtd_num, mtd->eb_size);
+		errno = EINVAL;
+		return -1;
+	}
+	if (offs % mtd->subpage_size) {
+		errmsg("write offset %d is not aligned to mtd%d min. I/O size %d",
+		       offs, mtd->mtd_num, mtd->subpage_size);
+		errno = EINVAL;
+		return -1;
+	}
+
+	in_fd = open(img_name, O_RDONLY | O_CLOEXEC);
+	if (in_fd == -1)
+		return sys_errmsg("cannot open \"%s\"", img_name);
+
+	if (fstat(in_fd, &st)) {
+		sys_errmsg("cannot stat %s", img_name);
+		goto out_close;
+	}
+
+	len = st.st_size;
+	if (len % mtd->subpage_size) {
+		errmsg("size of \"%s\" is %d byte, which is not aligned to "
+		       "mtd%d min. I/O size %d", img_name, len, mtd->mtd_num,
+		       mtd->subpage_size);
+		errno = EINVAL;
+		goto out_close;
+	}
+	tmp = (offs + len + mtd->eb_size - 1) / mtd->eb_size;
+	if (eb + tmp > mtd->eb_cnt) {
+		errmsg("\"%s\" image size is %d bytes, mtd%d size is %d "
+		       "eraseblocks, the image does not fit if we write it "
+		       "starting from eraseblock %d, offset %d",
+		       img_name, len, mtd->mtd_num, mtd->eb_cnt, eb, offs);
+		errno = EINVAL;
+		goto out_close;
+	}
+
+	/* Seek to the beginning of the eraseblock */
+	seek = (off_t)eb * mtd->eb_size + offs;
+	if (lseek(fd, seek, SEEK_SET) != seek) {
+		sys_errmsg("cannot seek mtd%d to offset %"PRIdoff_t,
+			    mtd->mtd_num, seek);
+		goto out_close;
+	}
+
+	buf = xmalloc(mtd->eb_size);
+
+	while (written < len) {
+		int rd = 0;
+
+		do {
+			ret = read(in_fd, buf, mtd->eb_size - offs - rd);
+			if (ret == -1) {
+				sys_errmsg("cannot read \"%s\"", img_name);
+				goto out_free;
+			}
+			rd += ret;
+		} while (ret && rd < mtd->eb_size - offs);
+
+		ret = write(fd, buf, rd);
+		if (ret != rd) {
+			sys_errmsg("cannot write %d bytes to mtd%d (eraseblock %d, offset %d)",
+				   len, mtd->mtd_num, eb, offs);
+			goto out_free;
+		}
+
+		offs = 0;
+		eb += 1;
+		written += rd;
+	}
+
+	free(buf);
+	close(in_fd);
+	return 0;
+
+out_free:
+	free(buf);
+out_close:
+	close(in_fd);
+	return -1;
+}
+
+int mtd_probe_node(libmtd_t desc, const char *node)
+{
+	struct stat st;
+	struct mtd_info info;
+	int i, mjr, mnr;
+	struct libmtd *lib = (struct libmtd *)desc;
+
+	if (stat(node, &st))
+		return sys_errmsg("cannot get information about \"%s\"", node);
+
+	if (!S_ISCHR(st.st_mode)) {
+		errmsg("\"%s\" is not a character device", node);
+		errno = EINVAL;
+		return -1;
+	}
+
+	mjr = major(st.st_rdev);
+	mnr = minor(st.st_rdev);
+
+	if (mtd_get_info((libmtd_t *)lib, &info))
+		return -1;
+
+	if (!lib->sysfs_supported)
+		return 0;
+
+	for (i = info.lowest_mtd_num; i <= info.highest_mtd_num; i++) {
+		int mjr1, mnr1, ret;
+
+		ret = dev_get_major(lib, i, &mjr1, &mnr1);
+		if (ret) {
+			if (errno == ENOENT)
+				continue;
+			if (!errno)
+				break;
+			return -1;
+		}
+
+		if (mjr1 == mjr && mnr1 == mnr)
+			return 1;
+	}
+
+	errno = 0;
+	return -1;
+}

diff --git a/oslib/libmtd.h b/oslib/libmtd.h
new file mode 100644
index 0000000..b5fd3f3
--- /dev/null
+++ b/oslib/libmtd.h

@@ -0,0 +1,357 @@
+/*
+ * Copyright (C) 2008, 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_H__
+#define __LIBMTD_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Needed for uint8_t, uint64_t
+#include <stdint.h>
+
+/* Maximum MTD device name length */
+#define MTD_NAME_MAX 127
+/* Maximum MTD device type string length */
+#define MTD_TYPE_MAX 64
+
+/* MTD library descriptor */
+typedef void * libmtd_t;
+
+/* Forward decls */
+struct region_info_user;
+
+/**
+ * @mtd_dev_cnt: count of MTD devices in system
+ * @lowest_mtd_num: lowest MTD device number in system
+ * @highest_mtd_num: highest MTD device number in system
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ */
+struct mtd_info
+{
+	int mtd_dev_cnt;
+	int lowest_mtd_num;
+	int highest_mtd_num;
+	unsigned int sysfs_supported:1;
+};
+
+/**
+ * struct mtd_dev_info - information about an MTD device.
+ * @mtd_num: MTD device number
+ * @major: major number of corresponding character device
+ * @minor: minor number of corresponding character device
+ * @type: flash type (constants like %MTD_NANDFLASH defined in mtd-abi.h)
+ * @type_str: static R/O flash type string
+ * @name: device name
+ * @size: device size in bytes
+ * @eb_cnt: count of eraseblocks
+ * @eb_size: eraseblock size
+ * @min_io_size: minimum input/output unit size
+ * @subpage_size: sub-page size
+ * @oob_size: OOB size (zero if the device does not have OOB area)
+ * @region_cnt: count of additional erase regions
+ * @writable: zero if the device is read-only
+ * @bb_allowed: non-zero if the MTD device may have bad eraseblocks
+ */
+struct mtd_dev_info
+{
+	int mtd_num;
+	int major;
+	int minor;
+	int type;
+	char type_str[MTD_TYPE_MAX + 1];
+	char name[MTD_NAME_MAX + 1];
+	long long size;
+	int eb_cnt;
+	int eb_size;
+	int min_io_size;
+	int subpage_size;
+	int oob_size;
+	int region_cnt;
+	unsigned int writable:1;
+	unsigned int bb_allowed:1;
+};
+
+/**
+ * libmtd_open - open MTD library.
+ *
+ * This function initializes and opens the MTD library and returns MTD library
+ * descriptor in case of success and %NULL in case of failure. In case of
+ * failure, errno contains zero if MTD is not present in the system, or
+ * contains the error code if a real error happened.
+ */
+libmtd_t libmtd_open(void);
+
+/**
+ * libmtd_close - close MTD library.
+ * @desc: MTD library descriptor
+ */
+void libmtd_close(libmtd_t desc);
+
+/**
+ * mtd_dev_present - check whether a MTD device is present.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to check
+ *
+ * This function returns %1 if MTD device is present and %0 if not.
+ */
+int mtd_dev_present(libmtd_t desc, int mtd_num);
+
+/**
+ * mtd_get_info - get general MTD information.
+ * @desc: MTD library descriptor
+ * @info: the MTD device information is returned here
+ *
+ * This function fills the passed @info object with general MTD information and
+ * returns %0 in case of success and %-1 in case of failure. If MTD subsystem is
+ * not present in the system, errno is set to @ENODEV.
+ */
+int mtd_get_info(libmtd_t desc, struct mtd_info *info);
+
+/**
+ * mtd_get_dev_info - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function gets information about MTD device defined by the @node device
+ * node file and saves this information in the @mtd object. Returns %0 in case
+ * of success and %-1 in case of failure. If MTD subsystem is not present in the
+ * system, or the MTD device does not exist, errno is set to @ENODEV.
+ */
+int mtd_get_dev_info(libmtd_t desc, const char *node, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_get_dev_info1 - get information about an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd_num: MTD device number to fetch information about
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is identical to 'mtd_get_dev_info()' except that it accepts
+ * MTD device number, not MTD character device.
+ */
+int mtd_get_dev_info1(libmtd_t desc, int mtd_num, struct mtd_dev_info *mtd);
+
+/**
+ * mtd_lock - lock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function locks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_lock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_unlock - unlock eraseblocks.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to lock
+ *
+ * This function unlocks eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_unlock(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_erase - erase an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to erase
+ *
+ * This function erases eraseblock @eb of MTD device described by @fd. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_erase(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_regioninfo - get information about an erase region.
+ * @fd: MTD device node file descriptor
+ * @regidx: index of region to look up
+ * @reginfo: the region information is returned here
+ *
+ * This function gets information about an erase region defined by the
+ * @regidx index and saves this information in the @reginfo object.
+ * Returns %0 in case of success and %-1 in case of failure. If the
+ * @regidx is not valid or unavailable, errno is set to @ENODEV.
+ */
+int mtd_regioninfo(int fd, int regidx, struct region_info_user *reginfo);
+
+/**
+ * mtd_is_locked - see if the specified eraseblock is locked.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks to see if eraseblock @eb of MTD device described
+ * by @fd is locked. Returns %0 if it is unlocked, %1 if it is locked, and
+ * %-1 in case of failure. If the ioctl is not supported (support was added in
+ * Linux kernel 2.6.36) or this particular device does not support it, errno is
+ * set to @ENOTSUPP.
+ */
+int mtd_is_locked(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_torture - torture an eraseblock.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to torture
+ *
+ * This function tortures eraseblock @eb. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_torture(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_is_bad - check if eraseblock is bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to check
+ *
+ * This function checks if eraseblock @eb is bad. Returns %0 if not, %1 if yes,
+ * and %-1 in case of failure.
+ */
+int mtd_is_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_mark_bad - mark an eraseblock as bad.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to mark as bad
+ *
+ * This function marks eraseblock @eb as bad. Returns %0 in case of success and
+ * %-1 in case of failure.
+ */
+int mtd_mark_bad(const struct mtd_dev_info *mtd, int fd, int eb);
+
+/**
+ * mtd_read - read data from an MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to read from
+ * @offs: offset withing the eraseblock to read from
+ * @buf: buffer to read data to
+ * @len: how many bytes to read
+ *
+ * This function reads @len bytes of data from eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd and stores the read data at buffer @buf.
+ * Returns %0 in case of success and %-1 in case of failure.
+ */
+int mtd_read(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+	     void *buf, int len);
+
+/**
+ * mtd_write - write data to an MTD device.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @data: data buffer to write
+ * @len: how many data bytes to write
+ * @oob: OOB buffer to write
+ * @ooblen: how many OOB bytes to write
+ * @mode: write mode (e.g., %MTD_OOB_PLACE, %MTD_OOB_RAW)
+ *
+ * This function writes @len bytes of data to eraseblock @eb and offset @offs
+ * of the MTD device defined by @mtd. Returns %0 in case of success and %-1 in
+ * case of failure.
+ *
+ * Can only write to a single page at a time if writing to OOB.
+ */
+int mtd_write(libmtd_t desc, const struct mtd_dev_info *mtd, int fd, int eb,
+	      int offs, void *data, int len, void *oob, int ooblen,
+	      uint8_t mode);
+
+/**
+ * mtd_read_oob - read out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to read
+ * @data: read buffer
+ *
+ * This function reads @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. This function returns %0 in case of
+ * success and %-1 in case of failure.
+ */
+int mtd_read_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		 uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_oob - write out-of-band area.
+ * @desc: MTD library descriptor
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @start: page-aligned start address
+ * @length: number of OOB bytes to write
+ * @data: write buffer
+ *
+ * This function writes @length OOB bytes starting from address @start on
+ * MTD device described by @fd. The address is specified as page byte offset
+ * from the beginning of the MTD device. Returns %0 in case of success and %-1
+ * in case of failure.
+ */
+int mtd_write_oob(libmtd_t desc, const struct mtd_dev_info *mtd, int fd,
+		  uint64_t start, uint64_t length, void *data);
+
+/**
+ * mtd_write_img - write a file to MTD device.
+ * @mtd: MTD device description object
+ * @fd: MTD device node file descriptor
+ * @eb: eraseblock to write to
+ * @offs: offset withing the eraseblock to write to
+ * @img_name: the file to write
+ *
+ * This function writes an image @img_name the MTD device defined by @mtd. @eb
+ * and @offs are the starting eraseblock and offset on the MTD device. Returns
+ * %0 in case of success and %-1 in case of failure.
+ */
+int mtd_write_img(const struct mtd_dev_info *mtd, int fd, int eb, int offs,
+		  const char *img_name);
+
+/**
+ * mtd_probe_node - test MTD node.
+ * @desc: MTD library descriptor
+ * @node: the node to test
+ *
+ * This function tests whether @node is an MTD device node and returns %1 if it
+ * is, and %-1 if it is not (errno is %ENODEV in this case) or if an error
+ * occurred.
+ */
+int mtd_probe_node(libmtd_t desc, const char *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __LIBMTD_H__ */

diff --git a/oslib/libmtd_common.h b/oslib/libmtd_common.h
new file mode 100644
index 0000000..9768066
--- /dev/null
+++ b/oslib/libmtd_common.h

@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __MTD_UTILS_COMMON_H__
+#define __MTD_UTILS_COMMON_H__
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <string.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <features.h>
+#include <inttypes.h>
+#include <sys/sysmacros.h>
+
+#ifndef PROGRAM_NAME
+# error "You must define PROGRAM_NAME before including this header"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MIN	/* some C lib headers define this for us */
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#define min(a, b) MIN(a, b) /* glue for linux kernel source */
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
+
+#define min_t(t,x,y) ({ \
+	typeof((x)) _x = (x); \
+	typeof((y)) _y = (y); \
+	(_x < _y) ? _x : _y; \
+})
+
+#define max_t(t,x,y) ({ \
+	typeof((x)) _x = (x); \
+	typeof((y)) _y = (y); \
+	(_x > _y) ? _x : _y; \
+})
+
+#ifndef O_CLOEXEC
+#define O_CLOEXEC 0
+#endif
+
+/* define a print format specifier for off_t */
+#ifdef __USE_FILE_OFFSET64
+#define PRIxoff_t PRIx64
+#define PRIdoff_t PRId64
+#else
+#define PRIxoff_t "l"PRIx32
+#define PRIdoff_t "l"PRId32
+#endif
+
+/* Verbose messages */
+#define bareverbose(verbose, fmt, ...) do {                        \
+	if (verbose)                                               \
+		printf(fmt, ##__VA_ARGS__);                        \
+} while(0)
+#define verbose(verbose, fmt, ...) \
+	bareverbose(verbose, "%s: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__)
+
+/* Normal messages */
+#define normsg_cont(fmt, ...) do {                                 \
+	printf("%s: " fmt, PROGRAM_NAME, ##__VA_ARGS__);           \
+} while(0)
+#define normsg(fmt, ...) do {                                      \
+	normsg_cont(fmt "\n", ##__VA_ARGS__);                      \
+} while(0)
+
+/* Error messages */
+#define errmsg(fmt, ...)  ({                                                \
+	fprintf(stderr, "%s: error!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+	-1;                                                                 \
+})
+#define errmsg_die(fmt, ...) do {                                           \
+	exit(errmsg(fmt, ##__VA_ARGS__));                                   \
+} while(0)
+
+/* System error messages */
+#define sys_errmsg(fmt, ...)  ({                                            \
+	int _err = errno;                                                   \
+	errmsg(fmt, ##__VA_ARGS__);                                         \
+	fprintf(stderr, "%*serror %d (%s)\n", (int)sizeof(PROGRAM_NAME) + 1,\
+		"", _err, strerror(_err));                                  \
+	-1;                                                                 \
+})
+#define sys_errmsg_die(fmt, ...) do {                                       \
+	exit(sys_errmsg(fmt, ##__VA_ARGS__));                               \
+} while(0)
+
+/* Warnings */
+#define warnmsg(fmt, ...) do {                                                \
+	fprintf(stderr, "%s: warning!: " fmt "\n", PROGRAM_NAME, ##__VA_ARGS__); \
+} while(0)
+
+#if defined(__UCLIBC__)
+/* uClibc versions before 0.9.34 don't have rpmatch() */
+#if __UCLIBC_MAJOR__ == 0 && \
+		(__UCLIBC_MINOR__ < 9 || \
+		(__UCLIBC_MINOR__ == 9 && __UCLIBC_SUBLEVEL__ < 34))
+#undef rpmatch
+#define rpmatch __rpmatch
+static inline int __rpmatch(const char *resp)
+{
+    return (resp[0] == 'y' || resp[0] == 'Y') ? 1 :
+	(resp[0] == 'n' || resp[0] == 'N') ? 0 : -1;
+}
+#endif
+#endif
+
+/**
+ * prompt the user for confirmation
+ */
+static inline bool prompt(const char *msg, bool def)
+{
+	char *line = NULL;
+	size_t len;
+	bool ret = def;
+
+	do {
+		normsg_cont("%s (%c/%c) ", msg, def ? 'Y' : 'y', def ? 'n' : 'N');
+		fflush(stdout);
+
+		while (getline(&line, &len, stdin) == -1) {
+			printf("failed to read prompt; assuming '%s'\n",
+				def ? "yes" : "no");
+			break;
+		}
+
+		if (strcmp("\n", line) != 0) {
+			switch (rpmatch(line)) {
+			case 0: ret = false; break;
+			case 1: ret = true; break;
+			case -1:
+				puts("unknown response; please try again");
+				continue;
+			}
+		}
+		break;
+	} while (1);
+
+	free(line);
+
+	return ret;
+}
+
+static inline int is_power_of_2(unsigned long long n)
+{
+	return (n != 0 && ((n & (n - 1)) == 0));
+}
+
+/**
+ * simple_strtoX - convert a hex/dec/oct string into a number
+ * @snum: buffer to convert
+ * @error: set to 1 when buffer isn't fully consumed
+ *
+ * These functions are similar to the standard strtoX() functions, but they are
+ * a little bit easier to use if you want to convert full string of digits into
+ * the binary form. The typical usage:
+ *
+ * int error = 0;
+ * unsigned long num;
+ *
+ * num = simple_strtoul(str, &error);
+ * if (error || ... if needed, your check that num is not out of range ...)
+ * 	error_happened();
+ */
+#define simple_strtoX(func, type) \
+static inline type simple_##func(const char *snum, int *error) \
+{ \
+	char *endptr; \
+	type ret = func(snum, &endptr, 0); \
+ \
+	if (error && (!*snum || *endptr)) { \
+		errmsg("%s: unable to parse the number '%s'", #func, snum); \
+		*error = 1; \
+	} \
+ \
+	return ret; \
+}
+simple_strtoX(strtol, long int)
+simple_strtoX(strtoll, long long int)
+simple_strtoX(strtoul, unsigned long int)
+simple_strtoX(strtoull, unsigned long long int)
+
+/* Simple version-printing for utils */
+#define common_print_version() \
+do { \
+	printf("%s %s\n", PROGRAM_NAME, VERSION); \
+} while (0)
+
+#include "libmtd_xalloc.h"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__MTD_UTILS_COMMON_H__ */

diff --git a/oslib/libmtd_int.h b/oslib/libmtd_int.h
new file mode 100644
index 0000000..cbe2ff5
--- /dev/null
+++ b/oslib/libmtd_int.h

@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) International Business Machines Corp., 2006
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * MTD library.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#ifndef __LIBMTD_INT_H__
+#define __LIBMTD_INT_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PROGRAM_NAME "libmtd"
+
+#define SYSFS_MTD        "class/mtd"
+#define MTD_NAME_PATT    "mtd%d"
+#define MTD_DEV          "dev"
+#define MTD_NAME         "name"
+#define MTD_TYPE         "type"
+#define MTD_EB_SIZE      "erasesize"
+#define MTD_SIZE         "size"
+#define MTD_MIN_IO_SIZE  "writesize"
+#define MTD_SUBPAGE_SIZE "subpagesize"
+#define MTD_OOB_SIZE     "oobsize"
+#define MTD_REGION_CNT   "numeraseregions"
+#define MTD_FLAGS        "flags"
+
+#define OFFS64_IOCTLS_UNKNOWN       0
+#define OFFS64_IOCTLS_NOT_SUPPORTED 1
+#define OFFS64_IOCTLS_SUPPORTED     2
+
+/**
+ * libmtd - MTD library description data structure.
+ * @sysfs_mtd: MTD directory in sysfs
+ * @mtd: MTD device sysfs directory pattern
+ * @mtd_dev: MTD device major/minor numbers file pattern
+ * @mtd_name: MTD device name file pattern
+ * @mtd_type: MTD device type file pattern
+ * @mtd_eb_size: MTD device eraseblock size file pattern
+ * @mtd_size: MTD device size file pattern
+ * @mtd_min_io_size: minimum I/O unit size file pattern
+ * @mtd_subpage_size: sub-page size file pattern
+ * @mtd_oob_size: MTD device OOB size file pattern
+ * @mtd_region_cnt: count of additional erase regions file pattern
+ * @mtd_flags: MTD device flags file pattern
+ * @sysfs_supported: non-zero if sysfs is supported by MTD
+ * @offs64_ioctls: %OFFS64_IOCTLS_SUPPORTED if 64-bit %MEMERASE64,
+ *                 %MEMREADOOB64, %MEMWRITEOOB64 MTD device ioctls are
+ *                 supported, %OFFS64_IOCTLS_NOT_SUPPORTED if not, and
+ *                 %OFFS64_IOCTLS_UNKNOWN if it is not known yet;
+ *
+ *  Note, we cannot find out whether 64-bit ioctls are supported by MTD when we
+ *  are initializing the library, because this requires an MTD device node.
+ *  Indeed, we have to actually call the ioctl and check for %ENOTTY to find
+ *  out whether it is supported or not.
+ *
+ *  Thus, we leave %offs64_ioctls uninitialized in 'libmtd_open()', and
+ *  initialize it later, when corresponding libmtd function is used, and when
+ *  we actually have a device node and can invoke an ioctl command on it.
+ */
+struct libmtd
+{
+	char *sysfs_mtd;
+	char *mtd;
+	char *mtd_dev;
+	char *mtd_name;
+	char *mtd_type;
+	char *mtd_eb_size;
+	char *mtd_size;
+	char *mtd_min_io_size;
+	char *mtd_subpage_size;
+	char *mtd_oob_size;
+	char *mtd_region_cnt;
+	char *mtd_flags;
+	unsigned int sysfs_supported:1;
+	unsigned int offs64_ioctls:2;
+};
+
+int legacy_libmtd_open(void);
+int legacy_dev_present(int mtd_num);
+int legacy_mtd_get_info(struct mtd_info *info);
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd);
+int legacy_get_dev_info1(int dev_num, struct mtd_dev_info *mtd);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !__LIBMTD_INT_H__ */

diff --git a/oslib/libmtd_legacy.c b/oslib/libmtd_legacy.c
new file mode 100644
index 0000000..38dc2b7
--- /dev/null
+++ b/oslib/libmtd_legacy.c

@@ -0,0 +1,384 @@
+/*
+ * Copyright (C) 2009 Nokia Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * Author: Artem Bityutskiy
+ *
+ * This file  is part of the MTD library. Implements pre-2.6.30 kernels support,
+ * where MTD did not have sysfs interface. The main limitation of the old
+ * kernels was that the sub-page size was not exported to user-space, so it was
+ * not possible to get sub-page size.
+ */
+
+/* Imported from mtd-utils by dehrenberg */
+
+#include <limits.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <mtd/mtd-user.h>
+
+#include "libmtd.h"
+#include "libmtd_int.h"
+#include "libmtd_common.h"
+
+#define MTD_PROC_FILE "/proc/mtd"
+#define MTD_DEV_PATT  "/dev/mtd%d"
+#define MTD_DEV_MAJOR 90
+
+#define PROC_MTD_FIRST     "dev:    size   erasesize  name\n"
+#define PROC_MTD_FIRST_LEN (sizeof(PROC_MTD_FIRST) - 1)
+#define PROC_MTD_MAX_LEN   4096
+#define PROC_MTD_PATT      "mtd%d: %llx %x"
+
+/**
+ * struct proc_parse_info - /proc/mtd parsing information.
+ * @mtd_num: MTD device number
+ * @size: device size
+ * @eb_size: eraseblock size
+ * @name: device name
+ * @buf: contents of /proc/mtd
+ * @data_size: how much data was read into @buf
+ * @pos: next string in @buf to parse
+ */
+struct proc_parse_info
+{
+	int mtd_num;
+	long long size;
+	char name[MTD_NAME_MAX + 1];
+	int eb_size;
+	char *buf;
+	int data_size;
+	char *next;
+};
+
+static int proc_parse_start(struct proc_parse_info *pi)
+{
+	int fd, ret;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1)
+		return -1;
+
+	pi->buf = xmalloc(PROC_MTD_MAX_LEN);
+
+	ret = read(fd, pi->buf, PROC_MTD_MAX_LEN);
+	if (ret == -1) {
+		sys_errmsg("cannot read \"%s\"", MTD_PROC_FILE);
+		goto out_free;
+	}
+
+	if (ret < PROC_MTD_FIRST_LEN ||
+	    memcmp(pi->buf, PROC_MTD_FIRST, PROC_MTD_FIRST_LEN)) {
+		errmsg("\"%s\" does not start with \"%s\"", MTD_PROC_FILE,
+		       PROC_MTD_FIRST);
+		goto out_free;
+	}
+
+	pi->data_size = ret;
+	pi->next = pi->buf + PROC_MTD_FIRST_LEN;
+
+	close(fd);
+	return 0;
+
+out_free:
+	free(pi->buf);
+	close(fd);
+	return -1;
+}
+
+static int proc_parse_next(struct proc_parse_info *pi)
+{
+	int ret, len, pos = pi->next - pi->buf;
+	char *p, *p1;
+
+	if (pos >= pi->data_size) {
+		free(pi->buf);
+		return 0;
+	}
+
+	ret = sscanf(pi->next, PROC_MTD_PATT, &pi->mtd_num, &pi->size,
+		     &pi->eb_size);
+	if (ret != 3)
+		return errmsg("\"%s\" pattern not found", PROC_MTD_PATT);
+
+	p = memchr(pi->next, '\"', pi->data_size - pos);
+	if (!p)
+		return errmsg("opening \" not found");
+	p += 1;
+	pos = p - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("opening \" not found");
+
+	p1 = memchr(p, '\"', pi->data_size - pos);
+	if (!p1)
+		return errmsg("closing \" not found");
+	pos = p1 - pi->buf;
+	if (pos >= pi->data_size)
+		return errmsg("closing \" not found");
+
+	len = p1 - p;
+	if (len > MTD_NAME_MAX)
+		return errmsg("too long mtd%d device name", pi->mtd_num);
+
+	memcpy(pi->name, p, len);
+	pi->name[len] = '\0';
+
+	if (p1[1] != '\n')
+		return errmsg("opening \"\n\" not found");
+	pi->next = p1 + 2;
+	return 1;
+}
+
+/**
+ * legacy_libmtd_open - legacy version of 'libmtd_open()'.
+ *
+ * This function is just checks that MTD is present in the system. Returns
+ * zero in case of success and %-1 in case of failure. In case of failure,
+ * errno contains zero if MTD is not present in the system, or contains the
+ * error code if a real error happened. This is similar to the 'libmtd_open()'
+ * return conventions.
+ */
+int legacy_libmtd_open(void)
+{
+	int fd;
+
+	fd = open(MTD_PROC_FILE, O_RDONLY);
+	if (fd == -1) {
+		if (errno == ENOENT)
+			errno = 0;
+		return -1;
+	}
+
+	close(fd);
+	return 0;
+}
+
+/**
+ * legacy_dev_presentl - legacy version of 'mtd_dev_present()'.
+ * @info: the MTD device information is returned here
+ *
+ * When the kernel does not provide sysfs files for the MTD subsystem,
+ * fall-back to parsing the /proc/mtd file to determine whether an mtd device
+ * number @mtd_num is present.
+ */
+int legacy_dev_present(int mtd_num)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd_num)
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_mtd_get_info - legacy version of 'mtd_get_info()'.
+ * @info: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_info()' and has the same conventions.
+ */
+int legacy_mtd_get_info(struct mtd_info *info)
+{
+	int ret;
+	struct proc_parse_info pi;
+
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	info->lowest_mtd_num = INT_MAX;
+	while (proc_parse_next(&pi)) {
+		info->mtd_dev_cnt += 1;
+		if (pi.mtd_num > info->highest_mtd_num)
+			info->highest_mtd_num = pi.mtd_num;
+		if (pi.mtd_num < info->lowest_mtd_num)
+			info->lowest_mtd_num = pi.mtd_num;
+	}
+
+	return 0;
+}
+
+/**
+ * legacy_get_dev_info - legacy version of 'mtd_get_dev_info()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info(const char *node, struct mtd_dev_info *mtd)
+{
+	struct stat st;
+	struct mtd_info_user ui;
+	int fd, ret;
+	loff_t offs = 0;
+	struct proc_parse_info pi;
+
+	if (stat(node, &st)) {
+		sys_errmsg("cannot open \"%s\"", node);
+		if (errno == ENOENT)
+			normsg("MTD subsystem is old and does not support "
+			       "sysfs, so MTD character device nodes have "
+			       "to exist");
+	}
+
+	if (!S_ISCHR(st.st_mode)) {
+		errno = EINVAL;
+		return errmsg("\"%s\" is not a character device", node);
+	}
+
+	memset(mtd, '\0', sizeof(struct mtd_dev_info));
+	mtd->major = major(st.st_rdev);
+	mtd->minor = minor(st.st_rdev);
+
+	if (mtd->major != MTD_DEV_MAJOR) {
+		errno = EINVAL;
+		return errmsg("\"%s\" has major number %d, MTD devices have "
+			      "major %d", node, mtd->major, MTD_DEV_MAJOR);
+	}
+
+	mtd->mtd_num = mtd->minor / 2;
+
+	fd = open(node, O_RDONLY);
+	if (fd == -1)
+		return sys_errmsg("cannot open \"%s\"", node);
+
+	if (ioctl(fd, MEMGETINFO, &ui)) {
+		sys_errmsg("MEMGETINFO ioctl request failed");
+		goto out_close;
+	}
+
+	ret = ioctl(fd, MEMGETBADBLOCK, &offs);
+	if (ret == -1) {
+		if (errno != EOPNOTSUPP) {
+			sys_errmsg("MEMGETBADBLOCK ioctl failed");
+			goto out_close;
+		}
+		errno = 0;
+		mtd->bb_allowed = 0;
+	} else
+		mtd->bb_allowed = 1;
+
+	mtd->type = ui.type;
+	mtd->size = ui.size;
+	mtd->eb_size = ui.erasesize;
+	mtd->min_io_size = ui.writesize;
+	mtd->oob_size = ui.oobsize;
+
+	if (mtd->min_io_size <= 0) {
+		errmsg("mtd%d (%s) has insane min. I/O unit size %d",
+		       mtd->mtd_num, node, mtd->min_io_size);
+		goto out_close;
+	}
+	if (mtd->eb_size <= 0 || mtd->eb_size < mtd->min_io_size) {
+		errmsg("mtd%d (%s) has insane eraseblock size %d",
+		       mtd->mtd_num, node, mtd->eb_size);
+		goto out_close;
+	}
+	if (mtd->size <= 0 || mtd->size < mtd->eb_size) {
+		errmsg("mtd%d (%s) has insane size %lld",
+		       mtd->mtd_num, node, mtd->size);
+		goto out_close;
+	}
+	mtd->eb_cnt = mtd->size / mtd->eb_size;
+
+	switch(mtd->type) {
+	case MTD_ABSENT:
+		errmsg("mtd%d (%s) is removable and is not present",
+		       mtd->mtd_num, node);
+		goto out_close;
+	case MTD_RAM:
+		strcpy((char *)mtd->type_str, "ram");
+		break;
+	case MTD_ROM:
+		strcpy((char *)mtd->type_str, "rom");
+		break;
+	case MTD_NORFLASH:
+		strcpy((char *)mtd->type_str, "nor");
+		break;
+	case MTD_NANDFLASH:
+		strcpy((char *)mtd->type_str, "nand");
+		break;
+	case MTD_MLCNANDFLASH:
+		strcpy((char *)mtd->type_str, "mlc-nand");
+		break;
+	case MTD_DATAFLASH:
+		strcpy((char *)mtd->type_str, "dataflash");
+		break;
+	case MTD_UBIVOLUME:
+		strcpy((char *)mtd->type_str, "ubi");
+		break;
+	default:
+		goto out_close;
+	}
+
+	if (ui.flags & MTD_WRITEABLE)
+		mtd->writable = 1;
+	mtd->subpage_size = mtd->min_io_size;
+
+	close(fd);
+
+	/*
+	 * Unfortunately, the device name is not available via ioctl, and
+	 * we have to parse /proc/mtd to get it.
+	 */
+	ret = proc_parse_start(&pi);
+	if (ret)
+		return -1;
+
+	while (proc_parse_next(&pi)) {
+		if (pi.mtd_num == mtd->mtd_num) {
+			strcpy((char *)mtd->name, pi.name);
+			return 0;
+		}
+	}
+
+	errmsg("mtd%d not found in \"%s\"", mtd->mtd_num, MTD_PROC_FILE);
+	errno = ENOENT;
+	return -1;
+
+out_close:
+	close(fd);
+	return -1;
+}
+
+/**
+ * legacy_get_dev_info1 - legacy version of 'mtd_get_dev_info1()'.
+ * @node: name of the MTD device node
+ * @mtd: the MTD device information is returned here
+ *
+ * This function is similar to 'mtd_get_dev_info1()' and has the same
+ * conventions.
+ */
+int legacy_get_dev_info1(int mtd_num, struct mtd_dev_info *mtd)
+{
+	char node[sizeof(MTD_DEV_PATT) + 20];
+
+	sprintf(node, MTD_DEV_PATT, mtd_num);
+	return legacy_get_dev_info(node, mtd);
+}

diff --git a/oslib/libmtd_xalloc.h b/oslib/libmtd_xalloc.h
new file mode 100644
index 0000000..532b80f
--- /dev/null
+++ b/oslib/libmtd_xalloc.h

@@ -0,0 +1,106 @@
+/*
+ * memory wrappers
+ *
+ * Copyright (c) Artem Bityutskiy, 2007, 2008
+ * Copyright 2001, 2002 Red Hat, Inc.
+ *           2001 David A. Schleef <ds@lineo.com>
+ *           2002 Axis Communications AB
+ *           2001, 2002 Erik Andersen <andersen@codepoet.org>
+ *           2004 University of Szeged, Hungary
+ *           2006 KaiGai Kohei <kaigai@ak.jp.nec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef __MTD_UTILS_XALLOC_H__
+#define __MTD_UTILS_XALLOC_H__
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Mark these functions as unused so that gcc does not emit warnings
+ * when people include this header but don't use every function.
+ */
+
+__attribute__((unused))
+static void *xmalloc(size_t size)
+{
+	void *ptr = malloc(size);
+
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ptr = calloc(nmemb, size);
+
+	if (ptr == NULL && nmemb != 0 && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static void *xzalloc(size_t size)
+{
+	return xcalloc(1, size);
+}
+
+__attribute__((unused))
+static void *xrealloc(void *ptr, size_t size)
+{
+	ptr = realloc(ptr, size);
+	if (ptr == NULL && size != 0)
+		sys_errmsg_die("out of memory");
+	return ptr;
+}
+
+__attribute__((unused))
+static char *xstrdup(const char *s)
+{
+	char *t;
+
+	if (s == NULL)
+		return NULL;
+	t = strdup(s);
+	if (t == NULL)
+		sys_errmsg_die("out of memory");
+	return t;
+}
+
+#ifdef _GNU_SOURCE
+
+__attribute__((unused))
+static int xasprintf(char **strp, const char *fmt, ...)
+{
+	int cnt;
+	va_list ap;
+
+	va_start(ap, fmt);
+	cnt = vasprintf(strp, fmt, ap);
+	va_end(ap);
+
+	if (cnt == -1)
+		sys_errmsg_die("out of memory");
+
+	return cnt;
+}
+#endif
+
+#endif /* !__MTD_UTILS_XALLOC_H__ */

diff --git a/lib/linux-dev-lookup.c b/oslib/linux-dev-lookup.c
similarity index 92%
rename from lib/linux-dev-lookup.c
rename to oslib/linux-dev-lookup.c
index 4d5f356..5fbccd3 100644
--- a/lib/linux-dev-lookup.c
+++ b/oslib/linux-dev-lookup.c

@@ -5,7 +5,7 @@
 #include <stdio.h>
 #include <unistd.h>
 
-#include "../os/os.h"
+#include "linux-dev-lookup.h"
 
 int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
 			   unsigned int min)
@@ -25,7 +25,7 @@
 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
 			continue;
 
-		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
+		sprintf(full_path, "%s/%s", path, dir->d_name);
 		if (lstat(full_path, &st) == -1) {
 			perror("lstat");
 			break;

diff --git a/lib/linux-dev-lookup.h b/oslib/linux-dev-lookup.h
similarity index 100%
rename from lib/linux-dev-lookup.h
rename to oslib/linux-dev-lookup.h


diff --git a/lib/strcasestr.c b/oslib/strcasestr.c
similarity index 91%
rename from lib/strcasestr.c
rename to oslib/strcasestr.c
index 92cf24c..2626609 100644
--- a/lib/strcasestr.c
+++ b/oslib/strcasestr.c

@@ -1,6 +1,8 @@
 #include <ctype.h>
 #include <stddef.h>
 
+#ifndef CONFIG_STRCASESTR
+
 char *strcasestr(const char *s1, const char *s2)
 {
 	const char *s = s1;
@@ -23,3 +25,5 @@
 
 	return *p ? NULL : (char *) s1;
 }
+
+#endif

diff --git a/lib/strcasestr.h b/oslib/strcasestr.h
similarity index 100%
rename from lib/strcasestr.h
rename to oslib/strcasestr.h


diff --git a/oslib/strlcat.c b/oslib/strlcat.c
new file mode 100644
index 0000000..3b33d0e
--- /dev/null
+++ b/oslib/strlcat.c

@@ -0,0 +1,24 @@
+#include <string.h>
+#include "strlcat.h"
+
+size_t strlcat(char *dst, const char *src, size_t size)
+{
+	size_t dstlen;
+	size_t srclen;
+
+	dstlen = strlen(dst);
+	size -= dstlen + 1;
+
+	/* return if no room */
+	if (!size)
+		return dstlen;
+
+	srclen = strlen(src);
+	if (srclen > size)
+		srclen = size;
+
+	memcpy(dst + dstlen, src, srclen);
+	dst[dstlen + srclen] = '\0';
+
+	return dstlen + srclen;
+}

diff --git a/oslib/strlcat.h b/oslib/strlcat.h
new file mode 100644
index 0000000..baeace4
--- /dev/null
+++ b/oslib/strlcat.h

@@ -0,0 +1,6 @@
+#ifndef FIO_STRLCAT_H
+#define FIO_STRLCAT_H
+
+size_t strlcat(char *dst, const char *src, size_t size);
+
+#endif

diff --git a/lib/strsep.c b/oslib/strsep.c
similarity index 100%
rename from lib/strsep.c
rename to oslib/strsep.c


diff --git a/lib/strsep.h b/oslib/strsep.h
similarity index 100%
rename from lib/strsep.h
rename to oslib/strsep.h


diff --git a/parse.c b/parse.c
index 7912212..4d4fddd 100644
--- a/parse.c
+++ b/parse.c

@@ -15,8 +15,10 @@
 #include "parse.h"
 #include "debug.h"
 #include "options.h"
+#include "optgroup.h"
 #include "minmax.h"
 #include "lib/ieee754.h"
+#include "lib/pow2.h"
 
 #ifdef CONFIG_ARITHMETIC
 #include "y.tab.h"
@@ -51,7 +53,7 @@
 }
 
 static void show_option_range(struct fio_option *o,
-				int (*logger)(const char *format, ...))
+			      size_t (*logger)(const char *format, ...))
 {
 	if (o->type == FIO_OPT_FLOAT_LIST) {
 		if (o->minfp == DBL_MIN && o->maxfp == DBL_MAX)
@@ -107,8 +109,9 @@
 		"list of floating point values separated by ':' (opt=5.9:7.8)",
 		"no argument (opt)",
 		"deprecated",
+		"unsupported",
 	};
-	int (*logger)(const char *format, ...);
+	size_t (*logger)(const char *format, ...);
 
 	if (is_err)
 		logger = log_err;
@@ -132,6 +135,7 @@
 	const char *p = str;
 	char *c;
 	unsigned long long mult = 1;
+	int i;
 
 	/*
          * Go forward until we hit a non-digit, or +/- sign
@@ -150,7 +154,7 @@
 	}
 
 	c = strdup(p);
-	for (int i = 0; i < strlen(c); i++)
+	for (i = 0; i < strlen(c); i++)
 		c[i] = tolower(c[i]);
 
 	if (!strncmp("us", c, 2) || !strncmp("usec", c, 4))
@@ -164,7 +168,7 @@
 	else if (!strcmp("h", c))
 		mult = 60 * 60 * 1000000UL;
 	else if (!strcmp("d", c))
-		mult = 24 * 60 * 60 * 1000000UL;
+		mult = 24 * 60 * 60 * 1000000ULL;
 
 	free(c);
 	return mult;
@@ -204,32 +208,50 @@
 		}
 	}
 
+	/* If kb_base is 1000, use true units.
+	 * If kb_base is 1024, use opposite units.
+	 */
 	if (!strncmp("pib", c, 3)) {
 		pow = 5;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("tib", c, 3)) {
 		pow = 4;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("gib", c, 3)) {
 		pow = 3;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("mib", c, 3)) {
 		pow = 2;
-		mult = 1000;
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
 	} else if (!strncmp("kib", c, 3)) {
 		pow = 1;
-		mult = 1000;
-	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2))
+		if (kb_base == 1000)
+			mult = 1024;
+		else if (kb_base == 1024)
+			mult = 1000;
+	} else if (!strncmp("p", c, 1) || !strncmp("pb", c, 2)) {
 		pow = 5;
-	else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2))
+	} else if (!strncmp("t", c, 1) || !strncmp("tb", c, 2)) {
 		pow = 4;
-	else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2))
+	} else if (!strncmp("g", c, 1) || !strncmp("gb", c, 2)) {
 		pow = 3;
-	else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2))
+	} else if (!strncmp("m", c, 1) || !strncmp("mb", c, 2)) {
 		pow = 2;
-	else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2))
+	} else if (!strncmp("k", c, 1) || !strncmp("kb", c, 2)) {
 		pow = 1;
-	else if (!strncmp("%", c, 1)) {
+	} else if (!strncmp("%", c, 1)) {
 		*percent = 1;
 		free(c);
 		return ret;
@@ -482,6 +504,8 @@
 			if (!vp->ival || vp->ival[0] == '\0')
 				continue;
 			all_skipped = 0;
+			if (!ptr)
+				break;
 			if (!strncmp(vp->ival, ptr, str_match_len(vp, ptr))) {
 				ret = 0;
 				if (o->off1)
@@ -521,6 +545,10 @@
 
 		if (ret)
 			break;
+		if (o->pow2 && !is_power_of_2(ull)) {
+			log_err("%s: must be a power-of-2\n", o->name);
+			return 1;
+		}
 
 		if (o->maxval && ull > o->maxval) {
 			log_err("max value out of range: %llu"
@@ -898,6 +926,25 @@
 	return ret;
 }
 
+struct fio_option *find_option(struct fio_option *options, const char *opt)
+{
+	struct fio_option *o;
+
+	for (o = &options[0]; o->name; o++) {
+		if (!o_match(o, opt))
+			continue;
+		if (o->type == FIO_OPT_UNSUPPORTED) {
+			log_err("Option <%s>: %s\n", o->name, o->help);
+			continue;
+		}
+
+		return o;
+	}
+
+	return NULL;
+}
+
+
 static struct fio_option *get_option(char *opt,
 				     struct fio_option *options, char **post)
 {
@@ -953,8 +1000,27 @@
 	__fio_options = NULL;
 }
 
+static void add_to_dump_list(struct fio_option *o, struct flist_head *dump_list,
+			     const char *post)
+{
+	struct print_option *p;
+
+	if (!dump_list)
+		return;
+
+	p = malloc(sizeof(*p));
+	p->name = strdup(o->name);
+	if (post)
+		p->value = strdup(post);
+	else
+		p->value = NULL;
+
+	flist_add_tail(&p->list, dump_list);
+}
+
 int parse_cmd_option(const char *opt, const char *val,
-		     struct fio_option *options, void *data)
+		     struct fio_option *options, void *data,
+		     struct flist_head *dump_list)
 {
 	struct fio_option *o;
 
@@ -964,16 +1030,18 @@
 		return 1;
 	}
 
-	if (!handle_option(o, val, data))
-		return 0;
+	if (handle_option(o, val, data)) {
+		log_err("fio: failed parsing %s=%s\n", opt, val);
+		return 1;
+	}
 
-	log_err("fio: failed parsing %s=%s\n", opt, val);
-	return 1;
+	add_to_dump_list(o, dump_list, val);
+	return 0;
 }
 
 int parse_option(char *opt, const char *input,
 		 struct fio_option *options, struct fio_option **o, void *data,
-		 int dump_cmdline)
+		 struct flist_head *dump_list)
 {
 	char *post;
 
@@ -999,19 +1067,7 @@
 		return 1;
 	}
 
-	if (dump_cmdline) {
-		const char *delim;
-
-		if (!strcmp("description", (*o)->name))
-			delim = "\"";
-		else
-			delim = "";
-
-		log_info("--%s%s", (*o)->name, post ? "" : " ");
-		if (post)
-			log_info("=%s%s%s ", delim, post, delim);
-	}
-
+	add_to_dump_list(*o, dump_list, post);
 	return 0;
 }
 
@@ -1056,6 +1112,19 @@
 	return i;
 }
 
+/*
+ * Make a guess of whether the distance from 's1' is significant enough
+ * to warrant printing the guess. We set this to a 1/2 match.
+ */
+int string_distance_ok(const char *opt, int distance)
+{
+	size_t len;
+
+	len = strlen(opt);
+	len = (len + 1) / 2;
+	return distance <= len;
+}
+
 static struct fio_option *find_child(struct fio_option *options,
 				     struct fio_option *o)
 {
@@ -1200,10 +1269,12 @@
 			handle_option(o, o->def, data);
 }
 
-void option_init(struct fio_option *o)
+static void option_init(struct fio_option *o)
 {
-	if (o->type == FIO_OPT_DEPRECATED)
+	if (o->type == FIO_OPT_DEPRECATED || o->type == FIO_OPT_UNSUPPORTED)
 		return;
+	if (o->name && !o->lname)
+		log_err("Option %s: missing long option name\n", o->name);
 	if (o->type == FIO_OPT_BOOL) {
 		o->minval = 0;
 		o->maxval = 1;
@@ -1249,6 +1320,23 @@
 	}
 }
 
+void options_mem_dupe(struct fio_option *options, void *data)
+{
+	struct fio_option *o;
+	char **ptr;
+
+	dprint(FD_PARSE, "dup options\n");
+
+	for (o = &options[0]; o->name; o++) {
+		if (o->type != FIO_OPT_STR_STORE)
+			continue;
+
+		ptr = td_var(data, o, o->off1);
+		if (*ptr)
+			*ptr = strdup(*ptr);
+	}
+}
+
 void options_free(struct fio_option *options, void *data)
 {
 	struct fio_option *o;

diff --git a/parse.h b/parse.h
index 15f2e06..fb6abd1 100644
--- a/parse.h
+++ b/parse.h

@@ -1,6 +1,7 @@
 #ifndef FIO_PARSE_H
 #define FIO_PARSE_H
 
+#include <inttypes.h>
 #include "flist.h"
 
 /*
@@ -19,6 +20,7 @@
 	FIO_OPT_FLOAT_LIST,
 	FIO_OPT_STR_SET,
 	FIO_OPT_DEPRECATED,
+	FIO_OPT_UNSUPPORTED,
 };
 
 /*
@@ -69,23 +71,22 @@
 	int (*verify)(struct fio_option *, void *);
 	const char *prof_name;		/* only valid for specific profile */
 	void *prof_opts;
-	unsigned int category;		/* what type of option */
-	unsigned int group;		/* who to group with */
+	uint64_t category;		/* what type of option */
+	uint64_t group;			/* who to group with */
 	void *gui_data;
 	int is_seconds;			/* time value with seconds base */
 	int is_time;			/* time based value */
 	int no_warn_def;
+	int pow2;			/* must be a power-of-2 */
 };
 
-typedef int (str_cb_fn)(void *, char *);
-
-extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, int);
+extern int parse_option(char *, const char *, struct fio_option *, struct fio_option **, void *, struct flist_head *);
 extern void sort_options(char **, struct fio_option *, int);
-extern int parse_cmd_option(const char *t, const char *l, struct fio_option *, void *);
+extern int parse_cmd_option(const char *t, const char *l, struct fio_option *, void *, struct flist_head *);
 extern int show_cmd_help(struct fio_option *, const char *);
 extern void fill_default_options(void *, struct fio_option *);
-extern void option_init(struct fio_option *);
 extern void options_init(struct fio_option *);
+extern void options_mem_dupe(struct fio_option *, void *);
 extern void options_free(struct fio_option *, void *);
 
 extern void strip_blank_front(char **);
@@ -96,6 +97,7 @@
 extern int str_to_float(const char *str, double *val, int is_time);
 
 extern int string_distance(const char *s1, const char *s2);
+extern int string_distance_ok(const char *s1, int dist);
 
 /*
  * Handlers for the options
@@ -103,18 +105,18 @@
 typedef int (fio_opt_str_fn)(void *, const char *);
 typedef int (fio_opt_str_val_fn)(void *, long long *);
 typedef int (fio_opt_int_fn)(void *, int *);
-typedef int (fio_opt_str_set_fn)(void *);
-
-#define __td_var(start, offset)	((char *) start + (offset))
 
 struct thread_options;
-static inline void *td_var(struct thread_options *to, struct fio_option *o,
-			   unsigned int offset)
+static inline void *td_var(void *to, struct fio_option *o, unsigned int offset)
 {
-	if (o->prof_opts)
-		return __td_var(o->prof_opts, offset);
+	void *ret;
 
-	return __td_var(to, offset);
+	if (o->prof_opts)
+		ret = o->prof_opts;
+	else
+		ret = to;
+
+	return (char *) ret + offset;
 }
 
 static inline int parse_is_percent(unsigned long long val)
@@ -122,4 +124,10 @@
 	return val <= -1ULL && val >= (-1ULL - 100ULL);
 }
 
+struct print_option {
+	struct flist_head list;
+	char *name;
+	char *value;
+};
+
 #endif

diff --git a/profiles/act.c b/profiles/act.c
index 4d2ec5c..643f8a8 100644
--- a/profiles/act.c
+++ b/profiles/act.c

@@ -1,6 +1,7 @@
 #include "../fio.h"
 #include "../profile.h"
 #include "../parse.h"
+#include "../optgroup.h"
 
 /*
  * 1x loads
@@ -129,21 +130,21 @@
 	},
 	{
 		.name	= "read-req-num-512-blocks",
-		.lname	= "Number of 512b blocks to read",
+		.lname	= "Number of 512B blocks to read",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct act_options, num_read_blocks),
-		.help	= "Number of 512b blocks to read at the time",
+		.help	= "Number of 512B blocks to read at the time",
 		.def	= "3",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
 	{
 		.name	= "large-block-op-kbytes",
-		.lname	= "Size of large block ops (writes)",
+		.lname	= "Size of large block ops in KiB (writes)",
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct act_options, write_size),
-		.help	= "Size of large block ops (writes)",
-		.def	= "128k",
+		.help	= "Size of large block ops in KiB (writes)",
+		.def	= "131072",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_ACT,
 	},
@@ -219,7 +220,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=1M"))
+	if (act_add_opt("bs=1048576"))
 		return 1;
 	if (act_add_opt("zero_buffers"))
 		return 1;
@@ -233,7 +234,7 @@
 		return 1;
 	if (act_add_opt("filename=%s", dev))
 		return 1;
-	if (act_add_opt("bs=4k"))
+	if (act_add_opt("bs=4096"))
 		return 1;
 	if (act_add_opt("ioengine=libaio"))
 		return 1;

diff --git a/profiles/tiobench.c b/profiles/tiobench.c
index b4331d7..9d9885a 100644
--- a/profiles/tiobench.c
+++ b/profiles/tiobench.c

@@ -1,6 +1,7 @@
 #include "../fio.h"
 #include "../profile.h"
 #include "../parse.h"
+#include "../optgroup.h"
 
 static unsigned long long size;
 static unsigned int loops = 1;
@@ -38,7 +39,7 @@
 		.lname	= "Tiobench size",
 		.type	= FIO_OPT_STR_VAL,
 		.off1	= offsetof(struct tiobench_options, size),
-		.help	= "Size in MB",
+		.help	= "Size in MiB",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -48,7 +49,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct tiobench_options, bs),
 		.help	= "Block size in bytes",
-		.def	= "4k",
+		.def	= "4096",
 		.category = FIO_OPT_C_PROFILE,
 		.group	= FIO_OPT_G_TIOBENCH,
 	},
@@ -90,7 +91,7 @@
 static int tb_prep_cmdline(void)
 {
 	/*
-	 * tiobench uses size as MB, so multiply up
+	 * tiobench uses size as MiB, so multiply up
 	 */
 	size *= 1024 * 1024ULL;
 	if (size)

diff --git a/rate-submit.c b/rate-submit.c
new file mode 100644
index 0000000..fdbece6
--- /dev/null
+++ b/rate-submit.c

@@ -0,0 +1,249 @@
+/*
+ * Rated submission helpers
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include "fio.h"
+#include "ioengines.h"
+#include "lib/getrusage.h"
+#include "rate-submit.h"
+
+static int io_workqueue_fn(struct submit_worker *sw,
+			   struct workqueue_work *work)
+{
+	struct io_u *io_u = container_of(work, struct io_u, work);
+	const enum fio_ddir ddir = io_u->ddir;
+	struct thread_data *td = sw->priv;
+	int ret;
+
+	dprint(FD_RATE, "io_u %p queued by %u\n", io_u, gettid());
+
+	io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
+
+	td->cur_depth++;
+
+	do {
+		ret = td_io_queue(td, io_u);
+		if (ret != FIO_Q_BUSY)
+			break;
+		ret = io_u_queued_complete(td, 1);
+		if (ret > 0)
+			td->cur_depth -= ret;
+		io_u_clear(td, io_u, IO_U_F_FLIGHT);
+	} while (1);
+
+	dprint(FD_RATE, "io_u %p ret %d by %u\n", io_u, ret, gettid());
+
+	io_queue_event(td, io_u, &ret, ddir, NULL, 0, NULL);
+
+	if (ret == FIO_Q_COMPLETED)
+		td->cur_depth--;
+	else if (ret == FIO_Q_QUEUED) {
+		unsigned int min_evts;
+
+		if (td->o.iodepth == 1)
+			min_evts = 1;
+		else
+			min_evts = 0;
+
+		ret = io_u_queued_complete(td, min_evts);
+		if (ret > 0)
+			td->cur_depth -= ret;
+	} else if (ret == FIO_Q_BUSY) {
+		ret = io_u_queued_complete(td, td->cur_depth);
+		if (ret > 0)
+			td->cur_depth -= ret;
+	}
+
+	return 0;
+}
+
+static bool io_workqueue_pre_sleep_flush_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+
+	if (td->io_u_queued || td->cur_depth || td->io_u_in_flight)
+		return true;
+
+	return false;
+}
+
+static void io_workqueue_pre_sleep_fn(struct submit_worker *sw)
+{
+	struct thread_data *td = sw->priv;
+	int ret;
+
+	ret = io_u_quiesce(td);
+	if (ret > 0)
+		td->cur_depth -= ret;
+}
+
+static int io_workqueue_alloc_fn(struct submit_worker *sw)
+{
+	struct thread_data *td;
+
+	td = calloc(1, sizeof(*td));
+	sw->priv = td;
+	return 0;
+}
+
+static void io_workqueue_free_fn(struct submit_worker *sw)
+{
+	free(sw->priv);
+	sw->priv = NULL;
+}
+
+static int io_workqueue_init_worker_fn(struct submit_worker *sw)
+{
+	struct thread_data *parent = sw->wq->td;
+	struct thread_data *td = sw->priv;
+
+	memcpy(&td->o, &parent->o, sizeof(td->o));
+	memcpy(&td->ts, &parent->ts, sizeof(td->ts));
+	td->o.uid = td->o.gid = -1U;
+	dup_files(td, parent);
+	td->eo = parent->eo;
+	fio_options_mem_dupe(td);
+
+	if (ioengine_load(td))
+		goto err;
+
+	td->pid = gettid();
+
+	INIT_FLIST_HEAD(&td->io_log_list);
+	INIT_FLIST_HEAD(&td->io_hist_list);
+	INIT_FLIST_HEAD(&td->verify_list);
+	INIT_FLIST_HEAD(&td->trim_list);
+	INIT_FLIST_HEAD(&td->next_rand_list);
+	td->io_hist_tree = RB_ROOT;
+
+	td->o.iodepth = 1;
+	if (td_io_init(td))
+		goto err_io_init;
+
+	set_epoch_time(td, td->o.log_unix_epoch);
+	fio_getrusage(&td->ru_start);
+	clear_io_state(td, 1);
+
+	td_set_runstate(td, TD_RUNNING);
+	td->flags |= TD_F_CHILD;
+	td->parent = parent;
+	return 0;
+
+err_io_init:
+	close_ioengine(td);
+err:
+	return 1;
+
+}
+
+static void io_workqueue_exit_worker_fn(struct submit_worker *sw,
+					unsigned int *sum_cnt)
+{
+	struct thread_data *td = sw->priv;
+
+	(*sum_cnt)++;
+	sum_thread_stats(&sw->wq->td->ts, &td->ts, *sum_cnt == 1);
+
+	fio_options_free(td);
+	close_and_free_files(td);
+	if (td->io_ops)
+		close_ioengine(td);
+	td_set_runstate(td, TD_EXITED);
+}
+
+#ifdef CONFIG_SFAA
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		__sync_fetch_and_add(dst, *src);
+		*src = 0;
+	}
+}
+#else
+static void sum_val(uint64_t *dst, uint64_t *src)
+{
+	if (*src) {
+		*dst += *src;
+		*src = 0;
+	}
+}
+#endif
+
+static void pthread_double_unlock(pthread_mutex_t *lock1,
+				  pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	pthread_mutex_unlock(lock1);
+	pthread_mutex_unlock(lock2);
+#endif
+}
+
+static void pthread_double_lock(pthread_mutex_t *lock1, pthread_mutex_t *lock2)
+{
+#ifndef CONFIG_SFAA
+	if (lock1 < lock2) {
+		pthread_mutex_lock(lock1);
+		pthread_mutex_lock(lock2);
+	} else {
+		pthread_mutex_lock(lock2);
+		pthread_mutex_lock(lock1);
+	}
+#endif
+}
+
+static void sum_ddir(struct thread_data *dst, struct thread_data *src,
+		     enum fio_ddir ddir)
+{
+	pthread_double_lock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+
+	sum_val(&dst->io_bytes[ddir], &src->io_bytes[ddir]);
+	sum_val(&dst->io_blocks[ddir], &src->io_blocks[ddir]);
+	sum_val(&dst->this_io_blocks[ddir], &src->this_io_blocks[ddir]);
+	sum_val(&dst->this_io_bytes[ddir], &src->this_io_bytes[ddir]);
+	sum_val(&dst->bytes_done[ddir], &src->bytes_done[ddir]);
+
+	pthread_double_unlock(&dst->io_wq.stat_lock, &src->io_wq.stat_lock);
+}
+
+static void io_workqueue_update_acct_fn(struct submit_worker *sw)
+{
+	struct thread_data *src = sw->priv;
+	struct thread_data *dst = sw->wq->td;
+
+	if (td_read(src))
+		sum_ddir(dst, src, DDIR_READ);
+	if (td_write(src))
+		sum_ddir(dst, src, DDIR_WRITE);
+	if (td_trim(src))
+		sum_ddir(dst, src, DDIR_TRIM);
+
+}
+
+static struct workqueue_ops rated_wq_ops = {
+	.fn			= io_workqueue_fn,
+	.pre_sleep_flush_fn	= io_workqueue_pre_sleep_flush_fn,
+	.pre_sleep_fn		= io_workqueue_pre_sleep_fn,
+	.update_acct_fn		= io_workqueue_update_acct_fn,
+	.alloc_worker_fn	= io_workqueue_alloc_fn,
+	.free_worker_fn		= io_workqueue_free_fn,
+	.init_worker_fn		= io_workqueue_init_worker_fn,
+	.exit_worker_fn		= io_workqueue_exit_worker_fn,
+};
+
+int rate_submit_init(struct thread_data *td, struct sk_out *sk_out)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return 0;
+
+	return workqueue_init(td, &td->io_wq, &rated_wq_ops, td->o.iodepth, sk_out);
+}
+
+void rate_submit_exit(struct thread_data *td)
+{
+	if (td->o.io_submit_mode != IO_MODE_OFFLOAD)
+		return;
+
+	workqueue_exit(&td->io_wq);
+}

diff --git a/rate-submit.h b/rate-submit.h
new file mode 100644
index 0000000..19fde3a
--- /dev/null
+++ b/rate-submit.h

@@ -0,0 +1,7 @@
+#ifndef FIO_RATE_SUBMIT
+#define FIO_RATE_SUBMIT
+
+int rate_submit_init(struct thread_data *, struct sk_out *);
+void rate_submit_exit(struct thread_data *);
+
+#endif

diff --git a/server.c b/server.c
index c249849..1e269c2 100644
--- a/server.c
+++ b/server.c

@@ -21,6 +21,7 @@
 #endif
 
 #include "fio.h"
+#include "options.h"
 #include "server.h"
 #include "crc/crc16.h"
 #include "lib/ieee754.h"
@@ -31,7 +32,24 @@
 
 int exit_backend = 0;
 
-static int server_fd = -1;
+enum {
+	SK_F_FREE	= 1,
+	SK_F_COPY	= 2,
+	SK_F_SIMPLE	= 4,
+	SK_F_VEC	= 8,
+	SK_F_INLINE	= 16,
+};
+
+struct sk_entry {
+	struct flist_head list;	/* link on sk_out->list */
+	int flags;		/* SK_F_* */
+	int opcode;		/* Actual command fields */
+	void *buf;
+	off_t size;
+	uint64_t tag;
+	struct flist_head next;	/* Other sk_entry's, if linked command */
+};
+
 static char *fio_server_arg;
 static char *bind_sock;
 static struct sockaddr_in saddr_in;
@@ -45,6 +63,8 @@
 static unsigned int use_zlib;
 static char me[128];
 
+static pthread_key_t sk_out_key;
+
 struct fio_fork_item {
 	struct flist_head list;
 	int exitval;
@@ -83,8 +103,87 @@
 	"LOAD_FILE",
 	"VTRIGGER",
 	"SENDFILE",
+	"JOB_OPT",
 };
 
+static void sk_lock(struct sk_out *sk_out)
+{
+	fio_mutex_down(&sk_out->lock);
+}
+
+static void sk_unlock(struct sk_out *sk_out)
+{
+	fio_mutex_up(&sk_out->lock);
+}
+
+void sk_out_assign(struct sk_out *sk_out)
+{
+	if (!sk_out)
+		return;
+
+	sk_lock(sk_out);
+	sk_out->refs++;
+	sk_unlock(sk_out);
+	pthread_setspecific(sk_out_key, sk_out);
+}
+
+static void sk_out_free(struct sk_out *sk_out)
+{
+	__fio_mutex_remove(&sk_out->lock);
+	__fio_mutex_remove(&sk_out->wait);
+	__fio_mutex_remove(&sk_out->xmit);
+	sfree(sk_out);
+}
+
+static int __sk_out_drop(struct sk_out *sk_out)
+{
+	if (sk_out) {
+		int refs;
+
+		sk_lock(sk_out);
+		assert(sk_out->refs != 0);
+		refs = --sk_out->refs;
+		sk_unlock(sk_out);
+
+		if (!refs) {
+			sk_out_free(sk_out);
+			pthread_setspecific(sk_out_key, NULL);
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+void sk_out_drop(void)
+{
+	struct sk_out *sk_out;
+
+	sk_out = pthread_getspecific(sk_out_key);
+	__sk_out_drop(sk_out);
+}
+
+static void __fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			       uint32_t pdu_len, uint64_t tag)
+{
+	memset(cmd, 0, sizeof(*cmd));
+
+	cmd->version	= __cpu_to_le16(FIO_SERVER_VER);
+	cmd->opcode	= cpu_to_le16(opcode);
+	cmd->tag	= cpu_to_le64(tag);
+	cmd->pdu_len	= cpu_to_le32(pdu_len);
+}
+
+
+static void fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
+			     const void *pdu, uint32_t pdu_len, uint64_t tag)
+{
+	__fio_init_net_cmd(cmd, opcode, pdu_len, tag);
+
+	if (pdu)
+		memcpy(&cmd->payload, pdu, pdu_len);
+}
+
 const char *fio_server_op(unsigned int op)
 {
 	static char buf[32];
@@ -141,13 +240,10 @@
 	if (!total_len)
 		return 0;
 
-	if (errno)
-		return -errno;
-
 	return 1;
 }
 
-int fio_send_data(int sk, const void *p, unsigned int len)
+static int fio_send_data(int sk, const void *p, unsigned int len)
 {
 	struct iovec iov = { .iov_base = (void *) p, .iov_len = len };
 
@@ -156,10 +252,17 @@
 	return fio_sendv_data(sk, &iov, 1);
 }
 
-int fio_recv_data(int sk, void *p, unsigned int len)
+static int fio_recv_data(int sk, void *p, unsigned int len, bool wait)
 {
+	int flags;
+
+	if (wait)
+		flags = MSG_WAITALL;
+	else
+		flags = OS_MSG_DONTWAIT;
+
 	do {
-		int ret = recv(sk, p, len, MSG_WAITALL);
+		int ret = recv(sk, p, len, flags);
 
 		if (ret > 0) {
 			len -= ret;
@@ -169,9 +272,11 @@
 			continue;
 		} else if (!ret)
 			break;
-		else if (errno == EAGAIN || errno == EINTR)
-			continue;
-		else
+		else if (errno == EAGAIN || errno == EINTR) {
+			if (wait)
+				continue;
+			break;
+		} else
 			break;
 	} while (!exit_backend);
 
@@ -220,7 +325,7 @@
 /*
  * Read (and defragment, if necessary) incoming commands
  */
-struct fio_net_cmd *fio_net_recv_cmd(int sk)
+struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait)
 {
 	struct fio_net_cmd cmd, *tmp, *cmdret = NULL;
 	size_t cmd_size = 0, pdu_offset = 0;
@@ -229,7 +334,7 @@
 	void *pdu = NULL;
 
 	do {
-		ret = fio_recv_data(sk, &cmd, sizeof(cmd));
+		ret = fio_recv_data(sk, &cmd, sizeof(cmd), wait);
 		if (ret)
 			break;
 
@@ -273,7 +378,7 @@
 
 		/* There's payload, get it */
 		pdu = (void *) cmdret->payload + pdu_offset;
-		ret = fio_recv_data(sk, pdu, cmd.pdu_len);
+		ret = fio_recv_data(sk, pdu, cmd.pdu_len, wait);
 		if (ret)
 			break;
 
@@ -348,7 +453,7 @@
 	free(reply);
 }
 
-void fio_net_cmd_crc_pdu(struct fio_net_cmd *cmd, const void *pdu)
+static void fio_net_cmd_crc_pdu(struct fio_net_cmd *cmd, const void *pdu)
 {
 	uint32_t pdu_len;
 
@@ -358,7 +463,7 @@
 	cmd->pdu_crc16 = __cpu_to_le16(fio_crc16(pdu, pdu_len));
 }
 
-void fio_net_cmd_crc(struct fio_net_cmd *cmd)
+static void fio_net_cmd_crc(struct fio_net_cmd *cmd)
 {
 	fio_net_cmd_crc_pdu(cmd, cmd->payload);
 }
@@ -415,6 +520,61 @@
 	return ret;
 }
 
+static struct sk_entry *fio_net_prep_cmd(uint16_t opcode, void *buf,
+					 size_t size, uint64_t *tagptr,
+					 int flags)
+{
+	struct sk_entry *entry;
+
+	entry = smalloc(sizeof(*entry));
+	INIT_FLIST_HEAD(&entry->next);
+	entry->opcode = opcode;
+	if (flags & SK_F_COPY) {
+		entry->buf = smalloc(size);
+		memcpy(entry->buf, buf, size);
+	} else
+		entry->buf = buf;
+
+	entry->size = size;
+	if (tagptr)
+		entry->tag = *tagptr;
+	else
+		entry->tag = 0;
+	entry->flags = flags;
+	return entry;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry);
+
+static void fio_net_queue_entry(struct sk_entry *entry)
+{
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	if (entry->flags & SK_F_INLINE)
+		handle_sk_entry(sk_out, entry);
+	else {
+		sk_lock(sk_out);
+		flist_add_tail(&entry->list, &sk_out->list);
+		sk_unlock(sk_out);
+
+		fio_mutex_up(&sk_out->wait);
+	}
+}
+
+static int fio_net_queue_cmd(uint16_t opcode, void *buf, off_t size,
+			     uint64_t *tagptr, int flags)
+{
+	struct sk_entry *entry;
+
+	entry = fio_net_prep_cmd(opcode, buf, size, tagptr, flags);
+	if (entry) {
+		fio_net_queue_entry(entry);
+		return 0;
+	}
+
+	return 1;
+}
+
 static int fio_net_send_simple_stack_cmd(int sk, uint16_t opcode, uint64_t tag)
 {
 	struct fio_net_cmd cmd;
@@ -451,6 +611,13 @@
 	return 0;
 }
 
+static int fio_net_queue_quit(void)
+{
+	dprint(FD_NET, "server: sending quit\n");
+
+	return fio_net_queue_cmd(FIO_NET_CMD_QUIT, NULL, 0, NULL, SK_F_SIMPLE);
+}
+
 int fio_net_send_quit(int sk)
 {
 	dprint(FD_NET, "server: sending quit\n");
@@ -458,8 +625,7 @@
 	return fio_net_send_simple_cmd(sk, FIO_NET_CMD_QUIT, 0, NULL);
 }
 
-static int fio_net_send_ack(int sk, struct fio_net_cmd *cmd, int error,
-			    int signal)
+static int fio_net_send_ack(struct fio_net_cmd *cmd, int error, int signal)
 {
 	struct cmd_end_pdu epdu;
 	uint64_t tag = 0;
@@ -469,13 +635,13 @@
 
 	epdu.error = __cpu_to_le32(error);
 	epdu.signal = __cpu_to_le32(signal);
-	return fio_net_send_cmd(sk, FIO_NET_CMD_STOP, &epdu, sizeof(epdu), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_STOP, &epdu, sizeof(epdu), &tag, SK_F_COPY);
 }
 
-int fio_net_send_stop(int sk, int error, int signal)
+static int fio_net_queue_stop(int error, int signal)
 {
 	dprint(FD_NET, "server: sending stop (%d, %d)\n", error, signal);
-	return fio_net_send_ack(sk, NULL, error, signal);
+	return fio_net_send_ack(NULL, error, signal);
 }
 
 static void fio_server_add_fork_item(pid_t pid, struct flist_head *list)
@@ -526,20 +692,23 @@
 	}
 }
 
-static void fio_server_fork_item_done(struct fio_fork_item *ffi)
+static void fio_server_fork_item_done(struct fio_fork_item *ffi, bool stop)
 {
 	dprint(FD_NET, "pid %u exited, sig=%u, exitval=%d\n", (int) ffi->pid, ffi->signal, ffi->exitval);
 
 	/*
 	 * Fold STOP and QUIT...
 	 */
-	fio_net_send_stop(server_fd, ffi->exitval, ffi->signal);
-	fio_net_send_quit(server_fd);
+	if (stop) {
+		fio_net_queue_stop(ffi->exitval, ffi->signal);
+		fio_net_queue_quit();
+	}
+
 	flist_del(&ffi->list);
 	free(ffi);
 }
 
-static void fio_server_check_fork_items(struct flist_head *list)
+static void fio_server_check_fork_items(struct flist_head *list, bool stop)
 {
 	struct flist_head *entry, *tmp;
 	struct fio_fork_item *ffi;
@@ -550,18 +719,18 @@
 		fio_server_check_fork_item(ffi);
 
 		if (ffi->exited)
-			fio_server_fork_item_done(ffi);
+			fio_server_fork_item_done(ffi, stop);
 	}
 }
 
 static void fio_server_check_jobs(struct flist_head *job_list)
 {
-	fio_server_check_fork_items(job_list);
+	fio_server_check_fork_items(job_list, true);
 }
 
 static void fio_server_check_conns(struct flist_head *conn_list)
 {
-	fio_server_check_fork_items(conn_list);
+	fio_server_check_fork_items(conn_list, false);
 }
 
 static int handle_load_file_cmd(struct fio_net_cmd *cmd)
@@ -576,21 +745,24 @@
 	pdu->client_type = le16_to_cpu(pdu->client_type);
 
 	if (parse_jobs_ini(file_name, 0, 0, pdu->client_type)) {
-		fio_net_send_quit(server_fd);
+		fio_net_queue_quit();
 		return -1;
 	}
 
 	spdu.jobs = cpu_to_le32(thread_number);
 	spdu.stat_outputs = cpu_to_le32(stat_number);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
 	return 0;
 }
 
-static int handle_run_cmd(struct flist_head *job_list, struct fio_net_cmd *cmd)
+static int handle_run_cmd(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
 {
 	pid_t pid;
 	int ret;
 
+	sk_out_assign(sk_out);
+
 	fio_time_init();
 	set_genesis_time();
 
@@ -600,8 +772,9 @@
 		return 0;
 	}
 
-	ret = fio_backend();
+	ret = fio_backend(sk_out);
 	free_threads_shm();
+	sk_out_drop();
 	_exit(ret);
 }
 
@@ -615,13 +788,14 @@
 	pdu->client_type = le32_to_cpu(pdu->client_type);
 
 	if (parse_jobs_ini(buf, 1, 0, pdu->client_type)) {
-		fio_net_send_quit(server_fd);
+		fio_net_queue_quit();
 		return -1;
 	}
 
 	spdu.jobs = cpu_to_le32(thread_number);
 	spdu.stat_outputs = cpu_to_le32(stat_number);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
 	return 0;
 }
 
@@ -652,7 +826,7 @@
 	}
 
 	if (parse_cmd_line(clp->lines, argv, clp->client_type)) {
-		fio_net_send_quit(server_fd);
+		fio_net_queue_quit();
 		free(argv);
 		return -1;
 	}
@@ -661,7 +835,8 @@
 
 	spdu.jobs = cpu_to_le32(thread_number);
 	spdu.stat_outputs = cpu_to_le32(stat_number);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+
+	fio_net_queue_cmd(FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, SK_F_COPY);
 	return 0;
 }
 
@@ -698,7 +873,7 @@
 		use_zlib = 0;
 	}
 
-	return fio_net_send_cmd(server_fd, FIO_NET_CMD_PROBE, &probe, sizeof(probe), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_PROBE, &probe, sizeof(probe), &tag, SK_F_COPY);
 }
 
 static int handle_send_eta_cmd(struct fio_net_cmd *cmd)
@@ -708,45 +883,50 @@
 	size_t size;
 	int i;
 
-	je = get_jobs_eta(1, &size);
-	if (!je)
-		return 0;
-
 	dprint(FD_NET, "server sending status\n");
 
-	je->nr_running		= cpu_to_le32(je->nr_running);
-	je->nr_ramp		= cpu_to_le32(je->nr_ramp);
-	je->nr_pending		= cpu_to_le32(je->nr_pending);
-	je->nr_setting_up	= cpu_to_le32(je->nr_setting_up);
-	je->files_open		= cpu_to_le32(je->files_open);
+	/*
+	 * Fake ETA return if we don't have a local one, otherwise the client
+	 * will end up timing out waiting for a response to the ETA request
+	 */
+	je = get_jobs_eta(true, &size);
+	if (!je) {
+		size = sizeof(*je);
+		je = calloc(1, size);
+	} else {
+		je->nr_running		= cpu_to_le32(je->nr_running);
+		je->nr_ramp		= cpu_to_le32(je->nr_ramp);
+		je->nr_pending		= cpu_to_le32(je->nr_pending);
+		je->nr_setting_up	= cpu_to_le32(je->nr_setting_up);
+		je->files_open		= cpu_to_le32(je->files_open);
 
-	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
-		je->m_rate[i]	= cpu_to_le32(je->m_rate[i]);
-		je->t_rate[i]	= cpu_to_le32(je->t_rate[i]);
-		je->m_iops[i]	= cpu_to_le32(je->m_iops[i]);
-		je->t_iops[i]	= cpu_to_le32(je->t_iops[i]);
-		je->rate[i]	= cpu_to_le32(je->rate[i]);
-		je->iops[i]	= cpu_to_le32(je->iops[i]);
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			je->m_rate[i]	= cpu_to_le64(je->m_rate[i]);
+			je->t_rate[i]	= cpu_to_le64(je->t_rate[i]);
+			je->m_iops[i]	= cpu_to_le32(je->m_iops[i]);
+			je->t_iops[i]	= cpu_to_le32(je->t_iops[i]);
+			je->rate[i]	= cpu_to_le64(je->rate[i]);
+			je->iops[i]	= cpu_to_le32(je->iops[i]);
+		}
+
+		je->elapsed_sec		= cpu_to_le64(je->elapsed_sec);
+		je->eta_sec		= cpu_to_le64(je->eta_sec);
+		je->nr_threads		= cpu_to_le32(je->nr_threads);
+		je->is_pow2		= cpu_to_le32(je->is_pow2);
+		je->unit_base		= cpu_to_le32(je->unit_base);
 	}
 
-	je->elapsed_sec		= cpu_to_le64(je->elapsed_sec);
-	je->eta_sec		= cpu_to_le64(je->eta_sec);
-	je->nr_threads		= cpu_to_le32(je->nr_threads);
-	je->is_pow2		= cpu_to_le32(je->is_pow2);
-	je->unit_base		= cpu_to_le32(je->unit_base);
-
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_ETA, je, size, &tag, NULL);
-	free(je);
+	fio_net_queue_cmd(FIO_NET_CMD_ETA, je, size, &tag, SK_F_FREE);
 	return 0;
 }
 
-static int send_update_job_reply(int fd, uint64_t __tag, int error)
+static int send_update_job_reply(uint64_t __tag, int error)
 {
 	uint64_t tag = __tag;
 	uint32_t pdu_error;
 
 	pdu_error = __cpu_to_le32(error);
-	return fio_net_send_cmd(fd, FIO_NET_CMD_UPDATE_JOB, &pdu_error, sizeof(pdu_error), &tag, NULL);
+	return fio_net_queue_cmd(FIO_NET_CMD_UPDATE_JOB, &pdu_error, sizeof(pdu_error), &tag, SK_F_COPY);
 }
 
 static int handle_update_job_cmd(struct fio_net_cmd *cmd)
@@ -760,13 +940,13 @@
 	dprint(FD_NET, "server: updating options for job %u\n", tnumber);
 
 	if (!tnumber || tnumber > thread_number) {
-		send_update_job_reply(server_fd, cmd->tag, ENODEV);
+		send_update_job_reply(cmd->tag, ENODEV);
 		return 0;
 	}
 
 	td = &threads[tnumber - 1];
 	convert_thread_options_to_cpu(&td->o, &pdu->top);
-	send_update_job_reply(server_fd, cmd->tag, 0);
+	send_update_job_reply(cmd->tag, 0);
 	return 0;
 }
 
@@ -785,17 +965,16 @@
 		struct all_io_list state;
 
 		state.threads = cpu_to_le64((uint64_t) 0);
-		fio_net_send_cmd(server_fd, FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, NULL);
-	} else {
-		fio_net_send_cmd(server_fd, FIO_NET_CMD_VTRIGGER, rep, sz, NULL, NULL);
-		free(rep);
-	}
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, SK_F_COPY | SK_F_INLINE);
+	} else
+		fio_net_queue_cmd(FIO_NET_CMD_VTRIGGER, rep, sz, NULL, SK_F_FREE | SK_F_INLINE);
 
 	exec_trigger(buf);
 	return 0;
 }
 
-static int handle_command(struct flist_head *job_list, struct fio_net_cmd *cmd)
+static int handle_command(struct sk_out *sk_out, struct flist_head *job_list,
+			  struct fio_net_cmd *cmd)
 {
 	int ret;
 
@@ -806,7 +985,8 @@
 	switch (cmd->opcode) {
 	case FIO_NET_CMD_QUIT:
 		fio_terminate_threads(TERMINATE_ALL);
-		return -1;
+		ret = 0;
+		break;
 	case FIO_NET_CMD_EXIT:
 		exit_backend = 1;
 		return -1;
@@ -826,7 +1006,7 @@
 		ret = handle_send_eta_cmd(cmd);
 		break;
 	case FIO_NET_CMD_RUN:
-		ret = handle_run_cmd(job_list, cmd);
+		ret = handle_run_cmd(sk_out, job_list, cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ret = handle_update_job_cmd(cmd);
@@ -868,19 +1048,151 @@
 	return ret;
 }
 
-static int handle_connection(int sk)
+/*
+ * Send a command with a separate PDU, not inlined in the command
+ */
+static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
+				off_t size, uint64_t tag, uint32_t flags)
+{
+	struct fio_net_cmd cmd;
+	struct iovec iov[2];
+	size_t this_len;
+	int ret;
+
+	iov[0].iov_base = (void *) &cmd;
+	iov[0].iov_len = sizeof(cmd);
+
+	do {
+		uint32_t this_flags = flags;
+
+		this_len = size;
+		if (this_len > FIO_SERVER_MAX_FRAGMENT_PDU)
+			this_len = FIO_SERVER_MAX_FRAGMENT_PDU;
+
+		if (this_len < size)
+			this_flags |= FIO_NET_CMD_F_MORE;
+
+		__fio_init_net_cmd(&cmd, opcode, this_len, tag);
+		cmd.flags = __cpu_to_le32(this_flags);
+		fio_net_cmd_crc_pdu(&cmd, buf);
+
+		iov[1].iov_base = (void *) buf;
+		iov[1].iov_len = this_len;
+
+		ret = fio_sendv_data(sk, iov, 2);
+		size -= this_len;
+		buf += this_len;
+	} while (!ret && size);
+
+	return ret;
+}
+
+static void finish_entry(struct sk_entry *entry)
+{
+	if (entry->flags & SK_F_FREE)
+		free(entry->buf);
+	else if (entry->flags & SK_F_COPY)
+		sfree(entry->buf);
+
+	sfree(entry);
+}
+
+static void entry_set_flags(struct sk_entry *entry, struct flist_head *list,
+			    unsigned int *flags)
+{
+	if (!flist_empty(list))
+		*flags = FIO_NET_CMD_F_MORE;
+	else
+		*flags = 0;
+}
+
+static int send_vec_entry(struct sk_out *sk_out, struct sk_entry *first)
+{
+	unsigned int flags;
+	int ret;
+
+	entry_set_flags(first, &first->next, &flags);
+
+	ret = fio_send_cmd_ext_pdu(sk_out->sk, first->opcode, first->buf,
+					first->size, first->tag, flags);
+
+	while (!flist_empty(&first->next)) {
+		struct sk_entry *next;
+
+		next = flist_first_entry(&first->next, struct sk_entry, list);
+		flist_del_init(&next->list);
+
+		entry_set_flags(next, &first->next, &flags);
+
+		ret += fio_send_cmd_ext_pdu(sk_out->sk, next->opcode, next->buf,
+						next->size, next->tag, flags);
+		finish_entry(next);
+	}
+
+	return ret;
+}
+
+static int handle_sk_entry(struct sk_out *sk_out, struct sk_entry *entry)
+{
+	int ret;
+
+	fio_mutex_down(&sk_out->xmit);
+
+	if (entry->flags & SK_F_VEC)
+		ret = send_vec_entry(sk_out, entry);
+	else if (entry->flags & SK_F_SIMPLE) {
+		ret = fio_net_send_simple_cmd(sk_out->sk, entry->opcode,
+						entry->tag, NULL);
+	} else {
+		ret = fio_net_send_cmd(sk_out->sk, entry->opcode, entry->buf,
+					entry->size, &entry->tag, NULL);
+	}
+
+	fio_mutex_up(&sk_out->xmit);
+
+	if (ret)
+		log_err("fio: failed handling cmd %s\n", fio_server_op(entry->opcode));
+
+	finish_entry(entry);
+	return ret;
+}
+
+static int handle_xmits(struct sk_out *sk_out)
+{
+	struct sk_entry *entry;
+	FLIST_HEAD(list);
+	int ret = 0;
+
+	sk_lock(sk_out);
+	if (flist_empty(&sk_out->list)) {
+		sk_unlock(sk_out);
+		return 0;
+	}
+
+	flist_splice_init(&sk_out->list, &list);
+	sk_unlock(sk_out);
+
+	while (!flist_empty(&list)) {
+		entry = flist_entry(list.next, struct sk_entry, list);
+		flist_del(&entry->list);
+		ret += handle_sk_entry(sk_out, entry);
+	}
+
+	return ret;
+}
+
+static int handle_connection(struct sk_out *sk_out)
 {
 	struct fio_net_cmd *cmd = NULL;
 	FLIST_HEAD(job_list);
 	int ret = 0;
 
 	reset_fio_state();
-	server_fd = sk;
 
 	/* read forever */
 	while (!exit_backend) {
 		struct pollfd pfd = {
-			.fd	= sk,
+			.fd	= sk_out->sk,
 			.events	= POLLIN,
 		};
 
@@ -891,7 +1203,9 @@
 			if (!flist_empty(&job_list))
 				timeout = 100;
 
-			ret = poll(&pfd, 1, timeout);
+			handle_xmits(sk_out);
+
+			ret = poll(&pfd, 1, 0);
 			if (ret < 0) {
 				if (errno == EINTR)
 					break;
@@ -899,6 +1213,7 @@
 				break;
 			} else if (!ret) {
 				fio_server_check_jobs(&job_list);
+				fio_mutex_down_timeout(&sk_out->wait, timeout);
 				continue;
 			}
 
@@ -915,13 +1230,13 @@
 		if (ret < 0)
 			break;
 
-		cmd = fio_net_recv_cmd(sk);
+		cmd = fio_net_recv_cmd(sk_out->sk, true);
 		if (!cmd) {
 			ret = -1;
 			break;
 		}
 
-		ret = handle_command(&job_list, cmd);
+		ret = handle_command(sk_out, &job_list, cmd);
 		if (ret)
 			break;
 
@@ -932,10 +1247,51 @@
 	if (cmd)
 		free(cmd);
 
-	close(sk);
+	handle_xmits(sk_out);
+
+	close(sk_out->sk);
+	sk_out->sk = -1;
+	__sk_out_drop(sk_out);
 	_exit(ret);
 }
 
+/* get the address on this host bound by the input socket, 
+ * whether it is ipv6 or ipv4 */
+
+static int get_my_addr_str(int sk)
+{
+	struct sockaddr_in6 myaddr6 = { 0, };
+	struct sockaddr_in myaddr4 = { 0, };
+	struct sockaddr *sockaddr_p;
+	char *net_addr;
+	socklen_t len;
+	int ret;
+
+	if (use_ipv6) {
+		len = sizeof(myaddr6);
+		sockaddr_p = (struct sockaddr * )&myaddr6;
+		net_addr = (char * )&myaddr6.sin6_addr;
+	} else {
+		len = sizeof(myaddr4);
+		sockaddr_p = (struct sockaddr * )&myaddr4;
+		net_addr = (char * )&myaddr4.sin_addr;
+	}
+
+	ret = getsockname(sk, sockaddr_p, &len);
+	if (ret) {
+		log_err("fio: getsockaddr: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (!inet_ntop(use_ipv6?AF_INET6:AF_INET, net_addr, client_sockaddr_str, INET6_ADDRSTRLEN - 1)) {
+		log_err("inet_ntop: failed to convert addr to string\n");
+		return -1;
+	}
+
+	dprint(FD_NET, "fio server bound to addr %s\n", client_sockaddr_str);
+	return 0;
+}
+
 static int accept_loop(int listen_sk)
 {
 	struct sockaddr_in addr;
@@ -950,6 +1306,7 @@
 	fio_set_fd_nonblocking(listen_sk, "server");
 
 	while (!exit_backend) {
+		struct sk_out *sk_out;
 		const char *from;
 		char buf[64];
 		pid_t pid;
@@ -999,6 +1356,13 @@
 
 		dprint(FD_NET, "server: connect from %s\n", from);
 
+		sk_out = smalloc(sizeof(*sk_out));
+		sk_out->sk = sk;
+		INIT_FLIST_HEAD(&sk_out->list);
+		__fio_mutex_init(&sk_out->lock, FIO_MUTEX_UNLOCKED);
+		__fio_mutex_init(&sk_out->wait, FIO_MUTEX_LOCKED);
+		__fio_mutex_init(&sk_out->xmit, FIO_MUTEX_UNLOCKED);
+
 		pid = fork();
 		if (pid) {
 			close(sk);
@@ -1006,8 +1370,15 @@
 			continue;
 		}
 
-		/* exits */
-		handle_connection(sk);
+		/* if error, it's already logged, non-fatal */
+		get_my_addr_str(sk);
+
+		/*
+		 * Assign sk_out here, it'll be dropped in handle_connection()
+		 * since that function calls _exit() when done
+		 */
+		sk_out_assign(sk_out);
+		handle_connection(sk_out);
 	}
 
 	return exitval;
@@ -1015,12 +1386,13 @@
 
 int fio_server_text_output(int level, const char *buf, size_t len)
 {
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
 	struct cmd_text_pdu *pdu;
 	unsigned int tlen;
 	struct timeval tv;
 
-	if (server_fd == -1)
-		return log_local_buf(buf, len);
+	if (!sk_out || sk_out->sk == -1)
+		return -1;
 
 	tlen = sizeof(*pdu) + len;
 	pdu = malloc(tlen);
@@ -1034,7 +1406,7 @@
 
 	memcpy(pdu->buf, buf, len);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_TEXT, pdu, tlen, NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_TEXT, pdu, tlen, NULL, SK_F_COPY);
 	free(pdu);
 	return len;
 }
@@ -1061,7 +1433,7 @@
 		dst->min_run[i]		= cpu_to_le64(src->min_run[i]);
 		dst->max_bw[i]		= cpu_to_le64(src->max_bw[i]);
 		dst->min_bw[i]		= cpu_to_le64(src->min_bw[i]);
-		dst->io_kb[i]		= cpu_to_le64(src->io_kb[i]);
+		dst->iobytes[i]		= cpu_to_le64(src->iobytes[i]);
 		dst->agg[i]		= cpu_to_le64(src->agg[i]);
 	}
 
@@ -1079,6 +1451,8 @@
 {
 	struct cmd_ts_pdu p;
 	int i, j;
+	void *ss_buf;
+	uint64_t *ss_iops, *ss_bw;
 
 	dprint(FD_NET, "server sending end stats\n");
 
@@ -1123,10 +1497,10 @@
 		p.ts.io_u_complete[i]	= cpu_to_le32(ts->io_u_complete[i]);
 	}
 
-	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
 		p.ts.io_u_lat_u[i]	= cpu_to_le32(ts->io_u_lat_u[i]);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
 		p.ts.io_u_lat_m[i]	= cpu_to_le32(ts->io_u_lat_m[i]);
-	}
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++)
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
@@ -1158,9 +1532,41 @@
 	p.ts.latency_window	= cpu_to_le64(ts->latency_window);
 	p.ts.latency_percentile.u.i = cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
 
+	p.ts.nr_block_infos	= cpu_to_le64(ts->nr_block_infos);
+	for (i = 0; i < p.ts.nr_block_infos; i++)
+		p.ts.block_infos[i] = cpu_to_le32(ts->block_infos[i]);
+
+	p.ts.ss_dur		= cpu_to_le64(ts->ss_dur);
+	p.ts.ss_state		= cpu_to_le32(ts->ss_state);
+	p.ts.ss_head		= cpu_to_le32(ts->ss_head);
+	p.ts.ss_limit.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_limit.u.f));
+	p.ts.ss_slope.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_slope.u.f));
+	p.ts.ss_deviation.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_deviation.u.f));
+	p.ts.ss_criterion.u.i	= cpu_to_le64(fio_double_to_uint64(ts->ss_criterion.u.f));
+
 	convert_gs(&p.rs, rs);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_TS, &p, sizeof(p), NULL, NULL);
+	dprint(FD_NET, "ts->ss_state = %d\n", ts->ss_state);
+	if (ts->ss_state & __FIO_SS_DATA) {
+		dprint(FD_NET, "server sending steadystate ring buffers\n");
+
+		ss_buf = malloc(sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t));
+
+		memcpy(ss_buf, &p, sizeof(p));
+
+		ss_iops = (uint64_t *) ((struct cmd_ts_pdu *)ss_buf + 1);
+		ss_bw = ss_iops + (int) ts->ss_dur;
+		for (i = 0; i < ts->ss_dur; i++) {
+			ss_iops[i] = cpu_to_le64(ts->ss_iops_data[i]);
+			ss_bw[i] = cpu_to_le64(ts->ss_bw_data[i]);
+		}
+
+		fio_net_queue_cmd(FIO_NET_CMD_TS, ss_buf, sizeof(p) + 2*ts->ss_dur*sizeof(uint64_t), NULL, SK_F_COPY);
+
+		free(ss_buf);
+	}
+	else
+		fio_net_queue_cmd(FIO_NET_CMD_TS, &p, sizeof(p), NULL, SK_F_COPY);
 }
 
 void fio_server_send_gs(struct group_run_stats *rs)
@@ -1170,7 +1576,48 @@
 	dprint(FD_NET, "server sending group run stats\n");
 
 	convert_gs(&gs, rs);
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_GS, &gs, sizeof(gs), NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_GS, &gs, sizeof(gs), NULL, SK_F_COPY);
+}
+
+void fio_server_send_job_options(struct flist_head *opt_list,
+				 unsigned int groupid)
+{
+	struct cmd_job_option pdu;
+	struct flist_head *entry;
+
+	if (flist_empty(opt_list))
+		return;
+
+	flist_for_each(entry, opt_list) {
+		struct print_option *p;
+		size_t len;
+
+		p = flist_entry(entry, struct print_option, list);
+		memset(&pdu, 0, sizeof(pdu));
+
+		if (groupid == -1U) {
+			pdu.global = __cpu_to_le16(1);
+			pdu.groupid = 0;
+		} else {
+			pdu.global = 0;
+			pdu.groupid = cpu_to_le32(groupid);
+		}
+		len = strlen(p->name);
+		if (len >= sizeof(pdu.name)) {
+			len = sizeof(pdu.name) - 1;
+			pdu.truncated = __cpu_to_le16(1);
+		}
+		memcpy(pdu.name, p->name, len);
+		if (p->value) {
+			len = strlen(p->value);
+			if (len >= sizeof(pdu.value)) {
+				len = sizeof(pdu.value) - 1;
+				pdu.truncated = __cpu_to_le16(1);
+			}
+			memcpy(pdu.value, p->value, len);
+		}
+		fio_net_queue_cmd(FIO_NET_CMD_JOB_OPT, &pdu, sizeof(pdu), NULL, SK_F_COPY);
+	}
 }
 
 static void convert_agg(struct disk_util_agg *dst, struct disk_util_agg *src)
@@ -1225,130 +1672,319 @@
 		convert_dus(&pdu.dus, &du->dus);
 		convert_agg(&pdu.agg, &du->agg);
 
-		fio_net_send_cmd(server_fd, FIO_NET_CMD_DU, &pdu, sizeof(pdu), NULL, NULL);
+		fio_net_queue_cmd(FIO_NET_CMD_DU, &pdu, sizeof(pdu), NULL, SK_F_COPY);
 	}
 }
 
-/*
- * Send a command with a separate PDU, not inlined in the command
- */
-static int fio_send_cmd_ext_pdu(int sk, uint16_t opcode, const void *buf,
-				off_t size, uint64_t tag, uint32_t flags)
+#ifdef CONFIG_ZLIB
+
+static inline void __fio_net_prep_tail(z_stream *stream, void *out_pdu,
+					struct sk_entry **last_entry,
+					struct sk_entry *first)
 {
-	struct fio_net_cmd cmd;
-	struct iovec iov[2];
+	unsigned int this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
 
-	iov[0].iov_base = (void *) &cmd;
-	iov[0].iov_len = sizeof(cmd);
-	iov[1].iov_base = (void *) buf;
-	iov[1].iov_len = size;
+	*last_entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+				 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+	flist_add_tail(&(*last_entry)->list, &first->next);
 
-	__fio_init_net_cmd(&cmd, opcode, size, tag);
-	cmd.flags = __cpu_to_le32(flags);
-	fio_net_cmd_crc_pdu(&cmd, buf);
-
-	return fio_sendv_data(sk, iov, 2);
 }
 
-static int fio_send_iolog_gz(struct cmd_iolog_pdu *pdu, struct io_log *log)
+/*
+ * Deflates the next input given, creating as many new packets in the
+ * linked list as necessary.
+ */
+static int __deflate_pdu_buffer(void *next_in, unsigned int next_sz, void **out_pdu,
+				struct sk_entry **last_entry, z_stream *stream,
+				struct sk_entry *first)
+{
+	int ret;
+
+	stream->next_in = next_in;
+	stream->avail_in = next_sz;
+	do {
+		if (! stream->avail_out) {
+
+			__fio_net_prep_tail(stream, *out_pdu, last_entry, first);
+
+			*out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+			stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+			stream->next_out = *out_pdu;
+		}
+
+		ret = deflate(stream, Z_BLOCK);
+
+		if (ret < 0) {
+			free(*out_pdu);
+			return 1;
+		}
+	} while (stream->avail_in);
+
+	return 0;
+}
+
+static int __fio_append_iolog_gz_hist(struct sk_entry *first, struct io_log *log,
+				      struct io_logs *cur_log, z_stream *stream)
+{
+	struct sk_entry *entry;
+	void *out_pdu;
+	int ret, i, j;
+	int sample_sz = log_entry_sz(log);
+
+	out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+	stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+	stream->next_out = out_pdu;
+
+	for (i = 0; i < cur_log->nr_samples; i++) {
+		struct io_sample *s;
+		struct io_u_plat_entry *cur_plat_entry, *prev_plat_entry;
+		unsigned int *cur_plat, *prev_plat;
+
+		s = get_sample(log, cur_log, i);
+		ret = __deflate_pdu_buffer(s, sample_sz, &out_pdu, &entry, stream, first);
+		if (ret)
+			return ret;
+
+		/* Do the subtraction on server side so that client doesn't have to
+		 * reconstruct our linked list from packets.
+		 */
+		cur_plat_entry  = s->data.plat_entry;
+		prev_plat_entry = flist_first_entry(&cur_plat_entry->list, struct io_u_plat_entry, list);
+		cur_plat  = cur_plat_entry->io_u_plat;
+		prev_plat = prev_plat_entry->io_u_plat;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++) {
+			cur_plat[j] -= prev_plat[j];
+		}
+
+		flist_del(&prev_plat_entry->list);
+		free(prev_plat_entry);
+
+		ret = __deflate_pdu_buffer(cur_plat_entry, sizeof(*cur_plat_entry),
+					   &out_pdu, &entry, stream, first);
+
+		if (ret)
+			return ret;
+	}
+
+	__fio_net_prep_tail(stream, out_pdu, &entry, first);
+
+	return 0;
+}
+
+static int __fio_append_iolog_gz(struct sk_entry *first, struct io_log *log,
+				 struct io_logs *cur_log, z_stream *stream)
+{
+	unsigned int this_len;
+	void *out_pdu;
+	int ret;
+
+	if (log->log_type == IO_LOG_TYPE_HIST)
+		return __fio_append_iolog_gz_hist(first, log, cur_log, stream);
+
+	stream->next_in = (void *) cur_log->log;
+	stream->avail_in = cur_log->nr_samples * log_entry_sz(log);
+
+	do {
+		struct sk_entry *entry;
+
+		/*
+		 * Dirty - since the log is potentially huge, compress it into
+		 * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
+		 * side defragment it.
+		 */
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
+
+		stream->avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
+		stream->next_out = out_pdu;
+		ret = deflate(stream, Z_BLOCK);
+		/* may be Z_OK, or Z_STREAM_END */
+		if (ret < 0) {
+			free(out_pdu);
+			return 1;
+		}
+
+		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream->avail_out;
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		flist_add_tail(&entry->list, &first->next);
+	} while (stream->avail_in);
+
+	return 0;
+}
+
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
 {
 	int ret = 0;
-#ifdef CONFIG_ZLIB
 	z_stream stream;
-	void *out_pdu;
 
-	/*
-	 * Dirty - since the log is potentially huge, compress it into
-	 * FIO_SERVER_MAX_FRAGMENT_PDU chunks and let the receiving
-	 * side defragment it.
-	 */
-	out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
-
+	memset(&stream, 0, sizeof(stream));
 	stream.zalloc = Z_NULL;
 	stream.zfree = Z_NULL;
 	stream.opaque = Z_NULL;
 
-	if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK) {
-		ret = 1;
-		goto err;
+	if (deflateInit(&stream, Z_DEFAULT_COMPRESSION) != Z_OK)
+		return 1;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		ret = __fio_append_iolog_gz(first, log, cur_log, &stream);
+		if (ret)
+			break;
 	}
 
-	stream.next_in = (void *) log->log;
-	stream.avail_in = log->nr_samples * log_entry_sz(log);
+	ret = deflate(&stream, Z_FINISH);
 
-	do {
-		unsigned int this_len, flags = 0;
+	while (ret != Z_STREAM_END) {
+		struct sk_entry *entry;
+		unsigned int this_len;
+		void *out_pdu;
 
+		out_pdu = malloc(FIO_SERVER_MAX_FRAGMENT_PDU);
 		stream.avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
 		stream.next_out = out_pdu;
+
 		ret = deflate(&stream, Z_FINISH);
 		/* may be Z_OK, or Z_STREAM_END */
-		if (ret < 0)
-			goto err_zlib;
+		if (ret < 0) {
+			free(out_pdu);
+			break;
+		}
 
 		this_len = FIO_SERVER_MAX_FRAGMENT_PDU - stream.avail_out;
 
-		if (stream.avail_in)
-			flags = FIO_NET_CMD_F_MORE;
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, out_pdu, this_len,
+					 NULL, SK_F_VEC | SK_F_INLINE | SK_F_FREE);
+		flist_add_tail(&entry->list, &first->next);
+	} while (ret != Z_STREAM_END);
 
-		ret = fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG,
-					   out_pdu, this_len, 0, flags);
-		if (ret)
-			goto err_zlib;
-	} while (stream.avail_in);
+	ret = deflateEnd(&stream);
+	if (ret == Z_OK)
+		return 0;
 
-err_zlib:
-	deflateEnd(&stream);
-err:
-	free(out_pdu);
+	return 1;
+}
+#else
+static int fio_append_iolog_gz(struct sk_entry *first, struct io_log *log)
+{
+	return 1;
+}
 #endif
-	return ret;
+
+static int fio_append_gz_chunks(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+	struct flist_head *node;
+
+	pthread_mutex_lock(&log->chunk_lock);
+	flist_for_each(node, &log->chunk_list) {
+		struct iolog_compress *c;
+
+		c = flist_entry(node, struct iolog_compress, list);
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, c->buf, c->len,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		flist_add_tail(&entry->list, &first->next);
+	}
+	pthread_mutex_unlock(&log->chunk_lock);
+
+	return 0;
+}
+
+static int fio_append_text_log(struct sk_entry *first, struct io_log *log)
+{
+	struct sk_entry *entry;
+
+	while (!flist_empty(&log->io_logs)) {
+		struct io_logs *cur_log;
+		size_t size;
+
+		cur_log = flist_first_entry(&log->io_logs, struct io_logs, list);
+		flist_del_init(&cur_log->list);
+
+		size = cur_log->nr_samples * log_entry_sz(log);
+
+		entry = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, cur_log->log, size,
+						NULL, SK_F_VEC | SK_F_INLINE);
+		flist_add_tail(&entry->list, &first->next);
+	}
+
+	return 0;
 }
 
 int fio_send_iolog(struct thread_data *td, struct io_log *log, const char *name)
 {
 	struct cmd_iolog_pdu pdu;
-	int i, ret = 0;
+	struct sk_entry *first;
+	struct flist_head *entry;
+	int ret = 0;
 
-	pdu.nr_samples = cpu_to_le64(log->nr_samples);
+	pdu.nr_samples = cpu_to_le64(iolog_nr_samples(log));
 	pdu.thread_number = cpu_to_le32(td->thread_number);
 	pdu.log_type = cpu_to_le32(log->log_type);
-	pdu.compressed = cpu_to_le32(use_zlib);
+	pdu.log_hist_coarseness = cpu_to_le32(log->hist_coarseness);
+
+	if (!flist_empty(&log->chunk_list))
+		pdu.compressed = __cpu_to_le32(STORE_COMPRESSED);
+	else if (use_zlib)
+		pdu.compressed = __cpu_to_le32(XMIT_COMPRESSED);
+	else
+		pdu.compressed = 0;
 
 	strncpy((char *) pdu.name, name, FIO_NET_NAME_MAX);
 	pdu.name[FIO_NET_NAME_MAX - 1] = '\0';
 
-	for (i = 0; i < log->nr_samples; i++) {
-		struct io_sample *s = get_sample(log, i);
+	/*
+	 * We can't do this for a pre-compressed log, but for that case,
+	 * log->nr_samples is zero anyway.
+	 */
+	flist_for_each(entry, &log->io_logs) {
+		struct io_logs *cur_log;
+		int i;
 
-		s->time		= cpu_to_le64(s->time);
-		s->val		= cpu_to_le64(s->val);
-		s->__ddir	= cpu_to_le32(s->__ddir);
-		s->bs		= cpu_to_le32(s->bs);
+		cur_log = flist_entry(entry, struct io_logs, list);
 
-		if (log->log_offset) {
-			struct io_sample_offset *so = (void *) s;
+		for (i = 0; i < cur_log->nr_samples; i++) {
+			struct io_sample *s = get_sample(log, cur_log, i);
 
-			so->offset = cpu_to_le64(so->offset);
+			s->time		= cpu_to_le64(s->time);
+			s->data.val	= cpu_to_le64(s->data.val);
+			s->__ddir	= cpu_to_le32(s->__ddir);
+			s->bs		= cpu_to_le32(s->bs);
+
+			if (log->log_offset) {
+				struct io_sample_offset *so = (void *) s;
+
+				so->offset = cpu_to_le64(so->offset);
+			}
 		}
 	}
 
 	/*
-	 * Send header first, it's not compressed.
+	 * Assemble header entry first
 	 */
-	ret = fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG, &pdu,
-					sizeof(pdu), 0, FIO_NET_CMD_F_MORE);
-	if (ret)
-		return ret;
+	first = fio_net_prep_cmd(FIO_NET_CMD_IOLOG, &pdu, sizeof(pdu), NULL, SK_F_VEC | SK_F_INLINE | SK_F_COPY);
 
 	/*
-	 * Now send actual log, compress if we can, otherwise just plain
+	 * Now append actual log entries. If log compression was enabled on
+	 * the job, just send out the compressed chunks directly. If we
+	 * have a plain log, compress if we can, then send. Otherwise, send
+	 * the plain text output.
 	 */
-	if (use_zlib)
-		return fio_send_iolog_gz(&pdu, log);
+	if (!flist_empty(&log->chunk_list))
+		ret = fio_append_gz_chunks(first, log);
+	else if (use_zlib)
+		ret = fio_append_iolog_gz(first, log);
+	else
+		ret = fio_append_text_log(first, log);
 
-	return fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG, log->log,
-			log->nr_samples * log_entry_sz(log), 0, 0);
+	fio_net_queue_entry(first);
+	return ret;
 }
 
 void fio_server_send_add_job(struct thread_data *td)
@@ -1360,14 +1996,17 @@
 	pdu.groupid = cpu_to_le32(td->groupid);
 	convert_thread_options_to_net(&pdu.top, &td->o);
 
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_ADD_JOB, &pdu, sizeof(pdu), NULL,
+				SK_F_COPY);
 }
 
 void fio_server_send_start(struct thread_data *td)
 {
-	assert(server_fd != -1);
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
 
-	fio_net_send_simple_cmd(server_fd, FIO_NET_CMD_SERVER_START, 0, NULL);
+	assert(sk_out->sk != -1);
+
+	fio_net_queue_cmd(FIO_NET_CMD_SERVER_START, NULL, 0, NULL, SK_F_SIMPLE);
 }
 
 int fio_server_get_verify_state(const char *name, int threadnumber,
@@ -1378,14 +2017,13 @@
 	struct cmd_reply *rep;
 	uint64_t tag;
 	void *data;
+	int ret;
 
 	dprint(FD_NET, "server: request verify state\n");
 
 	rep = smalloc(sizeof(*rep));
-	if (!rep) {
-		log_err("fio: smalloc pool too small\n");
-		return 1;
-	}
+	if (!rep)
+		return ENOMEM;
 
 	__fio_mutex_init(&rep->lock, FIO_MUTEX_LOCKED);
 	rep->data = NULL;
@@ -1394,24 +2032,27 @@
 	verify_state_gen_name((char *) out.path, sizeof(out.path), name, me,
 				threadnumber);
 	tag = (uint64_t) (uintptr_t) rep;
-	fio_net_send_cmd(server_fd, FIO_NET_CMD_SENDFILE, &out, sizeof(out),
-				&tag, NULL);
+	fio_net_queue_cmd(FIO_NET_CMD_SENDFILE, &out, sizeof(out), &tag,
+				SK_F_COPY);
 
 	/*
 	 * Wait for the backend to receive the reply
 	 */
-	if (fio_mutex_down_timeout(&rep->lock, 10)) {
+	if (fio_mutex_down_timeout(&rep->lock, 10000)) {
 		log_err("fio: timed out waiting for reply\n");
+		ret = ETIMEDOUT;
 		goto fail;
 	}
 
 	if (rep->error) {
-		log_err("fio: failure on receiving state file: %s\n", strerror(rep->error));
+		log_err("fio: failure on receiving state file %s: %s\n",
+				out.path, strerror(rep->error));
+		ret = rep->error;
 fail:
 		*datap = NULL;
 		sfree(rep);
-		fio_net_send_quit(server_fd);
-		return 1;
+		fio_net_queue_quit();
+		return ret;
 	}
 
 	/*
@@ -1419,12 +2060,15 @@
 	 * the header, and the thread_io_list checksum
 	 */
 	s = rep->data + sizeof(struct verify_state_hdr);
-	if (verify_state_hdr(rep->data, s))
+	if (verify_state_hdr(rep->data, s)) {
+		ret = EILSEQ;
 		goto fail;
+	}
 
 	/*
 	 * Don't need the header from now, copy just the thread_io_list
 	 */
+	ret = 0;
 	rep->size -= sizeof(struct verify_state_hdr);
 	data = malloc(rep->size);
 	memcpy(data, s, rep->size);
@@ -1433,7 +2077,7 @@
 	sfree(rep->data);
 	__fio_mutex_remove(&rep->lock);
 	sfree(rep);
-	return 0;
+	return ret;
 }
 
 static int fio_init_server_ip(void)
@@ -1456,16 +2100,15 @@
 
 	opt = 1;
 	if (setsockopt(sk, SOL_SOCKET, SO_REUSEADDR, (void *)&opt, sizeof(opt)) < 0) {
-		log_err("fio: setsockopt: %s\n", strerror(errno));
+		log_err("fio: setsockopt(REUSEADDR): %s\n", strerror(errno));
 		close(sk);
 		return -1;
 	}
 #ifdef SO_REUSEPORT
-	if (setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)) < 0) {
-		log_err("fio: setsockopt: %s\n", strerror(errno));
-		close(sk);
-		return -1;
-	}
+	/*
+	 * Not fatal if fails, so just ignore it if that happens
+	 */
+	setsockopt(sk, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));
 #endif
 
 	if (use_ipv6) {
@@ -1567,7 +2210,7 @@
 
 	log_info("fio: server listening on %s\n", bind_str);
 
-	if (listen(sk, 0) < 0) {
+	if (listen(sk, 4) < 0) {
 		log_err("fio: listen: %s\n", strerror(errno));
 		close(sk);
 		return -1;
@@ -1758,6 +2401,22 @@
 	sigaction(SIGINT, &act, NULL);
 }
 
+void fio_server_destroy_sk_key(void)
+{
+	pthread_key_delete(sk_out_key);
+}
+
+int fio_server_create_sk_key(void)
+{
+	if (pthread_key_create(&sk_out_key, NULL)) {
+		log_err("fio: can't create sk_out backend key\n");
+		return 1;
+	}
+
+	pthread_setspecific(sk_out_key, NULL);
+	return 0;
+}
+
 static int fio_server(void)
 {
 	int sk, ret;
@@ -1789,8 +2448,12 @@
 
 void fio_server_got_signal(int signal)
 {
+	struct sk_out *sk_out = pthread_getspecific(sk_out_key);
+
+	assert(sk_out);
+
 	if (signal == SIGPIPE)
-		server_fd = -1;
+		sk_out->sk = -1;
 	else {
 		log_info("\nfio: terminating on signal %d\n", signal);
 		exit_backend = 1;
@@ -1864,7 +2527,7 @@
 
 	pid = fork();
 	if (pid < 0) {
-		log_err("fio: failed server fork: %s", strerror(errno));
+		log_err("fio: failed server fork: %s\n", strerror(errno));
 		free(pidfile);
 		return -1;
 	} else if (pid) {

diff --git a/server.h b/server.h
index dc5a69e..5c720d4 100644
--- a/server.h
+++ b/server.h

@@ -12,6 +12,17 @@
 
 #define FIO_NET_PORT 8765
 
+struct sk_out {
+	unsigned int refs;	/* frees sk_out when it drops to zero.
+				 * protected by below ->lock */
+
+	int sk;			/* socket fd to talk to client */
+	struct fio_mutex lock;	/* protects ref and below list */
+	struct flist_head list;	/* list of pending transmit work */
+	struct fio_mutex wait;	/* wake backend when items added to list */
+	struct fio_mutex xmit;	/* held while sending data */
+};
+
 /*
  * On-wire encoding is little endian
  */
@@ -38,7 +49,7 @@
 };
 
 enum {
-	FIO_SERVER_VER			= 42,
+	FIO_SERVER_VER			= 61,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
@@ -64,7 +75,8 @@
 	FIO_NET_CMD_LOAD_FILE		= 19,
 	FIO_NET_CMD_VTRIGGER		= 20,
 	FIO_NET_CMD_SENDFILE		= 21,
-	FIO_NET_CMD_NR			= 22,
+	FIO_NET_CMD_JOB_OPT		= 22,
+	FIO_NET_CMD_NR			= 23,
 
 	FIO_NET_CMD_F_MORE		= 1UL << 0,
 
@@ -171,16 +183,30 @@
 	uint8_t buf[0];
 };
 
+enum {
+	XMIT_COMPRESSED		= 1U,
+	STORE_COMPRESSED	= 2U,
+};
+
 struct cmd_iolog_pdu {
 	uint64_t nr_samples;
 	uint32_t thread_number;
 	uint32_t log_type;
 	uint32_t compressed;
 	uint32_t log_offset;
+	uint32_t log_hist_coarseness;
 	uint8_t name[FIO_NET_NAME_MAX];
 	struct io_sample samples[0];
 };
 
+struct cmd_job_option {
+	uint16_t global;
+	uint16_t truncated;
+	uint32_t groupid;
+	uint8_t name[64];
+	uint8_t value[128];
+};
+
 extern int fio_start_server(char *);
 extern int fio_server_text_output(int, const char *, size_t);
 extern int fio_net_send_cmd(int, uint16_t, const void *, off_t, uint64_t *, struct flist_head *);
@@ -196,44 +222,20 @@
 extern void fio_server_send_ts(struct thread_stat *, struct group_run_stats *);
 extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
-extern void fio_server_idle_loop(void);
+extern void fio_server_send_job_options(struct flist_head *, unsigned int);
 extern int fio_server_get_verify_state(const char *, int, void **);
 
-extern int fio_recv_data(int sk, void *p, unsigned int len);
-extern int fio_send_data(int sk, const void *p, unsigned int len);
-extern void fio_net_cmd_crc(struct fio_net_cmd *);
-extern void fio_net_cmd_crc_pdu(struct fio_net_cmd *, const void *);
-extern struct fio_net_cmd *fio_net_recv_cmd(int sk);
+extern struct fio_net_cmd *fio_net_recv_cmd(int sk, bool wait);
 
 extern int fio_send_iolog(struct thread_data *, struct io_log *, const char *);
 extern void fio_server_send_add_job(struct thread_data *);
 extern void fio_server_send_start(struct thread_data *);
-extern int fio_net_send_stop(int sk, int error, int signal);
 extern int fio_net_send_quit(int sk);
 
+extern int fio_server_create_sk_key(void);
+extern void fio_server_destroy_sk_key(void);
+
 extern int exit_backend;
 extern int fio_net_port;
 
-static inline void __fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
-				      uint32_t pdu_len, uint64_t tag)
-{
-	memset(cmd, 0, sizeof(*cmd));
-
-	cmd->version	= __cpu_to_le16(FIO_SERVER_VER);
-	cmd->opcode	= cpu_to_le16(opcode);
-	cmd->tag	= cpu_to_le64(tag);
-	cmd->pdu_len	= cpu_to_le32(pdu_len);
-}
-
-
-static inline void fio_init_net_cmd(struct fio_net_cmd *cmd, uint16_t opcode,
-				    const void *pdu, uint32_t pdu_len,
-				    uint64_t tag)
-{
-	__fio_init_net_cmd(cmd, opcode, pdu_len, tag);
-
-	if (pdu)
-		memcpy(&cmd->payload, pdu, pdu_len);
-}
-
 #endif

diff --git a/smalloc.c b/smalloc.c
index b460d65..e48cfe8 100644
--- a/smalloc.c
+++ b/smalloc.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 #include <fcntl.h>
 
+#include "fio.h"
 #include "mutex.h"
 #include "arch/arch.h"
 #include "os/os.h"
@@ -26,13 +27,17 @@
 #define SMALLOC_BPL	(SMALLOC_BPB * SMALLOC_BPI)
 
 #define INITIAL_SIZE	16*1024*1024	/* new pool size */
-#define MAX_POOLS	8		/* maximum number of pools to setup */
+#define INITIAL_POOLS	8		/* maximum number of pools to setup */
+
+#define MAX_POOLS	16
 
 #define SMALLOC_PRE_RED		0xdeadbeefU
 #define SMALLOC_POST_RED	0x5aa55aa5U
 
 unsigned int smalloc_pool_size = INITIAL_SIZE;
+#ifdef SMALLOC_REDZONE
 static const int int_mask = sizeof(int) - 1;
+#endif
 
 struct pool {
 	struct fio_mutex *lock;			/* protects this pool */
@@ -54,37 +59,6 @@
 static struct pool mp[MAX_POOLS];
 static unsigned int nr_pools;
 static unsigned int last_pool;
-static struct fio_rwlock *lock;
-
-static inline void pool_lock(struct pool *pool)
-{
-	fio_mutex_down(pool->lock);
-}
-
-static inline void pool_unlock(struct pool *pool)
-{
-	fio_mutex_up(pool->lock);
-}
-
-static inline void global_read_lock(void)
-{
-	fio_rwlock_read(lock);
-}
-
-static inline void global_read_unlock(void)
-{
-	fio_rwlock_unlock(lock);
-}
-
-static inline void global_write_lock(void)
-{
-	fio_rwlock_write(lock);
-}
-
-static inline void global_write_unlock(void)
-{
-	fio_rwlock_unlock(lock);
-}
 
 static inline int ptr_valid(struct pool *pool, void *ptr)
 {
@@ -178,12 +152,15 @@
 	return ffz(word) + start;
 }
 
-static int add_pool(struct pool *pool, unsigned int alloc_size)
+static bool add_pool(struct pool *pool, unsigned int alloc_size)
 {
 	int bitmap_blocks;
 	int mmap_flags;
 	void *ptr;
 
+	if (nr_pools == MAX_POOLS)
+		return false;
+
 #ifdef SMALLOC_REDZONE
 	alloc_size += sizeof(unsigned int);
 #endif
@@ -211,32 +188,31 @@
 	if (ptr == MAP_FAILED)
 		goto out_fail;
 
-	memset(ptr, 0, alloc_size);
 	pool->map = ptr;
 	pool->bitmap = (void *) ptr + (pool->nr_blocks * SMALLOC_BPL);
+	memset(pool->bitmap, 0, bitmap_blocks * sizeof(unsigned int));
 
 	pool->lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
 	if (!pool->lock)
 		goto out_fail;
 
 	nr_pools++;
-	return 0;
+	return true;
 out_fail:
 	log_err("smalloc: failed adding pool\n");
 	if (pool->map)
 		munmap(pool->map, pool->mmap_size);
-	return 1;
+	return false;
 }
 
 void sinit(void)
 {
-	int i, ret;
+	bool ret;
+	int i;
 
-	lock = fio_rwlock_init();
-
-	for (i = 0; i < MAX_POOLS; i++) {
-		ret = add_pool(&mp[i], INITIAL_SIZE);
-		if (ret)
+	for (i = 0; i < INITIAL_POOLS; i++) {
+		ret = add_pool(&mp[nr_pools], smalloc_pool_size);
+		if (!ret)
 			break;
 	}
 
@@ -265,9 +241,6 @@
 
 	for (i = 0; i < nr_pools; i++)
 		cleanup_pool(&mp[i]);
-
-	if (lock)
-		fio_rwlock_remove(lock);
 }
 
 #ifdef SMALLOC_REDZONE
@@ -276,7 +249,7 @@
 	uintptr_t ptr;
 
 	ptr = (uintptr_t) hdr + hdr->size - sizeof(unsigned int);
-	ptr = (ptr + int_mask) & ~int_mask;
+	ptr = (uintptr_t) PTR_ALIGN(ptr, int_mask);
 
 	return (void *) ptr;
 }
@@ -336,12 +309,12 @@
 	i = offset / SMALLOC_BPL;
 	idx = (offset % SMALLOC_BPL) / SMALLOC_BPB;
 
-	pool_lock(pool);
+	fio_mutex_down(pool->lock);
 	clear_blocks(pool, i, idx, size_to_blocks(hdr->size));
 	if (i < pool->next_non_full)
 		pool->next_non_full = i;
 	pool->free_blocks += size_to_blocks(hdr->size);
-	pool_unlock(pool);
+	fio_mutex_up(pool->lock);
 }
 
 void sfree(void *ptr)
@@ -352,8 +325,6 @@
 	if (!ptr)
 		return;
 
-	global_read_lock();
-
 	for (i = 0; i < nr_pools; i++) {
 		if (ptr_valid(&mp[i], ptr)) {
 			pool = &mp[i];
@@ -361,10 +332,12 @@
 		}
 	}
 
-	global_read_unlock();
+	if (pool) {
+		sfree_pool(pool, ptr);
+		return;
+	}
 
-	assert(pool);
-	sfree_pool(pool, ptr);
+	log_err("smalloc: ptr %p not from smalloc pool\n", ptr);
 }
 
 static void *__smalloc_pool(struct pool *pool, size_t size)
@@ -375,7 +348,7 @@
 	unsigned int last_idx;
 	void *ret = NULL;
 
-	pool_lock(pool);
+	fio_mutex_down(pool->lock);
 
 	nr_blocks = size_to_blocks(size);
 	if (nr_blocks > pool->free_blocks)
@@ -418,7 +391,7 @@
 		ret = pool->map + offset;
 	}
 fail:
-	pool_unlock(pool);
+	fio_mutex_up(pool->lock);
 	return ret;
 }
 
@@ -457,7 +430,6 @@
 	if (size != (unsigned int) size)
 		return NULL;
 
-	global_write_lock();
 	i = last_pool;
 	end_pool = nr_pools;
 
@@ -467,7 +439,6 @@
 
 			if (ptr) {
 				last_pool = i;
-				global_write_unlock();
 				return ptr;
 			}
 		}
@@ -480,19 +451,14 @@
 		break;
 	} while (1);
 
-	global_write_unlock();
+	log_err("smalloc: OOM. Consider using --alloc-size to increase the "
+		"shared memory available.\n");
 	return NULL;
 }
 
 void *scalloc(size_t nmemb, size_t size)
 {
-	void *ret;
-
-	ret = smalloc(nmemb * size);
-	if (ret)
-		memset(ret, 0, nmemb * size);
-
-	return ret;
+	return smalloc(nmemb * size);
 }
 
 char *smalloc_strdup(const char *str)

diff --git a/stat.c b/stat.c
index 6a3610f..5b48413 100644
--- a/stat.c
+++ b/stat.c

@@ -13,9 +13,25 @@
 #include "json.h"
 #include "lib/getrusage.h"
 #include "idletime.h"
+#include "lib/pow2.h"
+#include "lib/output_buffer.h"
+#include "helper_thread.h"
+#include "smalloc.h"
+
+#define LOG_MSEC_SLACK	10
 
 struct fio_mutex *stat_mutex;
 
+void clear_rusage_stat(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+
+	fio_getrusage(&td->ru_start);
+	ts->usr_time = ts->sys_time = 0;
+	ts->ctx = 0;
+	ts->minf = ts->majf = 0;
+}
+
 void update_rusage_stat(struct thread_data *td)
 {
 	struct thread_stat *ts = &td->ts;
@@ -82,7 +98,7 @@
  * Convert the given index of the bucket array to the value
  * represented by the bucket
  */
-static unsigned int plat_idx_to_val(unsigned int idx)
+static unsigned long long plat_idx_to_val(unsigned int idx)
 {
 	unsigned int error_bits, k, base;
 
@@ -182,7 +198,8 @@
  * Find and display the p-th percentile of clat
  */
 static void show_clat_percentiles(unsigned int *io_u_plat, unsigned long nr,
-				  fio_fp64_t *plist, unsigned int precision)
+				  fio_fp64_t *plist, unsigned int precision,
+				  struct buf_output *out)
 {
 	unsigned int len, j = 0, minv, maxv;
 	unsigned int *ovals;
@@ -199,10 +216,10 @@
 	 */
 	if (minv > 2000 && maxv > 99999) {
 		scale_down = 1;
-		log_info("    clat percentiles (msec):\n     |");
+		log_buf(out, "    clat percentiles (msec):\n     |");
 	} else {
 		scale_down = 0;
-		log_info("    clat percentiles (usec):\n     |");
+		log_buf(out, "    clat percentiles (usec):\n     |");
 	}
 
 	snprintf(fmt, sizeof(fmt), "%%1.%uf", precision);
@@ -213,7 +230,7 @@
 
 		/* for formatting */
 		if (j != 0 && (j % per_line) == 0)
-			log_info("     |");
+			log_buf(out, "     |");
 
 		/* end of the list */
 		is_last = (j == len - 1);
@@ -226,13 +243,13 @@
 		if (scale_down)
 			ovals[j] = (ovals[j] + 999) / 1000;
 
-		log_info(" %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ',');
+		log_buf(out, " %sth=[%5u]%c", fbuf, ovals[j], is_last ? '\n' : ',');
 
 		if (is_last)
 			break;
 
 		if ((j % per_line) == per_line - 1)	/* for formatting */
-			log_info("\n");
+			log_buf(out, "\n");
 	}
 
 out:
@@ -240,13 +257,13 @@
 		free(ovals);
 }
 
-int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
-	     double *mean, double *dev)
+bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max,
+	      double *mean, double *dev)
 {
 	double n = (double) is->samples;
 
 	if (n == 0)
-		return 0;
+		return false;
 
 	*min = is->min_val;
 	*max = is->max_val;
@@ -257,16 +274,17 @@
 	else
 		*dev = 0;
 
-	return 1;
+	return true;
 }
 
-void show_group_stats(struct group_run_stats *rs)
+void show_group_stats(struct group_run_stats *rs, struct buf_output *out)
 {
-	char *p1, *p2, *p3, *p4;
+	char *io, *agg, *min, *max;
+	char *ioalt, *aggalt, *minalt, *maxalt;
 	const char *str[] = { "   READ", "  WRITE" , "   TRIM"};
 	int i;
 
-	log_info("\nRun status group %d (all jobs):\n", rs->groupid);
+	log_buf(out, "\nRun status group %d (all jobs):\n", rs->groupid);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		const int i2p = is_power_of_2(rs->kb_base);
@@ -274,22 +292,28 @@
 		if (!rs->max_run[i])
 			continue;
 
-		p1 = num2str(rs->io_kb[i], 6, rs->kb_base, i2p, 8);
-		p2 = num2str(rs->agg[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p3 = num2str(rs->min_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-		p4 = num2str(rs->max_bw[i], 6, rs->kb_base, i2p, rs->unit_base);
-
-		log_info("%s: io=%s, aggrb=%s/s, minb=%s/s, maxb=%s/s,"
-			 " mint=%llumsec, maxt=%llumsec\n",
+		io = num2str(rs->iobytes[i], 4, 1, i2p, N2S_BYTE);
+		ioalt = num2str(rs->iobytes[i], 4, 1, !i2p, N2S_BYTE);
+		agg = num2str(rs->agg[i], 4, 1, i2p, rs->unit_base);
+		aggalt = num2str(rs->agg[i], 4, 1, !i2p, rs->unit_base);
+		min = num2str(rs->min_bw[i], 4, 1, i2p, rs->unit_base);
+		minalt = num2str(rs->min_bw[i], 4, 1, !i2p, rs->unit_base);
+		max = num2str(rs->max_bw[i], 4, 1, i2p, rs->unit_base);
+		maxalt = num2str(rs->max_bw[i], 4, 1, !i2p, rs->unit_base);
+		log_buf(out, "%s: bw=%s (%s), %s-%s (%s-%s), io=%s (%s), run=%llu-%llumsec\n",
 				rs->unified_rw_rep ? "  MIXED" : str[i],
-				p1, p2, p3, p4,
+				agg, aggalt, min, max, minalt, maxalt, io, ioalt,
 				(unsigned long long) rs->min_run[i],
 				(unsigned long long) rs->max_run[i]);
 
-		free(p1);
-		free(p2);
-		free(p3);
-		free(p4);
+		free(io);
+		free(agg);
+		free(min);
+		free(max);
+		free(ioalt);
+		free(aggalt);
+		free(minalt);
+		free(maxalt);
 	}
 }
 
@@ -342,18 +366,18 @@
 }
 
 static void display_lat(const char *name, unsigned long min, unsigned long max,
-			double mean, double dev)
+			double mean, double dev, struct buf_output *out)
 {
 	const char *base = "(usec)";
 	char *minp, *maxp;
 
-	if (!usec_to_msec(&min, &max, &mean, &dev))
+	if (usec_to_msec(&min, &max, &mean, &dev))
 		base = "(msec)";
 
-	minp = num2str(min, 6, 1, 0, 0);
-	maxp = num2str(max, 6, 1, 0, 0);
+	minp = num2str(min, 6, 1, 0, N2S_NONE);
+	maxp = num2str(max, 6, 1, 0, N2S_NONE);
 
-	log_info("    %s %s: min=%s, max=%s, avg=%5.02f,"
+	log_buf(out, "    %s %s: min=%s, max=%s, avg=%5.02f,"
 		 " stdev=%5.02f\n", name, base, minp, maxp, mean, dev);
 
 	free(minp);
@@ -361,13 +385,13 @@
 }
 
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
-			     int ddir)
+			     int ddir, struct buf_output *out)
 {
-	const char *str[] = { "read ", "write", "trim" };
+	const char *str[] = { " read", "write", " trim" };
 	unsigned long min, max, runt;
 	unsigned long long bw, iops;
 	double mean, dev;
-	char *io_p, *bw_p, *iops_p;
+	char *io_p, *bw_p, *bw_p_alt, *iops_p;
 	int i2p;
 
 	assert(ddir_rw(ddir));
@@ -379,37 +403,48 @@
 	runt = ts->runtime[ddir];
 
 	bw = (1000 * ts->io_bytes[ddir]) / runt;
-	io_p = num2str(ts->io_bytes[ddir], 6, 1, i2p, 8);
-	bw_p = num2str(bw, 6, 1, i2p, ts->unit_base);
+	io_p = num2str(ts->io_bytes[ddir], 4, 1, i2p, N2S_BYTE);
+	bw_p = num2str(bw, 4, 1, i2p, ts->unit_base);
+	bw_p_alt = num2str(bw, 4, 1, !i2p, ts->unit_base);
 
 	iops = (1000 * (uint64_t)ts->total_io_u[ddir]) / runt;
-	iops_p = num2str(iops, 6, 1, 0, 0);
+	iops_p = num2str(iops, 4, 1, 0, N2S_NONE);
 
-	log_info("  %s: io=%s, bw=%s/s, iops=%s, runt=%6llumsec\n",
-				rs->unified_rw_rep ? "mixed" : str[ddir],
-				io_p, bw_p, iops_p,
-				(unsigned long long) ts->runtime[ddir]);
+	log_buf(out, "  %s: IOPS=%s, BW=%s (%s)(%s/%llumsec)\n",
+			rs->unified_rw_rep ? "mixed" : str[ddir],
+			iops_p, bw_p, bw_p_alt, io_p,
+			(unsigned long long) ts->runtime[ddir]);
 
 	free(io_p);
 	free(bw_p);
+	free(bw_p_alt);
 	free(iops_p);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("slat", min, max, mean, dev);
+		display_lat("slat", min, max, mean, dev, out);
 	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat("clat", min, max, mean, dev);
+		display_lat("clat", min, max, mean, dev, out);
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-		display_lat(" lat", min, max, mean, dev);
+		display_lat(" lat", min, max, mean, dev, out);
 
 	if (ts->clat_percentiles) {
 		show_clat_percentiles(ts->io_u_plat[ddir],
 					ts->clat_stat[ddir].samples,
 					ts->percentile_list,
-					ts->percentile_precision);
+					ts->percentile_precision, out);
 	}
 	if (calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
 		double p_of_agg = 100.0, fkb_base = (double)rs->kb_base;
-		const char *bw_str = (rs->unit_base == 1 ? "Kbit" : "KB");
+		const char *bw_str;
+
+		if ((rs->unit_base == 1) && i2p)
+			bw_str = "Kibit";
+		else if (rs->unit_base == 1)
+			bw_str = "kbit";
+		else if (i2p)
+			bw_str = "KiB";
+		else
+			bw_str = "kB";
 
 		if (rs->unit_base == 1) {
 			min *= 8.0;
@@ -429,17 +464,16 @@
 			max /= fkb_base;
 			mean /= fkb_base;
 			dev /= fkb_base;
-			bw_str = (rs->unit_base == 1 ? "Mbit" : "MB");
+			bw_str = (rs->unit_base == 1 ? "Mibit" : "MiB");
 		}
 
-		log_info("    bw (%-4s/s): min=%5lu, max=%5lu, per=%3.2f%%,"
-			 " avg=%5.02f, stdev=%5.02f\n", bw_str, min, max,
-							p_of_agg, mean, dev);
+		log_buf(out, "   bw (%5s/s): min=%5lu, max=%5lu, per=%3.2f%%, avg=%5.02f, stdev=%5.02f\n",
+			bw_str, min, max, p_of_agg, mean, dev);
 	}
 }
 
 static int show_lat(double *io_u_lat, int nr, const char **ranges,
-		    const char *msg)
+		    const char *msg, struct buf_output *out)
 {
 	int new_line = 1, i, line = 0, shown = 0;
 
@@ -449,43 +483,43 @@
 		shown = 1;
 		if (new_line) {
 			if (line)
-				log_info("\n");
-			log_info("    lat (%s) : ", msg);
+				log_buf(out, "\n");
+			log_buf(out, "    lat (%s) : ", msg);
 			new_line = 0;
 			line = 0;
 		}
 		if (line)
-			log_info(", ");
-		log_info("%s%3.2f%%", ranges[i], io_u_lat[i]);
+			log_buf(out, ", ");
+		log_buf(out, "%s%3.2f%%", ranges[i], io_u_lat[i]);
 		line++;
 		if (line == 5)
 			new_line = 1;
 	}
 
 	if (shown)
-		log_info("\n");
+		log_buf(out, "\n");
 
 	return shown;
 }
 
-static void show_lat_u(double *io_u_lat_u)
+static void show_lat_u(double *io_u_lat_u, struct buf_output *out)
 {
 	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
 				 "250=", "500=", "750=", "1000=", };
 
-	show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec");
+	show_lat(io_u_lat_u, FIO_IO_U_LAT_U_NR, ranges, "usec", out);
 }
 
-static void show_lat_m(double *io_u_lat_m)
+static void show_lat_m(double *io_u_lat_m, struct buf_output *out)
 {
 	const char *ranges[] = { "2=", "4=", "10=", "20=", "50=", "100=",
 				 "250=", "500=", "750=", "1000=", "2000=",
 				 ">=2000=", };
 
-	show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec");
+	show_lat(io_u_lat_m, FIO_IO_U_LAT_M_NR, ranges, "msec", out);
 }
 
-static void show_latencies(struct thread_stat *ts)
+static void show_latencies(struct thread_stat *ts, struct buf_output *out)
 {
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
@@ -493,12 +527,185 @@
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
-	show_lat_u(io_u_lat_u);
-	show_lat_m(io_u_lat_m);
+	show_lat_u(io_u_lat_u, out);
+	show_lat_m(io_u_lat_m, out);
+}
+
+static int block_state_category(int block_state)
+{
+	switch (block_state) {
+	case BLOCK_STATE_UNINIT:
+		return 0;
+	case BLOCK_STATE_TRIMMED:
+	case BLOCK_STATE_WRITTEN:
+		return 1;
+	case BLOCK_STATE_WRITE_FAILURE:
+	case BLOCK_STATE_TRIM_FAILURE:
+		return 2;
+	default:
+		/* Silence compile warning on some BSDs and have a return */
+		assert(0);
+		return -1;
+	}
+}
+
+static int compare_block_infos(const void *bs1, const void *bs2)
+{
+	uint32_t block1 = *(uint32_t *)bs1;
+	uint32_t block2 = *(uint32_t *)bs2;
+	int state1 = BLOCK_INFO_STATE(block1);
+	int state2 = BLOCK_INFO_STATE(block2);
+	int bscat1 = block_state_category(state1);
+	int bscat2 = block_state_category(state2);
+	int cycles1 = BLOCK_INFO_TRIMS(block1);
+	int cycles2 = BLOCK_INFO_TRIMS(block2);
+
+	if (bscat1 < bscat2)
+		return -1;
+	if (bscat1 > bscat2)
+		return 1;
+
+	if (cycles1 < cycles2)
+		return -1;
+	if (cycles1 > cycles2)
+		return 1;
+
+	if (state1 < state2)
+		return -1;
+	if (state1 > state2)
+		return 1;
+
+	assert(block1 == block2);
+	return 0;
+}
+
+static int calc_block_percentiles(int nr_block_infos, uint32_t *block_infos,
+				  fio_fp64_t *plist, unsigned int **percentiles,
+				  unsigned int *types)
+{
+	int len = 0;
+	int i, nr_uninit;
+
+	qsort(block_infos, nr_block_infos, sizeof(uint32_t), compare_block_infos);
+
+	while (len < FIO_IO_U_LIST_MAX_LEN && plist[len].u.f != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Sort the percentile list. Note that it may already be sorted if
+	 * we are using the default values, but since it's a short list this
+	 * isn't a worry. Also note that this does not work for NaN values.
+	 */
+	if (len > 1)
+		qsort((void *)plist, len, sizeof(plist[0]), double_cmp);
+
+	nr_uninit = 0;
+	/* Start only after the uninit entries end */
+	for (nr_uninit = 0;
+	     nr_uninit < nr_block_infos
+		&& BLOCK_INFO_STATE(block_infos[nr_uninit]) == BLOCK_STATE_UNINIT;
+	     nr_uninit ++)
+		;
+
+	if (nr_uninit == nr_block_infos)
+		return 0;
+
+	*percentiles = calloc(len, sizeof(**percentiles));
+
+	for (i = 0; i < len; i++) {
+		int idx = (plist[i].u.f * (nr_block_infos - nr_uninit) / 100)
+				+ nr_uninit;
+		(*percentiles)[i] = BLOCK_INFO_TRIMS(block_infos[idx]);
+	}
+
+	memset(types, 0, sizeof(*types) * BLOCK_STATE_COUNT);
+	for (i = 0; i < nr_block_infos; i++)
+		types[BLOCK_INFO_STATE(block_infos[i])]++;
+
+	return len;
+}
+
+static const char *block_state_names[] = {
+	[BLOCK_STATE_UNINIT] = "unwritten",
+	[BLOCK_STATE_TRIMMED] = "trimmed",
+	[BLOCK_STATE_WRITTEN] = "written",
+	[BLOCK_STATE_TRIM_FAILURE] = "trim failure",
+	[BLOCK_STATE_WRITE_FAILURE] = "write failure",
+};
+
+static void show_block_infos(int nr_block_infos, uint32_t *block_infos,
+			     fio_fp64_t *plist, struct buf_output *out)
+{
+	int len, pos, i;
+	unsigned int *percentiles = NULL;
+	unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+	len = calc_block_percentiles(nr_block_infos, block_infos, plist,
+				     &percentiles, block_state_counts);
+
+	log_buf(out, "  block lifetime percentiles :\n   |");
+	pos = 0;
+	for (i = 0; i < len; i++) {
+		uint32_t block_info = percentiles[i];
+#define LINE_LENGTH	75
+		char str[LINE_LENGTH];
+		int strln = snprintf(str, LINE_LENGTH, " %3.2fth=%u%c",
+				     plist[i].u.f, block_info,
+				     i == len - 1 ? '\n' : ',');
+		assert(strln < LINE_LENGTH);
+		if (pos + strln > LINE_LENGTH) {
+			pos = 0;
+			log_buf(out, "\n   |");
+		}
+		log_buf(out, "%s", str);
+		pos += strln;
+#undef LINE_LENGTH
+	}
+	if (percentiles)
+		free(percentiles);
+
+	log_buf(out, "        states               :");
+	for (i = 0; i < BLOCK_STATE_COUNT; i++)
+		log_buf(out, " %s=%u%c",
+			 block_state_names[i], block_state_counts[i],
+			 i == BLOCK_STATE_COUNT - 1 ? '\n' : ',');
+}
+
+static void show_ss_normal(struct thread_stat *ts, struct buf_output *out)
+{
+	char *p1, *p1alt, *p2;
+	unsigned long long bw_mean, iops_mean;
+	const int i2p = is_power_of_2(ts->kb_base);
+
+	if (!ts->ss_dur)
+		return;
+
+	bw_mean = steadystate_bw_mean(ts);
+	iops_mean = steadystate_iops_mean(ts);
+
+	p1 = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, i2p, ts->unit_base);
+	p1alt = num2str(bw_mean / ts->kb_base, 4, ts->kb_base, !i2p, ts->unit_base);
+	p2 = num2str(iops_mean, 4, 1, 0, N2S_NONE);
+
+	log_buf(out, "  steadystate  : attained=%s, bw=%s (%s), iops=%s, %s%s=%.3f%s\n",
+		ts->ss_state & __FIO_SS_ATTAINED ? "yes" : "no",
+		p1, p1alt, p2,
+		ts->ss_state & __FIO_SS_IOPS ? "iops" : "bw",
+		ts->ss_state & __FIO_SS_SLOPE ? " slope": " mean dev",
+		ts->ss_criterion.u.f,
+		ts->ss_state & __FIO_SS_PCT ? "%" : "");
+
+	free(p1);
+	free(p1alt);
+	free(p2);
 }
 
 static void show_thread_status_normal(struct thread_stat *ts,
-				      struct group_run_stats *rs)
+				      struct group_run_stats *rs,
+				      struct buf_output *out)
 {
 	double usr_cpu, sys_cpu;
 	unsigned long runtime;
@@ -508,32 +715,34 @@
 
 	if (!ddir_rw_sum(ts->io_bytes) && !ddir_rw_sum(ts->total_io_u))
 		return;
+		
+	memset(time_buf, 0, sizeof(time_buf));
 
 	time(&time_p);
 	os_ctime_r((const time_t *) &time_p, time_buf, sizeof(time_buf));
 
 	if (!ts->error) {
-		log_info("%s: (groupid=%d, jobs=%d): err=%2d: pid=%d: %s",
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d: pid=%d: %s",
 					ts->name, ts->groupid, ts->members,
 					ts->error, (int) ts->pid, time_buf);
 	} else {
-		log_info("%s: (groupid=%d, jobs=%d): err=%2d (%s): pid=%d: %s",
+		log_buf(out, "%s: (groupid=%d, jobs=%d): err=%2d (%s): pid=%d: %s",
 					ts->name, ts->groupid, ts->members,
 					ts->error, ts->verror, (int) ts->pid,
 					time_buf);
 	}
 
 	if (strlen(ts->description))
-		log_info("  Description  : [%s]\n", ts->description);
+		log_buf(out, "  Description  : [%s]\n", ts->description);
 
 	if (ts->io_bytes[DDIR_READ])
-		show_ddir_status(rs, ts, DDIR_READ);
+		show_ddir_status(rs, ts, DDIR_READ, out);
 	if (ts->io_bytes[DDIR_WRITE])
-		show_ddir_status(rs, ts, DDIR_WRITE);
+		show_ddir_status(rs, ts, DDIR_WRITE, out);
 	if (ts->io_bytes[DDIR_TRIM])
-		show_ddir_status(rs, ts, DDIR_TRIM);
+		show_ddir_status(rs, ts, DDIR_TRIM, out);
 
-	show_latencies(ts);
+	show_latencies(ts, out);
 
 	runtime = ts->total_run_time;
 	if (runtime) {
@@ -546,34 +755,34 @@
 		sys_cpu = 0;
 	}
 
-	log_info("  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%llu,"
+	log_buf(out, "  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%llu,"
 		 " majf=%llu, minf=%llu\n", usr_cpu, sys_cpu,
 			(unsigned long long) ts->ctx,
 			(unsigned long long) ts->majf,
 			(unsigned long long) ts->minf);
 
 	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
-	log_info("  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
+	log_buf(out, "  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
 		 " 16=%3.1f%%, 32=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
 
 	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
-	log_info("     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+	log_buf(out, "     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
 		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
 	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
-	log_info("     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+	log_buf(out, "     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
 		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
-	log_info("     issued    : total=r=%llu/w=%llu/d=%llu,"
-				 " short=r=%llu/w=%llu/d=%llu,"
-				 " drop=r=%llu/w=%llu/d=%llu\n",
+	log_buf(out, "     issued rwt: total=%llu,%llu,%llu,"
+				 " short=%llu,%llu,%llu,"
+				 " dropped=%llu,%llu,%llu\n",
 					(unsigned long long) ts->total_io_u[0],
 					(unsigned long long) ts->total_io_u[1],
 					(unsigned long long) ts->total_io_u[2],
@@ -584,22 +793,30 @@
 					(unsigned long long) ts->drop_io_u[1],
 					(unsigned long long) ts->drop_io_u[2]);
 	if (ts->continue_on_error) {
-		log_info("     errors    : total=%llu, first_error=%d/<%s>\n",
+		log_buf(out, "     errors    : total=%llu, first_error=%d/<%s>\n",
 					(unsigned long long)ts->total_err_count,
 					ts->first_error,
 					strerror(ts->first_error));
 	}
 	if (ts->latency_depth) {
-		log_info("     latency   : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
+		log_buf(out, "     latency   : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
 					(unsigned long long)ts->latency_target,
 					(unsigned long long)ts->latency_window,
 					ts->latency_percentile.u.f,
 					ts->latency_depth);
 	}
+
+	if (ts->nr_block_infos)
+		show_block_infos(ts->nr_block_infos, ts->block_infos,
+				  ts->percentile_list, out);
+
+	if (ts->ss_dur)
+		show_ss_normal(ts, out);
 }
 
 static void show_ddir_status_terse(struct thread_stat *ts,
-				   struct group_run_stats *rs, int ddir)
+				   struct group_run_stats *rs, int ddir,
+				   struct buf_output *out)
 {
 	unsigned long min, max;
 	unsigned long long bw, iops;
@@ -614,23 +831,23 @@
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
 		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
-	log_info(";%llu;%llu;%llu;%llu",
+	log_buf(out, ";%llu;%llu;%llu;%llu",
 		(unsigned long long) ts->io_bytes[ddir] >> 10, bw, iops,
 					(unsigned long long) ts->runtime[ddir]);
 
 	if (calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
 
 	if (calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
 
 	if (ts->clat_percentiles) {
 		len = calc_clat_percentiles(ts->io_u_plat[ddir],
@@ -642,16 +859,16 @@
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		if (i >= len) {
-			log_info(";0%%=0");
+			log_buf(out, ";0%%=0");
 			continue;
 		}
-		log_info(";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]);
+		log_buf(out, ";%f%%=%u", ts->percentile_list[i].u.f, ovals[i]);
 	}
 
 	if (calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev))
-		log_info(";%lu;%lu;%f;%f", min, max, mean, dev);
+		log_buf(out, ";%lu;%lu;%f;%f", min, max, mean, dev);
 	else
-		log_info(";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
+		log_buf(out, ";%lu;%lu;%f;%f", 0UL, 0UL, 0.0, 0.0);
 
 	if (ovals)
 		free(ovals);
@@ -665,9 +882,9 @@
 				p_of_agg = 100.0;
 		}
 
-		log_info(";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
+		log_buf(out, ";%lu;%lu;%f%%;%f;%f", min, max, p_of_agg, mean, dev);
 	} else
-		log_info(";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
+		log_buf(out, ";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
 }
 
 static void add_ddir_status_json(struct thread_stat *ts,
@@ -680,7 +897,7 @@
 	unsigned int len, minv, maxv;
 	int i;
 	const char *ddirname[] = {"read", "write", "trim"};
-	struct json_object *dir_object, *tmp_object, *percentile_object;
+	struct json_object *dir_object, *tmp_object, *percentile_object, *clat_bins_object;
 	char buf[120];
 	double p_of_agg = 100.0;
 
@@ -698,7 +915,7 @@
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
-		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024; /* KiB/s */
 		iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
@@ -751,6 +968,17 @@
 		json_object_add_value_int(percentile_object, (const char *)buf, ovals[i]);
 	}
 
+	if (output_format & FIO_OUTPUT_JSON_PLUS) {
+		clat_bins_object = json_create_object();
+		json_object_add_value_object(tmp_object, "bins", clat_bins_object);
+		for(i = 0; i < FIO_IO_U_PLAT_NR; i++) {
+			if (ts->io_u_plat[ddir][i]) {
+				snprintf(buf, sizeof(buf), "%llu", plat_idx_to_val(i));
+				json_object_add_value_int(clat_bins_object, (const char *)buf, ts->io_u_plat[ddir][i]);
+			}
+		}
+	}
+
 	if (!calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) {
 		min = max = 0;
 		mean = dev = 0.0;
@@ -782,7 +1010,8 @@
 }
 
 static void show_thread_status_terse_v2(struct thread_stat *ts,
-					struct group_run_stats *rs)
+					struct group_run_stats *rs,
+					struct buf_output *out)
 {
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -791,13 +1020,13 @@
 	int i;
 
 	/* General Info */
-	log_info("2;%s;%d;%d", ts->name, ts->groupid, ts->error);
+	log_buf(out, "2;%s;%d;%d", ts->name, ts->groupid, ts->error);
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ);
+	show_ddir_status_terse(ts, rs, DDIR_READ, out);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
 	/* Log Trim Status */
-	show_ddir_status_terse(ts, rs, DDIR_TRIM);
+	show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -810,7 +1039,7 @@
 		sys_cpu = 0;
 	}
 
-	log_info(";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
+	log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
 						(unsigned long long) ts->ctx,
 						(unsigned long long) ts->majf,
 						(unsigned long long) ts->minf);
@@ -821,30 +1050,31 @@
 	stat_calc_lat_m(ts, io_u_lat_m);
 
 	/* Only show fixed 7 I/O depth levels*/
-	log_info(";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
+	log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
 			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
 			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
 
 	/* Microsecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_u[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
 	/* Millisecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_m[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
-		log_info(";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
-	log_info("\n");
+		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+	log_buf(out, "\n");
 
 	/* Additional output if description is set */
 	if (strlen(ts->description))
-		log_info(";%s", ts->description);
+		log_buf(out, ";%s", ts->description);
 
-	log_info("\n");
+	log_buf(out, "\n");
 }
 
 static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
-					   struct group_run_stats *rs, int ver)
+					   struct group_run_stats *rs, int ver,
+					   struct buf_output *out)
 {
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -853,15 +1083,15 @@
 	int i;
 
 	/* General Info */
-	log_info("%d;%s;%s;%d;%d", ver, fio_version_string,
+	log_buf(out, "%d;%s;%s;%d;%d", ver, fio_version_string,
 					ts->name, ts->groupid, ts->error);
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, DDIR_READ);
+	show_ddir_status_terse(ts, rs, DDIR_READ, out);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, DDIR_WRITE);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE, out);
 	/* Log Trim Status */
 	if (ver == 4)
-		show_ddir_status_terse(ts, rs, DDIR_TRIM);
+		show_ddir_status_terse(ts, rs, DDIR_TRIM, out);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -874,7 +1104,7 @@
 		sys_cpu = 0;
 	}
 
-	log_info(";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
+	log_buf(out, ";%f%%;%f%%;%llu;%llu;%llu", usr_cpu, sys_cpu,
 						(unsigned long long) ts->ctx,
 						(unsigned long long) ts->majf,
 						(unsigned long long) ts->minf);
@@ -885,46 +1115,84 @@
 	stat_calc_lat_m(ts, io_u_lat_m);
 
 	/* Only show fixed 7 I/O depth levels*/
-	log_info(";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
+	log_buf(out, ";%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%;%3.1f%%",
 			io_u_dist[0], io_u_dist[1], io_u_dist[2], io_u_dist[3],
 			io_u_dist[4], io_u_dist[5], io_u_dist[6]);
 
 	/* Microsecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_u[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_u[i]);
 	/* Millisecond latency */
 	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
-		log_info(";%3.2f%%", io_u_lat_m[i]);
+		log_buf(out, ";%3.2f%%", io_u_lat_m[i]);
 
 	/* disk util stats, if any */
-	show_disk_util(1, NULL);
+	show_disk_util(1, NULL, out);
 
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
-		log_info(";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
+		log_buf(out, ";%llu;%d", (unsigned long long) ts->total_err_count, ts->first_error);
 
 	/* Additional output if description is set */
 	if (strlen(ts->description))
-		log_info(";%s", ts->description);
+		log_buf(out, ";%s", ts->description);
 
-	log_info("\n");
+	log_buf(out, "\n");
+}
+
+static void json_add_job_opts(struct json_object *root, const char *name,
+			      struct flist_head *opt_list, bool num_jobs)
+{
+	struct json_object *dir_object;
+	struct flist_head *entry;
+	struct print_option *p;
+
+	if (flist_empty(opt_list))
+		return;
+
+	dir_object = json_create_object();
+	json_object_add_value_object(root, name, dir_object);
+
+	flist_for_each(entry, opt_list) {
+		const char *pos = "";
+
+		p = flist_entry(entry, struct print_option, list);
+		if (!num_jobs && !strcmp(p->name, "numjobs"))
+			continue;
+		if (p->value)
+			pos = p->value;
+		json_object_add_value_string(dir_object, p->name, pos);
+	}
 }
 
 static struct json_object *show_thread_status_json(struct thread_stat *ts,
-				    struct group_run_stats *rs)
+						   struct group_run_stats *rs,
+						   struct flist_head *opt_list)
 {
 	struct json_object *root, *tmp;
+	struct jobs_eta *je;
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	double usr_cpu, sys_cpu;
 	int i;
+	size_t size;
 
 	root = json_create_object();
 	json_object_add_value_string(root, "jobname", ts->name);
 	json_object_add_value_int(root, "groupid", ts->groupid);
 	json_object_add_value_int(root, "error", ts->error);
 
+	/* ETA Info */
+	je = get_jobs_eta(true, &size);
+	if (je) {
+		json_object_add_value_int(root, "eta", je->eta_sec);
+		json_object_add_value_int(root, "elapsed", je->elapsed_sec);
+	}
+
+	if (opt_list)
+		json_add_job_opts(root, "job options", opt_list, true);
+
 	add_ddir_status_json(ts, rs, DDIR_READ, root);
 	add_ddir_status_json(ts, rs, DDIR_WRITE, root);
 	add_ddir_status_json(ts, rs, DDIR_TRIM, root);
@@ -998,33 +1266,128 @@
 	if (strlen(ts->description))
 		json_object_add_value_string(root, "desc", ts->description);
 
+	if (ts->nr_block_infos) {
+		/* Block error histogram and types */
+		int len;
+		unsigned int *percentiles = NULL;
+		unsigned int block_state_counts[BLOCK_STATE_COUNT];
+
+		len = calc_block_percentiles(ts->nr_block_infos, ts->block_infos,
+					     ts->percentile_list,
+					     &percentiles, block_state_counts);
+
+		if (len) {
+			struct json_object *block, *percentile_object, *states;
+			int state;
+			block = json_create_object();
+			json_object_add_value_object(root, "block", block);
+
+			percentile_object = json_create_object();
+			json_object_add_value_object(block, "percentiles",
+						     percentile_object);
+			for (i = 0; i < len; i++) {
+				char buf[20];
+				snprintf(buf, sizeof(buf), "%f",
+					 ts->percentile_list[i].u.f);
+				json_object_add_value_int(percentile_object,
+							  (const char *)buf,
+							  percentiles[i]);
+			}
+
+			states = json_create_object();
+			json_object_add_value_object(block, "states", states);
+			for (state = 0; state < BLOCK_STATE_COUNT; state++) {
+				json_object_add_value_int(states,
+					block_state_names[state],
+					block_state_counts[state]);
+			}
+			free(percentiles);
+		}
+	}
+
+	if (ts->ss_dur) {
+		struct json_object *data;
+		struct json_array *iops, *bw;
+		int i, j, k;
+		char ss_buf[64];
+
+		snprintf(ss_buf, sizeof(ss_buf), "%s%s:%f%s",
+			ts->ss_state & __FIO_SS_IOPS ? "iops" : "bw",
+			ts->ss_state & __FIO_SS_SLOPE ? "_slope" : "",
+			(float) ts->ss_limit.u.f,
+			ts->ss_state & __FIO_SS_PCT ? "%" : "");
+
+		tmp = json_create_object();
+		json_object_add_value_object(root, "steadystate", tmp);
+		json_object_add_value_string(tmp, "ss", ss_buf);
+		json_object_add_value_int(tmp, "duration", (int)ts->ss_dur);
+		json_object_add_value_int(tmp, "attained", (ts->ss_state & __FIO_SS_ATTAINED) > 0);
+
+		snprintf(ss_buf, sizeof(ss_buf), "%f%s", (float) ts->ss_criterion.u.f,
+			ts->ss_state & __FIO_SS_PCT ? "%" : "");
+		json_object_add_value_string(tmp, "criterion", ss_buf);
+		json_object_add_value_float(tmp, "max_deviation", ts->ss_deviation.u.f);
+		json_object_add_value_float(tmp, "slope", ts->ss_slope.u.f);
+
+		data = json_create_object();
+		json_object_add_value_object(tmp, "data", data);
+		bw = json_create_array();
+		iops = json_create_array();
+
+		/*
+		** if ss was attained or the buffer is not full,
+		** ss->head points to the first element in the list.
+		** otherwise it actually points to the second element
+		** in the list
+		*/
+		if ((ts->ss_state & __FIO_SS_ATTAINED) || !(ts->ss_state & __FIO_SS_BUFFER_FULL))
+			j = ts->ss_head;
+		else
+			j = ts->ss_head == 0 ? ts->ss_dur - 1 : ts->ss_head - 1;
+		for (i = 0; i < ts->ss_dur; i++) {
+			k = (j + i) % ts->ss_dur;
+			json_array_add_value_int(bw, ts->ss_bw_data[k]);
+			json_array_add_value_int(iops, ts->ss_iops_data[k]);
+		}
+		json_object_add_value_int(data, "bw_mean", steadystate_bw_mean(ts));
+		json_object_add_value_int(data, "iops_mean", steadystate_iops_mean(ts));
+		json_object_add_value_array(data, "iops", iops);
+		json_object_add_value_array(data, "bw", bw);
+	}
+
 	return root;
 }
 
 static void show_thread_status_terse(struct thread_stat *ts,
-				     struct group_run_stats *rs)
+				     struct group_run_stats *rs,
+				     struct buf_output *out)
 {
 	if (terse_version == 2)
-		show_thread_status_terse_v2(ts, rs);
+		show_thread_status_terse_v2(ts, rs, out);
 	else if (terse_version == 3 || terse_version == 4)
-		show_thread_status_terse_v3_v4(ts, rs, terse_version);
+		show_thread_status_terse_v3_v4(ts, rs, terse_version, out);
 	else
 		log_err("fio: bad terse version!? %d\n", terse_version);
 }
 
 struct json_object *show_thread_status(struct thread_stat *ts,
-				       struct group_run_stats *rs)
+				       struct group_run_stats *rs,
+				       struct flist_head *opt_list,
+				       struct buf_output *out)
 {
-	if (output_format == FIO_OUTPUT_TERSE)
-		show_thread_status_terse(ts, rs);
-	else if (output_format == FIO_OUTPUT_JSON)
-		return show_thread_status_json(ts, rs);
-	else
-		show_thread_status_normal(ts, rs);
-	return NULL;
+	struct json_object *ret = NULL;
+
+	if (output_format & FIO_OUTPUT_TERSE)
+		show_thread_status_terse(ts, rs,  out);
+	if (output_format & FIO_OUTPUT_JSON)
+		ret = show_thread_status_json(ts, rs, opt_list);
+	if (output_format & FIO_OUTPUT_NORMAL)
+		show_thread_status_normal(ts, rs,  out);
+
+	return ret;
 }
 
-static void sum_stat(struct io_stat *dst, struct io_stat *src, int nr)
+static void sum_stat(struct io_stat *dst, struct io_stat *src, bool first)
 {
 	double mean, S;
 
@@ -1039,7 +1402,7 @@
 	 * <http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
 	 *  #Parallel_algorithm>
 	 */
-	if (nr == 1) {
+	if (first) {
 		mean = src->mean.u.f;
 		S = src->S.u.f;
 	} else {
@@ -1073,7 +1436,7 @@
 		if (dst->min_bw[i] && dst->min_bw[i] > src->min_bw[i])
 			dst->min_bw[i] = src->min_bw[i];
 
-		dst->io_kb[i] += src->io_kb[i];
+		dst->iobytes[i] += src->iobytes[i];
 		dst->agg[i] += src->agg[i];
 	}
 
@@ -1083,31 +1446,38 @@
 		dst->unit_base = src->unit_base;
 }
 
-void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr)
+void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src,
+		      bool first)
 {
 	int l, k;
 
 	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		if (!dst->unified_rw_rep) {
-			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], nr);
-			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], nr);
-			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], nr);
-			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], nr);
+			sum_stat(&dst->clat_stat[l], &src->clat_stat[l], first);
+			sum_stat(&dst->slat_stat[l], &src->slat_stat[l], first);
+			sum_stat(&dst->lat_stat[l], &src->lat_stat[l], first);
+			sum_stat(&dst->bw_stat[l], &src->bw_stat[l], first);
 
 			dst->io_bytes[l] += src->io_bytes[l];
 
 			if (dst->runtime[l] < src->runtime[l])
 				dst->runtime[l] = src->runtime[l];
 		} else {
-			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], nr);
-			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], nr);
-			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], nr);
-			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], nr);
+			sum_stat(&dst->clat_stat[0], &src->clat_stat[l], first);
+			sum_stat(&dst->slat_stat[0], &src->slat_stat[l], first);
+			sum_stat(&dst->lat_stat[0], &src->lat_stat[l], first);
+			sum_stat(&dst->bw_stat[0], &src->bw_stat[l], first);
 
 			dst->io_bytes[0] += src->io_bytes[l];
 
 			if (dst->runtime[0] < src->runtime[l])
 				dst->runtime[0] = src->runtime[l];
+
+			/*
+			 * We're summing to the same destination, so override
+			 * 'first' after the first iteration of the loop
+			 */
+			first = false;
 		}
 	}
 
@@ -1144,14 +1514,10 @@
 		int m;
 
 		for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
-			/* HACK to prevent bus error in arm GCC 4.9 */
-			dst->io_u_plat[k][m]+=1;
 			if (!dst->unified_rw_rep)
 				dst->io_u_plat[k][m] += src->io_u_plat[k][m];
 			else
 				dst->io_u_plat[0][m] += src->io_u_plat[k][m];
-			/* HACK to prevent bus error in arm GCC 4.9 */
-			dst->io_u_plat[k][m]-=1;
 		}
 	}
 
@@ -1189,11 +1555,14 @@
 	struct group_run_stats *runstats, *rs;
 	struct thread_data *td;
 	struct thread_stat *threadstats, *ts;
-	int i, j, nr_ts, last_ts, idx;
+	int i, j, k, nr_ts, last_ts, idx;
 	int kb_base_warned = 0;
 	int unit_base_warned = 0;
 	struct json_object *root = NULL;
 	struct json_array *array = NULL;
+	struct buf_output output[FIO_OUTPUT_NR];
+	struct flist_head **opt_lists;
+
 	runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1));
 
 	for (i = 0; i < groupid + 1; i++)
@@ -1212,20 +1581,27 @@
 		}
 		if (last_ts == td->groupid)
 			continue;
+		if (!td->o.stats)
+			continue;
 
 		last_ts = td->groupid;
 		nr_ts++;
 	}
 
 	threadstats = malloc(nr_ts * sizeof(struct thread_stat));
+	opt_lists = malloc(nr_ts * sizeof(struct flist_head *));
 
-	for (i = 0; i < nr_ts; i++)
+	for (i = 0; i < nr_ts; i++) {
 		init_thread_stat(&threadstats[i]);
+		opt_lists[i] = NULL;
+	}
 
 	j = 0;
 	last_ts = -1;
 	idx = 0;
 	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
 		if (idx && (!td->o.group_reporting ||
 		    (td->o.group_reporting && last_ts != td->groupid))) {
 			idx = 0;
@@ -1239,6 +1615,7 @@
 		ts->clat_percentiles = td->o.clat_percentiles;
 		ts->percentile_precision = td->o.percentile_precision;
 		memcpy(ts->percentile_list, td->o.percentile_list, sizeof(td->o.percentile_list));
+		opt_lists[j] = &td->opt_list;
 
 		idx++;
 		ts->members++;
@@ -1300,13 +1677,33 @@
 		ts->latency_percentile = td->o.latency_percentile;
 		ts->latency_window = td->o.latency_window;
 
-		sum_thread_stats(ts, &td->ts, idx);
+		ts->nr_block_infos = td->ts.nr_block_infos;
+		for (k = 0; k < ts->nr_block_infos; k++)
+			ts->block_infos[k] = td->ts.block_infos[k];
+
+		sum_thread_stats(ts, &td->ts, idx == 1);
+
+		if (td->o.ss_dur) {
+			ts->ss_state = td->ss.state;
+			ts->ss_dur = td->ss.dur;
+			ts->ss_head = td->ss.head;
+			ts->ss_bw_data = td->ss.bw_data;
+			ts->ss_iops_data = td->ss.iops_data;
+			ts->ss_limit.u.f = td->ss.limit;
+			ts->ss_slope.u.f = td->ss.slope;
+			ts->ss_deviation.u.f = td->ss.deviation;
+			ts->ss_criterion.u.f = td->ss.criterion;
+		}
+		else
+			ts->ss_dur = ts->ss_state = 0;
 	}
 
 	for (i = 0; i < nr_ts; i++) {
 		unsigned long long bw;
 
 		ts = &threadstats[i];
+		if (ts->groupid == -1)
+			continue;
 		rs = &runstats[ts->groupid];
 		rs->kb_base = ts->kb_base;
 		rs->unit_base = ts->unit_base;
@@ -1321,19 +1718,14 @@
 				rs->max_run[j] = ts->runtime[j];
 
 			bw = 0;
-			if (ts->runtime[j]) {
-				unsigned long runt = ts->runtime[j];
-				unsigned long long kb;
-
-				kb = ts->io_bytes[j] / rs->kb_base;
-				bw = kb * 1000 / runt;
-			}
+			if (ts->runtime[j])
+				bw = ts->io_bytes[j] * 1000 / ts->runtime[j];
 			if (bw < rs->min_bw[j])
 				rs->min_bw[j] = bw;
 			if (bw > rs->max_bw[j])
 				rs->max_bw[j] = bw;
 
-			rs->io_kb[j] += ts->io_bytes[j] / rs->kb_base;
+			rs->iobytes[j] += ts->io_bytes[j];
 		}
 	}
 
@@ -1344,55 +1736,74 @@
 
 		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 			if (rs->max_run[ddir])
-				rs->agg[ddir] = (rs->io_kb[ddir] * 1000) /
+				rs->agg[ddir] = (rs->iobytes[ddir] * 1000) /
 						rs->max_run[ddir];
 		}
 	}
 
+	for (i = 0; i < FIO_OUTPUT_NR; i++)
+		buf_output_init(&output[i]);
+
 	/*
 	 * don't overwrite last signal output
 	 */
-	if (output_format == FIO_OUTPUT_NORMAL)
-		log_info("\n");
-	else if (output_format == FIO_OUTPUT_JSON) {
+	if (output_format & FIO_OUTPUT_NORMAL)
+		log_buf(&output[__FIO_OUTPUT_NORMAL], "\n");
+	if (output_format & FIO_OUTPUT_JSON) {
+		struct thread_data *global;
 		char time_buf[32];
-		time_t time_p;
+		struct timeval now;
+		unsigned long long ms_since_epoch;
 
-		time(&time_p);
-		os_ctime_r((const time_t *) &time_p, time_buf,
+		gettimeofday(&now, NULL);
+		ms_since_epoch = (unsigned long long)(now.tv_sec) * 1000 +
+		                 (unsigned long long)(now.tv_usec) / 1000;
+
+		os_ctime_r((const time_t *) &now.tv_sec, time_buf,
 				sizeof(time_buf));
-		time_buf[strlen(time_buf) - 1] = '\0';
+		if (time_buf[strlen(time_buf) - 1] == '\n')
+			time_buf[strlen(time_buf) - 1] = '\0';
 
 		root = json_create_object();
 		json_object_add_value_string(root, "fio version", fio_version_string);
-		json_object_add_value_int(root, "timestamp", time_p);
+		json_object_add_value_int(root, "timestamp", now.tv_sec);
+		json_object_add_value_int(root, "timestamp_ms", ms_since_epoch);
 		json_object_add_value_string(root, "time", time_buf);
+		global = get_global_options();
+		json_add_job_opts(root, "global options", &global->opt_list, false);
 		array = json_create_array();
 		json_object_add_value_array(root, "jobs", array);
 	}
 
+	if (is_backend)
+		fio_server_send_job_options(&get_global_options()->opt_list, -1U);
+
 	for (i = 0; i < nr_ts; i++) {
 		ts = &threadstats[i];
 		rs = &runstats[ts->groupid];
 
-		if (is_backend)
+		if (is_backend) {
+			fio_server_send_job_options(opt_lists[i], i);
 			fio_server_send_ts(ts, rs);
-		else if (output_format == FIO_OUTPUT_TERSE)
-			show_thread_status_terse(ts, rs);
-		else if (output_format == FIO_OUTPUT_JSON) {
-			struct json_object *tmp = show_thread_status_json(ts, rs);
-			json_array_add_value_object(array, tmp);
-		} else
-			show_thread_status_normal(ts, rs);
+		} else {
+			if (output_format & FIO_OUTPUT_TERSE)
+				show_thread_status_terse(ts, rs, &output[__FIO_OUTPUT_TERSE]);
+			if (output_format & FIO_OUTPUT_JSON) {
+				struct json_object *tmp = show_thread_status_json(ts, rs, opt_lists[i]);
+				json_array_add_value_object(array, tmp);
+			}
+			if (output_format & FIO_OUTPUT_NORMAL)
+				show_thread_status_normal(ts, rs, &output[__FIO_OUTPUT_NORMAL]);
+		}
 	}
-	if (output_format == FIO_OUTPUT_JSON) {
+	if (!is_backend && (output_format & FIO_OUTPUT_JSON)) {
 		/* disk util stats, if any */
-		show_disk_util(1, root);
+		show_disk_util(1, root, &output[__FIO_OUTPUT_JSON]);
 
-		show_idle_prof_stats(FIO_OUTPUT_JSON, root);
+		show_idle_prof_stats(FIO_OUTPUT_JSON, root, &output[__FIO_OUTPUT_JSON]);
 
-		json_print_object(root);
-		log_info("\n");
+		json_print_object(root, &output[__FIO_OUTPUT_JSON]);
+		log_buf(&output[__FIO_OUTPUT_JSON], "\n");
 		json_free_object(root);
 	}
 
@@ -1402,30 +1813,26 @@
 		rs->groupid = i;
 		if (is_backend)
 			fio_server_send_gs(rs);
-		else if (output_format == FIO_OUTPUT_NORMAL)
-			show_group_stats(rs);
+		else if (output_format & FIO_OUTPUT_NORMAL)
+			show_group_stats(rs, &output[__FIO_OUTPUT_NORMAL]);
 	}
 
 	if (is_backend)
 		fio_server_send_du();
-	else if (output_format == FIO_OUTPUT_NORMAL) {
-		show_disk_util(0, NULL);
-		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL);
+	else if (output_format & FIO_OUTPUT_NORMAL) {
+		show_disk_util(0, NULL, &output[__FIO_OUTPUT_NORMAL]);
+		show_idle_prof_stats(FIO_OUTPUT_NORMAL, NULL, &output[__FIO_OUTPUT_NORMAL]);
 	}
 
-	if ( !(output_format == FIO_OUTPUT_TERSE) && append_terse_output) {
-		log_info("\nAdditional Terse Output:\n");
-
-		for (i = 0; i < nr_ts; i++) {
-			ts = &threadstats[i];
-			rs = &runstats[ts->groupid];
-			show_thread_status_terse(ts, rs);
-		}
+	for (i = 0; i < FIO_OUTPUT_NR; i++) {
+		buf_output_flush(&output[i]);
+		buf_output_free(&output[i]);
 	}
 
 	log_info_flush();
 	free(runstats);
 	free(threadstats);
+	free(opt_lists);
 }
 
 void show_run_stats(void)
@@ -1448,19 +1855,19 @@
 	fio_gettime(&tv, NULL);
 
 	for_each_td(td, i) {
-		rt[i] = mtime_since(&td->start, &tv);
-		if (td_read(td) && td->io_bytes[DDIR_READ])
-			td->ts.runtime[DDIR_READ] += rt[i];
-		if (td_write(td) && td->io_bytes[DDIR_WRITE])
-			td->ts.runtime[DDIR_WRITE] += rt[i];
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
-			td->ts.runtime[DDIR_TRIM] += rt[i];
-
 		td->update_rusage = 1;
 		td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
 		td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
 		td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 		td->ts.total_run_time = mtime_since(&td->epoch, &tv);
+
+		rt[i] = mtime_since(&td->start, &tv);
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
+			td->ts.runtime[DDIR_READ] += rt[i];
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
+			td->ts.runtime[DDIR_WRITE] += rt[i];
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] += rt[i];
 	}
 
 	for_each_td(td, i) {
@@ -1476,11 +1883,11 @@
 	__show_run_stats();
 
 	for_each_td(td, i) {
-		if (td_read(td) && td->io_bytes[DDIR_READ])
+		if (td_read(td) && td->ts.io_bytes[DDIR_READ])
 			td->ts.runtime[DDIR_READ] -= rt[i];
-		if (td_write(td) && td->io_bytes[DDIR_WRITE])
+		if (td_write(td) && td->ts.io_bytes[DDIR_WRITE])
 			td->ts.runtime[DDIR_WRITE] -= rt[i];
-		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+		if (td_trim(td) && td->ts.io_bytes[DDIR_TRIM])
 			td->ts.runtime[DDIR_TRIM] -= rt[i];
 	}
 
@@ -1564,58 +1971,184 @@
 	is->samples++;
 }
 
-static void __add_log_sample(struct io_log *iolog, unsigned long val,
-			     enum fio_ddir ddir, unsigned int bs,
-			     unsigned long t, uint64_t offset)
+/*
+ * Return a struct io_logs, which is added to the tail of the log
+ * list for 'iolog'.
+ */
+static struct io_logs *get_new_log(struct io_log *iolog)
 {
-	uint64_t nr_samples = iolog->nr_samples;
-	struct io_sample *s;
+	size_t new_size, new_samples;
+	struct io_logs *cur_log;
 
-	if (iolog->disabled)
-		return;
+	/*
+	 * Cap the size at MAX_LOG_ENTRIES, so we don't keep doubling
+	 * forever
+	 */
+	if (!iolog->cur_log_max)
+		new_samples = DEF_LOG_ENTRIES;
+	else {
+		new_samples = iolog->cur_log_max * 2;
+		if (new_samples > MAX_LOG_ENTRIES)
+			new_samples = MAX_LOG_ENTRIES;
+	}
 
-	if (!iolog->nr_samples)
-		iolog->avg_last = t;
+	new_size = new_samples * log_entry_sz(iolog);
 
-	if (iolog->nr_samples == iolog->max_samples) {
-		size_t new_size;
-		void *new_log;
+	cur_log = smalloc(sizeof(*cur_log));
+	if (cur_log) {
+		INIT_FLIST_HEAD(&cur_log->list);
+		cur_log->log = malloc(new_size);
+		if (cur_log->log) {
+			cur_log->nr_samples = 0;
+			cur_log->max_samples = new_samples;
+			flist_add_tail(&cur_log->list, &iolog->io_logs);
+			iolog->cur_log_max = new_samples;
+			return cur_log;
+		}
+		sfree(cur_log);
+	}
 
-		new_size = 2 * iolog->max_samples * log_entry_sz(iolog);
+	return NULL;
+}
 
-		if (iolog->log_gz && (new_size > iolog->log_gz)) {
-			if (iolog_flush(iolog, 0)) {
-				log_err("fio: failed flushing iolog! Will stop logging.\n");
-				iolog->disabled = 1;
-				return;
-			}
-			nr_samples = iolog->nr_samples;
-		} else {
-			new_log = realloc(iolog->log, new_size);
-			if (!new_log) {
-				log_err("fio: failed extending iolog! Will stop logging.\n");
-				iolog->disabled = 1;
-				return;
-			}
-			iolog->log = new_log;
-			iolog->max_samples <<= 1;
+/*
+ * Add and return a new log chunk, or return current log if big enough
+ */
+static struct io_logs *regrow_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+	int i;
+
+	if (!iolog || iolog->disabled)
+		goto disable;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * No room for a new sample. If we're compressing on the fly, flush
+	 * out the current chunk
+	 */
+	if (iolog->log_gz) {
+		if (iolog_cur_flush(iolog, cur_log)) {
+			log_err("fio: failed flushing iolog! Will stop logging.\n");
+			return NULL;
 		}
 	}
 
-	s = get_sample(iolog, nr_samples);
-
-	s->val = val;
-	s->time = t;
-	io_sample_set_ddir(iolog, s, ddir);
-	s->bs = bs;
-
-	if (iolog->log_offset) {
-		struct io_sample_offset *so = (void *) s;
-
-		so->offset = offset;
+	/*
+	 * Get a new log array, and add to our list
+	 */
+	cur_log = get_new_log(iolog);
+	if (!cur_log) {
+		log_err("fio: failed extending iolog! Will stop logging.\n");
+		return NULL;
 	}
 
-	iolog->nr_samples++;
+	if (!iolog->pending || !iolog->pending->nr_samples)
+		return cur_log;
+
+	/*
+	 * Flush pending items to new log
+	 */
+	for (i = 0; i < iolog->pending->nr_samples; i++) {
+		struct io_sample *src, *dst;
+
+		src = get_sample(iolog, iolog->pending, i);
+		dst = get_sample(iolog, cur_log, i);
+		memcpy(dst, src, log_entry_sz(iolog));
+	}
+	cur_log->nr_samples = iolog->pending->nr_samples;
+
+	iolog->pending->nr_samples = 0;
+	return cur_log;
+disable:
+	if (iolog)
+		iolog->disabled = true;
+	return NULL;
+}
+
+void regrow_logs(struct thread_data *td)
+{
+	regrow_log(td->slat_log);
+	regrow_log(td->clat_log);
+	regrow_log(td->clat_hist_log);
+	regrow_log(td->lat_log);
+	regrow_log(td->bw_log);
+	regrow_log(td->iops_log);
+	td->flags &= ~TD_F_REGROW_LOGS;
+}
+
+static struct io_logs *get_cur_log(struct io_log *iolog)
+{
+	struct io_logs *cur_log;
+
+	cur_log = iolog_cur_log(iolog);
+	if (!cur_log) {
+		cur_log = get_new_log(iolog);
+		if (!cur_log)
+			return NULL;
+	}
+
+	if (cur_log->nr_samples < cur_log->max_samples)
+		return cur_log;
+
+	/*
+	 * Out of space. If we're in IO offload mode, or we're not doing
+	 * per unit logging (hence logging happens outside of the IO thread
+	 * as well), add a new log chunk inline. If we're doing inline
+	 * submissions, flag 'td' as needing a log regrow and we'll take
+	 * care of it on the submission side.
+	 */
+	if (iolog->td->o.io_submit_mode == IO_MODE_OFFLOAD ||
+	    !per_unit_log(iolog))
+		return regrow_log(iolog);
+
+	iolog->td->flags |= TD_F_REGROW_LOGS;
+	assert(iolog->pending->nr_samples < iolog->pending->max_samples);
+	return iolog->pending;
+}
+
+static void __add_log_sample(struct io_log *iolog, union io_sample_data data,
+			     enum fio_ddir ddir, unsigned int bs,
+			     unsigned long t, uint64_t offset)
+{
+	struct io_logs *cur_log;
+
+	if (iolog->disabled)
+		return;
+	if (flist_empty(&iolog->io_logs))
+		iolog->avg_last = t;
+
+	cur_log = get_cur_log(iolog);
+	if (cur_log) {
+		struct io_sample *s;
+
+		s = get_sample(iolog, cur_log, cur_log->nr_samples);
+
+		s->data = data;
+		s->time = t + (iolog->td ? iolog->td->unix_epoch : 0);
+		io_sample_set_ddir(iolog, s, ddir);
+		s->bs = bs;
+
+		if (iolog->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = offset;
+		}
+
+		cur_log->nr_samples++;
+		return;
+	}
+
+	iolog->disabled = true;
 }
 
 static inline void reset_io_stat(struct io_stat *ios)
@@ -1638,6 +2171,9 @@
 
 		ts->io_bytes[i] = 0;
 		ts->runtime[i] = 0;
+		ts->total_io_u[i] = 0;
+		ts->short_io_u[i] = 0;
+		ts->drop_io_u[i] = 0;
 
 		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
 			ts->io_u_plat[i][j] = 0;
@@ -1647,58 +2183,56 @@
 		ts->io_u_map[i] = 0;
 		ts->io_u_submit[i] = 0;
 		ts->io_u_complete[i] = 0;
-		ts->io_u_lat_u[i] = 0;
-		ts->io_u_lat_m[i] = 0;
-		ts->total_submit = 0;
-		ts->total_complete = 0;
 	}
 
-	for (i = 0; i < 3; i++) {
-		ts->total_io_u[i] = 0;
-		ts->short_io_u[i] = 0;
-		ts->drop_io_u[i] = 0;
-	}
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++)
+		ts->io_u_lat_u[i] = 0;
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++)
+		ts->io_u_lat_m[i] = 0;
+
+	ts->total_submit = 0;
+	ts->total_complete = 0;
 }
 
-static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed)
+static void __add_stat_to_log(struct io_log *iolog, enum fio_ddir ddir,
+			      unsigned long elapsed, bool log_max)
 {
 	/*
 	 * Note an entry in the log. Use the mean from the logged samples,
 	 * making sure to properly round up. Only write a log entry if we
 	 * had actual samples done.
 	 */
-	if (iolog->avg_window[DDIR_READ].samples) {
-		unsigned long mr;
+	if (iolog->avg_window[ddir].samples) {
+		union io_sample_data data;
 
-		mr = iolog->avg_window[DDIR_READ].mean.u.f + 0.50;
-		__add_log_sample(iolog, mr, DDIR_READ, 0, elapsed, 0);
-	}
-	if (iolog->avg_window[DDIR_WRITE].samples) {
-		unsigned long mw;
+		if (log_max)
+			data.val = iolog->avg_window[ddir].max_val;
+		else
+			data.val = iolog->avg_window[ddir].mean.u.f + 0.50;
 
-		mw = iolog->avg_window[DDIR_WRITE].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed, 0);
-	}
-	if (iolog->avg_window[DDIR_TRIM].samples) {
-		unsigned long mw;
-
-		mw = iolog->avg_window[DDIR_TRIM].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed, 0);
+		__add_log_sample(iolog, data, ddir, 0, elapsed, 0);
 	}
 
-	reset_io_stat(&iolog->avg_window[DDIR_READ]);
-	reset_io_stat(&iolog->avg_window[DDIR_WRITE]);
-	reset_io_stat(&iolog->avg_window[DDIR_TRIM]);
+	reset_io_stat(&iolog->avg_window[ddir]);
 }
 
-static void add_log_sample(struct thread_data *td, struct io_log *iolog,
-			   unsigned long val, enum fio_ddir ddir,
+static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed,
+			     bool log_max)
+{
+	int ddir;
+
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++)
+		__add_stat_to_log(iolog, ddir, elapsed, log_max);
+}
+
+static long add_log_sample(struct thread_data *td, struct io_log *iolog,
+			   union io_sample_data data, enum fio_ddir ddir,
 			   unsigned int bs, uint64_t offset)
 {
 	unsigned long elapsed, this_window;
 
 	if (!ddir_rw(ddir))
-		return;
+		return 0;
 
 	elapsed = mtime_since_now(&td->epoch);
 
@@ -1706,48 +2240,55 @@
 	 * If no time averaging, just add the log sample.
 	 */
 	if (!iolog->avg_msec) {
-		__add_log_sample(iolog, val, ddir, bs, elapsed, offset);
-		return;
+		__add_log_sample(iolog, data, ddir, bs, elapsed, offset);
+		return 0;
 	}
 
 	/*
 	 * Add the sample. If the time period has passed, then
 	 * add that entry to the log and clear.
 	 */
-	add_stat_sample(&iolog->avg_window[ddir], val);
+	add_stat_sample(&iolog->avg_window[ddir], data.val);
 
 	/*
 	 * If period hasn't passed, adding the above sample is all we
 	 * need to do.
 	 */
 	this_window = elapsed - iolog->avg_last;
-	if (this_window < iolog->avg_msec)
-		return;
+	if (elapsed < iolog->avg_last)
+		return iolog->avg_last - elapsed;
+	else if (this_window < iolog->avg_msec) {
+		int diff = iolog->avg_msec - this_window;
 
-	_add_stat_to_log(iolog, elapsed);
+		if (inline_log(iolog) || diff > LOG_MSEC_SLACK)
+			return diff;
+	}
 
-	iolog->avg_last = elapsed;
+	_add_stat_to_log(iolog, elapsed, td->o.log_max != 0);
+
+	iolog->avg_last = elapsed - (this_window - iolog->avg_msec);
+	return iolog->avg_msec;
 }
 
-void finalize_logs(struct thread_data *td)
+void finalize_logs(struct thread_data *td, bool unit_logs)
 {
 	unsigned long elapsed;
 
 	elapsed = mtime_since_now(&td->epoch);
 
-	if (td->clat_log)
-		_add_stat_to_log(td->clat_log, elapsed);
-	if (td->slat_log)
-		_add_stat_to_log(td->slat_log, elapsed);
-	if (td->lat_log)
-		_add_stat_to_log(td->lat_log, elapsed);
-	if (td->bw_log)
-		_add_stat_to_log(td->bw_log, elapsed);
-	if (td->iops_log)
-		_add_stat_to_log(td->iops_log, elapsed);
+	if (td->clat_log && unit_logs)
+		_add_stat_to_log(td->clat_log, elapsed, td->o.log_max != 0);
+	if (td->slat_log && unit_logs)
+		_add_stat_to_log(td->slat_log, elapsed, td->o.log_max != 0);
+	if (td->lat_log && unit_logs)
+		_add_stat_to_log(td->lat_log, elapsed, td->o.log_max != 0);
+	if (td->bw_log && (unit_logs == per_unit_log(td->bw_log)))
+		_add_stat_to_log(td->bw_log, elapsed, td->o.log_max != 0);
+	if (td->iops_log && (unit_logs == per_unit_log(td->iops_log)))
+		_add_stat_to_log(td->iops_log, elapsed, td->o.log_max != 0);
 }
 
-void add_agg_sample(unsigned long val, enum fio_ddir ddir, unsigned int bs)
+void add_agg_sample(union io_sample_data data, enum fio_ddir ddir, unsigned int bs)
 {
 	struct io_log *iolog;
 
@@ -1755,7 +2296,7 @@
 		return;
 
 	iolog = agg_io_log[ddir];
-	__add_log_sample(iolog, val, ddir, bs, mtime_since_genesis(), 0);
+	__add_log_sample(iolog, data, ddir, bs, mtime_since_genesis(), 0);
 }
 
 static void add_clat_percentile_sample(struct thread_stat *ts,
@@ -1770,18 +2311,60 @@
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
 		     unsigned long usec, unsigned int bs, uint64_t offset)
 {
+	unsigned long elapsed, this_window;
 	struct thread_stat *ts = &td->ts;
+	struct io_log *iolog = td->clat_hist_log;
 
-	if (!ddir_rw(ddir))
-		return;
+	td_io_u_lock(td);
 
 	add_stat_sample(&ts->clat_stat[ddir], usec);
 
 	if (td->clat_log)
-		add_log_sample(td, td->clat_log, usec, ddir, bs, offset);
+		add_log_sample(td, td->clat_log, sample_val(usec), ddir, bs,
+			       offset);
 
 	if (ts->clat_percentiles)
 		add_clat_percentile_sample(ts, usec, ddir);
+
+	if (iolog && iolog->hist_msec) {
+		struct io_hist *hw = &iolog->hist_window[ddir];
+
+		hw->samples++;
+		elapsed = mtime_since_now(&td->epoch);
+		if (!hw->hist_last)
+			hw->hist_last = elapsed;
+		this_window = elapsed - hw->hist_last;
+		
+		if (this_window >= iolog->hist_msec) {
+			unsigned int *io_u_plat;
+			struct io_u_plat_entry *dst;
+
+			/*
+			 * Make a byte-for-byte copy of the latency histogram
+			 * stored in td->ts.io_u_plat[ddir], recording it in a
+			 * log sample. Note that the matching call to free() is
+			 * located in iolog.c after printing this sample to the
+			 * log file.
+			 */
+			io_u_plat = (unsigned int *) td->ts.io_u_plat[ddir];
+			dst = malloc(sizeof(struct io_u_plat_entry));
+			memcpy(&(dst->io_u_plat), io_u_plat,
+				FIO_IO_U_PLAT_NR * sizeof(unsigned int));
+			flist_add(&dst->list, &hw->list);
+			__add_log_sample(iolog, sample_plat(dst), ddir, bs,
+						elapsed, offset);
+
+			/*
+			 * Update the last time we recorded as being now, minus
+			 * any drift in time we encountered before actually
+			 * making the record.
+			 */
+			hw->hist_last = elapsed - (this_window - iolog->hist_msec);
+			hw->samples = 0;
+		}
+	}
+
+	td_io_u_unlock(td);
 }
 
 void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
@@ -1792,10 +2375,14 @@
 	if (!ddir_rw(ddir))
 		return;
 
+	td_io_u_lock(td);
+
 	add_stat_sample(&ts->slat_stat[ddir], usec);
 
 	if (td->slat_log)
-		add_log_sample(td, td->slat_log, usec, ddir, bs, offset);
+		add_log_sample(td, td->slat_log, sample_val(usec), ddir, bs, offset);
+
+	td_io_u_unlock(td);
 }
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
@@ -1806,88 +2393,169 @@
 	if (!ddir_rw(ddir))
 		return;
 
+	td_io_u_lock(td);
+
 	add_stat_sample(&ts->lat_stat[ddir], usec);
 
 	if (td->lat_log)
-		add_log_sample(td, td->lat_log, usec, ddir, bs, offset);
+		add_log_sample(td, td->lat_log, sample_val(usec), ddir, bs,
+			       offset);
+
+	td_io_u_unlock(td);
 }
 
-void add_bw_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-		   struct timeval *t)
+void add_bw_sample(struct thread_data *td, struct io_u *io_u,
+		   unsigned int bytes, unsigned long spent)
 {
 	struct thread_stat *ts = &td->ts;
+	unsigned long rate;
+
+	if (spent)
+		rate = bytes * 1000 / spent;
+	else
+		rate = 0;
+
+	td_io_u_lock(td);
+
+	add_stat_sample(&ts->bw_stat[io_u->ddir], rate);
+
+	if (td->bw_log)
+		add_log_sample(td, td->bw_log, sample_val(rate), io_u->ddir,
+			       bytes, io_u->offset);
+
+	td->stat_io_bytes[io_u->ddir] = td->this_io_bytes[io_u->ddir];
+	td_io_u_unlock(td);
+}
+
+static int __add_samples(struct thread_data *td, struct timeval *parent_tv,
+			 struct timeval *t, unsigned int avg_time,
+			 uint64_t *this_io_bytes, uint64_t *stat_io_bytes,
+			 struct io_stat *stat, struct io_log *log,
+			 bool is_kb)
+{
 	unsigned long spent, rate;
+	enum fio_ddir ddir;
+	unsigned int next, next_log;
 
-	if (!ddir_rw(ddir))
-		return;
+	next_log = avg_time;
 
-	spent = mtime_since(&td->bw_sample_time, t);
-	if (spent < td->o.bw_avg_time)
-		return;
+	spent = mtime_since(parent_tv, t);
+	if (spent < avg_time && avg_time - spent >= LOG_MSEC_SLACK)
+		return avg_time - spent;
+
+	td_io_u_lock(td);
 
 	/*
 	 * Compute both read and write rates for the interval.
 	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
 		uint64_t delta;
 
-		delta = td->this_io_bytes[ddir] - td->stat_io_bytes[ddir];
+		delta = this_io_bytes[ddir] - stat_io_bytes[ddir];
 		if (!delta)
 			continue; /* No entries for interval */
 
-		if (spent)
-			rate = delta * 1000 / spent / 1024;
-		else
+		if (spent) {
+			if (is_kb)
+				rate = delta * 1000 / spent / 1024; /* KiB/s */
+			else
+				rate = (delta * 1000) / spent;
+		} else
 			rate = 0;
 
-		add_stat_sample(&ts->bw_stat[ddir], rate);
+		add_stat_sample(&stat[ddir], rate);
 
-		if (td->bw_log)
-			add_log_sample(td, td->bw_log, rate, ddir, bs, 0);
+		if (log) {
+			unsigned int bs = 0;
 
-		td->stat_io_bytes[ddir] = td->this_io_bytes[ddir];
+			if (td->o.min_bs[ddir] == td->o.max_bs[ddir])
+				bs = td->o.min_bs[ddir];
+
+			next = add_log_sample(td, log, sample_val(rate), ddir, bs, 0);
+			next_log = min(next_log, next);
+		}
+
+		stat_io_bytes[ddir] = this_io_bytes[ddir];
 	}
 
-	fio_gettime(&td->bw_sample_time, NULL);
+	timeval_add_msec(parent_tv, avg_time);
+
+	td_io_u_unlock(td);
+
+	if (spent <= avg_time)
+		next = avg_time;
+	else
+		next = avg_time - (1 + spent - avg_time);
+
+	return min(next, next_log);
 }
 
-void add_iops_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
-		     struct timeval *t)
+static int add_bw_samples(struct thread_data *td, struct timeval *t)
+{
+	return __add_samples(td, &td->bw_sample_time, t, td->o.bw_avg_time,
+				td->this_io_bytes, td->stat_io_bytes,
+				td->ts.bw_stat, td->bw_log, true);
+}
+
+void add_iops_sample(struct thread_data *td, struct io_u *io_u,
+		     unsigned int bytes)
 {
 	struct thread_stat *ts = &td->ts;
-	unsigned long spent, iops;
 
-	if (!ddir_rw(ddir))
-		return;
+	td_io_u_lock(td);
 
-	spent = mtime_since(&td->iops_sample_time, t);
-	if (spent < td->o.iops_avg_time)
-		return;
+	add_stat_sample(&ts->iops_stat[io_u->ddir], 1);
 
-	/*
-	 * Compute both read and write rates for the interval.
-	 */
-	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
-		uint64_t delta;
+	if (td->iops_log)
+		add_log_sample(td, td->iops_log, sample_val(1), io_u->ddir,
+			       bytes, io_u->offset);
 
-		delta = td->this_io_blocks[ddir] - td->stat_io_blocks[ddir];
-		if (!delta)
-			continue; /* No entries for interval */
+	td->stat_io_blocks[io_u->ddir] = td->this_io_blocks[io_u->ddir];
+	td_io_u_unlock(td);
+}
 
-		if (spent)
-			iops = (delta * 1000) / spent;
-		else
-			iops = 0;
+static int add_iops_samples(struct thread_data *td, struct timeval *t)
+{
+	return __add_samples(td, &td->iops_sample_time, t, td->o.iops_avg_time,
+				td->this_io_blocks, td->stat_io_blocks,
+				td->ts.iops_stat, td->iops_log, false);
+}
 
-		add_stat_sample(&ts->iops_stat[ddir], iops);
+/*
+ * Returns msecs to next event
+ */
+int calc_log_samples(void)
+{
+	struct thread_data *td;
+	unsigned int next = ~0U, tmp;
+	struct timeval now;
+	int i;
 
-		if (td->iops_log)
-			add_log_sample(td, td->iops_log, iops, ddir, bs, 0);
+	fio_gettime(&now, NULL);
 
-		td->stat_io_blocks[ddir] = td->this_io_blocks[ddir];
+	for_each_td(td, i) {
+		if (!td->o.stats)
+			continue;
+		if (in_ramp_time(td) ||
+		    !(td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING)) {
+			next = min(td->o.iops_avg_time, td->o.bw_avg_time);
+			continue;
+		}
+		if (!td->bw_log ||
+			(td->bw_log && !per_unit_log(td->bw_log))) {
+			tmp = add_bw_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
+		if (!td->iops_log ||
+			(td->iops_log && !per_unit_log(td->iops_log))) {
+			tmp = add_iops_samples(td, &now);
+			if (tmp < next)
+				next = tmp;
+		}
 	}
 
-	fio_gettime(&td->iops_sample_time, NULL);
+	return next == ~0U ? 0 : next;
 }
 
 void stat_init(void)
@@ -1910,6 +2578,16 @@
  */
 void show_running_run_stats(void)
 {
-	helper_do_stat = 1;
-	pthread_cond_signal(&helper_cond);
+	helper_do_stat();
+}
+
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u)
+{
+	/* Ignore io_u's which span multiple blocks--they will just get
+	 * inaccurate counts. */
+	int idx = (io_u->offset - io_u->file->file_offset)
+			/ td->o.bs[DDIR_TRIM];
+	uint32_t *info = &td->ts.block_infos[idx];
+	assert(idx < td->ts.nr_block_infos);
+	return info;
 }

diff --git a/stat.h b/stat.h
index 8b4416c..aa4ad80 100644
--- a/stat.h
+++ b/stat.h

@@ -2,11 +2,12 @@
 #define FIO_STAT_H
 
 #include "iolog.h"
+#include "lib/output_buffer.h"
 
 struct group_run_stats {
 	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
 	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
-	uint64_t io_kb[DDIR_RWDIR_CNT];
+	uint64_t iobytes[DDIR_RWDIR_CNT];
 	uint64_t agg[DDIR_RWDIR_CNT];
 	uint32_t kb_base;
 	uint32_t unit_base;
@@ -112,6 +113,28 @@
 #define FIO_IO_U_LIST_MAX_LEN 20 /* The size of the default and user-specified
 					list of percentiles */
 
+/*
+ * Trim cycle count measurements
+ */
+#define MAX_NR_BLOCK_INFOS	8192
+#define BLOCK_INFO_STATE_SHIFT	29
+#define BLOCK_INFO_TRIMS(block_info)	\
+	((block_info) & ((1 << BLOCK_INFO_STATE_SHIFT) - 1))
+#define BLOCK_INFO_STATE(block_info)		\
+	((block_info) >> BLOCK_INFO_STATE_SHIFT)
+#define BLOCK_INFO(state, trim_cycles)	\
+	((trim_cycles) | ((unsigned int) (state) << BLOCK_INFO_STATE_SHIFT))
+#define BLOCK_INFO_SET_STATE(block_info, state)	\
+	BLOCK_INFO(state, BLOCK_INFO_TRIMS(block_info))
+enum block_info_state {
+	BLOCK_STATE_UNINIT,
+	BLOCK_STATE_TRIMMED,
+	BLOCK_STATE_WRITTEN,
+	BLOCK_STATE_TRIM_FAILURE,
+	BLOCK_STATE_WRITE_FAILURE,
+	BLOCK_STATE_COUNT,
+};
+
 #define MAX_PATTERN_SIZE	512
 #define FIO_JOBNAME_SIZE	128
 #define FIO_JOBDESC_SIZE	256
@@ -160,9 +183,9 @@
 	uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
 	uint32_t pad;
 
-	uint64_t total_io_u[3];
-	uint64_t short_io_u[3];
-	uint64_t drop_io_u[3];
+	uint64_t total_io_u[DDIR_RWDIR_CNT];
+	uint64_t short_io_u[DDIR_RWDIR_CNT];
+	uint64_t drop_io_u[DDIR_RWDIR_CNT];
 	uint64_t total_submit;
 	uint64_t total_complete;
 
@@ -175,18 +198,41 @@
 	 */
 	union {
 		uint16_t continue_on_error;
-		uint64_t pad2;
+		uint32_t pad2;
 	};
-	uint64_t total_err_count;
 	uint32_t first_error;
+	uint64_t total_err_count;
+
+	uint64_t nr_block_infos;
+	uint32_t block_infos[MAX_NR_BLOCK_INFOS];
 
 	uint32_t kb_base;
 	uint32_t unit_base;
 
 	uint32_t latency_depth;
+	uint32_t pad3;
 	uint64_t latency_target;
 	fio_fp64_t latency_percentile;
 	uint64_t latency_window;
+
+	uint64_t ss_dur;
+	uint32_t ss_state;
+	uint32_t ss_head;
+
+	fio_fp64_t ss_limit;
+	fio_fp64_t ss_slope;
+	fio_fp64_t ss_deviation;
+	fio_fp64_t ss_criterion;
+
+	union {
+		uint64_t *ss_iops_data;
+		uint64_t pad4;
+	};
+
+	union {
+		uint64_t *ss_bw_data;
+		uint64_t pad5;
+	};
 } __attribute__((packed));
 
 struct jobs_eta {
@@ -198,9 +244,9 @@
 
 	uint32_t files_open;
 
-	uint32_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
+	uint64_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
 	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
-	uint32_t rate[DDIR_RWDIR_CNT];
+	uint64_t rate[DDIR_RWDIR_CNT];
 	uint32_t iops[DDIR_RWDIR_CNT];
 	uint64_t elapsed_sec;
 	uint64_t eta_sec;
@@ -214,46 +260,69 @@
 	uint8_t run_str[];
 } __attribute__((packed));
 
+struct io_u_plat_entry {
+	struct flist_head list;
+	unsigned int io_u_plat[FIO_IO_U_PLAT_NR];
+};
+
 extern struct fio_mutex *stat_mutex;
 
-extern struct jobs_eta *get_jobs_eta(int force, size_t *size);
+extern struct jobs_eta *get_jobs_eta(bool force, size_t *size);
 
 extern void stat_init(void);
 extern void stat_exit(void);
 
-extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs);
-extern void show_group_stats(struct group_run_stats *rs);
-extern int calc_thread_status(struct jobs_eta *je, int force);
+extern struct json_object * show_thread_status(struct thread_stat *ts, struct group_run_stats *rs, struct flist_head *, struct buf_output *);
+extern void show_group_stats(struct group_run_stats *rs, struct buf_output *);
+extern bool calc_thread_status(struct jobs_eta *je, int force);
 extern void display_thread_status(struct jobs_eta *je);
 extern void show_run_stats(void);
 extern void __show_run_stats(void);
 extern void __show_running_run_stats(void);
 extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
-extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr);
+extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, bool first);
 extern void sum_group_stats(struct group_run_stats *dst, struct group_run_stats *src);
 extern void init_thread_stat(struct thread_stat *ts);
 extern void init_group_run_stat(struct group_run_stats *gs);
 extern void eta_to_str(char *str, unsigned long eta_sec);
-extern int calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev);
+extern bool calc_lat(struct io_stat *is, unsigned long *min, unsigned long *max, double *mean, double *dev);
 extern unsigned int calc_clat_percentiles(unsigned int *io_u_plat, unsigned long nr, fio_fp64_t *plist, unsigned int **output, unsigned int *maxv, unsigned int *minv);
 extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
 extern void reset_io_stats(struct thread_data *);
+extern void update_rusage_stat(struct thread_data *);
+extern void clear_rusage_stat(struct thread_data *);
 
-static inline int usec_to_msec(unsigned long *min, unsigned long *max,
-			       double *mean, double *dev)
+extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+				unsigned int, uint64_t);
+extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+				unsigned int, uint64_t);
+extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
+				unsigned int, uint64_t);
+extern void add_agg_sample(union io_sample_data, enum fio_ddir, unsigned int);
+extern void add_iops_sample(struct thread_data *, struct io_u *,
+				unsigned int);
+extern void add_bw_sample(struct thread_data *, struct io_u *,
+				unsigned int, unsigned long);
+extern int calc_log_samples(void);
+
+extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
+extern int write_bw_log;
+
+static inline bool usec_to_msec(unsigned long *min, unsigned long *max,
+				double *mean, double *dev)
 {
 	if (*min > 1000 && *max > 1000 && *mean > 1000.0 && *dev > 1000.0) {
 		*min /= 1000;
 		*max /= 1000;
 		*mean /= 1000.0;
 		*dev /= 1000.0;
-		return 0;
+		return true;
 	}
 
-	return 1;
+	return false;
 }
 /*
  * Worst level condensing would be 1:5, so allow enough room for that
@@ -261,4 +330,6 @@
 #define __THREAD_RUNSTR_SZ(nr)	((nr) * 5)
 #define THREAD_RUNSTR_SZ	__THREAD_RUNSTR_SZ(thread_number)
 
+uint32_t *io_u_block_info(struct thread_data *td, struct io_u *io_u);
+
 #endif

diff --git a/steadystate.c b/steadystate.c
new file mode 100644
index 0000000..98f027c
--- /dev/null
+++ b/steadystate.c

@@ -0,0 +1,363 @@
+#include <stdlib.h>
+
+#include "fio.h"
+#include "steadystate.h"
+#include "helper_thread.h"
+
+bool steadystate_enabled = false;
+
+static void steadystate_alloc(struct thread_data *td)
+{
+	td->ss.bw_data = calloc(td->ss.dur, sizeof(uint64_t));
+	td->ss.iops_data = calloc(td->ss.dur, sizeof(uint64_t));
+
+	td->ss.state |= __FIO_SS_DATA;
+}
+
+void steadystate_setup(void)
+{
+	int i, prev_groupid;
+	struct thread_data *td, *prev_td;
+
+	if (!steadystate_enabled)
+		return;
+
+	/*
+	 * if group reporting is enabled, identify the last td
+	 * for each group and use it for storing steady state
+	 * data
+	 */
+	prev_groupid = -1;
+	prev_td = NULL;
+	for_each_td(td, i) {
+		if (!td->ss.dur)
+			continue;
+
+		if (!td->o.group_reporting) {
+			steadystate_alloc(td);
+			continue;
+		}
+
+		if (prev_groupid != td->groupid) {
+			if (prev_td != NULL) {
+				steadystate_alloc(prev_td);
+			}
+			prev_groupid = td->groupid;
+		}
+		prev_td = td;
+	}
+
+	if (prev_td != NULL && prev_td->o.group_reporting) {
+		steadystate_alloc(prev_td);
+	}
+}
+
+static bool steadystate_slope(uint64_t iops, uint64_t bw,
+			      struct thread_data *td)
+{
+	int i, j;
+	double result;
+	struct steadystate_data *ss = &td->ss;
+	uint64_t new_val;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & __FIO_SS_IOPS)
+		new_val = iops;
+	else
+		new_val = bw;
+
+	if (ss->state & __FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & __FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++) {
+				if (ss->state & __FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+				j = (ss->head + i) % ss->dur;
+				if (ss->state & __FIO_SS_IOPS)
+					ss->sum_xy += i * ss->iops_data[j];
+				else
+					ss->sum_xy += i * ss->bw_data[j];
+			}
+			ss->state |= __FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sums */
+			ss->sum_y -= ss->oldest_y;
+			ss->sum_y += new_val;
+			ss->sum_xy = ss->sum_xy - ss->sum_y + ss->dur * new_val;
+		}
+
+		if (ss->state & __FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		/*
+		 * calculate slope as (sum_xy - sum_x * sum_y / n) / (sum_(x^2)
+		 * - (sum_x)^2 / n) This code assumes that all x values are
+		 * equally spaced when they are often off by a few milliseconds.
+		 * This assumption greatly simplifies the calculations.
+		 */
+		ss->slope = (ss->sum_xy - (double) ss->sum_x * ss->sum_y / ss->dur) /
+				(ss->sum_x_sq - (double) ss->sum_x * ss->sum_x / ss->dur);
+		if (ss->state & __FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->slope / (ss->sum_y / ss->dur);
+		else
+			ss->criterion = ss->slope;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, sum_xy: %llu, slope: %f, "
+					"criterion: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y,
+					(unsigned long long) ss->sum_xy,
+					ss->slope, ss->criterion, ss->limit);
+
+		result = ss->criterion * (ss->criterion < 0.0 ? -1.0 : 1.0);
+		if (result < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+static bool steadystate_deviation(uint64_t iops, uint64_t bw,
+				  struct thread_data *td)
+{
+	int i;
+	double diff;
+	double mean;
+
+	struct steadystate_data *ss = &td->ss;
+
+	ss->bw_data[ss->tail] = bw;
+	ss->iops_data[ss->tail] = iops;
+
+	if (ss->state & __FIO_SS_BUFFER_FULL || ss->tail - ss->head == ss->dur - 1) {
+		if (!(ss->state & __FIO_SS_BUFFER_FULL)) {
+			/* first time through */
+			for(i = 0, ss->sum_y = 0; i < ss->dur; i++)
+				if (ss->state & __FIO_SS_IOPS)
+					ss->sum_y += ss->iops_data[i];
+				else
+					ss->sum_y += ss->bw_data[i];
+			ss->state |= __FIO_SS_BUFFER_FULL;
+		} else {		/* easy to update the sum */
+			ss->sum_y -= ss->oldest_y;
+			if (ss->state & __FIO_SS_IOPS)
+				ss->sum_y += ss->iops_data[ss->tail];
+			else
+				ss->sum_y += ss->bw_data[ss->tail];
+		}
+
+		if (ss->state & __FIO_SS_IOPS)
+			ss->oldest_y = ss->iops_data[ss->head];
+		else
+			ss->oldest_y = ss->bw_data[ss->head];
+
+		mean = (double) ss->sum_y / ss->dur;
+		ss->deviation = 0.0;
+
+		for (i = 0; i < ss->dur; i++) {
+			if (ss->state & __FIO_SS_IOPS)
+				diff = ss->iops_data[i] - mean;
+			else
+				diff = ss->bw_data[i] - mean;
+			ss->deviation = max(ss->deviation, diff * (diff < 0.0 ? -1.0 : 1.0));
+		}
+
+		if (ss->state & __FIO_SS_PCT)
+			ss->criterion = 100.0 * ss->deviation / mean;
+		else
+			ss->criterion = ss->deviation;
+
+		dprint(FD_STEADYSTATE, "sum_y: %llu, mean: %f, max diff: %f, "
+					"objective: %f, limit: %f\n",
+					(unsigned long long) ss->sum_y, mean,
+					ss->deviation, ss->criterion, ss->limit);
+
+		if (ss->criterion < ss->limit)
+			return true;
+	}
+
+	ss->tail = (ss->tail + 1) % ss->dur;
+	if (ss->tail <= ss->head)
+		ss->head = (ss->head + 1) % ss->dur;
+
+	return false;
+}
+
+void steadystate_check(void)
+{
+	int i, j, ddir, prev_groupid, group_ramp_time_over = 0;
+	unsigned long rate_time;
+	struct thread_data *td, *td2;
+	struct timeval now;
+	uint64_t group_bw = 0, group_iops = 0;
+	uint64_t td_iops, td_bytes;
+	bool ret;
+
+	prev_groupid = -1;
+	for_each_td(td, i) {
+		struct steadystate_data *ss = &td->ss;
+
+		if (!ss->dur || td->runstate <= TD_SETTING_UP ||
+		    td->runstate >= TD_EXITED || !ss->state ||
+		    ss->state & __FIO_SS_ATTAINED)
+			continue;
+
+		td_iops = 0;
+		td_bytes = 0;
+		if (!td->o.group_reporting ||
+		    (td->o.group_reporting && td->groupid != prev_groupid)) {
+			group_bw = 0;
+			group_iops = 0;
+			group_ramp_time_over = 0;
+		}
+		prev_groupid = td->groupid;
+
+		fio_gettime(&now, NULL);
+		if (ss->ramp_time && !(ss->state & __FIO_SS_RAMP_OVER)) {
+			/*
+			 * Begin recording data one second after ss->ramp_time
+			 * has elapsed
+			 */
+			if (utime_since(&td->epoch, &now) >= (ss->ramp_time + 1000000L))
+				ss->state |= __FIO_SS_RAMP_OVER;
+		}
+
+		td_io_u_lock(td);
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			td_iops += td->io_blocks[ddir];
+			td_bytes += td->io_bytes[ddir];
+		}
+		td_io_u_unlock(td);
+
+		rate_time = mtime_since(&ss->prev_time, &now);
+		memcpy(&ss->prev_time, &now, sizeof(now));
+
+		/*
+		 * Begin monitoring when job starts but don't actually use
+		 * data in checking stopping criterion until ss->ramp_time is
+		 * over. This ensures that we will have a sane value in
+		 * prev_iops/bw the first time through after ss->ramp_time
+		 * is done.
+		 */
+		if (ss->state & __FIO_SS_RAMP_OVER) {
+			group_bw += 1000 * (td_bytes - ss->prev_bytes) / rate_time;
+			group_iops += 1000 * (td_iops - ss->prev_iops) / rate_time;
+			++group_ramp_time_over;
+		}
+		ss->prev_iops = td_iops;
+		ss->prev_bytes = td_bytes;
+
+		if (td->o.group_reporting && !(ss->state & __FIO_SS_DATA))
+			continue;
+
+		/*
+		 * Don't begin checking criterion until ss->ramp_time is over
+		 * for at least one thread in group
+		 */
+		if (!group_ramp_time_over)
+			continue;
+
+		dprint(FD_STEADYSTATE, "steadystate_check() thread: %d, "
+					"groupid: %u, rate_msec: %ld, "
+					"iops: %llu, bw: %llu, head: %d, tail: %d\n",
+					i, td->groupid, rate_time,
+					(unsigned long long) group_iops,
+					(unsigned long long) group_bw,
+					ss->head, ss->tail);
+
+		if (ss->state & __FIO_SS_SLOPE)
+			ret = steadystate_slope(group_iops, group_bw, td);
+		else
+			ret = steadystate_deviation(group_iops, group_bw, td);
+
+		if (ret) {
+			if (td->o.group_reporting) {
+				for_each_td(td2, j) {
+					if (td2->groupid == td->groupid) {
+						td2->ss.state |= __FIO_SS_ATTAINED;
+						fio_mark_td_terminate(td2);
+					}
+				}
+			} else {
+				ss->state |= __FIO_SS_ATTAINED;
+				fio_mark_td_terminate(td);
+			}
+		}
+	}
+}
+
+int td_steadystate_init(struct thread_data *td)
+{
+	struct steadystate_data *ss = &td->ss;
+	struct thread_options *o = &td->o;
+	struct thread_data *td2;
+	int j;
+
+	memset(ss, 0, sizeof(*ss));
+
+	if (o->ss_dur) {
+		steadystate_enabled = true;
+		o->ss_dur /= 1000000L;
+
+		/* put all steady state info in one place */
+		ss->dur = o->ss_dur;
+		ss->limit = o->ss_limit.u.f;
+		ss->ramp_time = o->ss_ramp_time;
+
+		ss->state = o->ss_state;
+		if (!td->ss.ramp_time)
+			ss->state |= __FIO_SS_RAMP_OVER;
+
+		ss->sum_x = o->ss_dur * (o->ss_dur - 1) / 2;
+		ss->sum_x_sq = (o->ss_dur - 1) * (o->ss_dur) * (2*o->ss_dur - 1) / 6;
+	}
+
+	/* make sure that ss options are consistent within reporting group */
+	for_each_td(td2, j) {
+		if (td2->groupid == td->groupid) {
+			struct steadystate_data *ss2 = &td2->ss;
+
+			if (ss2->dur != ss->dur ||
+			    ss2->limit != ss->limit ||
+			    ss2->ramp_time != ss->ramp_time ||
+			    ss2->state != ss->state ||
+			    ss2->sum_x != ss->sum_x ||
+			    ss2->sum_x_sq != ss->sum_x_sq) {
+				td_verror(td, EINVAL, "job rejected: steadystate options must be consistent within reporting groups");
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
+uint64_t steadystate_bw_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_bw_data[i];
+
+	return sum / ts->ss_dur;
+}
+
+uint64_t steadystate_iops_mean(struct thread_stat *ts)
+{
+	int i;
+	uint64_t sum;
+
+	for (i = 0, sum = 0; i < ts->ss_dur; i++)
+		sum += ts->ss_iops_data[i];
+
+	return sum / ts->ss_dur;
+}

diff --git a/steadystate.h b/steadystate.h
new file mode 100644
index 0000000..20ccd30
--- /dev/null
+++ b/steadystate.h

@@ -0,0 +1,61 @@
+#ifndef FIO_STEADYSTATE_H
+#define FIO_STEADYSTATE_H
+
+#include "stat.h"
+#include "thread_options.h"
+#include "lib/ieee754.h"
+
+extern void steadystate_check(void);
+extern void steadystate_setup(void);
+extern int td_steadystate_init(struct thread_data *);
+extern uint64_t steadystate_bw_mean(struct thread_stat *);
+extern uint64_t steadystate_iops_mean(struct thread_stat *);
+
+extern bool steadystate_enabled;
+
+struct steadystate_data {
+	double limit;
+	unsigned long long dur;
+	unsigned long long ramp_time;
+
+	uint32_t state;
+
+	unsigned int head;
+	unsigned int tail;
+	uint64_t *iops_data;
+	uint64_t *bw_data;
+
+	double slope;
+	double deviation;
+	double criterion;
+
+	uint64_t sum_y;
+	uint64_t sum_x;
+	uint64_t sum_x_sq;
+	uint64_t sum_xy;
+	uint64_t oldest_y;
+
+	struct timeval prev_time;
+	uint64_t prev_iops;
+	uint64_t prev_bytes;
+};
+
+enum {
+	__FIO_SS_IOPS		= 1,
+	__FIO_SS_BW		= 2,
+	__FIO_SS_SLOPE		= 4,
+	__FIO_SS_ATTAINED	= 8,
+	__FIO_SS_RAMP_OVER	= 16,
+	__FIO_SS_DATA		= 32,
+	__FIO_SS_PCT		= 64,
+	__FIO_SS_BUFFER_FULL	= 128,
+
+	FIO_SS_IOPS		= __FIO_SS_IOPS,
+	FIO_SS_IOPS_SLOPE	= __FIO_SS_IOPS | __FIO_SS_SLOPE,
+	FIO_SS_BW		= __FIO_SS_BW,
+	FIO_SS_BW_SLOPE		= __FIO_SS_BW | __FIO_SS_SLOPE,
+};
+
+#define STEADYSTATE_MSEC	1000
+
+#endif

diff --git a/t/arch.c b/t/arch.c
new file mode 100644
index 0000000..befb7c7
--- /dev/null
+++ b/t/arch.c

@@ -0,0 +1,5 @@
+#include "../arch/arch.h"
+
+unsigned long arch_flags = 0;
+int tsc_reliable;
+int arch_random;

diff --git a/t/btrace2fio.c b/t/btrace2fio.c
index d0b7e09..4cdb38d 100644
--- a/t/btrace2fio.c
+++ b/t/btrace2fio.c

@@ -12,7 +12,7 @@
 #include "../blktrace_api.h"
 #include "../os/os.h"
 #include "../log.h"
-#include "../lib/linux-dev-lookup.h"
+#include "../oslib/linux-dev-lookup.h"
 
 #define TRACE_FIFO_SIZE	8192
 
@@ -62,7 +62,7 @@
 
 	uint64_t first_ttime[DDIR_RWDIR_CNT];
 	uint64_t last_ttime[DDIR_RWDIR_CNT];
-	uint64_t kb[DDIR_RWDIR_CNT];
+	uint64_t kib[DDIR_RWDIR_CNT];
 
 	uint64_t start_delay;
 };
@@ -406,7 +406,7 @@
 
 		i = inflight_find(t->sector + (t->bytes >> 9));
 		if (i) {
-			i->p->o.kb[t_to_rwdir(t)] += (t->bytes >> 10);
+			i->p->o.kib[t_to_rwdir(t)] += (t->bytes >> 10);
 			i->p->o.complete_seen = 1;
 			inflight_remove(i);
 		}
@@ -556,7 +556,7 @@
 	return bsb->nr - bsa->nr;
 }
 
-static unsigned long o_to_kb_rate(struct btrace_out *o, int rw)
+static unsigned long o_to_kib_rate(struct btrace_out *o, int rw)
 {
 	uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
 	uint64_t val;
@@ -568,7 +568,7 @@
 	if (!usec)
 		return 0;
 
-	val = o->kb[rw] * 1000ULL;
+	val = o->kib[rw] * 1000ULL;
 	return val / usec;
 }
 
@@ -623,7 +623,7 @@
 		printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
 		perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
 		printf("\tseq:    %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
-		printf("\trate:   %lu KB/sec\n", o_to_kb_rate(o, i));
+		printf("\trate:   %lu KiB/sec\n", o_to_kib_rate(o, i));
 
 		for (j = 0; j < o->nr_bs[i]; j++) {
 			struct bs *bs = &o->bs[i][j];
@@ -746,7 +746,7 @@
 		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 			unsigned long rate;
 
-			rate = o_to_kb_rate(o, i);
+			rate = o_to_kib_rate(o, i);
 			if (i)
 				printf(",");
 			if (rate)
@@ -810,7 +810,7 @@
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		unsigned long this_rate;
 
-		this_rate = o_to_kb_rate(o, i);
+		this_rate = o_to_kib_rate(o, i);
 		if (this_rate < rate_threshold) {
 			remove_ddir(o, i);
 			this_rate = 0;
@@ -926,7 +926,7 @@
 		oa->ios[i] += ob->ios[i];
 		oa->merges[i] += ob->merges[i];
 		oa->seq[i] += ob->seq[i];
-		oa->kb[i] += ob->kb[i];
+		oa->kib[i] += ob->kib[i];
 		oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]);
 		oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]);
 		merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]);
@@ -937,14 +937,14 @@
 	return 1;
 }
 
-static void check_merges(struct btrace_pid *p, struct flist_head *pid_list)
+static void check_merges(struct btrace_pid *p, struct flist_head *pidlist)
 {
 	struct flist_head *e, *tmp;
 
 	if (p->ignore)
 		return;
 
-	flist_for_each_safe(e, tmp, pid_list) {
+	flist_for_each_safe(e, tmp, pidlist) {
 		struct btrace_pid *pidb;
 
 		pidb = flist_entry(e, struct btrace_pid, pid_list);
@@ -1021,7 +1021,7 @@
 	log_err("\t-n\tNumber IOS threshold to ignore task\n");
 	log_err("\t-f\tFio job file output\n");
 	log_err("\t-d\tUse this file/device for replay\n");
-	log_err("\t-r\tIgnore jobs with less than this KB/sec rate\n");
+	log_err("\t-r\tIgnore jobs with less than this KiB/sec rate\n");
 	log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate);
 	log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
 	log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries);

diff --git a/t/debug.c b/t/debug.c
index c297d61..bf6f460 100644
--- a/t/debug.c
+++ b/t/debug.c

@@ -2,7 +2,7 @@
 
 FILE *f_err;
 struct timeval *fio_tv = NULL;
-unsigned int fio_debug = 0;
+unsigned long fio_debug = 0;
 
 void __dprint(int type, const char *str, ...)
 {

diff --git a/t/dedupe.c b/t/dedupe.c
index 5b88fcb..1f172a2 100644
--- a/t/dedupe.c
+++ b/t/dedupe.c

@@ -14,17 +14,17 @@
 #include <fcntl.h>
 #include <string.h>
 
-#include "../lib/rbtree.h"
 #include "../flist.h"
 #include "../log.h"
 #include "../mutex.h"
 #include "../smalloc.h"
 #include "../minmax.h"
 #include "../crc/md5.h"
-#include "../memalign.h"
+#include "../lib/memalign.h"
 #include "../os/os.h"
 #include "../gettime.h"
 #include "../fio_time.h"
+#include "../lib/rbtree.h"
 
 #include "../lib/bloom.h"
 #include "debug.h"
@@ -84,7 +84,7 @@
 	uint64_t ret;
 
 	if (S_ISBLK(sb->st_mode)) {
-		unsigned long long bytes;
+		unsigned long long bytes = 0;
 
 		if (blockdev_size(f, &bytes)) {
 			log_err("dedupe: failed getting bdev size\n");
@@ -363,7 +363,7 @@
 		tdiff = mtime_since_now(&last_tv);
 		if (tdiff) {
 			this_items = (this_items * 1000) / (tdiff * 1024);
-			printf("%3.2f%% done (%luKB/sec)\r", perc, this_items);
+			printf("%3.2f%% done (%luKiB/sec)\r", perc, this_items);
 			last_nitems = nitems;
 			fio_gettime(&last_tv, NULL);
 		} else
@@ -537,6 +537,7 @@
 	uint64_t nextents = 0, nchunks = 0;
 	int c, ret;
 
+	arch_init(argv);
 	debug_init();
 
 	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {

diff --git a/t/gen-rand.c b/t/gen-rand.c
new file mode 100644
index 0000000..6c31f92
--- /dev/null
+++ b/t/gen-rand.c

@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "../lib/types.h"
+#include "../log.h"
+#include "../lib/lfsr.h"
+#include "../lib/axmap.h"
+#include "../smalloc.h"
+#include "../minmax.h"
+#include "../lib/rand.h"
+
+int main(int argc, char *argv[])
+{
+	struct frand_state s;
+	uint64_t i, start, end, nvalues;
+	unsigned long *buckets, index, pass, fail;
+	double p, dev, mean, vmin, vmax;
+
+	if (argc < 4) {
+		log_err("%s: start end nvalues\n", argv[0]);
+		return 1;
+	}
+
+	start = strtoul(argv[1], NULL, 10);
+	end = strtoul(argv[2], NULL, 10);
+
+	if (start >= end) {
+		log_err("%s: start must be smaller than end\n", argv[0]);
+		return 1;
+	}
+	index = 1 + end - start;
+	buckets = calloc(index, sizeof(unsigned long));
+
+	nvalues = strtoul(argv[3], NULL, 10);
+
+	init_rand(&s, false);
+
+	for (i = 0; i < nvalues; i++) {
+		int v = rand32_between(&s, start, end);
+
+		buckets[v - start]++;
+	}
+
+	p = 1.0 / index;
+	dev = sqrt(nvalues * p * (1.0 - p));
+	mean = nvalues * p;
+	vmin = mean - dev;
+	vmax = mean + dev;
+
+	pass = fail = 0;
+	for (i = 0; i < index; i++) {
+		if (buckets[i] < vmin || buckets[i] > vmax) {
+			printf("FAIL bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			fail++;
+		} else {
+			printf("PASS bucket%4lu: val=%8lu (%.1f < %.1f > %.1f)\n", (unsigned long) i + 1, buckets[i], vmin, mean, vmax);
+			pass++;
+		}
+	}
+
+	printf("Passes=%lu, Fail=%lu\n", pass, fail);
+
+	return 0;
+}

diff --git a/t/genzipf.c b/t/genzipf.c
index c5f098c..9faec38 100644
--- a/t/genzipf.c
+++ b/t/genzipf.c

@@ -3,10 +3,10 @@
  * what an access pattern would look like.
  *
  * For instance, the following would generate a zipf distribution
- * with theta 1.2, using 100,000 values and split the reporting into
- * 20 buckets:
+ * with theta 1.2, using 262144 (1 GiB / 4096) values and split the
+ * reporting into 20 buckets:
  *
- *	t/genzipf zipf 1.2 100000 20
+ *	./t/fio-genzipf -t zipf -i 1.2 -g 1 -b 4096 -o 20
  *
  * Only the distribution type (zipf or pareto) and spread input need
  * to be given, if not given defaults are used.
@@ -19,11 +19,11 @@
 #include <unistd.h>
 
 #include "../lib/zipf.h"
+#include "../lib/gauss.h"
 #include "../flist.h"
 #include "../hash.h"
 
-#define DEF_NR		1000000
-#define DEF_NR_OUTPUT	23
+#define DEF_NR_OUTPUT	20
 
 struct node {
 	struct flist_head list;
@@ -39,23 +39,34 @@
 	TYPE_NONE = 0,
 	TYPE_ZIPF,
 	TYPE_PARETO,
+	TYPE_NORMAL,
 };
-static const char *dist_types[] = { "None", "Zipf", "Pareto" };
+static const char *dist_types[] = { "None", "Zipf", "Pareto", "Normal" };
+
+enum {
+	OUTPUT_NORMAL,
+	OUTPUT_CSV,
+};
 
 static int dist_type = TYPE_ZIPF;
-static unsigned long gb_size = 500;
+static unsigned long gib_size = 500;
 static unsigned long block_size = 4096;
 static unsigned long output_nranges = DEF_NR_OUTPUT;
 static double percentage;
 static double dist_val;
-static int output_csv = 0;
+static int output_type = OUTPUT_NORMAL;
 
 #define DEF_ZIPF_VAL	1.2
 #define DEF_PARETO_VAL	0.3
 
+static unsigned int hashv(unsigned long long val)
+{
+	return jhash(&val, sizeof(val), 0) & (hash_size - 1);
+}
+
 static struct node *hash_lookup(unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *l = &hash[hashv(val)];
 	struct flist_head *entry;
 	struct node *n;
 
@@ -68,14 +79,13 @@
 	return NULL;
 }
 
-static struct node *hash_insert(struct node *n, unsigned long long val)
+static void hash_insert(struct node *n, unsigned long long val)
 {
-	struct flist_head *l = &hash[hash_long(val, hash_bits)];
+	struct flist_head *l = &hash[hashv(val)];
 
 	n->val = val;
 	n->hits = 1;
 	flist_add_tail(&n->list, l);
-	return n;
 }
 
 static void usage(void)
@@ -83,11 +93,12 @@
 	printf("genzipf: test zipf/pareto values for fio input\n");
 	printf("\t-h\tThis help screen\n");
 	printf("\t-p\tGenerate size of data set that are hit by this percentage\n");
-	printf("\t-t\tDistribution type (zipf or pareto)\n");
-	printf("\t-i\tDistribution algorithm input (zipf theta or pareto power)\n");
+	printf("\t-t\tDistribution type (zipf, pareto, or normal)\n");
+	printf("\t-i\tDistribution algorithm input (zipf theta, pareto power,\n"
+		"\t\tor normal %% deviation)\n");
 	printf("\t-b\tBlock size of a given range (in bytes)\n");
 	printf("\t-g\tSize of data set (in gigabytes)\n");
-	printf("\t-o\tNumber of output columns\n");
+	printf("\t-o\tNumber of output rows\n");
 	printf("\t-c\tOutput ranges in CSV format\n");
 }
 
@@ -112,13 +123,15 @@
 				dist_type = TYPE_ZIPF;
 			else if (!strncmp(optarg, "pareto", 6))
 				dist_type = TYPE_PARETO;
+			else if (!strncmp(optarg, "normal", 6))
+				dist_type = TYPE_NORMAL;
 			else {
 				printf("wrong dist type: %s\n", optarg);
 				return 1;
 			}
 			break;
 		case 'g':
-			gb_size = strtoul(optarg, NULL, 10);
+			gib_size = strtoul(optarg, NULL, 10);
 			break;
 		case 'i':
 			dist_val = atof(optarg);
@@ -128,7 +141,7 @@
 			output_nranges = strtoul(optarg, NULL, 10);
 			break;
 		case 'c':
-			output_csv = 1;
+			output_type = OUTPUT_CSV;
 			break;
 		default:
 			printf("bad option %c\n", c);
@@ -168,29 +181,128 @@
 	return n2->hits - n1->hits;
 }
 
+static void output_csv(struct node *nodes, unsigned long nnodes)
+{
+	unsigned long i;
+
+	printf("rank, count\n");
+	for (i = 0; i < nnodes; i++)
+		printf("%lu, %lu\n", i, nodes[i].hits);
+}
+
+static void output_normal(struct node *nodes, unsigned long nnodes,
+			  unsigned long nranges)
+{
+	unsigned long i, j, cur_vals, interval_step, next_interval, total_vals;
+	unsigned long blocks = percentage * nnodes / 100;
+	double hit_percent_sum = 0;
+	unsigned long long hit_sum = 0;
+	double perc, perc_i;
+	struct output_sum *output_sums;
+
+	interval_step = (nnodes - 1) / output_nranges + 1;
+	next_interval = interval_step;
+	output_sums = malloc(output_nranges * sizeof(struct output_sum));
+
+	for (i = 0; i < output_nranges; i++) {
+		output_sums[i].output = 0.0;
+		output_sums[i].nranges = 0;
+	}
+
+	j = total_vals = cur_vals = 0;
+
+	for (i = 0; i < nnodes; i++) {
+		struct output_sum *os = &output_sums[j];
+		struct node *node = &nodes[i];
+		cur_vals += node->hits;
+		total_vals += node->hits;
+		os->nranges += node->hits;
+		if (i == (next_interval) -1 || i == nnodes - 1) {
+			os->output = (double) cur_vals / (double) nranges;
+			os->output *= 100.0;
+			cur_vals = 0;
+			next_interval += interval_step;
+			j++;
+		}
+
+		if (percentage) {
+			if (total_vals >= blocks) {
+				double cs = (double) i * block_size / (1024.0 * 1024.0);
+				char p = 'M';
+
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'G';
+				}
+				if (cs > 1024.0) {
+					cs /= 1024.0;
+					p = 'T';
+				}
+
+				printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
+				percentage = 0.0;
+			}
+		}
+	}
+
+	perc_i = 100.0 / (double)output_nranges;
+	perc = 0.0;
+
+	printf("\n   Rows           Hits %%         Sum %%           # Hits          Size\n");
+	printf("-----------------------------------------------------------------------\n");
+	for (i = 0; i < output_nranges; i++) {
+		struct output_sum *os = &output_sums[i];
+		double gb = (double)os->nranges * block_size / 1024.0;
+		char p = 'K';
+
+		if (gb > 1024.0) {
+			p = 'M';
+			gb /= 1024.0;
+		}
+		if (gb > 1024.0) {
+			p = 'G';
+			gb /= 1024.0;
+		}
+
+		perc += perc_i;
+		hit_percent_sum += os->output;
+		hit_sum += os->nranges;
+		printf("%s %6.2f%%\t%6.2f%%\t\t%6.2f%%\t\t%8u\t%6.2f%c\n",
+			i ? "|->" : "Top", perc, os->output, hit_percent_sum,
+			os->nranges, gb, p);
+	}
+
+	printf("-----------------------------------------------------------------------\n");
+	printf("Total\t\t\t\t\t\t%8llu\n", hit_sum);
+	free(output_sums);
+}
+
 int main(int argc, char *argv[])
 {
 	unsigned long offset;
-	unsigned long i, j, k, nr_vals, cur_vals, interval, total_vals, nnodes;
 	unsigned long long nranges;
-	struct output_sum *output_sums;
+	unsigned long nnodes;
 	struct node *nodes;
-	double perc, perc_i;
 	struct zipf_state zs;
+	struct gauss_state gs;
+	int i, j;
 
 	if (parse_options(argc, argv))
 		return 1;
 
-	if( !output_csv )
-		printf("Generating %s distribution with %f input and %lu GB size and %lu block_size.\n", dist_types[dist_type], dist_val, gb_size, block_size);
+	if (output_type != OUTPUT_CSV)
+		printf("Generating %s distribution with %f input and %lu GiB size and %lu block_size.\n",
+		       dist_types[dist_type], dist_val, gib_size, block_size);
 
-	nranges = gb_size * 1024 * 1024 * 1024ULL;
+	nranges = gib_size * 1024 * 1024 * 1024ULL;
 	nranges /= block_size;
 
 	if (dist_type == TYPE_ZIPF)
 		zipf_init(&zs, nranges, dist_val, 1);
-	else
+	else if (dist_type == TYPE_PARETO)
 		pareto_init(&zs, nranges, dist_val, 1);
+	else
+		gauss_init(&gs, nranges, dist_val, 1);
 
 	hash_bits = 0;
 	hash_size = nranges;
@@ -199,19 +311,21 @@
 
 	hash_size = 1 << hash_bits;
 
-	hash = malloc(hash_size * sizeof(struct flist_head));
+	hash = calloc(hash_size, sizeof(struct flist_head));
 	for (i = 0; i < hash_size; i++)
 		INIT_FLIST_HEAD(&hash[i]);
 
 	nodes = malloc(nranges * sizeof(struct node));
 
-	for (nr_vals = i = j = 0; i < nranges; i++) {
+	for (i = j = 0; i < nranges; i++) {
 		struct node *n;
 
 		if (dist_type == TYPE_ZIPF)
 			offset = zipf_next(&zs);
-		else
+		else if (dist_type == TYPE_PARETO)
 			offset = pareto_next(&zs);
+		else
+			offset = gauss_next(&gs);
 
 		n = hash_lookup(offset);
 		if (n)
@@ -220,101 +334,15 @@
 			hash_insert(&nodes[j], offset);
 			j++;
 		}
-
-		nr_vals++;
 	}
 
 	qsort(nodes, j, sizeof(struct node), node_cmp);
 	nnodes = j;
-	nr_vals = nnodes;
 
-	if (output_csv) {
-		printf("rank, count\n");
-		for (k = 0; k < nnodes; k++)
-			printf("%lu, %lu\n", k, nodes[k].hits);
-	} else {
-		interval = (nr_vals + output_nranges - 1) / output_nranges;
-
-		output_sums = malloc(output_nranges * sizeof(struct output_sum));
-		for (i = 0; i < output_nranges; i++) {
-			output_sums[i].output = 0.0;
-			output_sums[i].nranges = 1;
-		}
-
-		total_vals = i = j = cur_vals = 0;
-
-		for (k = 0; k < nnodes; k++) {
-			struct output_sum *os = &output_sums[j];
-			struct node *node = &nodes[k];
-
-			if (i >= interval) {
-				os->output =
-				    (double)(cur_vals + 1) / (double)nranges;
-				os->output *= 100.0;
-				j++;
-				cur_vals = node->hits;
-				interval +=
-				    (nr_vals + output_nranges -
-				     1) / output_nranges;
-			} else {
-				cur_vals += node->hits;
-				os->nranges += node->hits;
-			}
-
-			i++;
-			total_vals += node->hits;
-
-			if (percentage) {
-				unsigned long blocks =
-				    percentage * nranges / 100;
-
-				if (total_vals >= blocks) {
-					double cs =
-					    i * block_size / (1024 * 1024);
-					char p = 'M';
-
-					if (cs > 1024.0) {
-						cs /= 1024.0;
-						p = 'G';
-					}
-					if (cs > 1024.0) {
-						cs /= 1024.0;
-						p = 'T';
-					}
-
-					printf("%.2f%% of hits satisfied in %.3f%cB of cache\n", percentage, cs, p);
-					percentage = 0.0;
-				}
-			}
-		}
-
-		perc_i = 100.0 / (double)output_nranges;
-		perc = 0.0;
-
-		printf("\n   Rows           Hits           No Hits         Size\n");
-		printf("--------------------------------------------------------\n");
-		for (i = 0; i < j; i++) {
-			struct output_sum *os = &output_sums[i];
-			double gb = (double)os->nranges * block_size / 1024.0;
-			char p = 'K';
-
-			if (gb > 1024.0) {
-				p = 'M';
-				gb /= 1024.0;
-			}
-			if (gb > 1024.0) {
-				p = 'G';
-				gb /= 1024.0;
-			}
-
-			perc += perc_i;
-			printf("%s %6.2f%%\t%6.2f%%\t\t%8u\t%6.2f%c\n",
-			       i ? "|->" : "Top", perc, os->output, os->nranges,
-			       gb, p);
-		}
-
-		free(output_sums);
-	}
+	if (output_type == OUTPUT_CSV)
+		output_csv(nodes, nnodes);
+	else
+		output_normal(nodes, nnodes, nranges);
 
 	free(hash);
 	free(nodes);

diff --git a/t/lfsr-test.c b/t/lfsr-test.c
index 901f1a6..7016f26 100644
--- a/t/lfsr-test.c
+++ b/t/lfsr-test.c

@@ -38,6 +38,8 @@
 	void *v = NULL, *v_start;
 	double total, mean;
 
+	arch_init(argv);
+
 	/* Read arguments */
 	switch (argc) {
 		case 5: if (strncmp(argv[4], "verify", 7) == 0)
@@ -78,7 +80,7 @@
 		v_size = numbers * sizeof(uint8_t);
 		v = malloc(v_size);
 		memset(v, 0, v_size);
-		printf("\nVerification table is %lf KBs\n", (double)(v_size) / 1024);
+		printf("\nVerification table is %lf KiB\n", (double)(v_size) / 1024);
 	}
 	v_start = v;
 
@@ -122,7 +124,7 @@
 	if (verify)
 		printf("(slower due to verification)");
 	printf("\n==============================\n");
-	printf("Elapsed: %lf s\n", total / pow(10,9));
+	printf("Elapsed: %lf s\n", total / pow(10,6));
 	printf("Mean:    %lf us\n", mean);
 
 	free(v_start);

diff --git a/t/log.c b/t/log.c
index 1ed3851..929aac6 100644
--- a/t/log.c
+++ b/t/log.c

@@ -2,7 +2,7 @@
 #include <stdarg.h>
 #include "../minmax.h"
 
-int log_err(const char *format, ...)
+size_t log_err(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;
@@ -16,7 +16,7 @@
 	return fwrite(buffer, len, 1, stderr);
 }
 
-int log_info(const char *format, ...)
+size_t log_info(const char *format, ...)
 {
 	char buffer[1024];
 	va_list args;

diff --git a/t/memlock.c b/t/memlock.c
new file mode 100644
index 0000000..3d3579a
--- /dev/null
+++ b/t/memlock.c

@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+
+static struct thread_data {
+	unsigned long mib;
+} td;
+
+static void *worker(void *data)
+{
+	struct thread_data *td = data;
+	unsigned long index;
+	size_t size;
+	char *buf;
+	int i, first = 1;
+
+	size = td->mib * 1024UL * 1024UL;
+	buf = malloc(size);
+
+	for (i = 0; i < 100000; i++) {
+		for (index = 0; index + 4096 < size; index += 4096)
+			memset(&buf[index+512], 0x89, 512);
+		if (first) {
+			printf("loop%d: did %lu MiB\n", i+1, size/(1024UL*1024UL));
+			first = 0;
+		}
+	}
+	return NULL;
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long mib, threads;
+	pthread_t *pthreads;
+	int i;
+
+	if (argc < 3) {
+		printf("%s: <MiB per thread> <threads>\n", argv[0]);
+		return 1;
+	}
+
+	mib = strtoul(argv[1], NULL, 10);
+	threads = strtoul(argv[2], NULL, 10);
+
+	pthreads = calloc(threads, sizeof(pthread_t));
+	td.mib = mib;
+
+	for (i = 0; i < threads; i++)
+		pthread_create(&pthreads[i], NULL, worker, &td);
+
+	for (i = 0; i < threads; i++) {
+		void *ret;
+
+		pthread_join(pthreads[i], &ret);
+	}
+	return 0;
+}

diff --git a/t/read-to-pipe-async.c b/t/read-to-pipe-async.c
new file mode 100644
index 0000000..ebdd8f1
--- /dev/null
+++ b/t/read-to-pipe-async.c

@@ -0,0 +1,670 @@
+/*
+ * Read a file and write the contents to stdout. If a given read takes
+ * longer than 'max_us' time, then we schedule a new thread to handle
+ * the next read. This avoids the coordinated omission problem, where
+ * one request appears to take a long time, but in reality a lot of
+ * requests would have been slow, but we don't notice since new submissions
+ * are not being issued if just 1 is held up.
+ *
+ * One test case:
+ *
+ * $ time (./read-to-pipe-async -f randfile.gz | gzip -dc > outfile; sync)
+ *
+ * This will read randfile.gz and log the latencies of doing so, while
+ * piping the output to gzip to decompress it. Any latencies over max_us
+ * are logged when they happen, and latency buckets are displayed at the
+ * end of the run
+ *
+ * gcc -Wall -g -O2 -o read-to-pipe-async read-to-pipe-async.c -lpthread
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../flist.h"
+
+static int bs = 4096;
+static int max_us = 10000;
+static char *file;
+static int separate_writer = 1;
+
+#define PLAT_BITS	8
+#define PLAT_VAL	(1 << PLAT_BITS)
+#define PLAT_GROUP_NR	19
+#define PLAT_NR		(PLAT_GROUP_NR * PLAT_VAL)
+#define PLAT_LIST_MAX	20
+
+struct stats {
+	unsigned int plat[PLAT_NR];
+	unsigned int nr_samples;
+	unsigned int max;
+	unsigned int min;
+	unsigned int over;
+};
+
+static double plist[PLAT_LIST_MAX] = { 50.0, 75.0, 90.0, 95.0, 99.0, 99.5, 99.9, 99.99, 99.999, 99.9999, };
+
+struct thread_data {
+	int exit;
+	int done;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_mutex_t done_lock;
+	pthread_cond_t done_cond;
+	pthread_t thread;
+};
+
+struct writer_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct reader_thread {
+	struct flist_head list;
+	struct flist_head done_list;
+	int started;
+	int busy;
+	int write_seq;
+	struct stats s;
+	struct thread_data thread;
+};
+
+struct work_item {
+	struct flist_head list;
+	void *buf;
+	size_t buf_size;
+	off_t off;
+	int fd;
+	int seq;
+	struct writer_thread *writer;
+	struct reader_thread *reader;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	pthread_t thread;
+};
+
+static struct reader_thread reader_thread;
+static struct writer_thread writer_thread;
+
+uint64_t utime_since(const struct timeval *s, const struct timeval *e)
+{
+	long sec, usec;
+	uint64_t ret;
+
+	sec = e->tv_sec - s->tv_sec;
+	usec = e->tv_usec - s->tv_usec;
+	if (sec > 0 && usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+
+	if (sec < 0 || (sec == 0 && usec < 0))
+		return 0;
+
+	ret = sec * 1000000ULL + usec;
+
+	return ret;
+}
+
+static struct work_item *find_seq(struct writer_thread *w, unsigned int seq)
+{
+	struct work_item *work;
+	struct flist_head *entry;
+
+	if (flist_empty(&w->list))
+		return NULL;
+
+	flist_for_each(entry, &w->list) {
+		work = flist_entry(entry, struct work_item, list);
+		if (work->seq == seq)
+			return work;
+	}
+
+	return NULL;
+}
+
+static unsigned int plat_val_to_idx(unsigned int val)
+{
+	unsigned int msb, error_bits, base, offset;
+
+	/* Find MSB starting from bit 0 */
+	if (val == 0)
+		msb = 0;
+	else
+		msb = sizeof(val)*8 - __builtin_clz(val) - 1;
+
+	/*
+	 * MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index
+	 */
+	if (msb <= PLAT_BITS)
+		return val;
+
+	/* Compute the number of error bits to discard*/
+	error_bits = msb - PLAT_BITS;
+
+	/* Compute the number of buckets before the group */
+	base = (error_bits + 1) << PLAT_BITS;
+
+	/*
+	 * Discard the error bits and apply the mask to find the
+	 * index for the buckets in the group
+	 */
+	offset = (PLAT_VAL - 1) & (val >> error_bits);
+
+	/* Make sure the index does not exceed (array size - 1) */
+	return (base + offset) < (PLAT_NR - 1) ?
+		(base + offset) : (PLAT_NR - 1);
+}
+
+/*
+ * Convert the given index of the bucket array to the value
+ * represented by the bucket
+ */
+static unsigned int plat_idx_to_val(unsigned int idx)
+{
+	unsigned int error_bits, k, base;
+
+	assert(idx < PLAT_NR);
+
+	/* MSB <= (PLAT_BITS-1), cannot be rounded off. Use
+	 * all bits of the sample as index */
+	if (idx < (PLAT_VAL << 1))
+		return idx;
+
+	/* Find the group and compute the minimum value of that group */
+	error_bits = (idx >> PLAT_BITS) - 1;
+	base = 1 << (error_bits + PLAT_BITS);
+
+	/* Find its bucket number of the group */
+	k = idx % PLAT_VAL;
+
+	/* Return the mean of the range of the bucket */
+	return base + ((k + 0.5) * (1 << error_bits));
+}
+
+static void add_lat(struct stats *s, unsigned int us, const char *name)
+{
+	int lat_index = 0;
+
+	if (us > s->max)
+		s->max = us;
+	if (us < s->min)
+		s->min = us;
+
+	if (us > max_us) {
+		fprintf(stderr, "%s latency=%u usec\n", name, us);
+		s->over++;
+	}
+
+	lat_index = plat_val_to_idx(us);
+	__sync_fetch_and_add(&s->plat[lat_index], 1);
+	__sync_fetch_and_add(&s->nr_samples, 1);
+}
+
+static int write_work(struct work_item *work)
+{
+	struct timeval s, e;
+	ssize_t ret;
+
+	gettimeofday(&s, NULL);
+	ret = write(STDOUT_FILENO, work->buf, work->buf_size);
+	gettimeofday(&e, NULL);
+	assert(ret == work->buf_size);
+
+	add_lat(&work->writer->s, utime_since(&s, &e), "write");
+	return work->seq + 1;
+}
+
+static void thread_exiting(struct thread_data *thread)
+{
+	__sync_fetch_and_add(&thread->done, 1);
+	pthread_cond_signal(&thread->done_cond);
+}
+
+static void *writer_fn(void *data)
+{
+	struct writer_thread *wt = data;
+	struct work_item *work;
+	unsigned int seq = 1;
+
+	work = NULL;
+	while (!wt->thread.exit || !flist_empty(&wt->list)) {
+		pthread_mutex_lock(&wt->thread.lock);
+
+		if (work) {
+			flist_add_tail(&work->list, &wt->done_list);
+			work = NULL;
+		}
+	
+		work = find_seq(wt, seq);
+		if (work)
+			flist_del_init(&work->list);
+		else
+			pthread_cond_wait(&wt->thread.cond, &wt->thread.lock);
+
+		pthread_mutex_unlock(&wt->thread.lock);
+
+		if (work)
+			seq = write_work(work);
+	}
+
+	thread_exiting(&wt->thread);
+	return NULL;
+}
+
+static void reader_work(struct work_item *work)
+{
+	struct timeval s, e;
+	ssize_t ret;
+	size_t left;
+	void *buf;
+	off_t off;
+
+	gettimeofday(&s, NULL);
+
+	left = work->buf_size;
+	buf = work->buf;
+	off = work->off;
+	while (left) {
+		ret = pread(work->fd, buf, left, off);
+		if (!ret) {
+			fprintf(stderr, "zero read\n");
+			break;
+		} else if (ret < 0) {
+			fprintf(stderr, "errno=%d\n", errno);
+			break;
+		}
+		left -= ret;
+		off += ret;
+		buf += ret;
+	}
+
+	gettimeofday(&e, NULL);
+
+	add_lat(&work->reader->s, utime_since(&s, &e), "read");
+
+	pthread_cond_signal(&work->cond);
+
+	if (separate_writer) {
+		pthread_mutex_lock(&work->writer->thread.lock);
+		flist_add_tail(&work->list, &work->writer->list);
+		pthread_mutex_unlock(&work->writer->thread.lock);
+		pthread_cond_signal(&work->writer->thread.cond);
+	} else {
+		struct reader_thread *rt = work->reader;
+		struct work_item *next = NULL;
+		struct flist_head *entry;
+
+		/*
+		 * Write current work if it matches in sequence.
+		 */
+		if (work->seq == rt->write_seq)
+			goto write_it;
+
+		pthread_mutex_lock(&rt->thread.lock);
+
+		flist_add_tail(&work->list, &rt->done_list);
+
+		/*
+		 * See if the next work item is here, if so, write it
+		 */
+		work = NULL;
+		flist_for_each(entry, &rt->done_list) {
+			next = flist_entry(entry, struct work_item, list);
+			if (next->seq == rt->write_seq) {
+				work = next;
+				flist_del(&work->list);
+				break;
+			}
+		}
+
+		pthread_mutex_unlock(&rt->thread.lock);
+	
+		if (work) {
+write_it:
+			write_work(work);
+			__sync_fetch_and_add(&rt->write_seq, 1);
+		}
+	}
+}
+
+static void *reader_one_off(void *data)
+{
+	reader_work(data);
+	return NULL;
+}
+
+static void *reader_fn(void *data)
+{
+	struct reader_thread *rt = data;
+	struct work_item *work;
+
+	while (!rt->thread.exit || !flist_empty(&rt->list)) {
+		work = NULL;
+		pthread_mutex_lock(&rt->thread.lock);
+		if (!flist_empty(&rt->list)) {
+			work = flist_first_entry(&rt->list, struct work_item, list);
+			flist_del_init(&work->list);
+		} else
+			pthread_cond_wait(&rt->thread.cond, &rt->thread.lock);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		if (work) {
+			__sync_fetch_and_add(&rt->busy, 1);
+			reader_work(work);
+			__sync_fetch_and_sub(&rt->busy, 1);
+		}
+	}
+
+	thread_exiting(&rt->thread);
+	return NULL;
+}
+
+static void queue_work(struct reader_thread *rt, struct work_item *work)
+{
+	if (!rt->started) {
+		pthread_mutex_lock(&rt->thread.lock);
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		rt->started = 1;
+		pthread_create(&rt->thread.thread, NULL, reader_fn, rt);
+	} else if (!rt->busy && !pthread_mutex_trylock(&rt->thread.lock)) {
+		flist_add_tail(&work->list, &rt->list);
+		pthread_mutex_unlock(&rt->thread.lock);
+
+		pthread_cond_signal(&rt->thread.cond);
+	} else {
+		int ret = pthread_create(&work->thread, NULL, reader_one_off, work);
+		if (ret)
+			fprintf(stderr, "pthread_create=%d\n", ret);
+		else
+			pthread_detach(work->thread);
+	}
+}
+
+static unsigned int calc_percentiles(unsigned int *io_u_plat, unsigned long nr,
+				     unsigned int **output)
+{
+	unsigned long sum = 0;
+	unsigned int len, i, j = 0;
+	unsigned int oval_len = 0;
+	unsigned int *ovals = NULL;
+	int is_last;
+
+	len = 0;
+	while (len < PLAT_LIST_MAX && plist[len] != 0.0)
+		len++;
+
+	if (!len)
+		return 0;
+
+	/*
+	 * Calculate bucket values, note down max and min values
+	 */
+	is_last = 0;
+	for (i = 0; i < PLAT_NR && !is_last; i++) {
+		sum += io_u_plat[i];
+		while (sum >= (plist[j] / 100.0 * nr)) {
+			assert(plist[j] <= 100.0);
+
+			if (j == oval_len) {
+				oval_len += 100;
+				ovals = realloc(ovals, oval_len * sizeof(unsigned int));
+			}
+
+			ovals[j] = plat_idx_to_val(i);
+			is_last = (j == len - 1);
+			if (is_last)
+				break;
+
+			j++;
+		}
+	}
+
+	*output = ovals;
+	return len;
+}
+
+static void show_latencies(struct stats *s, const char *msg)
+{
+	unsigned int *ovals = NULL;
+	unsigned int len, i;
+
+	len = calc_percentiles(s->plat, s->nr_samples, &ovals);
+	if (len) {
+		fprintf(stderr, "Latency percentiles (usec) (%s)\n", msg);
+		for (i = 0; i < len; i++)
+			fprintf(stderr, "\t%2.4fth: %u\n", plist[i], ovals[i]);
+	}
+
+	if (ovals)
+		free(ovals);
+
+	fprintf(stderr, "\tOver=%u, min=%u, max=%u\n", s->over, s->min, s->max);
+}
+
+static void init_thread(struct thread_data *thread)
+{
+	pthread_cond_init(&thread->cond, NULL);
+	pthread_cond_init(&thread->done_cond, NULL);
+	pthread_mutex_init(&thread->lock, NULL);
+	pthread_mutex_init(&thread->done_lock, NULL);
+	thread->exit = 0;
+}
+
+static void exit_thread(struct thread_data *thread,
+			void fn(struct writer_thread *),
+			struct writer_thread *wt)
+{
+	__sync_fetch_and_add(&thread->exit, 1);
+	pthread_cond_signal(&thread->cond);
+
+	while (!thread->done) {
+		pthread_mutex_lock(&thread->done_lock);
+
+		if (fn) {
+			struct timeval tv;
+			struct timespec ts;
+
+			gettimeofday(&tv, NULL);
+			ts.tv_sec = tv.tv_sec + 1;
+			ts.tv_nsec = tv.tv_usec * 1000ULL;
+
+			pthread_cond_timedwait(&thread->done_cond, &thread->done_lock, &ts);
+			fn(wt);
+		} else
+			pthread_cond_wait(&thread->done_cond, &thread->done_lock);
+
+		pthread_mutex_unlock(&thread->done_lock);
+	}
+}
+
+static int usage(char *argv[])
+{
+	fprintf(stderr, "%s: [-b blocksize] [-t max usec] [-w separate writer] -f file\n", argv[0]);
+	return 1;
+}
+
+static int parse_options(int argc, char *argv[])
+{
+	int c;
+
+	while ((c = getopt(argc, argv, "f:b:t:w:")) != -1) {
+		switch (c) {
+		case 'f':
+			file = strdup(optarg);
+			break;
+		case 'b':
+			bs = atoi(optarg);
+			break;
+		case 't':
+			max_us = atoi(optarg);
+			break;
+		case 'w':
+			separate_writer = atoi(optarg);
+			if (!separate_writer)
+				fprintf(stderr, "inline writing is broken\n");
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (!file)
+		return usage(argv);
+
+	return 0;
+}
+
+static void prune_done_entries(struct writer_thread *wt)
+{
+	FLIST_HEAD(list);
+
+	if (flist_empty(&wt->done_list))
+		return;
+
+	if (pthread_mutex_trylock(&wt->thread.lock))
+		return;
+
+	if (!flist_empty(&wt->done_list))
+		flist_splice_init(&wt->done_list, &list);
+	pthread_mutex_unlock(&wt->thread.lock);
+
+	while (!flist_empty(&list)) {
+		struct work_item *work;
+
+		work = flist_first_entry(&list, struct work_item, list);
+		flist_del(&work->list);
+
+		pthread_cond_destroy(&work->cond);
+		pthread_mutex_destroy(&work->lock);
+		free(work->buf);
+		free(work);
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	struct timeval s, re, we;
+	struct reader_thread *rt;
+	struct writer_thread *wt;
+	unsigned long rate;
+	struct stat sb;
+	size_t bytes;
+	off_t off;
+	int fd, seq;
+
+	if (parse_options(argc, argv))
+		return 1;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 2;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		perror("stat");
+		return 3;
+	}
+
+	wt = &writer_thread;
+	init_thread(&wt->thread);
+	INIT_FLIST_HEAD(&wt->list);
+	INIT_FLIST_HEAD(&wt->done_list);
+	wt->s.max = 0;
+	wt->s.min = -1U;
+	pthread_create(&wt->thread.thread, NULL, writer_fn, wt);
+
+	rt = &reader_thread;
+	init_thread(&rt->thread);
+	INIT_FLIST_HEAD(&rt->list);
+	INIT_FLIST_HEAD(&rt->done_list);
+	rt->s.max = 0;
+	rt->s.min = -1U;
+	rt->write_seq = 1;
+
+	off = 0;
+	seq = 0;
+	bytes = 0;
+
+	gettimeofday(&s, NULL);
+
+	while (sb.st_size) {
+		struct work_item *work;
+		size_t this_len;
+		struct timespec ts;
+		struct timeval tv;
+
+		prune_done_entries(wt);
+
+		this_len = sb.st_size;
+		if (this_len > bs)
+			this_len = bs;
+
+		work = calloc(1, sizeof(*work));
+		work->buf = malloc(this_len);
+		work->buf_size = this_len;
+		work->off = off;
+		work->fd = fd;
+		work->seq = ++seq;
+		work->writer = wt;
+		work->reader = rt;
+		pthread_cond_init(&work->cond, NULL);
+		pthread_mutex_init(&work->lock, NULL);
+
+		queue_work(rt, work);
+
+		gettimeofday(&tv, NULL);
+		ts.tv_sec = tv.tv_sec;
+		ts.tv_nsec = tv.tv_usec * 1000ULL;
+		ts.tv_nsec += max_us * 1000ULL;
+		if (ts.tv_nsec >= 1000000000ULL) {
+			ts.tv_nsec -= 1000000000ULL;
+			ts.tv_sec++;
+		}
+
+		pthread_mutex_lock(&work->lock);
+		pthread_cond_timedwait(&work->cond, &work->lock, &ts);
+		pthread_mutex_unlock(&work->lock);
+
+		off += this_len;
+		sb.st_size -= this_len;
+		bytes += this_len;
+	}
+
+	exit_thread(&rt->thread, NULL, NULL);
+	gettimeofday(&re, NULL);
+
+	exit_thread(&wt->thread, prune_done_entries, wt);
+	gettimeofday(&we, NULL);
+
+	show_latencies(&rt->s, "READERS");
+	show_latencies(&wt->s, "WRITERS");
+
+	bytes /= 1024;
+	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &re);
+	fprintf(stderr, "Read rate (KiB/sec) : %lu\n", rate);
+	rate = (bytes * 1000UL * 1000UL) / utime_since(&s, &we);
+	fprintf(stderr, "Write rate (KiB/sec): %lu\n", rate);
+
+	close(fd);
+	return 0;
+}

diff --git a/t/stest.c b/t/stest.c
index efb256e..04df60d 100644
--- a/t/stest.c
+++ b/t/stest.c

@@ -4,6 +4,7 @@
 
 #include "../smalloc.h"
 #include "../flist.h"
+#include "../arch/arch.h"
 #include "debug.h"
 
 #define MAGIC1	0xa9b1c8d2
@@ -30,7 +31,7 @@
 		srand(MAGIC1);
 #endif
 		nr = total = 0;
-		while (total < 128*1024*1024UL) {
+		while (total < 120*1024*1024UL) {
 			size = 8 * sizeof(struct elem) + (int) (999.0 * (rand() / (RAND_MAX + 1.0)));
 			e = smalloc(size);
 			if (!e) {
@@ -58,25 +59,14 @@
 	return 0;
 }
 
-static int do_specific_alloc(unsigned long size)
-{
-	void *ptr;
-
-	ptr = smalloc(size);
-	sfree(ptr);
-	return 0;
-}
-
 int main(int argc, char *argv[])
 {
+	arch_init(argv);
 	sinit();
 	debug_init();
 
 	do_rand_allocs();
 
-	/* smalloc bug, commit 271067a6 */
-	do_specific_alloc(671386584);
-
 	scleanup();
 	return 0;
 }

diff --git a/t/verify-state.c b/t/verify-state.c
new file mode 100644
index 0000000..9a2c3df
--- /dev/null
+++ b/t/verify-state.c

@@ -0,0 +1,154 @@
+/*
+ * Dump the contents of a verify state file in plain text
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include "../log.h"
+#include "../os/os.h"
+#include "../verify-state.h"
+#include "../crc/crc32c.h"
+#include "debug.h"
+
+static void show_s(struct thread_io_list *s, unsigned int no_s)
+{
+	int i;
+
+	printf("Thread:\t\t%u\n", no_s);
+	printf("Name:\t\t%s\n", s->name);
+	printf("Completions:\t%llu\n", (unsigned long long) s->no_comps);
+	printf("Depth:\t\t%llu\n", (unsigned long long) s->depth);
+	printf("Number IOs:\t%llu\n", (unsigned long long) s->numberio);
+	printf("Index:\t\t%llu\n", (unsigned long long) s->index);
+
+	printf("Completions:\n");
+	if (!s->no_comps)
+		return;
+	for (i = s->no_comps - 1; i >= 0; i--) {
+		printf("\t(file=%2llu) %llu\n",
+				(unsigned long long) s->comps[i].fileno,
+				(unsigned long long) s->comps[i].offset);
+	}
+}
+
+static void show(struct thread_io_list *s, size_t size)
+{
+	int no_s;
+
+	no_s = 0;
+	do {
+		int i;
+
+		s->no_comps = le64_to_cpu(s->no_comps);
+		s->depth = le32_to_cpu(s->depth);
+		s->nofiles = le32_to_cpu(s->nofiles);
+		s->numberio = le64_to_cpu(s->numberio);
+		s->index = le64_to_cpu(s->index);
+
+		for (i = 0; i < s->no_comps; i++) {
+			s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+			s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+		}
+
+		show_s(s, no_s);
+		no_s++;
+		size -= __thread_io_list_sz(s->depth, s->nofiles);
+		s = (void *) s + __thread_io_list_sz(s->depth, s->nofiles);
+	} while (size != 0);
+}
+
+static void show_verify_state(void *buf, size_t size)
+{
+	struct verify_state_hdr *hdr = buf;
+	struct thread_io_list *s;
+	uint32_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	printf("Version:\t0x%x\n", (unsigned int) hdr->version);
+	printf("Size:\t\t%u\n", (unsigned int) hdr->size);
+	printf("CRC:\t\t0x%x\n", (unsigned int) hdr->crc);
+
+	size -= sizeof(*hdr);
+	if (hdr->size != size) {
+		log_err("Size mismatch\n");
+		return;
+	}
+
+	s = buf + sizeof(*hdr);
+	crc = fio_crc32c((unsigned char *) s, hdr->size);
+	if (crc != hdr->crc) {
+		log_err("crc mismatch %x != %x\n", crc, (unsigned int) hdr->crc);
+		return;
+	}
+
+	if (hdr->version == 0x03)
+		show(s, size);
+	else
+		log_err("Unsupported version %d\n", (int) hdr->version);
+}
+
+static int show_file(const char *file)
+{
+	struct stat sb;
+	void *buf;
+	int ret, fd;
+
+	fd = open(file, O_RDONLY);
+	if (fd < 0) {
+		log_err("open %s: %s\n", file, strerror(errno));
+		return 1;
+	}
+
+	if (fstat(fd, &sb) < 0) {
+		log_err("stat: %s\n", strerror(errno));
+		close(fd);
+		return 1;
+	}
+
+	buf = malloc(sb.st_size);
+	ret = read(fd, buf, sb.st_size);
+	if (ret < 0) {
+		log_err("read: %s\n", strerror(errno));
+		close(fd);
+		return 1;
+	} else if (ret != sb.st_size) {
+		log_err("Short read\n");
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	show_verify_state(buf, sb.st_size);
+
+	free(buf);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int i, ret;
+
+	debug_init();
+
+	if (argc < 2) {
+		log_err("Usage: %s <state file>\n", argv[0]);
+		return 1;
+	}
+
+	ret = 0;
+	for (i = 1; i < argc; i++) {
+		ret = show_file(argv[i]);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}

diff --git a/thread_options.h b/thread_options.h
index 611f8e7..d0f3fe9 100644
--- a/thread_options.h
+++ b/thread_options.h

@@ -7,6 +7,7 @@
 #include "stat.h"
 #include "gettime.h"
 #include "lib/ieee754.h"
+#include "lib/pattern.h"
 #include "td_error.h"
 
 /*
@@ -18,17 +19,25 @@
 	MEM_SHMHUGE,	/* use shared memory segments with huge pages */
 	MEM_MMAP,	/* use anonynomous mmap */
 	MEM_MMAPHUGE,	/* memory mapped huge file */
+	MEM_MMAPSHARED, /* use mmap with shared flag */
+	MEM_CUDA_MALLOC,/* use GPU memory */
 };
 
 #define ERROR_STR_MAX	128
 
 #define BSSPLIT_MAX	64
+#define ZONESPLIT_MAX	64
 
 struct bssplit {
 	uint32_t bs;
 	uint32_t perc;
 };
 
+struct zone_split {
+	uint8_t access_perc;
+	uint8_t size_perc;
+};
+
 #define NR_OPTS_SZ	(FIO_MAX_OPTS / (8 * sizeof(uint64_t)))
 
 #define OPT_MAGIC	0x4f50544e
@@ -38,6 +47,7 @@
 	uint64_t set_options[NR_OPTS_SZ];
 	char *description;
 	char *name;
+	char *wait_for;
 	char *directory;
 	char *filename;
 	char *filename_format;
@@ -53,10 +63,13 @@
 	unsigned int iodepth;
 	unsigned int iodepth_low;
 	unsigned int iodepth_batch;
-	unsigned int iodepth_batch_complete;
+	unsigned int iodepth_batch_complete_min;
+	unsigned int iodepth_batch_complete_max;
+
+	unsigned int unique_filename;
 
 	unsigned long long size;
-	unsigned long long io_limit;
+	unsigned long long io_size;
 	unsigned int size_percent;
 	unsigned int fill_device;
 	unsigned int file_append;
@@ -97,6 +110,8 @@
 	unsigned int verify_offset;
 	char verify_pattern[MAX_PATTERN_SIZE];
 	unsigned int verify_pattern_bytes;
+	struct pattern_fmt verify_fmt[8];
+	unsigned int verify_fmt_sz;
 	unsigned int verify_fatal;
 	unsigned int verify_dump;
 	unsigned int verify_async;
@@ -107,6 +122,7 @@
 	unsigned int verify_state_save;
 	unsigned int use_thread;
 	unsigned int unlink;
+	unsigned int unlink_each_loop;
 	unsigned int do_disk_util;
 	unsigned int override_sync;
 	unsigned int rand_repeatable;
@@ -114,9 +130,13 @@
 	unsigned long long rand_seed;
 	unsigned int dep_use_os_rand;
 	unsigned int log_avg_msec;
+	unsigned int log_hist_msec;
+	unsigned int log_hist_coarseness;
+	unsigned int log_max;
 	unsigned int log_offset;
 	unsigned int log_gz;
 	unsigned int log_gz_store;
+	unsigned int log_unix_epoch;
 	unsigned int norandommap;
 	unsigned int softrandommap;
 	unsigned int bs_unaligned;
@@ -126,9 +146,14 @@
 	unsigned int verify_only;
 
 	unsigned int random_distribution;
+	unsigned int exitall_error;
+
+	struct zone_split *zone_split[DDIR_RWDIR_CNT];
+	unsigned int zone_split_nr[DDIR_RWDIR_CNT];
 
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
 
 	unsigned int random_generator;
 
@@ -146,6 +171,10 @@
 	unsigned long long start_delay_high;
 	unsigned long long timeout;
 	unsigned long long ramp_time;
+	unsigned int ss_state;
+	fio_fp64_t ss_limit;
+	unsigned long long ss_dur;
+	unsigned long long ss_ramp_time;
 	unsigned int overwrite;
 	unsigned int bw_avg_time;
 	unsigned int iops_avg_time;
@@ -164,11 +193,14 @@
 	unsigned int numjobs;
 	os_cpu_mask_t cpumask;
 	os_cpu_mask_t verify_cpumask;
+	os_cpu_mask_t log_gz_cpumask;
 	unsigned int cpus_allowed_policy;
 	char *numa_cpunodes;
 	unsigned short numa_mem_mode;
 	unsigned int numa_mem_prefer_node;
 	char *numa_memnodes;
+	unsigned int gpu_dev_id;
+
 	unsigned int iolog;
 	unsigned int rwmixcycle;
 	unsigned int rwmix[DDIR_RWDIR_CNT];
@@ -177,7 +209,9 @@
 	unsigned int ioprio_class;
 	unsigned int file_service_type;
 	unsigned int group_reporting;
+	unsigned int stats;
 	unsigned int fadvise_hint;
+	unsigned int fadvise_stream;
 	enum fio_fallocate_mode fallocate_mode;
 	unsigned int zero_buffers;
 	unsigned int refill_buffers;
@@ -207,9 +241,16 @@
 
 	char *read_iolog_file;
 	char *write_iolog_file;
+
+	unsigned int write_bw_log;
+	unsigned int write_lat_log;
+	unsigned int write_iops_log;
+	unsigned int write_hist_log;
+
 	char *bw_log_file;
 	char *lat_log_file;
 	char *iops_log_file;
+	char *hist_log_file;
 	char *replay_redirect;
 
 	/*
@@ -218,11 +259,13 @@
 	char *exec_prerun;
 	char *exec_postrun;
 
-	unsigned int rate[DDIR_RWDIR_CNT];
-	unsigned int ratemin[DDIR_RWDIR_CNT];
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
 	unsigned int ratecycle;
+	unsigned int io_submit_mode;
 	unsigned int rate_iops[DDIR_RWDIR_CNT];
 	unsigned int rate_iops_min[DDIR_RWDIR_CNT];
+	unsigned int rate_process;
 
 	char *ioscheduler;
 
@@ -259,6 +302,17 @@
 	unsigned long long latency_target;
 	unsigned long long latency_window;
 	fio_fp64_t latency_percentile;
+
+	unsigned block_error_hist;
+	unsigned int skip_bad;
+
+	unsigned int replay_align;
+	unsigned int replay_scale;
+
+	unsigned int per_job_logs;
+
+	unsigned int allow_create;
+	unsigned int allow_mounted_write;
 };
 
 #define FIO_TOP_STR_MAX		256
@@ -267,6 +321,7 @@
 	uint64_t set_options[NR_OPTS_SZ];
 	uint8_t description[FIO_TOP_STR_MAX];
 	uint8_t name[FIO_TOP_STR_MAX];
+	uint8_t wait_for[FIO_TOP_STR_MAX];
 	uint8_t directory[FIO_TOP_STR_MAX];
 	uint8_t filename[FIO_TOP_STR_MAX];
 	uint8_t filename_format[FIO_TOP_STR_MAX];
@@ -282,13 +337,15 @@
 	uint32_t iodepth;
 	uint32_t iodepth_low;
 	uint32_t iodepth_batch;
-	uint32_t iodepth_batch_complete;
+	uint32_t iodepth_batch_complete_min;
+	uint32_t iodepth_batch_complete_max;
 
 	uint64_t size;
-	uint64_t io_limit;
+	uint64_t io_size;
 	uint32_t size_percent;
 	uint32_t fill_device;
 	uint32_t file_append;
+	uint32_t unique_filename;
 	uint64_t file_size_low;
 	uint64_t file_size_high;
 	uint64_t start_offset;
@@ -336,6 +393,7 @@
 	uint32_t verify_state_save;
 	uint32_t use_thread;
 	uint32_t unlink;
+	uint32_t unlink_each_loop;
 	uint32_t do_disk_util;
 	uint32_t override_sync;
 	uint32_t rand_repeatable;
@@ -343,9 +401,13 @@
 	uint64_t rand_seed;
 	uint32_t dep_use_os_rand;
 	uint32_t log_avg_msec;
+	uint32_t log_hist_msec;
+	uint32_t log_hist_coarseness;
+	uint32_t log_max;
 	uint32_t log_offset;
 	uint32_t log_gz;
 	uint32_t log_gz_store;
+	uint32_t log_unix_epoch;
 	uint32_t norandommap;
 	uint32_t softrandommap;
 	uint32_t bs_unaligned;
@@ -353,9 +415,14 @@
 	uint32_t bs_is_seq_rand;
 
 	uint32_t random_distribution;
-	uint32_t pad;
+	uint32_t exitall_error;
+
+	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
+	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
+
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
+	fio_fp64_t gauss_dev;
 
 	uint32_t random_generator;
 
@@ -373,6 +440,10 @@
 	uint64_t start_delay_high;
 	uint64_t timeout;
 	uint64_t ramp_time;
+	uint64_t ss_dur;
+	uint64_t ss_ramp_time;
+	uint32_t ss_state;
+	fio_fp64_t ss_limit;
 	uint32_t overwrite;
 	uint32_t bw_avg_time;
 	uint32_t iops_avg_time;
@@ -389,8 +460,16 @@
 	uint32_t stonewall;
 	uint32_t new_group;
 	uint32_t numjobs;
+	/*
+	 * We currently can't convert these, so don't enable them
+	 */
+#if 0
 	uint8_t cpumask[FIO_TOP_STR_MAX];
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
+	uint8_t log_gz_cpumask[FIO_TOP_STR_MAX];
+#endif
+	uint32_t gpu_dev_id;
+	uint32_t pad;
 	uint32_t cpus_allowed_policy;
 	uint32_t iolog;
 	uint32_t rwmixcycle;
@@ -400,7 +479,9 @@
 	uint32_t ioprio_class;
 	uint32_t file_service_type;
 	uint32_t group_reporting;
+	uint32_t stats;
 	uint32_t fadvise_hint;
+	uint32_t fadvise_stream;
 	uint32_t fallocate_mode;
 	uint32_t zero_buffers;
 	uint32_t refill_buffers;
@@ -426,14 +507,20 @@
 	uint64_t trim_backlog;
 	uint32_t clat_percentiles;
 	uint32_t percentile_precision;
-	uint32_t pad2;
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	uint8_t read_iolog_file[FIO_TOP_STR_MAX];
 	uint8_t write_iolog_file[FIO_TOP_STR_MAX];
+
+	uint32_t write_bw_log;
+	uint32_t write_lat_log;
+	uint32_t write_iops_log;
+	uint32_t write_hist_log;
+
 	uint8_t bw_log_file[FIO_TOP_STR_MAX];
 	uint8_t lat_log_file[FIO_TOP_STR_MAX];
 	uint8_t iops_log_file[FIO_TOP_STR_MAX];
+	uint8_t hist_log_file[FIO_TOP_STR_MAX];
 	uint8_t replay_redirect[FIO_TOP_STR_MAX];
 
 	/*
@@ -442,11 +529,13 @@
 	uint8_t exec_prerun[FIO_TOP_STR_MAX];
 	uint8_t exec_postrun[FIO_TOP_STR_MAX];
 
-	uint32_t rate[DDIR_RWDIR_CNT];
-	uint32_t ratemin[DDIR_RWDIR_CNT];
+	uint64_t rate[DDIR_RWDIR_CNT];
+	uint64_t ratemin[DDIR_RWDIR_CNT];
 	uint32_t ratecycle;
+	uint32_t io_submit_mode;
 	uint32_t rate_iops[DDIR_RWDIR_CNT];
 	uint32_t rate_iops_min[DDIR_RWDIR_CNT];
+	uint32_t rate_process;
 
 	uint8_t ioscheduler[FIO_TOP_STR_MAX];
 
@@ -479,11 +568,22 @@
 	uint64_t number_ios;
 
 	uint32_t sync_file_range;
+	uint32_t pad2;
 
 	uint64_t latency_target;
 	uint64_t latency_window;
-	uint32_t pad3;
 	fio_fp64_t latency_percentile;
+
+	uint32_t block_error_hist;
+	uint32_t skip_bad;
+
+	uint32_t replay_align;
+	uint32_t replay_scale;
+
+	uint32_t per_job_logs;
+
+	uint32_t allow_create;
+	uint32_t allow_mounted_write;
 } __attribute__((packed));
 
 extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);

diff --git a/time.c b/time.c
index f1833c7..279ee48 100644
--- a/time.c
+++ b/time.c

@@ -6,6 +6,23 @@
 static struct timeval genesis;
 static unsigned long ns_granularity;
 
+void timeval_add_msec(struct timeval *tv, unsigned int msec)
+{
+	unsigned long adj_usec = 1000 * msec;
+
+	tv->tv_usec += adj_usec;
+	if (adj_usec >= 1000000) {
+		unsigned long adj_sec = adj_usec / 1000000;
+
+		tv->tv_usec -=  adj_sec * 1000000;
+		tv->tv_sec += adj_sec;
+	}
+	if (tv->tv_usec >= 1000000){
+		tv->tv_usec -= 1000000;
+		tv->tv_sec++;
+	}
+}
+
 /*
  * busy looping version for the last few usec
  */
@@ -75,27 +92,40 @@
 	return utime_since_now(&genesis);
 }
 
-int in_ramp_time(struct thread_data *td)
+bool in_ramp_time(struct thread_data *td)
 {
 	return td->o.ramp_time && !td->ramp_time_over;
 }
 
-int ramp_time_over(struct thread_data *td)
+static void parent_update_ramp(struct thread_data *td)
+{
+	struct thread_data *parent = td->parent;
+
+	if (!parent || parent->ramp_time_over)
+		return;
+
+	reset_all_stats(parent);
+	parent->ramp_time_over = 1;
+	td_set_runstate(parent, TD_RAMP);
+}
+
+bool ramp_time_over(struct thread_data *td)
 {
 	struct timeval tv;
 
 	if (!td->o.ramp_time || td->ramp_time_over)
-		return 1;
+		return true;
 
 	fio_gettime(&tv, NULL);
 	if (utime_since(&td->epoch, &tv) >= td->o.ramp_time) {
 		td->ramp_time_over = 1;
 		reset_all_stats(td);
 		td_set_runstate(td, TD_RAMP);
-		return 1;
+		parent_update_ramp(td);
+		return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void fio_time_init(void)
@@ -129,6 +159,17 @@
 	fio_gettime(&genesis, NULL);
 }
 
+void set_epoch_time(struct thread_data *td, int log_unix_epoch)
+{
+	fio_gettime(&td->epoch, NULL);
+	if (log_unix_epoch) {
+		struct timeval tv;
+		gettimeofday(&tv, NULL);
+		td->unix_epoch = (unsigned long long)(tv.tv_sec) * 1000 +
+		                 (unsigned long long)(tv.tv_usec) / 1000;
+	}
+}
+
 void fill_start_time(struct timeval *t)
 {
 	memcpy(t, &genesis, sizeof(genesis));

diff --git a/tools/fio.service b/tools/fio.service
new file mode 100644
index 0000000..21de0b7
--- /dev/null
+++ b/tools/fio.service

@@ -0,0 +1,10 @@
+[Unit]
+
+Description=flexible I/O tester server
+After=network.target
+
+[Service]
+
+Type=simple
+PIDFile=/run/fio.pid
+ExecStart=/usr/bin/fio --server

diff --git a/tools/fiologparser.py b/tools/fiologparser.py
new file mode 100755
index 0000000..5a95009
--- /dev/null
+++ b/tools/fiologparser.py

@@ -0,0 +1,221 @@
+#!/usr/bin/python
+#
+# fiologparser.py
+#
+# This tool lets you parse multiple fio log files and look at interaval
+# statistics even when samples are non-uniform.  For instance:
+#
+# fiologparser.py -s *bw*
+#
+# to see per-interval sums for all bandwidth logs or:
+#
+# fiologparser.py -a *clat*
+#
+# to see per-interval average completion latency.
+
+import argparse
+import math
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-i', '--interval', required=False, type=int, default=1000, help='interval of time in seconds.')
+    parser.add_argument('-d', '--divisor', required=False, type=int, default=1, help='divide the results by this value.')
+    parser.add_argument('-f', '--full', dest='full', action='store_true', default=False, help='print full output.')
+    parser.add_argument('-A', '--all', dest='allstats', action='store_true', default=False, 
+                        help='print all stats for each interval.')
+    parser.add_argument('-a', '--average', dest='average', action='store_true', default=False, help='print the average for each interval.')
+    parser.add_argument('-s', '--sum', dest='sum', action='store_true', default=False, help='print the sum for each interval.')
+    parser.add_argument("FILE", help="collectl log output files to parse", nargs="+")
+    args = parser.parse_args()
+
+    return args
+
+def get_ftime(series):
+    ftime = 0
+    for ts in series:
+        if ftime == 0 or ts.last.end < ftime:
+            ftime = ts.last.end
+    return ftime
+
+def print_full(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %s" % (end, ', '.join(["%0.3f" % i for i in results])))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_sums(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, sum(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+def print_averages(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        print("%s, %0.3f" % (end, float(sum(results))/len(results)))
+        start += ctx.interval
+        end += ctx.interval
+
+# FIXME: this routine is computationally inefficient
+# and has O(N^2) behavior
+# it would be better to make one pass through samples
+# to segment them into a series of time intervals, and
+# then compute stats on each time interval instead.
+# to debug this routine, use
+#   # sort -n -t ',' -k 2 small.log
+# on your input.
+
+def my_extend( vlist, val ):
+    vlist.extend(val)
+    return vlist
+
+array_collapser = lambda vlist, val:  my_extend(vlist, val) 
+
+def print_all_stats(ctx, series):
+    ftime = get_ftime(series)
+    start = 0 
+    end = ctx.interval
+    print('start-time, samples, min, avg, median, 90%, 95%, 99%, max')
+    while (start < ftime):  # for each time interval
+        end = ftime if ftime < end else end
+        sample_arrays = [ s.get_samples(start, end) for s in series ]
+        samplevalue_arrays = []
+        for sample_array in sample_arrays:
+            samplevalue_arrays.append( 
+                [ sample.value for sample in sample_array ] )
+        # collapse list of lists of sample values into list of sample values
+        samplevalues = reduce( array_collapser, samplevalue_arrays, [] )
+        # compute all stats and print them
+        mymin = min(samplevalues)
+        myavg = sum(samplevalues) / float(len(samplevalues))
+        mymedian = median(samplevalues)
+        my90th = percentile(samplevalues, 0.90) 
+        my95th = percentile(samplevalues, 0.95)
+        my99th = percentile(samplevalues, 0.99)
+        mymax = max(samplevalues)
+        print( '%f, %d, %f, %f, %f, %f, %f, %f, %f' % (
+            start, len(samplevalues), 
+            mymin, myavg, mymedian, my90th, my95th, my99th, mymax))
+
+        # advance to next interval
+        start += ctx.interval
+        end += ctx.interval
+
+def median(values):
+    s=sorted(values)
+    return float(s[(len(s)-1)/2]+s[(len(s)/2)])/2
+
+def percentile(values, p):
+    s = sorted(values)
+    k = (len(s)-1) * p
+    f = math.floor(k)
+    c = math.ceil(k)
+    if f == c:
+        return s[int(k)]
+    return (s[int(f)] * (c-k)) + (s[int(c)] * (k-f))
+
+def print_default(ctx, series):
+    ftime = get_ftime(series)
+    start = 0
+    end = ctx.interval
+    averages = []
+    weights = []
+
+    while (start < ftime):
+        end = ftime if ftime < end else end
+        results = [ts.get_value(start, end) for ts in series]
+        averages.append(sum(results)) 
+        weights.append(end-start)
+        start += ctx.interval
+        end += ctx.interval
+
+    total = 0
+    for i in range(0, len(averages)):
+        total += averages[i]*weights[i]
+    print('%0.3f' % (total/sum(weights)))
+ 
+class TimeSeries(object):
+    def __init__(self, ctx, fn):
+        self.ctx = ctx
+        self.last = None 
+        self.samples = []
+        self.read_data(fn)
+
+    def read_data(self, fn):
+        f = open(fn, 'r')
+        p_time = 0
+        for line in f:
+            (time, value, foo, bar) = line.rstrip('\r\n').rsplit(', ')
+            self.add_sample(p_time, int(time), int(value))
+            p_time = int(time)
+ 
+    def add_sample(self, start, end, value):
+        sample = Sample(ctx, start, end, value)
+        if not self.last or self.last.end < end:
+            self.last = sample
+        self.samples.append(sample)
+
+    def get_samples(self, start, end):
+        sample_list = []
+        for s in self.samples:
+            if s.start >= start and s.end <= end:
+                sample_list.append(s)
+        return sample_list
+
+    def get_value(self, start, end):
+        value = 0
+        for sample in self.samples:
+            value += sample.get_contribution(start, end)
+        return value
+
+class Sample(object):
+    def __init__(self, ctx, start, end, value):
+       self.ctx = ctx
+       self.start = start
+       self.end = end
+       self.value = value
+
+    def get_contribution(self, start, end):
+       # short circuit if not within the bound
+       if (end < self.start or start > self.end):
+           return 0 
+
+       sbound = self.start if start < self.start else start
+       ebound = self.end if end > self.end else end
+       ratio = float(ebound-sbound) / (end-start) 
+       return self.value*ratio/ctx.divisor
+
+
+if __name__ == '__main__':
+    ctx = parse_args()
+    series = []
+    for fn in ctx.FILE:
+       series.append(TimeSeries(ctx, fn)) 
+    if ctx.sum:
+        print_sums(ctx, series)
+    elif ctx.average:
+        print_averages(ctx, series)
+    elif ctx.full:
+        print_full(ctx, series)
+    elif ctx.allstats:
+        print_all_stats(ctx, series)
+    else:
+        print_default(ctx, series)
+

diff --git a/tools/genfio b/tools/genfio
index 4d32d13..6800452 100755
--- a/tools/genfio
+++ b/tools/genfio

@@ -54,6 +54,8 @@
 					Default is $IODEPTH
 -d disk1[,disk2,disk3,..]	: Run the tests on the selected disks
 					Separated each disk with a comma
+-z filesize                     : Specify the working file size, if you are passing filepaths to -d
+                                        Disabled by default
 -r seconds			: Time in seconds per benchmark
 					0 means till the end of the device
 					Default is $RUNTIME seconds
@@ -203,7 +205,7 @@
 }
 
 parse_cmdline() {
-while getopts "hacpsd:b:r:m:x:D:A:B:" opt; do
+while getopts "hacpsd:b:r:m:x:z:D:A:B:" opt; do
   case $opt in
     h)
 	show_help
@@ -260,6 +262,10 @@
     A)
 	echo "exec_postrun=$OPTARG" >> $TEMPLATE
       ;;
+    z)
+	FSIZE=$OPTARG
+	echo "size=$FSIZE" >> $TEMPLATE
+      ;;
     \?)
       echo "Invalid option: -$OPTARG" >&2
       ;;

diff --git a/tools/hist/.gitignore b/tools/hist/.gitignore
new file mode 100644
index 0000000..4f875da
--- /dev/null
+++ b/tools/hist/.gitignore

@@ -0,0 +1,3 @@
+*.pyc
+*.ipynb
+.ipynb_checkpoints

diff --git a/tools/hist/fiologparser_hist.py b/tools/hist/fiologparser_hist.py
new file mode 100755
index 0000000..ead5e54
--- /dev/null
+++ b/tools/hist/fiologparser_hist.py

@@ -0,0 +1,388 @@
+#!/usr/bin/env python2.7
+""" 
+    Utility for converting *_clat_hist* files generated by fio into latency statistics.
+    
+    Example usage:
+    
+            $ fiologparser_hist.py *_clat_hist*
+            end-time, samples, min, avg, median, 90%, 95%, 99%, max
+            1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+            2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+            4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+            ...
+    
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import os
+import sys
+import pandas
+import numpy as np
+
+err = sys.stderr.write
+
+def weighted_percentile(percs, vs, ws):
+    """ Use linear interpolation to calculate the weighted percentile.
+        
+        Value and weight arrays are first sorted by value. The cumulative
+        distribution function (cdf) is then computed, after which np.interp
+        finds the two values closest to our desired weighted percentile(s)
+        and linearly interpolates them.
+        
+        percs  :: List of percentiles we want to calculate
+        vs     :: Array of values we are computing the percentile of
+        ws     :: Array of weights for our corresponding values
+        return :: Array of percentiles
+    """
+    idx = np.argsort(vs)
+    vs, ws = vs[idx], ws[idx] # weights and values sorted by value
+    cdf = 100 * (ws.cumsum() - ws / 2.0) / ws.sum()
+    return np.interp(percs, cdf, vs) # linear interpolation
+
+def weights(start_ts, end_ts, start, end):
+    """ Calculate weights based on fraction of sample falling in the
+        given interval [start,end]. Weights computed using vector / array
+        computation instead of for-loops.
+    
+        Note that samples with zero time length are effectively ignored
+        (we set their weight to zero).
+
+        start_ts :: Array of start times for a set of samples
+        end_ts   :: Array of end times for a set of samples
+        start    :: int
+        end      :: int
+        return   :: Array of weights
+    """
+    sbounds = np.maximum(start_ts, start).astype(float)
+    ebounds = np.minimum(end_ts,   end).astype(float)
+    ws = (ebounds - sbounds) / (end_ts - start_ts)
+    if np.any(np.isnan(ws)):
+      err("WARNING: zero-length sample(s) detected. Log file corrupt"
+          " / bad time values? Ignoring these samples.\n")
+    ws[np.where(np.isnan(ws))] = 0.0;
+    return ws
+
+def weighted_average(vs, ws):
+    return np.sum(vs * ws) / np.sum(ws)
+
+columns = ["end-time", "samples", "min", "avg", "median", "90%", "95%", "99%", "max"]
+percs   = [50, 90, 95, 99]
+
+def fmt_float_list(ctx, num=1):
+  """ Return a comma separated list of float formatters to the required number
+      of decimal places. For instance:
+
+        fmt_float_list(ctx.decimals=4, num=3) == "%.4f, %.4f, %.4f"
+  """
+  return ', '.join(["%%.%df" % ctx.decimals] * num)
+
+# Default values - see beginning of main() for how we detect number columns in
+# the input files:
+__HIST_COLUMNS = 1216
+__NON_HIST_COLUMNS = 3
+__TOTAL_COLUMNS = __HIST_COLUMNS + __NON_HIST_COLUMNS
+    
+def read_chunk(rdr, sz):
+    """ Read the next chunk of size sz from the given reader. """
+    try:
+        """ StopIteration occurs when the pandas reader is empty, and AttributeError
+            occurs if rdr is None due to the file being empty. """
+        new_arr = rdr.read().values
+    except (StopIteration, AttributeError):
+        return None    
+
+    """ Extract array of just the times, and histograms matrix without times column. """
+    times, rws, szs = new_arr[:,0], new_arr[:,1], new_arr[:,2]
+    hists = new_arr[:,__NON_HIST_COLUMNS:]
+    times = times.reshape((len(times),1))
+    arr = np.append(times, hists, axis=1)
+
+    return arr
+
+def get_min(fps, arrs):
+    """ Find the file with the current first row with the smallest start time """
+    return min([fp for fp in fps if not arrs[fp] is None], key=lambda fp: arrs.get(fp)[0][0])
+
+def histogram_generator(ctx, fps, sz):
+    
+    # Create a chunked pandas reader for each of the files:
+    rdrs = {}
+    for fp in fps:
+        try:
+            rdrs[fp] = pandas.read_csv(fp, dtype=int, header=None, chunksize=sz)
+        except ValueError as e:
+            if e.message == 'No columns to parse from file':
+                if ctx.warn: sys.stderr.write("WARNING: Empty input file encountered.\n")
+                rdrs[fp] = None
+            else:
+                raise(e)
+
+    # Initial histograms from disk:
+    arrs = {fp: read_chunk(rdr, sz) for fp,rdr in rdrs.items()}
+    while True:
+
+        try:
+            """ ValueError occurs when nothing more to read """
+            fp = get_min(fps, arrs)
+        except ValueError:
+            return
+        arr = arrs[fp]
+        yield np.insert(arr[0], 1, fps.index(fp))
+        arrs[fp] = arr[1:]
+
+        if arrs[fp].shape[0] == 0:
+            arrs[fp] = read_chunk(rdrs[fp], sz)
+
+def _plat_idx_to_val(idx, edge=0.5, FIO_IO_U_PLAT_BITS=6, FIO_IO_U_PLAT_VAL=64):
+    """ Taken from fio's stat.c for calculating the latency value of a bin
+        from that bin's index.
+        
+            idx  : the value of the index into the histogram bins
+            edge : fractional value in the range [0,1]** indicating how far into
+            the bin we wish to compute the latency value of.
+        
+        ** edge = 0.0 and 1.0 computes the lower and upper latency bounds
+           respectively of the given bin index. """
+
+    # MSB <= (FIO_IO_U_PLAT_BITS-1), cannot be rounded off. Use
+    # all bits of the sample as index
+    if (idx < (FIO_IO_U_PLAT_VAL << 1)):
+        return idx 
+
+    # Find the group and compute the minimum value of that group
+    error_bits = (idx >> FIO_IO_U_PLAT_BITS) - 1 
+    base = 1 << (error_bits + FIO_IO_U_PLAT_BITS)
+
+    # Find its bucket number of the group
+    k = idx % FIO_IO_U_PLAT_VAL
+
+    # Return the mean (if edge=0.5) of the range of the bucket
+    return base + ((k + edge) * (1 << error_bits))
+    
+def plat_idx_to_val_coarse(idx, coarseness, edge=0.5):
+    """ Converts the given *coarse* index into a non-coarse index as used by fio
+        in stat.h:plat_idx_to_val(), subsequently computing the appropriate
+        latency value for that bin.
+        """
+
+    # Multiply the index by the power of 2 coarseness to get the bin
+    # bin index with a max of 1536 bins (FIO_IO_U_PLAT_GROUP_NR = 24 in stat.h)
+    stride = 1 << coarseness
+    idx = idx * stride
+    lower = _plat_idx_to_val(idx, edge=0.0)
+    upper = _plat_idx_to_val(idx + stride, edge=1.0)
+    return lower + (upper - lower) * edge
+
+def print_all_stats(ctx, end, mn, ss_cnt, vs, ws, mx):
+    ps = weighted_percentile(percs, vs, ws)
+
+    avg = weighted_average(vs, ws)
+    values = [mn, avg] + list(ps) + [mx]
+    row = [end, ss_cnt] + map(lambda x: float(x) / ctx.divisor, values)
+    fmt = "%d, %d, %d, " + fmt_float_list(ctx, 5) + ", %d"
+    print (fmt % tuple(row))
+
+def update_extreme(val, fncn, new_val):
+    """ Calculate min / max in the presence of None values """
+    if val is None: return new_val
+    else: return fncn(val, new_val)
+
+# See beginning of main() for how bin_vals are computed
+bin_vals = []
+lower_bin_vals = [] # lower edge of each bin
+upper_bin_vals = [] # upper edge of each bin 
+
+def process_interval(ctx, samples, iStart, iEnd):
+    """ Construct the weighted histogram for the given interval by scanning
+        through all the histograms and figuring out which of their bins have
+        samples with latencies which overlap with the given interval
+        [iStart,iEnd].
+    """
+    
+    times, files, hists = samples[:,0], samples[:,1], samples[:,2:]
+    iHist = np.zeros(__HIST_COLUMNS)
+    ss_cnt = 0 # number of samples affecting this interval
+    mn_bin_val, mx_bin_val = None, None
+
+    for end_time,file,hist in zip(times,files,hists):
+            
+        # Only look at bins of the current histogram sample which
+        # started before the end of the current time interval [start,end]
+        start_times = (end_time - 0.5 * ctx.interval) - bin_vals / 1000.0
+        idx = np.where(start_times < iEnd)
+        s_ts, l_bvs, u_bvs, hs = start_times[idx], lower_bin_vals[idx], upper_bin_vals[idx], hist[idx]
+
+        # Increment current interval histogram by weighted values of future histogram:
+        ws = hs * weights(s_ts, end_time, iStart, iEnd)
+        iHist[idx] += ws
+    
+        # Update total number of samples affecting current interval histogram:
+        ss_cnt += np.sum(hs)
+        
+        # Update min and max bin values seen if necessary:
+        idx = np.where(hs != 0)[0]
+        if idx.size > 0:
+            mn_bin_val = update_extreme(mn_bin_val, min, l_bvs[max(0,           idx[0]  - 1)])
+            mx_bin_val = update_extreme(mx_bin_val, max, u_bvs[min(len(hs) - 1, idx[-1] + 1)])
+
+    if ss_cnt > 0: print_all_stats(ctx, iEnd, mn_bin_val, ss_cnt, bin_vals, iHist, mx_bin_val)
+
+def guess_max_from_bins(ctx, hist_cols):
+    """ Try to guess the GROUP_NR from given # of histogram
+        columns seen in an input file """
+    max_coarse = 8
+    if ctx.group_nr < 19 or ctx.group_nr > 26:
+        bins = [ctx.group_nr * (1 << 6)]
+    else:
+        bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+    coarses = range(max_coarse + 1)
+    fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else -10, coarses))
+    
+    arr = np.transpose(list(map(fncn, bins)))
+    idx = np.where(arr == hist_cols)
+    if len(idx[1]) == 0:
+        table = repr(arr.astype(int)).replace('-10', 'N/A').replace('array','     ')
+        err("Unable to determine bin values from input clat_hist files. Namely \n"
+            "the first line of file '%s' " % ctx.FILE[0] + "has %d \n" % (__TOTAL_COLUMNS,) +
+            "columns of which we assume %d " % (hist_cols,) + "correspond to histogram bins. \n"
+            "This number needs to be equal to one of the following numbers:\n\n"
+            + table + "\n\n"
+            "Possible reasons and corresponding solutions:\n"
+            "  - Input file(s) does not contain histograms.\n"
+            "  - You recompiled fio with a different GROUP_NR. If so please specify this\n"
+            "    new GROUP_NR on the command line with --group_nr\n")
+        exit(1)
+    return bins[idx[1][0]]
+
+def main(ctx):
+
+    if ctx.job_file:
+        try:
+            from configparser import SafeConfigParser, NoOptionError
+        except ImportError:
+            from ConfigParser import SafeConfigParser, NoOptionError
+
+        cp = SafeConfigParser(allow_no_value=True)
+        with open(ctx.job_file, 'r') as fp:
+            cp.readfp(fp)
+
+        if ctx.interval is None:
+            # Auto detect --interval value
+            for s in cp.sections():
+                try:
+                    hist_msec = cp.get(s, 'log_hist_msec')
+                    if hist_msec is not None:
+                        ctx.interval = int(hist_msec)
+                except NoOptionError:
+                    pass
+
+    if ctx.interval is None:
+        ctx.interval = 1000
+
+    # Automatically detect how many columns are in the input files,
+    # calculate the corresponding 'coarseness' parameter used to generate
+    # those files, and calculate the appropriate bin latency values:
+    with open(ctx.FILE[0], 'r') as fp:
+        global bin_vals,lower_bin_vals,upper_bin_vals,__HIST_COLUMNS,__TOTAL_COLUMNS
+        __TOTAL_COLUMNS = len(fp.readline().split(','))
+        __HIST_COLUMNS = __TOTAL_COLUMNS - __NON_HIST_COLUMNS
+
+        max_cols = guess_max_from_bins(ctx, __HIST_COLUMNS)
+        coarseness = int(np.log2(float(max_cols) / __HIST_COLUMNS))
+        bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness), np.arange(__HIST_COLUMNS)), dtype=float)
+        lower_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 0.0), np.arange(__HIST_COLUMNS)), dtype=float)
+        upper_bin_vals = np.array(map(lambda x: plat_idx_to_val_coarse(x, coarseness, 1.0), np.arange(__HIST_COLUMNS)), dtype=float)
+
+    fps = [open(f, 'r') for f in ctx.FILE]
+    gen = histogram_generator(ctx, fps, ctx.buff_size)
+
+    print(', '.join(columns))
+
+    try:
+        start, end = 0, ctx.interval
+        arr = np.empty(shape=(0,__TOTAL_COLUMNS - 1))
+        more_data = True
+        while more_data or len(arr) > 0:
+            
+            # Read up to ctx.max_latency (default 20 seconds) of data from end of current interval.
+            while len(arr) == 0 or arr[-1][0] < ctx.max_latency * 1000 + end:
+                try:
+                    new_arr = next(gen)
+                except StopIteration:
+                    more_data = False
+                    break
+                arr = np.append(arr, new_arr.reshape((1,__TOTAL_COLUMNS - 1)), axis=0)
+            arr = arr.astype(int)
+            
+            if arr.size > 0:
+                # Jump immediately to the start of the input, rounding
+                # down to the nearest multiple of the interval (useful when --log_unix_epoch
+                # was used to create these histograms):
+                if start == 0 and arr[0][0] - ctx.max_latency > end:
+                    start = arr[0][0] - ctx.max_latency
+                    start = start - (start % ctx.interval)
+                    end = start + ctx.interval
+
+                process_interval(ctx, arr, start, end)
+                
+                # Update arr to throw away samples we no longer need - samples which
+                # end before the start of the next interval, i.e. the end of the
+                # current interval:
+                idx = np.where(arr[:,0] > end)
+                arr = arr[idx]
+            
+            start += ctx.interval
+            end = start + ctx.interval
+    finally:
+        map(lambda f: f.close(), fps)
+
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg("FILE", help='space separated list of latency log filenames', nargs='+')
+    arg('--buff_size',
+        default=10000,
+        type=int,
+        help='number of samples to buffer into numpy at a time')
+
+    arg('--max_latency',
+        default=20,
+        type=float,
+        help='number of seconds of data to process at a time')
+
+    arg('-i', '--interval',
+        type=int,
+        help='interval width (ms), default 1000 ms')
+
+    arg('-d', '--divisor',
+        required=False,
+        type=int,
+        default=1,
+        help='divide the results by this value.')
+
+    arg('--decimals',
+        default=3,
+        type=int,
+        help='number of decimal places to print floats to')
+
+    arg('--warn',
+        dest='warn',
+        action='store_true',
+        default=False,
+        help='print warning messages to stderr')
+
+    arg('--group_nr',
+        default=19,
+        type=int,
+        help='FIO_IO_U_PLAT_GROUP_NR as defined in stat.h')
+
+    arg('--job-file',
+        default=None,
+        type=str,
+        help='Optional argument pointing to the job file used to create the '
+             'given histogram files. Useful for auto-detecting --log_hist_msec and '
+             '--log_unix_epoch (in fio) values.')
+
+    main(p.parse_args())
+

diff --git a/tools/hist/fiologparser_hist.py.1 b/tools/hist/fiologparser_hist.py.1
new file mode 100644
index 0000000..ed22c74
--- /dev/null
+++ b/tools/hist/fiologparser_hist.py.1

@@ -0,0 +1,201 @@
+.TH fiologparser_hist.py 1 "August 18, 2016"
+.SH NAME
+fiologparser_hist.py \- Calculate statistics from fio histograms
+.SH SYNOPSIS
+.B fiologparser_hist.py
+[\fIoptions\fR] [clat_hist_files]...
+.SH DESCRIPTION
+.B fiologparser_hist.py
+is a utility for converting *_clat_hist* files
+generated by fio into a CSV of latency statistics including minimum,
+average, maximum latency, and 50th, 95th, and 99th percentiles.
+.SH EXAMPLES
+.PP
+.nf
+$ fiologparser_hist.py *_clat_hist*
+end-time, samples, min, avg, median, 90%, 95%, 99%, max
+1000, 15, 192, 1678.107, 1788.859, 1856.076, 1880.040, 1899.208, 1888.000
+2000, 43, 152, 1642.368, 1714.099, 1816.659, 1845.552, 1888.131, 1888.000
+4000, 39, 1152, 1546.962, 1545.785, 1627.192, 1640.019, 1691.204, 1744
+...
+.fi
+.PP
+
+.SH OPTIONS
+.TP
+.BR \-\-help
+Print these options.
+.TP
+.BR \-\-buff_size \fR=\fPint
+Number of samples to buffer into numpy at a time. Default is 10,000.
+This can be adjusted to help performance.
+.TP
+.BR \-\-max_latency \fR=\fPint
+Number of seconds of data to process at a time. Defaults to 20 seconds,
+in order to handle the 17 second upper bound on latency in histograms
+reported by fio. This should be increased if fio has been
+run with a larger maximum latency. Lowering this when a lower maximum
+latency is known can improve performance. See NOTES for more details.
+.TP
+.BR \-i ", " \-\-interval \fR=\fPint
+Interval at which statistics are reported. Defaults to 1000 ms. This
+should be set a minimum of the value for \fBlog_hist_msec\fR as given
+to fio.
+.TP
+.BR \-d ", " \-\-divisor \fR=\fPint
+Divide statistics by this value. Defaults to 1. Useful if you want to
+convert latencies from milliseconds to seconds (\fBdivisor\fR=\fP1000\fR).
+.TP
+.BR \-\-warn
+Enables warning messages printed to stderr, useful for debugging.
+.TP
+.BR \-\-group_nr \fR=\fPint
+Set this to the value of \fIFIO_IO_U_PLAT_GROUP_NR\fR as defined in
+\fPstat.h\fR if fio has been recompiled. Defaults to 19, the
+current value used in fio. See NOTES for more details.
+
+.SH NOTES
+end-times are calculated to be uniform increments of the \fB\-\-interval\fR value given,
+regardless of when histogram samples are reported. Of note:
+
+.RS
+Intervals with no samples are omitted. In the example above this means
+"no statistics from 2 to 3 seconds" and "39 samples influenced the statistics
+of the interval from 3 to 4 seconds".
+.LP
+Intervals with a single sample will have the same value for all statistics
+.RE
+
+.PP
+The number of samples is unweighted, corresponding to the total number of samples
+which have any effect whatsoever on the interval.
+
+Min statistics are computed using value of the lower boundary of the first bin
+(in increasing bin order) with non-zero samples in it. Similarly for max,
+we take the upper boundary of the last bin with non-zero samples in it.
+This is semantically identical to taking the 0th and 100th percentiles with a
+50% bin-width buffer (because percentiles are computed using mid-points of
+the bins). This enforces the following nice properties:
+
+.RS
+min <= 50th <= 90th <= 95th <= 99th <= max
+.LP
+min and max are strict lower and upper bounds on the actual
+min / max seen by fio (and reported in *_clat.* with averaging turned off).
+.RE
+
+.PP
+Average statistics use a standard weighted arithmetic mean.
+
+Percentile statistics are computed using the weighted percentile method as
+described here: \fIhttps://en.wikipedia.org/wiki/Percentile#Weighted_percentile\fR.
+See weights() method for details on how weights are computed for individual
+samples. In process_interval() we further multiply by the height of each bin
+to get weighted histograms.
+
+We convert files given on the command line, assumed to be fio histogram files,
+An individual histogram file can contain the
+histograms for multiple different r/w directions (notably when \fB\-\-rw\fR=\fPrandrw\fR). This
+is accounted for by tracking each r/w direction separately. In the statistics
+reported we ultimately merge *all* histograms (regardless of r/w direction).
+
+The value of *_GROUP_NR in \fIstat.h\fR (and *_BITS) determines how many latency bins
+fio outputs when histogramming is enabled. Namely for the current default of
+GROUP_NR=19, we get 1,216 bins with a maximum latency of approximately 17
+seconds. For certain applications this may not be sufficient. With GROUP_NR=24
+we have 1,536 bins, giving us a maximum latency of 541 seconds (~ 9 minutes). If
+you expect your application to experience latencies greater than 17 seconds,
+you will need to recompile fio with a larger GROUP_NR, e.g. with:
+
+.RS
+.PP
+.nf
+sed -i.bak 's/^#define FIO_IO_U_PLAT_GROUP_NR 19\n/#define FIO_IO_U_PLAT_GROUP_NR 24/g' stat.h
+make fio
+.fi
+.PP
+.RE
+
+.PP
+Quick reference table for the max latency corresponding to a sampling of
+values for GROUP_NR:
+
+.RS
+.PP
+.nf
+GROUP_NR | # bins | max latency bin value
+19       | 1216   | 16.9 sec
+20       | 1280   | 33.8 sec
+21       | 1344   | 67.6 sec
+22       | 1408   | 2  min, 15 sec
+23       | 1472   | 4  min, 32 sec
+24       | 1536   | 9  min, 4  sec
+25       | 1600   | 18 min, 8  sec
+26       | 1664   | 36 min, 16 sec
+.fi
+.PP
+.RE
+
+.PP
+At present this program automatically detects the number of histogram bins in
+the log files, and adjusts the bin latency values accordingly. In particular if
+you use the \fB\-\-log_hist_coarseness\fR parameter of fio, you get output files with
+a number of bins according to the following table (note that the first
+row is identical to the table above):
+
+.RS
+.PP
+.nf
+coarse \\ GROUP_NR
+        19     20    21     22     23     24     25     26
+   -------------------------------------------------------
+  0  [[ 1216,  1280,  1344,  1408,  1472,  1536,  1600,  1664],
+  1   [  608,   640,   672,   704,   736,   768,   800,   832],
+  2   [  304,   320,   336,   352,   368,   384,   400,   416],
+  3   [  152,   160,   168,   176,   184,   192,   200,   208],
+  4   [   76,    80,    84,    88,    92,    96,   100,   104],
+  5   [   38,    40,    42,    44,    46,    48,    50,    52],
+  6   [   19,    20,    21,    22,    23,    24,    25,    26],
+  7   [  N/A,    10,   N/A,    11,   N/A,    12,   N/A,    13],
+  8   [  N/A,     5,   N/A,   N/A,   N/A,     6,   N/A,   N/A]]
+.fi
+.PP
+.RE
+
+.PP
+For other values of GROUP_NR and coarseness, this table can be computed like this:
+
+.RS
+.PP
+.nf
+bins = [1216,1280,1344,1408,1472,1536,1600,1664]
+max_coarse = 8
+fncn = lambda z: list(map(lambda x: z/2**x if z % 2**x == 0 else nan, range(max_coarse + 1)))
+np.transpose(list(map(fncn, bins)))
+.fi
+.PP
+.RE
+
+.PP
+If you have not adjusted GROUP_NR for your (high latency) application, then you
+will see the percentiles computed by this tool max out at the max latency bin
+value as in the first table above, and in this plot (where GROUP_NR=19 and thus we see
+a max latency of ~16.7 seconds in the red line):
+
+.RS
+\fIhttps://www.cronburg.com/fio/max_latency_bin_value_bug.png
+.RE
+
+.PP
+Motivation for, design decisions, and the implementation process are
+described in further detail here:
+
+.RS
+\fIhttps://www.cronburg.com/fio/cloud-latency-problem-measurement/
+.RE
+
+.SH AUTHOR
+.B fiologparser_hist.py
+and this manual page were written by Karl Cronburg <karl.cronburg@gmail.com>.
+.SH "REPORTING BUGS"
+Report bugs to the \fBfio\fR mailing list <fio@vger.kernel.org>.

diff --git a/tools/hist/half-bins.py b/tools/hist/half-bins.py
new file mode 100755
index 0000000..d592af0
--- /dev/null
+++ b/tools/hist/half-bins.py

@@ -0,0 +1,38 @@
+#!/usr/bin/env python2.7
+""" Cut the number bins in half in fio histogram output. Example usage:
+
+        $ half-bins.py -c 2 output_clat_hist.1.log > smaller_clat_hist.1.log
+
+    Which merges e.g. bins [0 .. 3], [4 .. 7], ..., [1212 .. 1215] resulting in
+    304 = 1216 / (2**2) merged bins per histogram sample.
+
+    @author Karl Cronburg <karl.cronburg@gmail.com>
+"""
+import sys
+
+def main(ctx):
+    stride = 1 << ctx.coarseness
+    with open(ctx.FILENAME, 'r') as fp:
+        for line in fp.readlines():
+            vals = line.split(', ')
+            sys.stdout.write("%s, %s, %s, " % tuple(vals[:3]))
+
+            hist = list(map(int, vals[3:]))
+            for i in range(0, len(hist) - stride, stride):
+                sys.stdout.write("%d, " % sum(hist[i : i + stride],))
+            sys.stdout.write("%d\n" % sum(hist[len(hist) - stride:]))
+
+if __name__ == '__main__':
+    import argparse
+    p = argparse.ArgumentParser()
+    arg = p.add_argument
+    arg( 'FILENAME', help='clat_hist file for which we will reduce'
+                         ' (by half or more) the number of bins.')
+    arg('-c', '--coarseness',
+       default=1,
+       type=int,
+       help='number of times to reduce number of bins by half, '
+            'e.g. coarseness of 4 merges each 2^4 = 16 consecutive '
+            'bins.')
+    main(p.parse_args())
+

diff --git a/tools/plot/fio2gnuplot b/tools/plot/fio2gnuplot
index 2d64a6e..a703ae3 100755
--- a/tools/plot/fio2gnuplot
+++ b/tools/plot/fio2gnuplot

@@ -31,7 +31,7 @@
 	fio_data_file=[]
 	# For all the local files
 	for file in os.listdir(path):
-	    # If the file math the regexp
+	    # If the file matches the glob
 	    if fnmatch.fnmatch(file, pattern):
 		# Let's consider this file
 		fio_data_file.append(file)
@@ -361,7 +361,7 @@
     print 'fio2gnuplot -ghbiodvk -t <title> -o <outputfile> -p <pattern> -G <type> -m <time> -M <time>'
     print
     print '-h --help                           : Print this help'
-    print '-p <pattern> or --pattern <pattern> : A pattern in regexp to select fio input files'
+    print '-p <pattern> or --pattern <pattern> : A glob pattern to select fio input files'
     print '-b           or --bandwidth         : A predefined pattern for selecting *_bw.log files'
     print '-i           or --iops              : A predefined pattern for selecting *_iops.log files'
     print '-g           or --gnuplot           : Render gnuplot traces before exiting'
@@ -458,7 +458,15 @@
     fio_data_file=find_file('.',pattern)
     if len(fio_data_file) == 0:
 	    print "No log file found with pattern %s!" % pattern
-	    sys.exit(1)
+	    # Try numjob log file format if per_numjob_logs=1
+            if (pattern == '*_bw.log'):
+                fio_data_file=find_file('.','*_bw.*.log')
+            if (pattern == '*_iops.log'):
+                fio_data_file=find_file('.','*_iops.*.log')
+            if len(fio_data_file) == 0:
+                sys.exit(1)
+            else:
+                print "Using log file per job format instead"
     else:
 	    print "%d files Selected with pattern '%s'" % (len(fio_data_file), pattern)
 
@@ -479,7 +487,7 @@
     #We need to adjust the output filename regarding the pattern required by the user
     if (pattern_set_by_user == True):
 	    gnuplot_output_filename=pattern
-	    # As we do have some regexp in the pattern, let's make this simpliest
+	    # As we do have some glob in the pattern, let's make this simpliest
 	    # We do remove the simpliest parts of the expression to get a clear file name
 	    gnuplot_output_filename=gnuplot_output_filename.replace('-*-','-')
 	    gnuplot_output_filename=gnuplot_output_filename.replace('*','-')
@@ -488,6 +496,8 @@
 	    # Insure that we don't have any starting or trailing dash to the filename
 	    gnuplot_output_filename = gnuplot_output_filename[:-1] if gnuplot_output_filename.endswith('-') else gnuplot_output_filename
 	    gnuplot_output_filename = gnuplot_output_filename[1:] if gnuplot_output_filename.startswith('-') else gnuplot_output_filename
+	    if (gnuplot_output_filename == ''):
+            	gnuplot_output_filename='default'	
 
     if parse_global==True:
 	parse_global_files(fio_data_file, global_search)

diff --git a/tools/plot/graph2D.gpm b/tools/plot/graph2D.gpm
index 5cd6ff3..769b754 100644
--- a/tools/plot/graph2D.gpm
+++ b/tools/plot/graph2D.gpm

@@ -1,9 +1,30 @@
 # This Gnuplot file has been generated by eNovance
 
-set title '$0'
+needed_args = 8
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+	ARG7 = "$6"; \
+	ARG8 = "$7"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+avg_num = ARG8 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 #set terminal x11
 
 #Preparing Axes
@@ -12,7 +33,7 @@
 #set data style lines
 set key top left reverse
 set xlabel "Time (Seconds)"
-set ylabel '$4'
+set ylabel ARG5
 set xrange [0:]
 set yrange [0:]
 
@@ -22,13 +43,13 @@
 set style line 100 lt 7 lw 0.5
 set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
 
-plot '$1' using 2:3 with linespoints title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+plot ARG2 using 2:3 with linespoints title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
 
-set output '$5.png'
-plot '$1' using 2:3 smooth csplines title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+set output ARG6 . '.png'
+plot ARG2 using 2:3 smooth csplines title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'
 
-set output '$6.png'
-plot '$1' using 2:3 smooth bezier title '$2', $7 w l ls 1 ti 'Global average value ($7)'
+set output ARG7 . '.png'
+plot ARG2 using 2:3 smooth bezier title ARG3, avg_num w l ls 1 ti 'Global average value (' . avg_str .')'
 
 #pause -1
 #The End

diff --git a/tools/plot/graph3D.gpm b/tools/plot/graph3D.gpm
index 93f7a4d..ac2cdf6 100644
--- a/tools/plot/graph3D.gpm
+++ b/tools/plot/graph3D.gpm

@@ -1,9 +1,24 @@
 # This Gnuplot file has been generated by eNovance
 
-set title '$0'
+needed_args = 5
+if (exists("ARGC") && ARGC >= needed_args) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" >= needed_args) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 #set terminal x11
 #3D Config
 set isosamples 30
@@ -19,7 +34,7 @@
 set key top left reverse
 set ylabel "Disk"
 set xlabel "Time (Seconds)"
-set zlabel '$4'
+set zlabel ARG5
 set cbrange [0:]
 set zrange [0:]
 
@@ -35,7 +50,7 @@
 set size 0.5,0.5
 set view 64,216
 set origin 0,0.5
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Top Right View
 set size 0.5,0.5
@@ -43,7 +58,7 @@
 set view 90,0
 set pm3d at s solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Bottom Right View
 set size 0.5,0.5
@@ -51,13 +66,13 @@
 set view 63,161
 set pm3d at s solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Bottom Left View
 set size 0.5,0.5
 set origin 0,0
 set pm3d map
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #Unsetting multiplotting
 unset multiplot
@@ -66,7 +81,7 @@
 #Preparing 3D Interactive view
 set mouse
 set terminal png size 1024,768
-set output '$3-3D.png'
+set output ARG4 . '-3D.png'
 
 #set term x11
 set view 64,216
@@ -74,7 +89,7 @@
 set size 1,1
 set pm3d at bs solid hidden3d 100 scansbackward
 set pm3d depthorder
-splot '$1' using 2:1:3 with linespoints title '$2'
+splot ARG2 using 2:1:3 with linespoints title ARG3
 
 #pause -1
 #The End

diff --git a/tools/plot/math.gpm b/tools/plot/math.gpm
index a01f5a0..0a2aff5 100644
--- a/tools/plot/math.gpm
+++ b/tools/plot/math.gpm

@@ -1,15 +1,32 @@
 # This Gnuplot file has been generated by eNovance
+if (exists("ARGC") && ARGC > 5) \
+	found_args = 1; \
+else if (strlen("$$#") < 3 && "$#" > 5) \
+	found_args = 1; \
+	ARG1 = "$0"; \
+	ARG2 = "$1"; \
+	ARG3 = "$2"; \
+	ARG4 = "$3"; \
+	ARG5 = "$4"; \
+	ARG6 = "$5"; \
+else \
+	found_args = 0; \
+	print "Aborting: could not find all arguments"; \
+	exit
 
-set title '$0'
+avg_num = ARG6 + 0
+avg_str = sprintf("%g", avg_num)
+
+set title ARG1
 
 set terminal png size 1280,1024
-set output '$3.png'
+set output ARG4 . '.png'
 
 set palette rgbformulae 7,5,15
 set style line 100 lt 7 lw 0.5
 set style fill transparent solid 0.9 noborder
 set auto x
-set ylabel '$4'
+set ylabel ARG5
 set xlabel "Disk"
 set yrange [0:]
 set style data histogram
@@ -22,4 +39,4 @@
 set xtic rotate by 45 scale 0 font ",8" autojustify
 set xtics offset 0,-1 border -5,1,5
 set style line 1 lt 1 lw 3 pt 3 linecolor rgb "green"
-plot '$1' using 2:xtic(1) ti col, $5 w l ls 1 ti 'Global average value ($5)'
+plot ARG2 using 2:xtic(1) ti col, avg_num w l ls 1 ti 'Global average value (' . avg_str . ')'

diff --git a/trim.c b/trim.c
index 95c433b..78cf672 100644
--- a/trim.c
+++ b/trim.c

@@ -11,7 +11,7 @@
 #include "trim.h"
 
 #ifdef FIO_HAVE_TRIM
-int get_next_trim(struct thread_data *td, struct io_u *io_u)
+bool get_next_trim(struct thread_data *td, struct io_u *io_u)
 {
 	struct io_piece *ipo;
 
@@ -19,9 +19,9 @@
 	 * this io_u is from a requeue, we already filled the offsets
 	 */
 	if (io_u->file)
-		return 0;
+		return true;
 	if (flist_empty(&td->trim_list))
-		return 1;
+		return false;
 
 	assert(td->trim_entries);
 	ipo = flist_first_entry(&td->trim_list, struct io_piece, trim_list);
@@ -53,7 +53,7 @@
 		if (r) {
 			dprint(FD_VERIFY, "failed file %s open\n",
 					io_u->file->file_name);
-			return 1;
+			return false;
 		}
 	}
 
@@ -64,19 +64,21 @@
 	io_u->xfer_buflen = io_u->buflen;
 
 	dprint(FD_VERIFY, "get_next_trim: ret io_u %p\n", io_u);
-	return 0;
+	return true;
 }
 
-int io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
 {
 	unsigned long long val;
+	uint64_t frand_max;
 	unsigned long r;
 
 	if (!td->o.trim_percentage)
-		return 0;
+		return false;
 
+	frand_max = rand_max(&td->trim_state);
 	r = __rand(&td->trim_state);
-	val = (FRAND_MAX / 100ULL);
+	val = (frand_max / 100ULL);
 
 	val *= (unsigned long long) td->o.trim_percentage;
 	return r <= val;

diff --git a/trim.h b/trim.h
index 6584606..37f5d7c 100644
--- a/trim.h
+++ b/trim.h

@@ -4,8 +4,8 @@
 #include "fio.h"
 
 #ifdef FIO_HAVE_TRIM
-extern int __must_check get_next_trim(struct thread_data *td, struct io_u *io_u);
-extern int io_u_should_trim(struct thread_data *td, struct io_u *io_u);
+extern bool __must_check get_next_trim(struct thread_data *td, struct io_u *io_u);
+extern bool io_u_should_trim(struct thread_data *td, struct io_u *io_u);
 
 /*
  * Determine whether a given io_u should be logged for verify or
@@ -20,13 +20,13 @@
 }
 
 #else
-static inline int get_next_trim(struct thread_data *td, struct io_u *io_u)
+static inline bool get_next_trim(struct thread_data *td, struct io_u *io_u)
 {
-	return 1;
+	return false;
 }
-static inline int io_u_should_trim(struct thread_data *td, struct io_u *io_u)
+static inline bool io_u_should_trim(struct thread_data *td, struct io_u *io_u)
 {
-	return 0;
+	return false;
 }
 static inline void remove_trim_entry(struct thread_data *td, struct io_piece *ipo)
 {

diff --git a/unit_tests/steadystate_tests.py b/unit_tests/steadystate_tests.py
new file mode 100755
index 0000000..91c79a4
--- /dev/null
+++ b/unit_tests/steadystate_tests.py

@@ -0,0 +1,222 @@
+#!/usr/bin/python
+#
+# steadystate_tests.py
+#
+# Test option parsing and functonality for fio's steady state detection feature.
+#
+# steadystate_tests.py ./fio file-for-read-testing file-for-write-testing
+#
+# REQUIREMENTS
+# Python 2.6+
+# SciPy
+#
+# KNOWN ISSUES
+# only option parsing and read tests are carried out
+# On Windows this script works under Cygwin but not from cmd.exe
+# On Windows I encounter frequent fio problems generating JSON output (nothing to decode)
+# min runtime:
+# if ss attained: min runtime = ss_dur + ss_ramp
+# if not attained: runtime = timeout
+
+import os
+import sys
+import json
+import uuid
+import pprint
+import argparse
+import subprocess
+from scipy import stats
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('fio',
+                        help='path to fio executable');
+    parser.add_argument('--read',
+                        help='target for read testing')
+    parser.add_argument('--write',
+                        help='target for write testing')
+    args = parser.parse_args()
+
+    return args
+
+
+def check(data, iops, slope, pct, limit, dur, criterion):
+    measurement = 'iops' if iops else 'bw'
+    data = data[measurement]
+    mean = sum(data) / len(data)
+    if slope:
+        x = range(len(data))
+        m, intercept, r_value, p_value, std_err = stats.linregress(x,data)
+        m = abs(m)
+        if pct:
+            target = m / mean * 100
+            criterion = criterion[:-1]
+        else:
+            target = m
+    else:
+        maxdev = 0
+        for x in data:
+            maxdev = max(abs(mean-x), maxdev)
+        if pct:
+            target = maxdev / mean * 100
+            criterion = criterion[:-1]
+        else:
+            target = maxdev
+
+    criterion = float(criterion)
+    return (abs(target - criterion) / criterion < 0.005), target < limit, mean, target
+
+
+if __name__ == '__main__':
+    args = parse_args()
+
+    pp = pprint.PrettyPrinter(indent=4)
+
+#
+# test option parsing
+#
+    parsing = [ { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10", "--ss_ramp=5"],
+                  'output': "set steady state IOPS threshold to 10.000000" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=iops:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:10%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 10.000000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:.1%", "--ss_ramp=5"],
+                  'output': "set steady state threshold to 0.100000%" },
+                { 'args': ["--parse-only", "--debug=parse", "--ss_dur=10s", "--ss=bw:12", "--ss_ramp=5"],
+                  'output': "set steady state BW threshold to 12" },
+              ]
+    for test in parsing:
+        output = subprocess.check_output([args.fio] + test['args']);
+        if test['output'] in output:
+            print "PASSED '{0}' found with arguments {1}".format(test['output'], test['args'])
+        else:
+            print "FAILED '{0}' NOT found with arguments {1}".format(test['output'], test['args'])
+
+#
+# test some read workloads
+#
+# if ss active and attained,
+#   check that runtime is less than job time
+#   check criteria
+#   how to check ramp time?
+#
+# if ss inactive
+#   check that runtime is what was specified
+#
+    reads = [ {'s': True, 'timeout': 100, 'numjobs': 1, 'ss_dur': 5, 'ss_ramp': 3, 'iops': True, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': False, 'timeout': 20, 'numjobs': 2},
+              {'s': True, 'timeout': 100, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 5, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+              {'s': True, 'timeout': 10, 'numjobs': 3, 'ss_dur': 10, 'ss_ramp': 500, 'iops': False, 'slope': True, 'ss_limit': 0.1, 'pct': True},
+            ]
+
+    if args.read == None:
+        if os.name == 'posix':
+            args.read = '/dev/zero'
+            extra = [ "--size=134217728" ]  # 128 MiB
+        else:
+            print "ERROR: file for read testing must be specified on non-posix systems"
+            sys.exit(1)
+    else:
+        extra = []
+
+    jobnum = 0
+    for job in reads:
+
+        tf = uuid.uuid4().hex
+        parameters = [ "--name=job{0}".format(jobnum) ]
+        parameters.extend(extra)
+        parameters.extend([ "--thread",
+                            "--output-format=json",
+                            "--output={0}".format(tf),
+                            "--filename={0}".format(args.read),
+                            "--rw=randrw",
+                            "--rwmixread=100",
+                            "--stonewall",
+                            "--group_reporting",
+                            "--numjobs={0}".format(job['numjobs']),
+                            "--time_based",
+                            "--runtime={0}".format(job['timeout']) ])
+        if job['s']:
+           if job['iops']:
+               ss = 'iops'
+           else:
+               ss = 'bw'
+           if job['slope']:
+               ss += "_slope"
+           ss += ":" + str(job['ss_limit'])
+           if job['pct']:
+               ss += '%'
+           parameters.extend([ '--ss_dur={0}'.format(job['ss_dur']),
+                               '--ss={0}'.format(ss),
+                               '--ss_ramp={0}'.format(job['ss_ramp']) ])
+
+        output = subprocess.call([args.fio] + parameters)
+        with open(tf, 'r') as source:
+            jsondata = json.loads(source.read())
+        os.remove(tf)
+
+        for jsonjob in jsondata['jobs']:
+            line = "job {0}".format(jsonjob['job options']['name'])
+            if job['s']:
+                if jsonjob['steadystate']['attained'] == 1:
+                    # check runtime >= ss_dur + ss_ramp, check criterion, check criterion < limit
+                    mintime = (job['ss_dur'] + job['ss_ramp']) * 1000
+                    actual = jsonjob['read']['runtime']
+                    if mintime > actual:
+                        line = 'FAILED ' + line + ' ss attained, runtime {0} < ss_dur {1} + ss_ramp {2}'.format(actual, job['ss_dur'], job['ss_ramp'])
+                    else:
+                        line = line + ' ss attained, runtime {0} > ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                        else:
+                            if met:
+                                line = 'PASSED ' + line + ' target {0} < limit {1}'.format(target, job['ss_limit'])
+                            else:
+                                line = 'FAILED ' + line + ' target {0} < limit {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                else:
+                    # check runtime, confirm criterion calculation, and confirm that criterion was not met
+                    expected = job['timeout'] * 1000
+                    actual = jsonjob['read']['runtime']
+                    if abs(expected - actual) > 10:
+                        line = 'FAILED ' + line + ' ss not attained, expected runtime {0} != actual runtime {1}'.format(expected, actual)
+                    else:
+                        line = line + ' ss not attained, runtime {0} != ss_dur {1} + ss_ramp {2},'.format(actual, job['ss_dur'], job['ss_ramp'])
+                        objsame, met, mean, target = check(data=jsonjob['steadystate']['data'],
+                            iops=job['iops'],
+                            slope=job['slope'],
+                            pct=job['pct'],
+                            limit=job['ss_limit'],
+                            dur=job['ss_dur'],
+                            criterion=jsonjob['steadystate']['criterion'])
+                        if not objsame:
+                            if actual > (job['ss_dur'] + job['ss_ramp'])*1000:
+                                line = 'FAILED ' + line + ' fio criterion {0} != calculated criterion {1} '.format(jsonjob['steadystate']['criterion'], target)
+                            else:
+                                line = 'PASSED ' + line + ' fio criterion {0} == 0.0 since ss_dur + ss_ramp has not elapsed '.format(jsonjob['steadystate']['criterion'])
+                        else:
+                            if met:
+                                line = 'FAILED ' + line + ' target {0} < threshold {1} but fio reports ss not attained '.format(target, job['ss_limit'])
+                            else:
+                                line = 'PASSED ' + line + ' criterion {0} > threshold {1}'.format(target, job['ss_limit'])
+            else:
+                expected = job['timeout'] * 1000
+                actual = jsonjob['read']['runtime']
+                if abs(expected - actual) < 10:
+                    result = 'PASSED '
+                else:
+                    result = 'FAILED '
+                line = result + line + ' no ss, expected runtime {0} ~= actual runtime {1}'.format(expected, actual)
+            print line
+            if 'steadystate' in jsonjob:
+                pp.pprint(jsonjob['steadystate'])
+        jobnum += 1

diff --git a/verify-state.h b/verify-state.h
new file mode 100644
index 0000000..e46265e
--- /dev/null
+++ b/verify-state.h

@@ -0,0 +1,108 @@
+#ifndef FIO_VERIFY_STATE_H
+#define FIO_VERIFY_STATE_H
+
+#include <stdint.h>
+#include <string.h>
+#include <limits.h>
+
+struct thread_rand32_state {
+	uint32_t s[4];
+};
+
+struct thread_rand64_state {
+	uint64_t s[6];
+};
+
+struct thread_rand_state {
+	uint64_t use64;
+	union {
+		struct thread_rand32_state state32;
+		struct thread_rand64_state state64;
+	};
+};
+
+/*
+ * For dumping current write state
+ */
+struct file_comp {
+	uint64_t fileno;
+	uint64_t offset;
+};
+
+struct thread_io_list {
+	uint64_t no_comps;
+	uint32_t depth;
+	uint32_t nofiles;
+	uint64_t numberio;
+	uint64_t index;
+	struct thread_rand_state rand;
+	uint8_t name[64];
+	struct file_comp comps[0];
+};
+
+struct all_io_list {
+	uint64_t threads;
+	struct thread_io_list state[0];
+};
+
+#define VSTATE_HDR_VERSION	0x03
+
+struct verify_state_hdr {
+	uint64_t version;
+	uint64_t size;
+	uint64_t crc;
+};
+
+#define IO_LIST_ALL		0xffffffff
+
+struct io_u;
+extern struct all_io_list *get_all_io_list(int, size_t *);
+extern void __verify_save_state(struct all_io_list *, const char *);
+extern void verify_save_state(int mask);
+extern int verify_load_state(struct thread_data *, const char *);
+extern void verify_free_state(struct thread_data *);
+extern int verify_state_should_stop(struct thread_data *, struct io_u *);
+extern void verify_assign_state(struct thread_data *, void *);
+extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *);
+
+static inline size_t __thread_io_list_sz(uint32_t depth, uint32_t nofiles)
+{
+	return sizeof(struct thread_io_list) + depth * nofiles * sizeof(struct file_comp);
+}
+
+static inline size_t thread_io_list_sz(struct thread_io_list *s)
+{
+	return __thread_io_list_sz(le32_to_cpu(s->depth), le32_to_cpu(s->nofiles));
+}
+
+static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
+{
+	return (void *) s + thread_io_list_sz(s);
+}
+
+static inline void verify_state_gen_name(char *out, size_t size,
+					 const char *name, const char *prefix,
+					 int num)
+{
+	char ename[PATH_MAX];
+	char *ptr;
+
+	/*
+	 * Escape '/', just turn them into '.'
+	 */
+	ptr = ename;
+	do {
+		*ptr = *name;
+		if (*ptr == '\0')
+			break;
+		else if (*ptr == '/')
+			*ptr = '.';
+		ptr++;
+		name++;
+	} while (1);
+
+	snprintf(out, size, "%s-%s-%d-verify.state", prefix, ename, num);
+	out[size - 1] = '\0';
+}
+
+#endif

diff --git a/verify.c b/verify.c
index b6793d7..cadfe9c 100644
--- a/verify.c
+++ b/verify.c

@@ -13,6 +13,7 @@
 #include "trim.h"
 #include "lib/rand.h"
 #include "lib/hweight.h"
+#include "lib/pattern.h"
 
 #include "crc/md5.h"
 #include "crc/crc64.h"
@@ -24,23 +25,31 @@
 #include "crc/sha512.h"
 #include "crc/sha1.h"
 #include "crc/xxhash.h"
+#include "crc/sha3.h"
 
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
 			 struct verify_header *hdr, unsigned int header_num,
 			 unsigned int header_len);
+static void fill_hdr(struct thread_data *td, struct io_u *io_u,
+		     struct verify_header *hdr, unsigned int header_num,
+		     unsigned int header_len, uint64_t rand_seed);
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed);
 
 void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len)
 {
-	fill_pattern(p, len, td->o.buffer_pattern, td->o.buffer_pattern_bytes);
+	(void)cpy_pattern(td->o.buffer_pattern, td->o.buffer_pattern_bytes, p, len);
 }
 
-void __fill_buffer(struct thread_options *o, unsigned long seed, void *p,
-		   unsigned int len)
+static void __fill_buffer(struct thread_options *o, unsigned long seed, void *p,
+			  unsigned int len)
 {
 	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
 }
 
-unsigned long fill_buffer(struct thread_data *td, void *p, unsigned int len)
+static unsigned long fill_buffer(struct thread_data *td, void *p,
+				 unsigned int len)
 {
 	struct frand_state *fs = &td->verify_state;
 	struct thread_options *o = &td->o;
@@ -63,13 +72,17 @@
 		return;
 	}
 
-	if (io_u->buf_filled_len >= len) {
+	/* Skip if we were here and we do not need to patch pattern
+	 * with format */
+	if (!td->o.verify_fmt_sz && io_u->buf_filled_len >= len) {
 		dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n",
 			o->verify_pattern_bytes, len);
 		return;
 	}
 
-	fill_pattern(p, len, o->verify_pattern, o->verify_pattern_bytes);
+	(void)paste_format(td->o.verify_pattern, td->o.verify_pattern_bytes,
+			   td->o.verify_fmt, td->o.verify_fmt_sz,
+			   p, len, io_u);
 	io_u->buf_filled_len = len;
 }
 
@@ -132,7 +145,9 @@
 
 	switch (verify_type) {
 	case VERIFY_NONE:
+	case VERIFY_HDR_ONLY:
 	case VERIFY_NULL:
+	case VERIFY_PATTERN:
 		len = 0;
 		break;
 	case VERIFY_MD5:
@@ -158,18 +173,26 @@
 	case VERIFY_SHA512:
 		len = sizeof(struct vhdr_sha512);
 		break;
+	case VERIFY_SHA3_224:
+		len = sizeof(struct vhdr_sha3_224);
+		break;
+	case VERIFY_SHA3_256:
+		len = sizeof(struct vhdr_sha3_256);
+		break;
+	case VERIFY_SHA3_384:
+		len = sizeof(struct vhdr_sha3_384);
+		break;
+	case VERIFY_SHA3_512:
+		len = sizeof(struct vhdr_sha3_512);
+		break;
 	case VERIFY_XXHASH:
 		len = sizeof(struct vhdr_xxhash);
 		break;
-	case VERIFY_META:
-		len = sizeof(struct vhdr_meta);
-		break;
 	case VERIFY_SHA1:
 		len = sizeof(struct vhdr_sha1);
 		break;
-	case VERIFY_PATTERN:
-		len = 0;
-		break;
+	case VERIFY_PATTERN_NO_HDR:
+		return 0;
 	default:
 		log_err("fio: unknown verify header!\n");
 		assert(0);
@@ -178,8 +201,12 @@
 	return len + sizeof(struct verify_header);
 }
 
-static inline unsigned int hdr_size(struct verify_header *hdr)
+static inline unsigned int hdr_size(struct thread_data *td,
+				    struct verify_header *hdr)
 {
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR)
+		return 0;
+
 	return __hdr_size(hdr->verify_type);
 }
 
@@ -223,8 +250,11 @@
 
 	ptr = strdup(f->file_name);
 
-	fname[DUMP_BUF_SZ - 1] = '\0';
-	strncpy(fname, basename(ptr), DUMP_BUF_SZ - 1);
+	memset(fname, 0, sizeof(fname));
+	if (aux_path)
+		sprintf(fname, "%s%s", aux_path, FIO_OS_PATH_SEPARATOR);
+
+	strncpy(fname + strlen(fname), basename(ptr), buf_left - 1);
 
 	buf_left -= strlen(fname);
 	if (buf_left <= 0) {
@@ -265,7 +295,7 @@
  * Dump the contents of the read block and re-generate the correct data
  * and dump that too.
  */
-static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+static void __dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
 {
 	struct thread_data *td = vc->td;
 	struct io_u *io_u = vc->io_u;
@@ -301,6 +331,19 @@
 	free(buf);
 }
 
+static void dump_verify_buffers(struct verify_header *hdr, struct vcont *vc)
+{
+	struct thread_data *td = vc->td;
+	struct verify_header shdr;
+
+	if (td->o.verify == VERIFY_PATTERN_NO_HDR) {
+		__fill_hdr(td, vc->io_u, &shdr, 0, vc->io_u->buflen, 0);
+		hdr = &shdr;
+	}
+
+	__dump_verify_buffers(hdr, vc);
+}
+
 static void log_verify_failure(struct verify_header *hdr, struct vcont *vc)
 {
 	unsigned long long offset;
@@ -325,7 +368,7 @@
  */
 static inline void *io_u_verify_off(struct verify_header *hdr, struct vcont *vc)
 {
-	return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(hdr);
+	return vc->io_u->buf + vc->hdr_num * hdr->len + hdr_size(vc->td, hdr);
 }
 
 static int verify_io_u_pattern(struct verify_header *hdr, struct vcont *vc)
@@ -334,35 +377,37 @@
 	struct io_u *io_u = vc->io_u;
 	char *buf, *pattern;
 	unsigned int header_size = __hdr_size(td->o.verify);
-	unsigned int len, mod, i, size, pattern_size;
+	unsigned int len, mod, i, pattern_size;
+	int rc;
 
 	pattern = td->o.verify_pattern;
 	pattern_size = td->o.verify_pattern_bytes;
-	if (pattern_size <= 1)
-		pattern_size = MAX_PATTERN_SIZE;
+	assert(pattern_size != 0);
+
+	(void)paste_format_inplace(pattern, pattern_size,
+				   td->o.verify_fmt, td->o.verify_fmt_sz, io_u);
+
 	buf = (void *) hdr + header_size;
 	len = get_hdr_inc(td, io_u) - header_size;
-	mod = header_size % pattern_size;
+	mod = (get_hdr_inc(td, io_u) * vc->hdr_num + header_size) % pattern_size;
 
-	for (i = 0; i < len; i += size) {
-		size = pattern_size - mod;
-		if (size > (len - i))
-			size = len - i;
-		if (memcmp(buf + i, pattern + mod, size))
-			/* Let the slow compare find the first mismatch byte. */
-			break;
-		mod = 0;
-	}
+	rc = cmp_pattern(pattern, pattern_size, mod, buf, len);
+	if (!rc)
+		return 0;
 
-	for (; i < len; i++) {
+	/* Slow path, compare each byte */
+	for (i = 0; i < len; i++) {
 		if (buf[i] != pattern[mod]) {
 			unsigned int bits;
 
 			bits = hweight8(buf[i] ^ pattern[mod]);
-			log_err("fio: got pattern %x, wanted %x. Bad bits %d\n",
-				buf[i], pattern[mod], bits);
+			log_err("fio: got pattern '%02x', wanted '%02x'. Bad bits %d\n",
+				(unsigned char)buf[i],
+				(unsigned char)pattern[mod],
+				bits);
 			log_err("fio: bad pattern block offset %u\n", i);
-			dump_verify_buffers(hdr, vc);
+			vc->name = "pattern";
+			log_verify_failure(hdr, vc);
 			return EILSEQ;
 		}
 		mod++;
@@ -370,44 +415,9 @@
 			mod = 0;
 	}
 
-	return 0;
-}
-
-static int verify_io_u_meta(struct verify_header *hdr, struct vcont *vc)
-{
-	struct thread_data *td = vc->td;
-	struct vhdr_meta *vh = hdr_priv(hdr);
-	struct io_u *io_u = vc->io_u;
-	int ret = EILSEQ;
-
-	dprint(FD_VERIFY, "meta verify io_u %p, len %u\n", io_u, hdr->len);
-
-	if (vh->offset == io_u->offset + vc->hdr_num * td->o.verify_interval)
-		ret = 0;
-
-	if (td->o.verify_pattern_bytes)
-		ret |= verify_io_u_pattern(hdr, vc);
-
-	/*
-	 * For read-only workloads, the program cannot be certain of the
-	 * last numberio written to a block. Checking of numberio will be
-	 * done only for workloads that write data.  For verify_only,
-	 * numberio will be checked in the last iteration when the correct
-	 * state of numberio, that would have been written to each block
-	 * in a previous run of fio, has been reached.
-	 */
-	if ((td_write(td) || td_rw(td)) && (td_min_bs(td) == td_max_bs(td)) &&
-	    !td->o.time_based)
-		if (!td->o.verify_only || td->o.loops == 0)
-			if (vh->numberio != io_u->numberio)
-				ret = EILSEQ;
-
-	if (!ret)
-		return 0;
-
-	vc->name = "meta";
-	log_verify_failure(hdr, vc);
-	return ret;
+	/* Unreachable line */
+	assert(0);
+	return EILSEQ;
 }
 
 static int verify_io_u_xxhash(struct verify_header *hdr, struct vcont *vc)
@@ -420,7 +430,7 @@
 	dprint(FD_VERIFY, "xxhash verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	state = XXH32_init(1);
-	XXH32_update(state, p, hdr->len - hdr_size(hdr));
+	XXH32_update(state, p, hdr->len - hdr_size(vc->td, hdr));
 	hash = XXH32_digest(state);
 
 	if (vh->hash == hash)
@@ -434,6 +444,84 @@
 	return EILSEQ;
 }
 
+static int verify_io_u_sha3(struct verify_header *hdr, struct vcont *vc,
+			    struct fio_sha3_ctx *sha3_ctx, uint8_t *sha,
+			    unsigned int sha_size, const char *name)
+{
+	void *p = io_u_verify_off(hdr, vc);
+
+	dprint(FD_VERIFY, "%s verify io_u %p, len %u\n", name, vc->io_u, hdr->len);
+
+	fio_sha3_update(sha3_ctx, p, hdr->len - hdr_size(vc->td, hdr));
+	fio_sha3_final(sha3_ctx);
+
+	if (!memcmp(sha, sha3_ctx->sha, sha_size))
+		return 0;
+
+	vc->name = name;
+	vc->good_crc = sha;
+	vc->bad_crc = sha3_ctx->sha;
+	vc->crc_len = sha_size;
+	log_verify_failure(hdr, vc);
+	return EILSEQ;
+}
+
+static int verify_io_u_sha3_224(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_224_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_224_DIGEST_SIZE, "sha3-224");
+}
+
+static int verify_io_u_sha3_256(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_256_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_256_DIGEST_SIZE, "sha3-256");
+}
+
+static int verify_io_u_sha3_384(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_384_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_384_DIGEST_SIZE, "sha3-384");
+}
+
+static int verify_io_u_sha3_512(struct verify_header *hdr, struct vcont *vc)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	uint8_t sha[SHA3_512_DIGEST_SIZE];
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+
+	return verify_io_u_sha3(hdr, vc, &sha3_ctx, vh->sha,
+				SHA3_512_DIGEST_SIZE, "sha3-512");
+}
+
 static int verify_io_u_sha512(struct verify_header *hdr, struct vcont *vc)
 {
 	void *p = io_u_verify_off(hdr, vc);
@@ -446,7 +534,7 @@
 	dprint(FD_VERIFY, "sha512 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha512_init(&sha512_ctx);
-	fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha512_update(&sha512_ctx, p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (!memcmp(vh->sha512, sha512_ctx.buf, sizeof(sha512)))
 		return 0;
@@ -471,7 +559,7 @@
 	dprint(FD_VERIFY, "sha256 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha256_init(&sha256_ctx);
-	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(vc->td, hdr));
 	fio_sha256_final(&sha256_ctx);
 
 	if (!memcmp(vh->sha256, sha256_ctx.buf, sizeof(sha256)))
@@ -497,7 +585,7 @@
 	dprint(FD_VERIFY, "sha1 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_sha1_init(&sha1_ctx);
-	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(vc->td, hdr));
 	fio_sha1_final(&sha1_ctx);
 
 	if (!memcmp(vh->sha1, sha1_ctx.H, sizeof(sha1)))
@@ -519,7 +607,7 @@
 
 	dprint(FD_VERIFY, "crc7 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc7(p, hdr->len - hdr_size(hdr));
+	c = fio_crc7(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc7)
 		return 0;
@@ -540,7 +628,7 @@
 
 	dprint(FD_VERIFY, "crc16 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc16(p, hdr->len - hdr_size(hdr));
+	c = fio_crc16(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc16)
 		return 0;
@@ -561,7 +649,7 @@
 
 	dprint(FD_VERIFY, "crc64 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc64(p, hdr->len - hdr_size(hdr));
+	c = fio_crc64(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc64)
 		return 0;
@@ -582,7 +670,7 @@
 
 	dprint(FD_VERIFY, "crc32 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc32(p, hdr->len - hdr_size(hdr));
+	c = fio_crc32(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc32)
 		return 0;
@@ -603,7 +691,7 @@
 
 	dprint(FD_VERIFY, "crc32c verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
-	c = fio_crc32c(p, hdr->len - hdr_size(hdr));
+	c = fio_crc32c(p, hdr->len - hdr_size(vc->td, hdr));
 
 	if (c == vh->crc32)
 		return 0;
@@ -628,7 +716,7 @@
 	dprint(FD_VERIFY, "md5 verify io_u %p, len %u\n", vc->io_u, hdr->len);
 
 	fio_md5_init(&md5_ctx);
-	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(hdr));
+	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(vc->td, hdr));
 	fio_md5_final(&md5_ctx);
 
 	if (!memcmp(vh->md5_digest, md5_ctx.hash, sizeof(hash)))
@@ -656,7 +744,7 @@
 
 	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
-		io_u->flags &= ~IO_U_F_IN_CUR_DEPTH;
+		io_u_clear(td, io_u, IO_U_F_IN_CUR_DEPTH);
 	}
 	flist_add_tail(&io_u->verify_list, &td->verify_list);
 	*io_u_ptr = NULL;
@@ -666,42 +754,68 @@
 	return 0;
 }
 
+/*
+ * Thanks Rusty, for spending the time so I don't have to.
+ *
+ * http://rusty.ozlabs.org/?p=560
+ */
+static int mem_is_zero(const void *data, size_t length)
+{
+	const unsigned char *p = data;
+	size_t len;
+
+	/* Check first 16 bytes manually */
+	for (len = 0; len < 16; len++) {
+		if (!length)
+			return 1;
+		if (*p)
+			return 0;
+		p++;
+		length--;
+	}
+
+	/* Now we know that's zero, memcmp with self. */
+	return memcmp(data, p, length) == 0;
+}
+
+static int mem_is_zero_slow(const void *data, size_t length, size_t *offset)
+{
+	const unsigned char *p = data;
+
+	*offset = 0;
+	while (length) {
+		if (*p)
+			break;
+		(*offset)++;
+		length--;
+		p++;
+	}
+
+	return !length;
+}
+
 static int verify_trimmed_io_u(struct thread_data *td, struct io_u *io_u)
 {
-	static char zero_buf[1024];
-	unsigned int this_len, len;
-	int ret = 0;
-	void *p;
+	size_t offset;
 
 	if (!td->o.trim_zero)
 		return 0;
 
-	len = io_u->buflen;
-	p = io_u->buf;
-	do {
-		this_len = sizeof(zero_buf);
-		if (this_len > len)
-			this_len = len;
-		if (memcmp(p, zero_buf, this_len)) {
-			ret = EILSEQ;
-			break;
-		}
-		len -= this_len;
-		p += this_len;
-	} while (len);
-
-	if (!ret)
+	if (mem_is_zero(io_u->buf, io_u->buflen))
 		return 0;
 
+	mem_is_zero_slow(io_u->buf, io_u->buflen, &offset);
+
 	log_err("trim: verify failed at file %s offset %llu, length %lu"
 		", block offset %lu\n",
 			io_u->file->file_name, io_u->offset, io_u->buflen,
-			(unsigned long) (p - io_u->buf));
-	return ret;
+			(unsigned long) offset);
+	return EILSEQ;
 }
 
-static int verify_header(struct io_u *io_u, struct verify_header *hdr,
-			 unsigned int hdr_num, unsigned int hdr_len)
+static int verify_header(struct io_u *io_u, struct thread_data *td,
+			 struct verify_header *hdr, unsigned int hdr_num,
+			 unsigned int hdr_len)
 {
 	void *p = hdr;
 	uint32_t crc;
@@ -722,6 +836,30 @@
 			hdr->rand_seed, io_u->rand_seed);
 		goto err;
 	}
+	if (hdr->offset != io_u->offset + hdr_num * td->o.verify_interval) {
+		log_err("verify: bad header offset %"PRIu64
+			", wanted %llu",
+			hdr->offset, io_u->offset);
+		goto err;
+	}
+
+	/*
+	 * For read-only workloads, the program cannot be certain of the
+	 * last numberio written to a block. Checking of numberio will be
+	 * done only for workloads that write data.  For verify_only,
+	 * numberio will be checked in the last iteration when the correct
+	 * state of numberio, that would have been written to each block
+	 * in a previous run of fio, has been reached.
+	 */
+	if (td_write(td) && (td_min_bs(td) == td_max_bs(td)) &&
+	    !td->o.time_based)
+		if (!td->o.verify_only || td->o.loops == 0)
+			if (hdr->numberio != io_u->numberio) {
+				log_err("verify: bad header numberio %"PRIu16
+					", wanted %"PRIu16,
+					hdr->numberio, io_u->numberio);
+				goto err;
+			}
 
 	crc = fio_crc32c(p, offsetof(struct verify_header, crc32));
 	if (crc != hdr->crc32) {
@@ -735,6 +873,11 @@
 	log_err(" at file %s offset %llu, length %u\n",
 		io_u->file->file_name,
 		io_u->offset + hdr_num * hdr_len, hdr_len);
+
+	if (td->o.verify_dump)
+		dump_buf(p, hdr_len, io_u->offset + hdr_num * hdr_len,
+				"hdr_fail", io_u->file);
+
 	return EILSEQ;
 }
 
@@ -752,7 +895,7 @@
 	 * If the IO engine is faking IO (like null), then just pretend
 	 * we verified everything.
 	 */
-	if (td->io_ops->flags & FIO_FAKEIO)
+	if (td_ioengine_flagged(td, FIO_FAKEIO))
 		return 0;
 
 	if (io_u->flags & IO_U_F_TRIMMED) {
@@ -787,9 +930,11 @@
 		if (td->o.verifysort || (td->flags & TD_F_VER_BACKLOG))
 			io_u->rand_seed = hdr->rand_seed;
 
-		ret = verify_header(io_u, hdr, hdr_num, hdr_inc);
-		if (ret)
-			return ret;
+		if (td->o.verify != VERIFY_PATTERN_NO_HDR) {
+			ret = verify_header(io_u, td, hdr, hdr_num, hdr_inc);
+			if (ret)
+				return ret;
+		}
 
 		if (td->o.verify != VERIFY_NONE)
 			verify_type = td->o.verify;
@@ -797,6 +942,12 @@
 			verify_type = hdr->verify_type;
 
 		switch (verify_type) {
+		case VERIFY_HDR_ONLY:
+			/* Header is always verified, check if pattern is left
+			 * for verification. */
+			if (td->o.verify_pattern_bytes)
+				ret = verify_io_u_pattern(hdr, &vc);
+			break;
 		case VERIFY_MD5:
 			ret = verify_io_u_md5(hdr, &vc);
 			break;
@@ -822,16 +973,26 @@
 		case VERIFY_SHA512:
 			ret = verify_io_u_sha512(hdr, &vc);
 			break;
+		case VERIFY_SHA3_224:
+			ret = verify_io_u_sha3_224(hdr, &vc);
+			break;
+		case VERIFY_SHA3_256:
+			ret = verify_io_u_sha3_256(hdr, &vc);
+			break;
+		case VERIFY_SHA3_384:
+			ret = verify_io_u_sha3_384(hdr, &vc);
+			break;
+		case VERIFY_SHA3_512:
+			ret = verify_io_u_sha3_512(hdr, &vc);
+			break;
 		case VERIFY_XXHASH:
 			ret = verify_io_u_xxhash(hdr, &vc);
 			break;
-		case VERIFY_META:
-			ret = verify_io_u_meta(hdr, &vc);
-			break;
 		case VERIFY_SHA1:
 			ret = verify_io_u_sha1(hdr, &vc);
 			break;
 		case VERIFY_PATTERN:
+		case VERIFY_PATTERN_NO_HDR:
 			ret = verify_io_u_pattern(hdr, &vc);
 			break;
 		default:
@@ -851,21 +1012,6 @@
 	return ret;
 }
 
-static void fill_meta(struct verify_header *hdr, struct thread_data *td,
-		      struct io_u *io_u, unsigned int header_num)
-{
-	struct vhdr_meta *vh = hdr_priv(hdr);
-
-	vh->thread = td->thread_number;
-
-	vh->time_sec = io_u->start_time.tv_sec;
-	vh->time_usec = io_u->start_time.tv_usec;
-
-	vh->numberio = io_u->numberio;
-
-	vh->offset = io_u->offset + header_num * td->o.verify_interval;
-}
-
 static void fill_xxhash(struct verify_header *hdr, void *p, unsigned int len)
 {
 	struct vhdr_xxhash *vh = hdr_priv(hdr);
@@ -876,6 +1022,56 @@
 	vh->hash = XXH32_digest(state);
 }
 
+static void fill_sha3(struct fio_sha3_ctx *sha3_ctx, void *p, unsigned int len)
+{
+	fio_sha3_update(sha3_ctx, p, len);
+	fio_sha3_final(sha3_ctx);
+}
+
+static void fill_sha3_224(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_224 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_224_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_256(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_256 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_256_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_384(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_384 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_384_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
+static void fill_sha3_512(struct verify_header *hdr, void *p, unsigned int len)
+{
+	struct vhdr_sha3_512 *vh = hdr_priv(hdr);
+	struct fio_sha3_ctx sha3_ctx = {
+		.sha = vh->sha,
+	};
+
+	fio_sha3_512_init(&sha3_ctx);
+	fill_sha3(&sha3_ctx, p, len);
+}
+
 static void fill_sha512(struct verify_header *hdr, void *p, unsigned int len)
 {
 	struct vhdr_sha512 *vh = hdr_priv(hdr);
@@ -958,6 +1154,34 @@
 	fio_md5_final(&md5_ctx);
 }
 
+static void __fill_hdr(struct thread_data *td, struct io_u *io_u,
+		       struct verify_header *hdr, unsigned int header_num,
+		       unsigned int header_len, uint64_t rand_seed)
+{
+	void *p = hdr;
+
+	hdr->magic = FIO_HDR_MAGIC;
+	hdr->verify_type = td->o.verify;
+	hdr->len = header_len;
+	hdr->rand_seed = rand_seed;
+	hdr->offset = io_u->offset + header_num * td->o.verify_interval;
+	hdr->time_sec = io_u->start_time.tv_sec;
+	hdr->time_usec = io_u->start_time.tv_usec;
+	hdr->thread = td->thread_number;
+	hdr->numberio = io_u->numberio;
+	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
+}
+
+
+static void fill_hdr(struct thread_data *td, struct io_u *io_u,
+		     struct verify_header *hdr, unsigned int header_num,
+		     unsigned int header_len, uint64_t rand_seed)
+{
+
+	if (td->o.verify != VERIFY_PATTERN_NO_HDR)
+		__fill_hdr(td, io_u, hdr, header_num, header_len, rand_seed);
+}
+
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
 			 struct verify_header *hdr, unsigned int header_num,
 			 unsigned int header_len)
@@ -967,15 +1191,11 @@
 
 	p = (void *) hdr;
 
-	hdr->magic = FIO_HDR_MAGIC;
-	hdr->verify_type = td->o.verify;
-	hdr->len = header_len;
-	hdr->rand_seed = io_u->rand_seed;
-	hdr->crc32 = fio_crc32c(p, offsetof(struct verify_header, crc32));
+	fill_hdr(td, io_u, hdr, header_num, header_len, io_u->rand_seed);
 
-	data_len = header_len - hdr_size(hdr);
+	data_len = header_len - hdr_size(td, hdr);
 
-	data = p + hdr_size(hdr);
+	data = p + hdr_size(td, hdr);
 	switch (td->o.verify) {
 	case VERIFY_MD5:
 		dprint(FD_VERIFY, "fill md5 io_u %p, len %u\n",
@@ -1018,30 +1238,48 @@
 						io_u, hdr->len);
 		fill_sha512(hdr, data, data_len);
 		break;
+	case VERIFY_SHA3_224:
+		dprint(FD_VERIFY, "fill sha3-224 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_224(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_256:
+		dprint(FD_VERIFY, "fill sha3-256 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_256(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_384:
+		dprint(FD_VERIFY, "fill sha3-384 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_384(hdr, data, data_len);
+		break;
+	case VERIFY_SHA3_512:
+		dprint(FD_VERIFY, "fill sha3-512 io_u %p, len %u\n",
+						io_u, hdr->len);
+		fill_sha3_512(hdr, data, data_len);
+		break;
 	case VERIFY_XXHASH:
 		dprint(FD_VERIFY, "fill xxhash io_u %p, len %u\n",
 						io_u, hdr->len);
 		fill_xxhash(hdr, data, data_len);
 		break;
-	case VERIFY_META:
-		dprint(FD_VERIFY, "fill meta io_u %p, len %u\n",
-						io_u, hdr->len);
-		fill_meta(hdr, td, io_u, header_num);
-		break;
 	case VERIFY_SHA1:
 		dprint(FD_VERIFY, "fill sha1 io_u %p, len %u\n",
 						io_u, hdr->len);
 		fill_sha1(hdr, data, data_len);
 		break;
+	case VERIFY_HDR_ONLY:
 	case VERIFY_PATTERN:
+	case VERIFY_PATTERN_NO_HDR:
 		/* nothing to do here */
 		break;
 	default:
 		log_err("fio: bad verify type: %d\n", td->o.verify);
 		assert(0);
 	}
-	if (td->o.verify_offset)
-		memswp(p, p + td->o.verify_offset, hdr_size(hdr));
+
+	if (td->o.verify_offset && hdr_size(td, hdr))
+		memswp(p, p + td->o.verify_offset, hdr_size(td, hdr));
 }
 
 /*
@@ -1105,10 +1343,10 @@
 		io_u->buflen = ipo->len;
 		io_u->numberio = ipo->numberio;
 		io_u->file = ipo->file;
-		io_u->flags |= IO_U_F_VER_LIST;
+		io_u_set(td, io_u, IO_U_F_VER_LIST);
 
 		if (ipo->flags & IP_F_TRIMMED)
-			io_u->flags |= IO_U_F_TRIMMED;
+			io_u_set(td, io_u, IO_U_F_TRIMMED);
 
 		if (!fio_file_open(io_u->file)) {
 			int r = td_io_open_file(td, io_u->file);
@@ -1147,6 +1385,7 @@
 {
 	if (td->o.verify == VERIFY_CRC32C_INTEL ||
 	    td->o.verify == VERIFY_CRC32C) {
+		crc32c_arm64_probe();
 		crc32c_intel_probe();
 	}
 }
@@ -1192,7 +1431,7 @@
 			io_u = flist_first_entry(&list, struct io_u, verify_list);
 			flist_del_init(&io_u->verify_list);
 
-			io_u->flags |= IO_U_F_NO_FILE_PUT;
+			io_u_set(td, io_u, IO_U_F_NO_FILE_PUT);
 			ret = verify_io_u(td, &io_u);
 
 			put_io_u(td, io_u);
@@ -1227,7 +1466,7 @@
 	pthread_attr_t attr;
 
 	pthread_attr_init(&attr);
-	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
+	pthread_attr_setstacksize(&attr, 2 * PTHREAD_STACK_MIN);
 
 	td->verify_thread_exit = 0;
 
@@ -1278,6 +1517,59 @@
 	td->verify_threads = NULL;
 }
 
+int paste_blockoff(char *buf, unsigned int len, void *priv)
+{
+	struct io_u *io = priv;
+	unsigned long long off;
+
+	typecheck(typeof(off), io->offset);
+	off = cpu_to_le64((uint64_t)io->offset);
+	len = min(len, (unsigned int)sizeof(off));
+	memcpy(buf, &off, len);
+	return 0;
+}
+
+static int __fill_file_completions(struct thread_data *td,
+				   struct thread_io_list *s,
+				   struct fio_file *f, unsigned int *index)
+{
+	unsigned int comps;
+	int i, j;
+
+	if (!f->last_write_comp)
+		return 0;
+
+	if (td->io_blocks[DDIR_WRITE] < td->o.iodepth)
+		comps = td->io_blocks[DDIR_WRITE];
+	else
+		comps = td->o.iodepth;
+
+	j = f->last_write_idx - 1;
+	for (i = 0; i < comps; i++) {
+		if (j == -1)
+			j = td->o.iodepth - 1;
+		s->comps[*index].fileno = __cpu_to_le64(f->fileno);
+		s->comps[*index].offset = cpu_to_le64(f->last_write_comp[j]);
+		(*index)++;
+		j--;
+	}
+
+	return comps;
+}
+
+static int fill_file_completions(struct thread_data *td,
+				 struct thread_io_list *s, unsigned int *index)
+{
+	struct fio_file *f;
+	unsigned int i;
+	int comps = 0;
+
+	for_each_file(td, f, i)
+		comps += __fill_file_completions(td, s, f, index);
+
+	return comps;
+}
+
 struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
 {
 	struct all_io_list *rep;
@@ -1299,7 +1591,7 @@
 			continue;
 		td->stop_io = 1;
 		td->flags |= TD_F_VSTATE_SAVED;
-		depth += td->o.iodepth;
+		depth += (td->o.iodepth * td->o.nr_files);
 		nr++;
 	}
 
@@ -1308,45 +1600,42 @@
 
 	*sz = sizeof(*rep);
 	*sz += nr * sizeof(struct thread_io_list);
-	*sz += depth * sizeof(uint64_t);
+	*sz += depth * sizeof(struct file_comp);
 	rep = malloc(*sz);
+	memset(rep, 0, *sz);
 
 	rep->threads = cpu_to_le64((uint64_t) nr);
 
 	next = &rep->state[0];
 	for_each_td(td, i) {
 		struct thread_io_list *s = next;
-		unsigned int comps;
+		unsigned int comps, index = 0;
 
 		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
 			continue;
 
-		if (td->last_write_comp) {
-			int j, k;
-
-			if (td->io_blocks[DDIR_WRITE] < td->o.iodepth)
-				comps = td->io_blocks[DDIR_WRITE];
-			else
-				comps = td->o.iodepth;
-
-			k = td->last_write_idx - 1;
-			for (j = 0; j < comps; j++) {
-				if (k == -1)
-					k = td->o.iodepth - 1;
-				s->offsets[j] = cpu_to_le64(td->last_write_comp[k]);
-				k--;
-			}
-		} else
-			comps = 0;
+		comps = fill_file_completions(td, s, &index);
 
 		s->no_comps = cpu_to_le64((uint64_t) comps);
 		s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
+		s->nofiles = cpu_to_le64((uint64_t) td->o.nr_files);
 		s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
 		s->index = cpu_to_le64((uint64_t) i);
-		s->rand.s[0] = cpu_to_le32(td->random_state.s1);
-		s->rand.s[1] = cpu_to_le32(td->random_state.s2);
-		s->rand.s[2] = cpu_to_le32(td->random_state.s3);
-		s->rand.s[3] = 0;
+		if (td->random_state.use64) {
+			s->rand.state64.s[0] = cpu_to_le64(td->random_state.state64.s1);
+			s->rand.state64.s[1] = cpu_to_le64(td->random_state.state64.s2);
+			s->rand.state64.s[2] = cpu_to_le64(td->random_state.state64.s3);
+			s->rand.state64.s[3] = cpu_to_le64(td->random_state.state64.s4);
+			s->rand.state64.s[4] = cpu_to_le64(td->random_state.state64.s5);
+			s->rand.state64.s[5] = 0;
+			s->rand.use64 = cpu_to_le64((uint64_t)1);
+		} else {
+			s->rand.state32.s[0] = cpu_to_le32(td->random_state.state32.s1);
+			s->rand.state32.s[1] = cpu_to_le32(td->random_state.state32.s2);
+			s->rand.state32.s[2] = cpu_to_le32(td->random_state.state32.s3);
+			s->rand.state32.s[3] = 0;
+			s->rand.use64 = 0;
+		}
 		s->name[sizeof(s->name) - 1] = '\0';
 		strncpy((char *) s->name, td->o.name, sizeof(s->name) - 1);
 		next = io_list_next(s);
@@ -1358,7 +1647,7 @@
 static int open_state_file(const char *name, const char *prefix, int num,
 			   int for_write)
 {
-	char out[64];
+	char out[PATH_MAX];
 	int flags;
 	int fd;
 
@@ -1372,6 +1661,7 @@
 	fd = open(out, flags, 0644);
 	if (fd == -1) {
 		perror("fio: open state file");
+		log_err("fio: state file: %s (for_write=%d)\n", out, for_write);
 		return -1;
 	}
 
@@ -1424,14 +1714,21 @@
 	}
 }
 
-void verify_save_state(void)
+void verify_save_state(int mask)
 {
 	struct all_io_list *state;
 	size_t sz;
 
-	state = get_all_io_list(IO_LIST_ALL, &sz);
+	state = get_all_io_list(mask, &sz);
 	if (state) {
-		__verify_save_state(state, "local");
+		char prefix[PATH_MAX];
+
+		if (aux_path)
+			sprintf(prefix, "%s%slocal", aux_path, FIO_OS_PATH_SEPARATOR);
+		else
+			strcpy(prefix, "local");
+
+		__verify_save_state(state, prefix);
 		free(state);
 	}
 }
@@ -1442,20 +1739,31 @@
 		free(td->vstate);
 }
 
-void verify_convert_assign_state(struct thread_data *td,
-				 struct thread_io_list *s)
+void verify_assign_state(struct thread_data *td, void *p)
 {
+	struct thread_io_list *s = p;
 	int i;
 
 	s->no_comps = le64_to_cpu(s->no_comps);
-	s->depth = le64_to_cpu(s->depth);
+	s->depth = le32_to_cpu(s->depth);
+	s->nofiles = le32_to_cpu(s->nofiles);
 	s->numberio = le64_to_cpu(s->numberio);
-	for (i = 0; i < 4; i++)
-		s->rand.s[i] = le32_to_cpu(s->rand.s[i]);
-	for (i = 0; i < s->no_comps; i++)
-		s->offsets[i] = le64_to_cpu(s->offsets[i]);
+	s->rand.use64 = le64_to_cpu(s->rand.use64);
 
-	td->vstate = s;
+	if (s->rand.use64) {
+		for (i = 0; i < 6; i++)
+			s->rand.state64.s[i] = le64_to_cpu(s->rand.state64.s[i]);
+	} else {
+		for (i = 0; i < 4; i++)
+			s->rand.state32.s[i] = le32_to_cpu(s->rand.state32.s[i]);
+	}
+
+	for (i = 0; i < s->no_comps; i++) {
+		s->comps[i].fileno = le64_to_cpu(s->comps[i].fileno);
+		s->comps[i].offset = le64_to_cpu(s->comps[i].offset);
+	}
+
+	td->vstate = p;
 }
 
 int verify_state_hdr(struct verify_state_hdr *hdr, struct thread_io_list *s)
@@ -1478,8 +1786,8 @@
 
 int verify_load_state(struct thread_data *td, const char *prefix)
 {
-	struct thread_io_list *s = NULL;
 	struct verify_state_hdr hdr;
+	void *s = NULL;
 	uint64_t crc;
 	ssize_t ret;
 	int fd;
@@ -1504,7 +1812,8 @@
 	hdr.crc = le64_to_cpu(hdr.crc);
 
 	if (hdr.version != VSTATE_HDR_VERSION) {
-		log_err("fio: bad version in verify state header\n");
+		log_err("fio: unsupported (%d) version in verify state header\n",
+				(unsigned int) hdr.version);
 		goto err;
 	}
 
@@ -1517,7 +1826,7 @@
 		goto err;
 	}
 
-	crc = fio_crc32c((void *)s, hdr.size);
+	crc = fio_crc32c(s, hdr.size);
 	if (crc != hdr.crc) {
 		log_err("fio: verify state is corrupt\n");
 		goto err;
@@ -1525,7 +1834,7 @@
 
 	close(fd);
 
-	verify_convert_assign_state(td, s);
+	verify_assign_state(td, s);
 	return 0;
 err:
 	if (s)
@@ -1540,9 +1849,10 @@
 int verify_state_should_stop(struct thread_data *td, struct io_u *io_u)
 {
 	struct thread_io_list *s = td->vstate;
+	struct fio_file *f = io_u->file;
 	int i;
 
-	if (!s)
+	if (!s || !f)
 		return 0;
 
 	/*
@@ -1559,9 +1869,12 @@
 	 * completed or not. If the IO was seen as completed, then
 	 * lets verify it.
 	 */
-	for (i = 0; i < s->no_comps; i++)
-		if (io_u->offset == s->offsets[i])
+	for (i = 0; i < s->no_comps; i++) {
+		if (s->comps[i].fileno != f->fileno)
+			continue;
+		if (io_u->offset == s->comps[i].offset)
 			return 0;
+	}
 
 	/*
 	 * Not found, we have to stop

diff --git a/verify.h b/verify.h
index 43de887..5aae2e7 100644
--- a/verify.h
+++ b/verify.h

@@ -2,11 +2,15 @@
 #define FIO_VERIFY_H
 
 #include <stdint.h>
+#include "verify-state.h"
 
 #define FIO_HDR_MAGIC	0xacca
 
 enum {
 	VERIFY_NONE = 0,		/* no verification */
+	VERIFY_HDR_ONLY,		/* verify header only, kept for sake of
+					 * compatibility with old configurations
+					 * which use 'verify=meta' */
 	VERIFY_MD5,			/* md5 sum data blocks */
 	VERIFY_CRC64,			/* crc64 sum data blocks */
 	VERIFY_CRC32,			/* crc32 sum data blocks */
@@ -16,10 +20,14 @@
 	VERIFY_CRC7,			/* crc7 sum data blocks */
 	VERIFY_SHA256,			/* sha256 sum data blocks */
 	VERIFY_SHA512,			/* sha512 sum data blocks */
+	VERIFY_SHA3_224,		/* sha3-224 sum data blocks */
+	VERIFY_SHA3_256,		/* sha3-256 sum data blocks */
+	VERIFY_SHA3_384,		/* sha3-384 sum data blocks */
+	VERIFY_SHA3_512,		/* sha3-512 sum data blocks */
 	VERIFY_XXHASH,			/* xxhash sum data blocks */
-	VERIFY_META,			/* block_num, timestamp etc. */
 	VERIFY_SHA1,			/* sha1 sum data blocks */
 	VERIFY_PATTERN,			/* verify specific patterns */
+	VERIFY_PATTERN_NO_HDR,		/* verify specific patterns, no hdr */
 	VERIFY_NULL,			/* pretend to verify */
 };
 
@@ -33,12 +41,29 @@
 	uint16_t verify_type;
 	uint32_t len;
 	uint64_t rand_seed;
+	uint64_t offset;
+	uint32_t time_sec;
+	uint32_t time_usec;
+	uint16_t thread;
+	uint16_t numberio;
 	uint32_t crc32;
 };
 
 struct vhdr_md5 {
 	uint32_t md5_digest[4];
 };
+struct vhdr_sha3_224 {
+	uint8_t sha[224 / 8];
+};
+struct vhdr_sha3_256 {
+	uint8_t sha[256 / 8];
+};
+struct vhdr_sha3_384 {
+	uint8_t sha[384 / 8];
+};
+struct vhdr_sha3_512 {
+	uint8_t sha[512 / 8];
+};
 struct vhdr_sha512 {
 	uint8_t sha512[128];
 };
@@ -60,13 +85,6 @@
 struct vhdr_crc7 {
 	uint8_t crc7;
 };
-struct vhdr_meta {
-	uint64_t offset;
-	unsigned char thread;
-	unsigned short numberio;
-	unsigned long time_sec;
-	unsigned long time_usec;
-};
 struct vhdr_xxhash {
 	uint32_t hash;
 };
@@ -88,62 +106,9 @@
 extern int verify_async_init(struct thread_data *);
 extern void verify_async_exit(struct thread_data *);
 
-struct thread_rand_state {
-	uint32_t s[4];
-};
-
 /*
- * For dumping current write state
+ * Callbacks for pasting formats in the pattern buffer
  */
-struct thread_io_list {
-	uint64_t no_comps;
-	uint64_t depth;
-	uint64_t numberio;
-	uint64_t index;
-	struct thread_rand_state rand;
-	uint8_t name[64];
-	uint64_t offsets[0];
-};
-
-struct all_io_list {
-	uint64_t threads;
-	struct thread_io_list state[0];
-};
-
-#define VSTATE_HDR_VERSION	0x01
-
-struct verify_state_hdr {
-	uint64_t version;
-	uint64_t size;
-	uint64_t crc;
-};
-
-#define IO_LIST_ALL		0xffffffff
-extern struct all_io_list *get_all_io_list(int, size_t *);
-extern void __verify_save_state(struct all_io_list *, const char *);
-extern void verify_save_state(void);
-extern int verify_load_state(struct thread_data *, const char *);
-extern void verify_free_state(struct thread_data *);
-extern int verify_state_should_stop(struct thread_data *, struct io_u *);
-extern void verify_convert_assign_state(struct thread_data *, struct thread_io_list *);
-extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *);
-
-static inline size_t thread_io_list_sz(struct thread_io_list *s)
-{
-	return sizeof(*s) + le64_to_cpu(s->depth) * sizeof(uint64_t);
-}
-
-static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
-{
-	return (void *) s + thread_io_list_sz(s);
-}
-
-static inline void verify_state_gen_name(char *out, size_t size,
-					 const char *name, const char *prefix,
-					 int num)
-{
-	snprintf(out, size, "%s-%s-%d-verify.state", prefix, name, num);
-	out[size - 1] = '\0';
-}
+extern int paste_blockoff(char *buf, unsigned int len, void *priv);
 
 #endif

diff --git a/workqueue.c b/workqueue.c
new file mode 100644
index 0000000..1131400
--- /dev/null
+++ b/workqueue.c

@@ -0,0 +1,373 @@
+/*
+ * Generic workqueue offload mechanism
+ *
+ * Copyright (C) 2015 Jens Axboe <axboe@kernel.dk>
+ *
+ */
+#include <unistd.h>
+
+#include "fio.h"
+#include "flist.h"
+#include "workqueue.h"
+#include "smalloc.h"
+
+enum {
+	SW_F_IDLE	= 1 << 0,
+	SW_F_RUNNING	= 1 << 1,
+	SW_F_EXIT	= 1 << 2,
+	SW_F_ACCOUNTED	= 1 << 3,
+	SW_F_ERROR	= 1 << 4,
+};
+
+static struct submit_worker *__get_submit_worker(struct workqueue *wq,
+						 unsigned int start,
+						 unsigned int end,
+						 struct submit_worker **best)
+{
+	struct submit_worker *sw = NULL;
+
+	while (start <= end) {
+		sw = &wq->workers[start];
+		if (sw->flags & SW_F_IDLE)
+			return sw;
+		if (!(*best) || sw->seq < (*best)->seq)
+			*best = sw;
+		start++;
+	}
+
+	return NULL;
+}
+
+static struct submit_worker *get_submit_worker(struct workqueue *wq)
+{
+	unsigned int next = wq->next_free_worker;
+	struct submit_worker *sw, *best = NULL;
+
+	assert(next < wq->max_workers);
+
+	sw = __get_submit_worker(wq, next, wq->max_workers - 1, &best);
+	if (!sw && next)
+		sw = __get_submit_worker(wq, 0, next - 1, &best);
+
+	/*
+	 * No truly idle found, use best match
+	 */
+	if (!sw)
+		sw = best;
+
+	if (sw->index == wq->next_free_worker) {
+		if (sw->index + 1 < wq->max_workers)
+			wq->next_free_worker = sw->index + 1;
+		else
+			wq->next_free_worker = 0;
+	}
+
+	return sw;
+}
+
+static bool all_sw_idle(struct workqueue *wq)
+{
+	int i;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		struct submit_worker *sw = &wq->workers[i];
+
+		if (!(sw->flags & SW_F_IDLE))
+			return false;
+	}
+
+	return true;
+}
+
+/*
+ * Must be serialized wrt workqueue_enqueue() by caller
+ */
+void workqueue_flush(struct workqueue *wq)
+{
+	wq->wake_idle = 1;
+
+	while (!all_sw_idle(wq)) {
+		pthread_mutex_lock(&wq->flush_lock);
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	}
+
+	wq->wake_idle = 0;
+}
+
+/*
+ * Must be serialized by caller. Returns true for queued, false for busy.
+ */
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work)
+{
+	struct submit_worker *sw;
+
+	sw = get_submit_worker(wq);
+	assert(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	flist_add_tail(&work->list, &sw->work_list);
+	sw->seq = ++wq->work_seq;
+	sw->flags &= ~SW_F_IDLE;
+	pthread_mutex_unlock(&sw->lock);
+
+	pthread_cond_signal(&sw->cond);
+}
+
+static void handle_list(struct submit_worker *sw, struct flist_head *list)
+{
+	struct workqueue *wq = sw->wq;
+	struct workqueue_work *work;
+
+	while (!flist_empty(list)) {
+		work = flist_first_entry(list, struct workqueue_work, list);
+		flist_del_init(&work->list);
+		wq->ops.fn(sw, work);
+	}
+}
+
+static void *worker_thread(void *data)
+{
+	struct submit_worker *sw = data;
+	struct workqueue *wq = sw->wq;
+	unsigned int ret = 0;
+	FLIST_HEAD(local_list);
+
+	sk_out_assign(sw->sk_out);
+
+	if (wq->ops.nice) {
+		if (nice(wq->ops.nice) < 0) {
+			log_err("workqueue: nice %s\n", strerror(errno));
+			ret = 1;
+		}
+	}
+
+	if (!ret)
+		ret = workqueue_init_worker(sw);
+
+	pthread_mutex_lock(&sw->lock);
+	sw->flags |= SW_F_RUNNING;
+	if (ret)
+		sw->flags |= SW_F_ERROR;
+	pthread_mutex_unlock(&sw->lock);
+
+	pthread_mutex_lock(&wq->flush_lock);
+	pthread_cond_signal(&wq->flush_cond);
+	pthread_mutex_unlock(&wq->flush_lock);
+
+	if (sw->flags & SW_F_ERROR)
+		goto done;
+
+	while (1) {
+		pthread_mutex_lock(&sw->lock);
+
+		if (flist_empty(&sw->work_list)) {
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			}
+
+			if (workqueue_pre_sleep_check(sw)) {
+				pthread_mutex_unlock(&sw->lock);
+				workqueue_pre_sleep(sw);
+				pthread_mutex_lock(&sw->lock);
+			}
+
+			/*
+			 * We dropped and reaquired the lock, check
+			 * state again.
+			 */
+			if (!flist_empty(&sw->work_list))
+				goto handle_work;
+
+			if (sw->flags & SW_F_EXIT) {
+				pthread_mutex_unlock(&sw->lock);
+				break;
+			} else if (!(sw->flags & SW_F_IDLE)) {
+				sw->flags |= SW_F_IDLE;
+				wq->next_free_worker = sw->index;
+				if (wq->wake_idle)
+					pthread_cond_signal(&wq->flush_cond);
+			}
+			if (wq->ops.update_acct_fn)
+				wq->ops.update_acct_fn(sw);
+
+			pthread_cond_wait(&sw->cond, &sw->lock);
+		} else {
+handle_work:
+			flist_splice_init(&sw->work_list, &local_list);
+		}
+		pthread_mutex_unlock(&sw->lock);
+		handle_list(sw, &local_list);
+	}
+
+	if (wq->ops.update_acct_fn)
+		wq->ops.update_acct_fn(sw);
+
+done:
+	sk_out_drop();
+	return NULL;
+}
+
+static void free_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+
+	workqueue_exit_worker(sw, sum_cnt);
+
+	pthread_cond_destroy(&sw->cond);
+	pthread_mutex_destroy(&sw->lock);
+
+	if (wq->ops.free_worker_fn)
+		wq->ops.free_worker_fn(sw);
+}
+
+static void shutdown_worker(struct submit_worker *sw, unsigned int *sum_cnt)
+{
+	pthread_join(sw->thread, NULL);
+	free_worker(sw, sum_cnt);
+}
+
+void workqueue_exit(struct workqueue *wq)
+{
+	unsigned int shutdown, sum_cnt = 0;
+	struct submit_worker *sw;
+	int i;
+
+	if (!wq->workers)
+		return;
+
+	for (i = 0; i < wq->max_workers; i++) {
+		sw = &wq->workers[i];
+
+		pthread_mutex_lock(&sw->lock);
+		sw->flags |= SW_F_EXIT;
+		pthread_cond_signal(&sw->cond);
+		pthread_mutex_unlock(&sw->lock);
+	}
+
+	do {
+		shutdown = 0;
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			if (sw->flags & SW_F_ACCOUNTED)
+				continue;
+			pthread_mutex_lock(&sw->lock);
+			sw->flags |= SW_F_ACCOUNTED;
+			pthread_mutex_unlock(&sw->lock);
+			shutdown_worker(sw, &sum_cnt);
+			shutdown++;
+		}
+	} while (shutdown && shutdown != wq->max_workers);
+
+	sfree(wq->workers);
+	wq->workers = NULL;
+	pthread_mutex_destroy(&wq->flush_lock);
+	pthread_cond_destroy(&wq->flush_cond);
+	pthread_mutex_destroy(&wq->stat_lock);
+}
+
+static int start_worker(struct workqueue *wq, unsigned int index,
+			struct sk_out *sk_out)
+{
+	struct submit_worker *sw = &wq->workers[index];
+	int ret;
+
+	INIT_FLIST_HEAD(&sw->work_list);
+
+	ret = mutex_cond_init_pshared(&sw->lock, &sw->cond);
+	if (ret)
+		return ret;
+
+	sw->wq = wq;
+	sw->index = index;
+	sw->sk_out = sk_out;
+
+	if (wq->ops.alloc_worker_fn) {
+		ret = wq->ops.alloc_worker_fn(sw);
+		if (ret)
+			return ret;
+	}
+
+	ret = pthread_create(&sw->thread, NULL, worker_thread, sw);
+	if (!ret) {
+		pthread_mutex_lock(&sw->lock);
+		sw->flags = SW_F_IDLE;
+		pthread_mutex_unlock(&sw->lock);
+		return 0;
+	}
+
+	free_worker(sw, NULL);
+	return 1;
+}
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq,
+		   struct workqueue_ops *ops, unsigned int max_workers,
+		   struct sk_out *sk_out)
+{
+	unsigned int running;
+	int i, error;
+	int ret;
+
+	wq->max_workers = max_workers;
+	wq->td = td;
+	wq->ops = *ops;
+	wq->work_seq = 0;
+	wq->next_free_worker = 0;
+
+	ret = mutex_cond_init_pshared(&wq->flush_lock, &wq->flush_cond);
+	if (ret)
+		goto err;
+	ret = mutex_init_pshared(&wq->stat_lock);
+	if (ret)
+		goto err;
+
+	wq->workers = smalloc(wq->max_workers * sizeof(struct submit_worker));
+	if (!wq->workers)
+		goto err;
+
+	for (i = 0; i < wq->max_workers; i++)
+		if (start_worker(wq, i, sk_out))
+			break;
+
+	wq->max_workers = i;
+	if (!wq->max_workers)
+		goto err;
+
+	/*
+	 * Wait for them all to be started and initialized
+	 */
+	error = 0;
+	do {
+		struct submit_worker *sw;
+
+		running = 0;
+		pthread_mutex_lock(&wq->flush_lock);
+		for (i = 0; i < wq->max_workers; i++) {
+			sw = &wq->workers[i];
+			pthread_mutex_lock(&sw->lock);
+			if (sw->flags & SW_F_RUNNING)
+				running++;
+			if (sw->flags & SW_F_ERROR)
+				error++;
+			pthread_mutex_unlock(&sw->lock);
+		}
+
+		if (error || running == wq->max_workers) {
+			pthread_mutex_unlock(&wq->flush_lock);
+			break;
+		}
+
+		pthread_cond_wait(&wq->flush_cond, &wq->flush_lock);
+		pthread_mutex_unlock(&wq->flush_lock);
+	} while (1);
+
+	if (!error)
+		return 0;
+
+err:
+	log_err("Can't create rate workqueue\n");
+	td_verror(td, ESRCH, "workqueue_init");
+	workqueue_exit(wq);
+	return 1;
+}

diff --git a/workqueue.h b/workqueue.h
new file mode 100644
index 0000000..e35c181
--- /dev/null
+++ b/workqueue.h

@@ -0,0 +1,112 @@
+#ifndef FIO_RATE_H
+#define FIO_RATE_H
+
+#include "flist.h"
+
+struct workqueue_work {
+	struct flist_head list;
+};
+
+struct submit_worker {
+	pthread_t thread;
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	struct flist_head work_list;
+	unsigned int flags;
+	unsigned int index;
+	uint64_t seq;
+	struct workqueue *wq;
+	void *priv;
+	struct sk_out *sk_out;
+};
+
+typedef int (workqueue_work_fn)(struct submit_worker *, struct workqueue_work *);
+typedef bool (workqueue_pre_sleep_flush_fn)(struct submit_worker *);
+typedef void (workqueue_pre_sleep_fn)(struct submit_worker *);
+typedef int (workqueue_alloc_worker_fn)(struct submit_worker *);
+typedef void (workqueue_free_worker_fn)(struct submit_worker *);
+typedef int (workqueue_init_worker_fn)(struct submit_worker *);
+typedef void (workqueue_exit_worker_fn)(struct submit_worker *, unsigned int *);
+typedef void (workqueue_update_acct_fn)(struct submit_worker *);
+
+struct workqueue_ops {
+	workqueue_work_fn *fn;
+	workqueue_pre_sleep_flush_fn *pre_sleep_flush_fn;
+	workqueue_pre_sleep_fn *pre_sleep_fn;
+
+	workqueue_update_acct_fn *update_acct_fn;
+
+	workqueue_alloc_worker_fn *alloc_worker_fn;
+	workqueue_free_worker_fn *free_worker_fn;
+
+	workqueue_init_worker_fn *init_worker_fn;
+	workqueue_exit_worker_fn *exit_worker_fn;
+
+	unsigned int nice;
+};
+
+struct workqueue {
+	unsigned int max_workers;
+
+	struct thread_data *td;
+	struct workqueue_ops ops;
+
+	uint64_t work_seq;
+	struct submit_worker *workers;
+	unsigned int next_free_worker;
+
+	pthread_cond_t flush_cond;
+	pthread_mutex_t flush_lock;
+	pthread_mutex_t stat_lock;
+	volatile int wake_idle;
+};
+
+int workqueue_init(struct thread_data *td, struct workqueue *wq, struct workqueue_ops *ops, unsigned int max_workers, struct sk_out *sk_out);
+void workqueue_exit(struct workqueue *wq);
+
+void workqueue_enqueue(struct workqueue *wq, struct workqueue_work *work);
+void workqueue_flush(struct workqueue *wq);
+
+static inline bool workqueue_pre_sleep_check(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.pre_sleep_flush_fn)
+		return false;
+
+	return wq->ops.pre_sleep_flush_fn(sw);
+}
+
+static inline void workqueue_pre_sleep(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (wq->ops.pre_sleep_fn)
+		wq->ops.pre_sleep_fn(sw);
+}
+
+static inline int workqueue_init_worker(struct submit_worker *sw)
+{
+	struct workqueue *wq = sw->wq;
+
+	if (!wq->ops.init_worker_fn)
+		return 0;
+
+	return wq->ops.init_worker_fn(sw);
+}
+
+static inline void workqueue_exit_worker(struct submit_worker *sw,
+					 unsigned int *sum_cnt)
+{
+	struct workqueue *wq = sw->wq;
+	unsigned int tmp = 1;
+
+	if (!wq->ops.exit_worker_fn)
+		return;
+
+	if (!sum_cnt)
+		sum_cnt = &tmp;
+
+	wq->ops.exit_worker_fn(sw, sum_cnt);
+}
+#endif
commit	bd65a914df7058352a8ec94c8406ac7b683ca65f	[log] [tgz]
author	Elliott Hughes <enh@google.com>	Sat May 20 03:51:12 2017 +0000
committer	android-build-merger <android-build-merger@google.com>	Sat May 20 03:51:12 2017 +0000
tree	7da7fbd89e536bc8e7465f86c0b738dae13ec124
parent	6fddb55670aaf3df6ccfb73a0da75aea3d53b42d [diff]
parent	21dbb04785b943ed85e8854800777e6a9dd806dd [diff]