diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..73b2998
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+# Ninja files
+build.ninja
+
+# Build objects and artifacts
+deps/
+build/
+bin/
+lib/
+*.pyc
+*.pyo
+
+# System files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..12a6b33
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,15 @@
+dist: trusty
+language: c
+compiler:
+ - clang
+ - gcc
+script:
+ - mkdir build
+ - cd build
+ - cmake .. -G Ninja
+ - ninja
+ - ctest --verbose
+addons:
+  apt:
+    packages:
+    - ninja-build
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..a74d59d
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,104 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
+
+INCLUDE(GNUInstallDirs)
+
+# ---[ Project
+PROJECT(FXdiv C CXX)
+
+# ---[ Options.
+OPTION(FXDIV_USE_INLINE_ASSEMBLY "Allow use of inline assembly in FXdiv" OFF)
+IF("${CMAKE_SOURCE_DIR}" STREQUAL "${PROJECT_SOURCE_DIR}")
+  OPTION(FXDIV_BUILD_TESTS "Build FXdiv unit tests" ON)
+  OPTION(FXDIV_BUILD_BENCHMARKS "Build FXdiv micro-benchmarks" ON)
+ELSE()
+  SET(FXDIV_BUILD_TESTS OFF CACHE BOOL "Build FXdiv unit tests")
+  SET(FXDIV_BUILD_BENCHMARKS OFF CACHE BOOL "Build FXdiv micro-benchmarks")
+ENDIF()
+
+# ---[ CMake options
+IF(FXDIV_BUILD_TESTS)
+  ENABLE_TESTING()
+ENDIF()
+
+# ---[ Download deps
+SET(CONFU_DEPENDENCIES_SOURCE_DIR ${CMAKE_SOURCE_DIR}/deps
+  CACHE PATH "Confu-style dependencies source directory")
+SET(CONFU_DEPENDENCIES_BINARY_DIR ${CMAKE_BINARY_DIR}/deps
+  CACHE PATH "Confu-style dependencies binary directory")
+
+IF(FXDIV_BUILD_TESTS AND NOT DEFINED GOOGLETEST_SOURCE_DIR)
+  MESSAGE(STATUS "Downloading Google Test to ${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest (define GOOGLETEST_SOURCE_DIR to avoid it)")
+  CONFIGURE_FILE(cmake/DownloadGoogleTest.cmake "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest-download/CMakeLists.txt")
+  EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+    WORKING_DIRECTORY "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest-download")
+  EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build .
+    WORKING_DIRECTORY "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest-download")
+  SET(GOOGLETEST_SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest" CACHE STRING "Google Test source directory")
+ENDIF()
+
+IF(FXDIV_BUILD_BENCHMARKS AND NOT DEFINED GOOGLEBENCHMARK_SOURCE_DIR)
+  MESSAGE(STATUS "Downloading Google Benchmark to ${CONFU_DEPENDENCIES_SOURCE_DIR}/googlebenchmark (define GOOGLEBENCHMARK_SOURCE_DIR to avoid it)")
+  CONFIGURE_FILE(cmake/DownloadGoogleBenchmark.cmake "${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark-download/CMakeLists.txt")
+  EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
+    WORKING_DIRECTORY "${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark-download")
+  EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build .
+    WORKING_DIRECTORY "${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark-download")
+  SET(GOOGLEBENCHMARK_SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googlebenchmark" CACHE STRING "Google Benchmark source directory")
+ENDIF()
+
+# ---[ FXdiv library
+IF(${CMAKE_VERSION} VERSION_LESS "3.0")
+  ADD_LIBRARY(fxdiv STATIC include/fxdiv.h)
+  SET_TARGET_PROPERTIES(fxdiv PROPERTIES LINKER_LANGUAGE C)
+ELSE()
+  ADD_LIBRARY(fxdiv INTERFACE)
+ENDIF()
+TARGET_INCLUDE_DIRECTORIES(fxdiv INTERFACE include)
+IF(NOT FXDIV_USE_INLINE_ASSEMBLY)
+  TARGET_COMPILE_DEFINITIONS(fxdiv INTERFACE FXDIV_USE_INLINE_ASSEMBLY=0)
+ENDIF()
+
+INSTALL(FILES include/fxdiv.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+IF(FXDIV_BUILD_TESTS)
+  # ---[ Build google test
+  IF(NOT TARGET gtest)
+    SET(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+    ADD_SUBDIRECTORY(
+      "${GOOGLETEST_SOURCE_DIR}"
+      "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest")
+  ENDIF()
+
+  ADD_EXECUTABLE(multiply-high-test test/multiply-high.cc)
+  TARGET_LINK_LIBRARIES(multiply-high-test fxdiv gtest gtest_main)
+  ADD_TEST(multiply-high multiply-high-test)
+
+  ADD_EXECUTABLE(quotient-test test/quotient.cc)
+  TARGET_LINK_LIBRARIES(quotient-test fxdiv gtest gtest_main)
+  ADD_TEST(quotient quotient-test)
+ENDIF()
+
+IF(FXDIV_BUILD_BENCHMARKS)
+  # ---[ Build google benchmark
+  IF(NOT TARGET benchmark)
+    SET(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE)
+    ADD_SUBDIRECTORY(
+      "${GOOGLEBENCHMARK_SOURCE_DIR}"
+      "${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark")
+  ENDIF()
+
+  ADD_EXECUTABLE(init-bench bench/init.cc)
+  TARGET_LINK_LIBRARIES(init-bench fxdiv benchmark)
+
+  ADD_EXECUTABLE(multiply-bench bench/multiply.cc)
+  TARGET_LINK_LIBRARIES(multiply-bench fxdiv benchmark)
+
+  ADD_EXECUTABLE(divide-bench bench/divide.cc)
+  TARGET_LINK_LIBRARIES(divide-bench fxdiv benchmark)
+
+  ADD_EXECUTABLE(quotient-bench bench/quotient.cc)
+  TARGET_LINK_LIBRARIES(quotient-bench fxdiv benchmark)
+
+  ADD_EXECUTABLE(round-down-bench bench/round-down.cc)
+  TARGET_LINK_LIBRARIES(round-down-bench fxdiv benchmark)
+ENDIF()
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..f25bb41
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,10 @@
+The MIT License (MIT)
+
+Copyright (c) 2017 Facebook Inc.
+Copyright (c) 2016-2017 Marat Dukhan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..2e9e231
--- /dev/null
+++ b/README.md
@@ -0,0 +1,60 @@
+# FXdiv
+
+[![MIT License](https://img.shields.io/badge/License-MIT%20License-blue.svg)](https://github.com/Maratyszcza/FXdiv/blob/master/LICENSE)
+[![Build Status](https://img.shields.io/travis/Maratyszcza/FXdiv.svg)](https://travis-ci.org/Maratyszcza/FXdiv)
+
+
+Header-only library for division via fixed-point multiplication by inverse
+
+On modern CPUs and GPUs integer division is several times slower than multiplication. FXdiv implements an algorithm to replace an integer division with a multiplication and two shifts. This algorithm improves performance when an application performs repeated divisions by the same divisor.
+
+## Features
+
+- Integer division for `uint32_t`, `uint64_t`, and `size_t`
+- Header-only library, no installation or build required
+- Compatible with C99, C++, OpenCL, and CUDA
+- Uses platform-specific compiler intrinsics for optimal performance
+- Covered with unit tests and microbenchmarks
+
+## Example
+
+```c
+#include <fxdiv.h>
+
+/* Division of array by a constant: reference implementation */
+void divide_array_c(size_t length, uint32_t array[], uint32_t divisor) {
+  for (size_t i = 0; i < length; i++) {
+    array[i] /= divisor;
+  }
+}
+
+/* Division of array by a constant: implementation with FXdiv */
+void divide_array_fxdiv(size_t length, uint32_t array[], uint32_t divisor) {
+  const struct fxdiv_divisor_uint32_t precomputed_divisor =
+    fxdiv_init_uint32_t(divisor);
+  for (size_t i = 0; i < length; i++) {
+    array[i] = fxdiv_quotient_uint32_t(array[i], precomputed_divisor);
+  }
+}
+```
+
+## Status
+
+Project is in alpha stage. API is unstable. Currently working features:
+
+| Platform        | uint32_t | uint64_t | size_t   |
+| --------------- |:--------:|:--------:|:--------:|
+| x86-64 gcc      | Works    | Works    | Works    |
+| x86-64 MSVC     | Works    | Works    | Works    |
+| x86 gcc         | Works    | Works    | Works    |
+| x86 MSVC        | Works    | Works    | Works    |
+| ARMv7 gcc       | Works    | Works    | Works    |
+| PPC64 gcc       | Works    | Works    | Works    |
+| PNaCl clang     | Works    | Works    | Works    |
+| Asm.js clang    | Works    | Works    | Works    |
+| CUDA            | Untested | Untested | Untested |
+| OpenCL          | Untested | Untested | Untested |
+
+## References
+
+- Granlund, Torbjörn, and Peter L. Montgomery. "Division by invariant integers using multiplication." In ACM SIGPLAN Notices, vol. 29, no. 6, pp. 61-72. ACM, 1994. Available: [gmplib.org/~tege/divcnst-pldi94.pdf](https://gmplib.org/~tege/divcnst-pldi94.pdf)
diff --git a/bench/divide.cc b/bench/divide.cc
new file mode 100644
index 0000000..78aa38c
--- /dev/null
+++ b/bench/divide.cc
@@ -0,0 +1,51 @@
+#include <benchmark/benchmark.h>
+
+#include <fxdiv.h>
+
+static void fxdiv_divide_uint32_t(benchmark::State& state) {
+	const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(UINT32_C(65537));
+	uint32_t x = 0;
+	while (state.KeepRunning()) {
+		const fxdiv_result_uint32_t result = fxdiv_divide_uint32_t(x++, divisor);
+		benchmark::DoNotOptimize(result);
+	}
+}
+BENCHMARK(fxdiv_divide_uint32_t);
+
+static void fxdiv_divide_uint64_t(benchmark::State& state) {
+	const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(UINT64_C(4294967311));
+	uint64_t x = 0;
+	while (state.KeepRunning()) {
+		const fxdiv_result_uint64_t result = fxdiv_divide_uint64_t(x++, divisor);
+		benchmark::DoNotOptimize(result);
+	}
+}
+BENCHMARK(fxdiv_divide_uint64_t);
+
+static void native_divide_uint32_t(benchmark::State& state) {
+	uint32_t divisor = UINT32_C(65537);
+	benchmark::DoNotOptimize(&divisor);
+	uint32_t x = 0;
+	while (state.KeepRunning()) {
+		const uint32_t quotient = x / divisor;
+		const uint32_t remainder = x++ % divisor;
+		benchmark::DoNotOptimize(quotient);
+		benchmark::DoNotOptimize(remainder);
+	}
+}
+BENCHMARK(native_divide_uint32_t);
+
+static void native_divide_uint64_t(benchmark::State& state) {
+	uint64_t divisor = UINT64_C(4294967311);
+	benchmark::DoNotOptimize(&divisor);
+	uint64_t x = 0;
+	while (state.KeepRunning()) {
+		const uint64_t quotient = x / divisor;
+		const uint64_t remainder = x++ % divisor;
+		benchmark::DoNotOptimize(quotient);
+		benchmark::DoNotOptimize(remainder);
+	}
+}
+BENCHMARK(native_divide_uint64_t);
+
+BENCHMARK_MAIN();
diff --git a/bench/init.cc b/bench/init.cc
new file mode 100644
index 0000000..060db49
--- /dev/null
+++ b/bench/init.cc
@@ -0,0 +1,23 @@
+#include <benchmark/benchmark.h>
+
+#include <fxdiv.h>
+
+static void init_uint32_t(benchmark::State& state) {
+	uint32_t d = UINT32_C(0x1971DB6B);
+	while (state.KeepRunning()) {
+		const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(d++);
+		benchmark::DoNotOptimize(divisor);
+	}
+}
+BENCHMARK(init_uint32_t);
+
+static void init_uint64_t(benchmark::State& state) {
+	uint64_t d = UINT64_C(0x425E892B38148FAD);
+	while (state.KeepRunning()) {
+		const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(d++);
+		benchmark::DoNotOptimize(divisor);
+	}
+}
+BENCHMARK(init_uint64_t);
+
+BENCHMARK_MAIN();
diff --git a/bench/multiply.cc b/bench/multiply.cc
new file mode 100644
index 0000000..c9757a4
--- /dev/null
+++ b/bench/multiply.cc
@@ -0,0 +1,47 @@
+#include <benchmark/benchmark.h>
+
+#include <fxdiv.h>
+
+static void fxdiv_mulext_uint32_t(benchmark::State& state) {
+	uint32_t c = UINT32_C(0x1971DB6B);
+	benchmark::DoNotOptimize(&c);
+	uint32_t d = c;
+	while (state.KeepRunning()) {
+		const uint64_t product = fxdiv_mulext_uint32_t(c, d++);
+		benchmark::DoNotOptimize(product);
+	}
+}
+BENCHMARK(fxdiv_mulext_uint32_t);
+
+static void native_mulext_uint32_t(benchmark::State& state) {
+	uint32_t c = UINT32_C(0x1971DB6B);
+	benchmark::DoNotOptimize(&c);
+	uint32_t d = c;
+	while (state.KeepRunning()) {
+		const uint64_t product = (uint64_t) c * (uint64_t) (d++);
+		benchmark::DoNotOptimize(product);
+	}
+}
+BENCHMARK(native_mulext_uint32_t);
+
+static void fxdiv_mulhi_uint32_t(benchmark::State& state) {
+	const uint32_t c = UINT32_C(0x1971DB6B);
+	uint32_t x = c;
+	while (state.KeepRunning()) {
+		const uint32_t product = fxdiv_mulhi_uint32_t(c, x++);
+		benchmark::DoNotOptimize(product);
+	}
+}
+BENCHMARK(fxdiv_mulhi_uint32_t);
+
+static void fxdiv_mulhi_uint64_t(benchmark::State& state) {
+	const uint64_t c = UINT64_C(0x425E892B38148FAD);
+	uint64_t x = c;
+	while (state.KeepRunning()) {
+		const uint64_t product = fxdiv_mulhi_uint64_t(c, x++);
+		benchmark::DoNotOptimize(product);
+	}
+}
+BENCHMARK(fxdiv_mulhi_uint64_t);
+
+BENCHMARK_MAIN();
diff --git a/bench/quotient.cc b/bench/quotient.cc
new file mode 100644
index 0000000..374ac2b
--- /dev/null
+++ b/bench/quotient.cc
@@ -0,0 +1,47 @@
+#include <benchmark/benchmark.h>
+
+#include <fxdiv.h>
+
+static void fxdiv_quotient_uint32_t(benchmark::State& state) {
+	const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(UINT32_C(65537));
+	uint32_t x = 0;
+	while (state.KeepRunning()) {
+		uint32_t quotient = fxdiv_quotient_uint32_t(x++, divisor);
+		benchmark::DoNotOptimize(quotient);
+	}
+}
+BENCHMARK(fxdiv_quotient_uint32_t);
+
+static void fxdiv_quotient_uint64_t(benchmark::State& state) {
+	const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(UINT64_C(4294967311));
+	uint64_t x = 0;
+	while (state.KeepRunning()) {
+		uint64_t quotient = fxdiv_quotient_uint64_t(x++, divisor);
+		benchmark::DoNotOptimize(quotient);
+	}
+}
+BENCHMARK(fxdiv_quotient_uint64_t);
+
+static void native_quotient_uint32_t(benchmark::State& state) {
+	uint32_t divisor = UINT32_C(65537);
+	benchmark::DoNotOptimize(&divisor);
+	uint32_t x = UINT32_MAX;
+	while (state.KeepRunning()) {
+		uint32_t quotient = x-- / divisor;
+		benchmark::DoNotOptimize(quotient);
+	}
+}
+BENCHMARK(native_quotient_uint32_t);
+
+static void native_quotient_uint64_t(benchmark::State& state) {
+	uint64_t divisor = UINT64_C(4294967311);
+    benchmark::DoNotOptimize(&divisor);
+	uint64_t x = UINT64_MAX;
+	while (state.KeepRunning()) {
+		const uint64_t quotient = x-- / divisor;
+		benchmark::DoNotOptimize(quotient);
+	}
+}
+BENCHMARK(native_quotient_uint64_t);
+
+BENCHMARK_MAIN();
diff --git a/bench/round-down.cc b/bench/round-down.cc
new file mode 100644
index 0000000..860ce11
--- /dev/null
+++ b/bench/round-down.cc
@@ -0,0 +1,47 @@
+#include <benchmark/benchmark.h>
+
+#include <fxdiv.h>
+
+static void fxdiv_round_down_uint32_t(benchmark::State& state) {
+	const fxdiv_divisor_uint32_t multiple = fxdiv_init_uint32_t(UINT32_C(65537));
+	uint32_t x = 0;
+	while (state.KeepRunning()) {
+		const uint32_t rounded_x = fxdiv_round_down_uint32_t(x++, multiple);
+		benchmark::DoNotOptimize(rounded_x);
+	}
+}
+BENCHMARK(fxdiv_round_down_uint32_t);
+
+static void fxdiv_round_down_uint64_t(benchmark::State& state) {
+	const fxdiv_divisor_uint64_t multiple = fxdiv_init_uint64_t(UINT64_C(4294967311));
+	uint64_t x = 0;
+	while (state.KeepRunning()) {
+		const uint64_t rounded_x = fxdiv_round_down_uint64_t(x++, multiple);
+		benchmark::DoNotOptimize(rounded_x);
+	}
+}
+BENCHMARK(fxdiv_round_down_uint64_t);
+
+static void native_round_down_uint32_t(benchmark::State& state) {
+	uint32_t multiple = UINT32_C(65537);
+	benchmark::DoNotOptimize(&multiple);
+	uint32_t x = 0;
+	while (state.KeepRunning()) {
+		const uint32_t rounded_x = x++ / multiple * multiple;
+		benchmark::DoNotOptimize(rounded_x);
+	}
+}
+BENCHMARK(native_round_down_uint32_t);
+
+static void native_round_down_uint64_t(benchmark::State& state) {
+	uint64_t multiple = UINT64_C(4294967311);
+	benchmark::DoNotOptimize(&multiple);
+	uint64_t x = 0;
+	while (state.KeepRunning()) {
+		const uint64_t rounded_x = x++ / multiple * multiple;
+		benchmark::DoNotOptimize(rounded_x);
+	}
+}
+BENCHMARK(native_round_down_uint64_t);
+
+BENCHMARK_MAIN();
diff --git a/cmake/DownloadGoogleBenchmark.cmake b/cmake/DownloadGoogleBenchmark.cmake
new file mode 100644
index 0000000..349e7cb
--- /dev/null
+++ b/cmake/DownloadGoogleBenchmark.cmake
@@ -0,0 +1,15 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.2)
+
+PROJECT(googlebenchmark-download NONE)
+
+INCLUDE(ExternalProject)
+ExternalProject_Add(googlebenchmark
+	URL https://github.com/google/benchmark/archive/v1.2.0.zip
+	URL_HASH SHA256=cc463b28cb3701a35c0855fbcefb75b29068443f1952b64dd5f4f669272e95ea
+	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googlebenchmark"
+	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googlebenchmark"
+	CONFIGURE_COMMAND ""
+	BUILD_COMMAND ""
+	INSTALL_COMMAND ""
+	TEST_COMMAND ""
+)
diff --git a/cmake/DownloadGoogleTest.cmake b/cmake/DownloadGoogleTest.cmake
new file mode 100644
index 0000000..19f5eb1
--- /dev/null
+++ b/cmake/DownloadGoogleTest.cmake
@@ -0,0 +1,15 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8.2)
+
+PROJECT(googletest-download NONE)
+
+INCLUDE(ExternalProject)
+ExternalProject_Add(googletest
+	URL https://github.com/google/googletest/archive/release-1.8.0.zip
+	URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
+	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
+	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
+	CONFIGURE_COMMAND ""
+	BUILD_COMMAND ""
+	INSTALL_COMMAND ""
+	TEST_COMMAND ""
+)
diff --git a/configure.py b/configure.py
new file mode 100755
index 0000000..e61363f
--- /dev/null
+++ b/configure.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+
+
+import confu
+parser = confu.standard_parser("FXdiv configuration script")
+
+
+def main(args):
+    options = parser.parse_args(args)
+    build = confu.Build.from_options(options)
+
+    build.export_cpath("include", ["fxdiv.h"])
+
+    with build.options(source_dir="test", deps=build.deps.googletest):
+        build.unittest("multiply-high-test", build.cxx("multiply-high.cc"))
+        build.unittest("quotient-test", build.cxx("quotient.cc"))
+
+    with build.options(source_dir="bench", deps=build.deps.googlebenchmark):
+        build.benchmark("init-bench", build.cxx("init.cc"))
+        build.benchmark("multiply-bench", build.cxx("multiply.cc"))
+        build.benchmark("divide-bench", build.cxx("divide.cc"))
+        build.benchmark("quotient-bench", build.cxx("quotient.cc"))
+        build.benchmark("round-down-bench", build.cxx("round-down.cc"))
+
+    return build
+
+
+if __name__ == "__main__":
+    import sys
+    main(sys.argv[1:]).generate()
diff --git a/confu.yaml b/confu.yaml
new file mode 100644
index 0000000..2f1bf46
--- /dev/null
+++ b/confu.yaml
@@ -0,0 +1,6 @@
+name: fxdiv
+title: division via fixed-point multiplication by inverse
+license: MIT
+deps:
+  - name: googletest
+  - name: googlebenchmark
diff --git a/include/fxdiv.h b/include/fxdiv.h
new file mode 100644
index 0000000..21a3dc1
--- /dev/null
+++ b/include/fxdiv.h
@@ -0,0 +1,409 @@
+#pragma once
+#ifndef FXDIV_H
+#define FXDIV_H
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+	#include <cstddef>
+	#include <cstdint>
+	#include <climits>
+#elif !defined(__OPENCL_VERSION__)
+	#include <stddef.h>
+	#include <stdint.h>
+	#include <limits.h>
+#endif
+
+#if defined(_MSC_VER)
+	#include <intrin.h>
+#endif
+
+#ifndef FXDIV_USE_INLINE_ASSEMBLY
+	#define FXDIV_USE_INLINE_ASSEMBLY 1
+#endif
+
+static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
+#if defined(_MSC_VER) && defined(_M_IX86)
+	return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
+#else
+	return (uint64_t) a * (uint64_t) b;
+#endif
+}
+
+static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
+#elif defined(_MSC_VER) && defined(_M_IX86)
+	return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
+#elif defined(_MSC_VER) && defined(_M_ARM)
+	return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
+#else
+	return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
+#endif
+}
+
+static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
+#if defined(__OPENCL_VERSION__)
+	return mul_hi(a, b);
+#elif defined(__CUDA_ARCH__)
+	return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
+#elif defined(_MSC_VER) && defined(_M_X64)
+	return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
+#elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
+	return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
+#else
+	const uint32_t a_lo = (uint32_t) a;
+	const uint32_t a_hi = (uint32_t) (a >> 32);
+	const uint32_t b_lo = (uint32_t) b;
+	const uint32_t b_hi = (uint32_t) (b >> 32);
+
+	const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
+		(uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
+	return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
+		((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
+#endif
+}
+
+static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
+#if SIZE_MAX == UINT32_MAX
+	return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
+#elif SIZE_MAX == UINT64_MAX
+	return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
+#else
+	#error Unsupported platform
+#endif
+}
+
+struct fxdiv_divisor_uint32_t {
+	uint32_t value;
+	uint32_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint32_t {
+	uint32_t quotient;
+	uint32_t remainder;
+};
+
+struct fxdiv_divisor_uint64_t {
+	uint64_t value;
+	uint64_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_uint64_t {
+	uint64_t quotient;
+	uint64_t remainder;
+};
+
+struct fxdiv_divisor_size_t {
+	size_t value;
+	size_t m;
+	uint8_t s1;
+	uint8_t s2;
+};
+
+struct fxdiv_result_size_t {
+	size_t quotient;
+	size_t remainder;
+};
+
+static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
+	struct fxdiv_divisor_uint32_t result = { d };
+	if (d == 1) {
+		result.m = UINT32_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t l_minus_1 = 31 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
+		#elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))
+			unsigned long l_minus_1;
+			_BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
+		#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t l_minus_1;
+			__asm__("BSRL %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1));
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+
+			uint32_t l_minus_1 = 0;
+			uint32_t x = d - 1;
+			uint32_t y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+		#endif
+		uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
+		#if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint32_t q;
+			__asm__("DIVL %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (0));
+		#else
+			const uint32_t q = ((uint64_t) u_hi << 32) / d;
+		#endif
+
+		result.m = q + UINT32_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
+	struct fxdiv_divisor_uint64_t result = { d };
+	if (d == 1) {
+		result.m = UINT64_C(1);
+		result.s1 = 0;
+		result.s2 = 0;
+	} else {
+		#if defined(__OPENCL_VERSION__)
+			const uint32_t nlz_d = clz(d);
+			const uint32_t l_minus_1 = 63 - clz(d - 1);
+		#elif defined(__CUDA_ARCH__)
+			const uint32_t nlz_d = __clzll((long long) d);
+			const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
+		#elif defined(_MSC_VER) && defined(_M_X64)
+			unsigned long l_minus_1;
+			_BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
+			unsigned long bsr_d;
+			_BitScanReverse64(&bsr_d, (unsigned __int64) d);
+			const uint32_t nlz_d = bsr_d ^ 0x3F;
+		#elif defined(_MSC_VER) && defined(_M_IX86)
+			const uint64_t d_minus_1 = d - 1;
+			const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			unsigned long l_minus_1;
+			if ((uint32_t) (d_minus_1 >> 32) == 0) {
+				_BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
+			} else {
+				_BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
+				l_minus_1 += 32;
+			}
+			const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
+		#elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t l_minus_1;
+			__asm__("BSRQ %[d_minus_1], %[l_minus_1]"
+				: [l_minus_1] "=r" (l_minus_1)
+				: [d_minus_1] "r" (d - 1));
+		#elif defined(__GNUC__)
+			const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
+			const uint32_t nlz_d = __builtin_clzll(d);
+		#else
+			/* Based on Algorithm 2 from Hacker's delight */
+			const uint64_t d_minus_1 = d - 1;
+			const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
+			uint64_t l_minus_1 = 0;
+			uint32_t x = d_minus_1;
+			uint32_t y = d_minus_1 >> 32;
+			if (y != 0) {
+				l_minus_1 += 32;
+				x = y;
+			}
+			y = x >> 16;
+			if (y != 0) {
+				l_minus_1 += 16;
+				x = y;
+			}
+			y = x >> 8;
+			if (y != 0) {
+				l_minus_1 += 8;
+				x = y;
+			}
+			y = x >> 4;
+			if (y != 0) {
+				l_minus_1 += 4;
+				x = y;
+			}
+			y = x >> 2;
+			if (y != 0) {
+				l_minus_1 += 2;
+				x = y;
+			}
+			if ((x & 2) != 0) {
+				l_minus_1 += 1;
+			}
+			const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
+		#endif
+		uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
+
+		/* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
+		#if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
+			uint64_t q;
+			__asm__("DIVQ %[d]"
+				: "=a" (q), "+d" (u_hi)
+				: [d] "r" (d), "a" (UINT64_C(0)));
+		#else
+			/* Implementation based on code from Hacker's delight */
+
+			/* Normalize divisor and shift divident left */
+			d <<= nlz_d;
+			u_hi <<= nlz_d;
+			/* Break divisor up into two 32-bit digits */
+			const uint64_t d_hi = (uint32_t) (d >> 32);
+			const uint32_t d_lo = (uint32_t) d;
+
+			/* Compute the first quotient digit, q1 */
+			uint64_t q1 = u_hi / d_hi;
+			uint64_t r1 = u_hi - q1 * d_hi;
+
+			while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
+				q1 -= 1;
+				r1 += d_hi;
+				if ((r1 >> 32) != 0) {
+					break;
+				}
+			}
+
+			/* Multiply and subtract. */
+			u_hi = (u_hi << 32) - q1 * d;
+
+			/* Compute the second quotient digit, q0 */
+			uint64_t q0 = u_hi / d_hi;
+			uint64_t r0 = u_hi - q0 * d_hi;
+
+			while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
+				q0 -= 1;
+				r0 += d_hi;
+				if ((r0 >> 32) != 0) {
+					break;
+				}
+			}
+			const uint64_t q = (q1 << 32) | (uint32_t) q0;
+		#endif
+		result.m = q + UINT64_C(1);
+		result.s1 = 1;
+		result.s2 = (uint8_t) l_minus_1;
+	}
+	return result;
+}
+
+static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
+#else
+	#error Unsupported platform
+#endif
+	struct fxdiv_divisor_size_t size_result = {
+		(size_t) uint_result.value,
+		(size_t) uint_result.m,
+		uint_result.s1,
+		uint_result.s2
+	};
+	return size_result;
+}
+
+static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
+	return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
+}
+
+static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+#if SIZE_MAX == UINT32_MAX
+	const struct fxdiv_divisor_uint32_t uint32_divisor = {
+		(uint32_t) divisor.value,
+		(uint32_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
+#elif SIZE_MAX == UINT64_MAX
+	const struct fxdiv_divisor_uint64_t uint64_divisor = {
+		(uint64_t) divisor.value,
+		(uint64_t) divisor.m,
+		divisor.s1,
+		divisor.s2
+	};
+	return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
+#else
+	#error Unsupported platform
+#endif
+}
+
+static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	return n - quotient * divisor.value;
+}
+
+static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
+	const size_t quotient = fxdiv_quotient_size_t(n, granularity);
+	return quotient * granularity.value;
+}
+
+static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
+	const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
+	const uint32_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint32_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
+	const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
+	const uint64_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_uint64_t result = { quotient, remainder };
+	return result;
+}
+
+static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
+	const size_t quotient = fxdiv_quotient_size_t(n, divisor);
+	const size_t remainder = n - quotient * divisor.value;
+	struct fxdiv_result_size_t result = { quotient, remainder };
+	return result;
+}
+
+#endif /* FXDIV_H */
diff --git a/jni/Android.mk b/jni/Android.mk
new file mode 100644
index 0000000..4e151dc
--- /dev/null
+++ b/jni/Android.mk
@@ -0,0 +1,6 @@
+LOCAL_PATH := $(call my-dir)/..
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := fxdiv
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/include
+include $(BUILD_STATIC_LIBRARY)
diff --git a/jni/Application.mk b/jni/Application.mk
new file mode 100644
index 0000000..0f660e3
--- /dev/null
+++ b/jni/Application.mk
@@ -0,0 +1,4 @@
+APP_PLATFORM := android-15
+APP_PIE := true
+APP_STL := c++_static
+APP_ABI := all
diff --git a/test/multiply-high.cc b/test/multiply-high.cc
new file mode 100644
index 0000000..10f9013
--- /dev/null
+++ b/test/multiply-high.cc
@@ -0,0 +1,33 @@
+#include <gtest/gtest.h>
+
+#include <fxdiv.h>
+
+TEST(uint32_t, cases) {
+	EXPECT_EQ(UINT32_C(0),              fxdiv_mulhi_uint32_t(UINT32_C(0),         UINT32_C(0)));
+	EXPECT_EQ(UINT32_C(0),              fxdiv_mulhi_uint32_t(UINT32_C(1),         UINT32_C(1)));
+	EXPECT_EQ(UINT32_C(0),              fxdiv_mulhi_uint32_t(UINT32_C(1),         UINT32_MAX));
+	EXPECT_EQ(UINT32_C(0),              fxdiv_mulhi_uint32_t(UINT32_MAX,          UINT32_C(1)));
+	EXPECT_EQ(UINT32_C(1),              fxdiv_mulhi_uint32_t(UINT32_C(65536),     UINT32_C(65536)));
+	EXPECT_EQ(UINT32_MAX - UINT32_C(1), fxdiv_mulhi_uint32_t(UINT32_MAX,          UINT32_MAX));
+	EXPECT_EQ(UINT32_C(28389652),       fxdiv_mulhi_uint32_t(UINT32_C(123456789), UINT32_C(987654321)));
+	EXPECT_EQ(UINT32_C(28389652),       fxdiv_mulhi_uint32_t(UINT32_C(987654321), UINT32_C(123456789)));
+}
+
+TEST(uint64_t, cases) {
+	EXPECT_EQ(UINT64_C(0),              fxdiv_mulhi_uint64_t(UINT64_C(0),                 UINT64_C(0)));
+	EXPECT_EQ(UINT64_C(0),              fxdiv_mulhi_uint64_t(UINT64_C(1),                 UINT64_C(1)));
+	EXPECT_EQ(UINT64_C(0),              fxdiv_mulhi_uint64_t(UINT64_C(1),                 UINT64_MAX));
+	EXPECT_EQ(UINT64_C(0),              fxdiv_mulhi_uint64_t(UINT64_MAX,                  UINT64_C(1)));
+	EXPECT_EQ(UINT64_C(1),              fxdiv_mulhi_uint64_t(UINT64_C(4294967296),        UINT64_C(4294967296)));
+	EXPECT_EQ(UINT64_MAX - UINT64_C(1), fxdiv_mulhi_uint64_t(UINT64_MAX,                  UINT64_MAX));
+	EXPECT_EQ(UINT64_C(66099812259603), fxdiv_mulhi_uint64_t(UINT64_C(12345678987654321), UINT64_C(98765432123456789)));
+	EXPECT_EQ(UINT64_C(66099812259603), fxdiv_mulhi_uint64_t(UINT64_C(98765432123456789), UINT64_C(12345678987654321)));
+}
+
+TEST(size_t, cases) {
+	EXPECT_EQ(0,            fxdiv_mulhi_size_t(0,        0));
+	EXPECT_EQ(0,            fxdiv_mulhi_size_t(1,        1));
+	EXPECT_EQ(0,            fxdiv_mulhi_size_t(1,        SIZE_MAX));
+	EXPECT_EQ(0,            fxdiv_mulhi_size_t(SIZE_MAX, 1));
+	EXPECT_EQ(SIZE_MAX - 1, fxdiv_mulhi_size_t(SIZE_MAX, SIZE_MAX));
+}
diff --git a/test/quotient.cc b/test/quotient.cc
new file mode 100644
index 0000000..ad5cd8e
--- /dev/null
+++ b/test/quotient.cc
@@ -0,0 +1,233 @@
+#include <gtest/gtest.h>
+
+#include <fxdiv.h>
+
+TEST(uint32_t, cases) {
+	EXPECT_EQ(UINT32_C(42) / UINT32_C(7),
+		fxdiv_quotient_uint32_t(UINT32_C(42),
+			fxdiv_init_uint32_t(UINT32_C(7))));
+	EXPECT_EQ(UINT32_C(42) / UINT32_C(6),
+		fxdiv_quotient_uint32_t(UINT32_C(42),
+			fxdiv_init_uint32_t(UINT32_C(6))));
+	EXPECT_EQ(UINT32_C(42) / UINT32_C(5),
+		fxdiv_quotient_uint32_t(UINT32_C(42),
+			fxdiv_init_uint32_t(UINT32_C(5))));
+	EXPECT_EQ(UINT32_C(1) / UINT32_C(1),
+		fxdiv_quotient_uint32_t(UINT32_C(1),
+			fxdiv_init_uint32_t(UINT32_C(1))));
+	EXPECT_EQ(UINT32_MAX / UINT32_C(1),
+		fxdiv_quotient_uint32_t(UINT32_MAX,
+			fxdiv_init_uint32_t(UINT32_C(1))));
+	EXPECT_EQ(UINT32_MAX / UINT32_MAX,
+		fxdiv_quotient_uint32_t(UINT32_MAX,
+			fxdiv_init_uint32_t(UINT32_MAX)));
+	EXPECT_EQ((UINT32_MAX - 1) / UINT32_MAX,
+		fxdiv_quotient_uint32_t(UINT32_MAX - 1,
+			fxdiv_init_uint32_t(UINT32_MAX)));
+	EXPECT_EQ(UINT32_MAX / (UINT32_MAX - 1),
+		fxdiv_quotient_uint32_t(UINT32_MAX,
+			fxdiv_init_uint32_t(UINT32_MAX - 1)));
+	EXPECT_EQ(UINT32_C(0) / UINT32_C(1),
+		fxdiv_quotient_uint32_t(UINT32_C(0),
+			fxdiv_init_uint32_t(UINT32_C(1))));
+}
+
+TEST(uint32_t, divide_by_1) {
+	const fxdiv_divisor_uint32_t d = fxdiv_init_uint32_t(UINT32_C(1));
+	const uint32_t step = UINT32_C(487);
+	for (uint32_t n = 0; n <= UINT32_MAX - step + 1; n += step) {
+		EXPECT_EQ(n, fxdiv_quotient_uint32_t(n, d));
+	}
+}
+
+TEST(uint32_t, divide_zero) {
+	const uint32_t step = UINT32_C(491);
+	for (uint32_t d = 1; d <= UINT32_MAX - step + 1; d += step) {
+		EXPECT_EQ(0,
+			fxdiv_quotient_uint32_t(0,
+				fxdiv_init_uint32_t(d)));
+	}
+}
+
+TEST(uint32_t, divide_by_n_minus_1) {
+	const uint32_t step = UINT32_C(523);
+	for (uint32_t n = 3; n <= UINT32_MAX - step + 1; n += step) {
+		EXPECT_EQ(UINT32_C(1),
+			fxdiv_quotient_uint32_t(n,
+				fxdiv_init_uint32_t(n - 1)));
+	}
+}
+
+TEST(uint32_t, divide_by_power_of_2) {
+	const uint32_t step = UINT32_C(25183);
+	for (uint32_t p = 0; p < 32; p += 1) {
+		const fxdiv_divisor_uint32_t divisor =
+			fxdiv_init_uint32_t(UINT32_C(1) << p);
+		for (uint32_t n = 0; n <= UINT32_MAX - step + 1; n += step) {
+			EXPECT_EQ(n >> p, fxdiv_quotient_uint32_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint32_t, divide_by_power_of_2_plus_1) {
+	const uint32_t step = UINT32_C(25183);
+	for (uint32_t p = 0; p < 32; p += 1) {
+		const uint32_t d = (UINT32_C(1) << p) + UINT32_C(1);
+		const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(d);
+		for (uint32_t n = 0; n <= UINT32_MAX - step + 1; n += step) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint32_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint32_t, divide_by_power_of_2_minus_1) {
+	const uint32_t step = UINT64_C(25183);
+	for (uint32_t p = 0; p < 32; p += 1) {
+		const uint32_t d = (UINT32_C(2) << p) - UINT32_C(1);
+		const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(d);
+		for (uint32_t n = 0; n <= UINT32_MAX - step + 1; n += step) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint32_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint32_t, match_native) {
+	const uint32_t stepD = UINT32_C(821603);
+	const uint32_t stepN = UINT32_C(821641);
+	for (uint32_t d = UINT32_MAX; d >= stepD; d -= stepD) {
+		const fxdiv_divisor_uint32_t divisor = fxdiv_init_uint32_t(d);
+		for (uint32_t n = UINT32_MAX; n >= stepN; n -= stepN) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint32_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint64_t, cases) {
+	EXPECT_EQ(UINT64_C(42) / UINT64_C(7),
+		fxdiv_quotient_uint64_t(UINT64_C(42),
+			fxdiv_init_uint64_t(UINT64_C(7))));
+	EXPECT_EQ(UINT64_C(42) / UINT64_C(6),
+		fxdiv_quotient_uint64_t(UINT64_C(42),
+			fxdiv_init_uint64_t(UINT64_C(6))));
+	EXPECT_EQ(UINT64_C(42) / UINT64_C(5),
+		fxdiv_quotient_uint64_t(UINT64_C(42),
+			fxdiv_init_uint64_t(UINT64_C(5))));
+	EXPECT_EQ(UINT64_C(1) / UINT64_C(1),
+		fxdiv_quotient_uint64_t(UINT64_C(1),
+			fxdiv_init_uint64_t(UINT64_C(1))));
+	EXPECT_EQ(UINT64_MAX / UINT64_C(1),
+		fxdiv_quotient_uint64_t(UINT64_MAX,
+			fxdiv_init_uint64_t(UINT64_C(1))));
+	EXPECT_EQ(UINT64_MAX / UINT64_MAX,
+		fxdiv_quotient_uint64_t(UINT64_MAX,
+			fxdiv_init_uint64_t(UINT64_MAX)));
+	EXPECT_EQ((UINT64_MAX - 1) / UINT64_MAX,
+		fxdiv_quotient_uint64_t(UINT64_MAX - 1,
+			fxdiv_init_uint64_t(UINT64_MAX)));
+	EXPECT_EQ(UINT64_MAX / (UINT64_MAX - 1),
+		fxdiv_quotient_uint64_t(UINT64_MAX,
+			fxdiv_init_uint64_t(UINT64_MAX - 1)));
+	EXPECT_EQ(UINT64_C(0) / UINT64_C(1),
+		fxdiv_quotient_uint64_t(UINT64_C(0),
+			fxdiv_init_uint64_t(UINT64_C(1))));
+}
+
+TEST(uint64_t, divide_by_1) {
+	const fxdiv_divisor_uint64_t d = fxdiv_init_uint64_t(UINT64_C(1));
+	const uint64_t step = UINT64_C(2048116998241);
+	for (uint64_t n = 0; n <= UINT64_MAX - step + 1; n += step) {
+		EXPECT_EQ(n, fxdiv_quotient_uint64_t(n, d));
+	}
+}
+
+TEST(uint64_t, divide_zero) {
+	const uint64_t step = UINT64_C(8624419641811);
+	for (uint64_t d = 1; d <= UINT64_MAX - step + 1; d += step) {
+		EXPECT_EQ(0,
+			fxdiv_quotient_uint64_t(0,
+				fxdiv_init_uint64_t(d)));
+	}
+}
+
+TEST(uint64_t, divide_by_n_minus_1) {
+	const uint64_t step = UINT64_C(8624419641833);
+	for (uint64_t n = 3; n <= UINT64_MAX - step + 1; n += step) {
+		EXPECT_EQ(UINT64_C(1),
+			fxdiv_quotient_uint64_t(n,
+				fxdiv_init_uint64_t(n - 1)));
+	}
+}
+
+TEST(uint64_t, divide_by_power_of_2) {
+	const uint64_t step = UINT64_C(93400375993241);
+	for (uint32_t p = 0; p < 64; p += 1) {
+		const fxdiv_divisor_uint64_t divisor =
+			fxdiv_init_uint64_t(UINT64_C(1) << p);
+		for (uint64_t n = 0; n <= UINT64_MAX - step + 1; n += step) {
+			EXPECT_EQ(n >> p, fxdiv_quotient_uint64_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint64_t, divide_by_power_of_2_plus_1) {
+	const uint64_t step = UINT64_C(93400375993241);
+	for (uint32_t p = 0; p < 64; p += 1) {
+		const uint64_t d = (UINT64_C(1) << p) + UINT64_C(1);
+		const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(d);
+		for (uint64_t n = 0; n <= UINT64_MAX - step + 1; n += step) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint64_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint64_t, divide_by_power_of_2_minus_1) {
+	const uint64_t step = UINT64_C(93400375993241);
+	for (uint32_t p = 0; p < 64; p += 1) {
+		const uint64_t d = (UINT64_C(2) << p) - UINT64_C(1);
+		const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(d);
+		for (uint64_t n = 0; n <= UINT64_MAX - step + 1; n += step) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint64_t(n, divisor));
+		}
+	}
+}
+
+TEST(uint64_t, match_native) {
+	const uint64_t stepD = UINT64_C(7093600525704701);
+	const uint64_t stepN = UINT64_C(7093600525704677);
+	for (uint64_t d = UINT64_MAX; d >= stepD; d -= stepD) {
+		const fxdiv_divisor_uint64_t divisor = fxdiv_init_uint64_t(d);
+		for (uint64_t n = UINT64_MAX; n >= stepN; n -= stepN) {
+			EXPECT_EQ(n / d, fxdiv_quotient_uint64_t(n, divisor));
+		}
+	}
+}
+
+TEST(size_t, cases) {
+	EXPECT_EQ(42 / 7,
+		fxdiv_quotient_size_t(42,
+			fxdiv_init_size_t(7)));
+	EXPECT_EQ(42 / 6,
+		fxdiv_quotient_size_t(42,
+			fxdiv_init_size_t(6)));
+	EXPECT_EQ(42 / 5,
+		fxdiv_quotient_size_t(42,
+			fxdiv_init_size_t(5)));
+	EXPECT_EQ(1 / 1,
+		fxdiv_quotient_size_t(1,
+			fxdiv_init_size_t(1)));
+	EXPECT_EQ(SIZE_MAX / 1,
+		fxdiv_quotient_size_t(SIZE_MAX,
+			fxdiv_init_size_t(1)));
+	EXPECT_EQ(SIZE_MAX / SIZE_MAX,
+		fxdiv_quotient_size_t(SIZE_MAX,
+			fxdiv_init_size_t(SIZE_MAX)));
+	EXPECT_EQ((SIZE_MAX - 1) / SIZE_MAX,
+		fxdiv_quotient_size_t(SIZE_MAX - 1,
+			fxdiv_init_size_t(SIZE_MAX)));
+	EXPECT_EQ(SIZE_MAX / (SIZE_MAX - 1),
+		fxdiv_quotient_size_t(SIZE_MAX,
+			fxdiv_init_size_t(SIZE_MAX - 1)));
+	EXPECT_EQ(0 / 1,
+		fxdiv_quotient_size_t(0,
+			fxdiv_init_size_t(1)));
+}
