Update rr prebuilts to build 10827144. am: 72266da4be am: 74924e4573 am: 77e9faec15 Original change: https://android-review.googlesource.com/c/platform/tools/rr_prebuilt/+/2755597 Change-Id: Iea5da26ecc3ea6953c7b68f1c5f8918715da5795 Signed-off-by: Automerger Merge Worker <android-build-automerger-merge-worker@system.gserviceaccount.com>
diff --git a/rr/android/x86_64/bin/rr b/rr/android/x86_64/bin/rr new file mode 100755 index 0000000..3128823 --- /dev/null +++ b/rr/android/x86_64/bin/rr Binary files differ
diff --git a/rr/android/x86_64/bin/rr-collect-symbols.py b/rr/android/x86_64/bin/rr-collect-symbols.py new file mode 100755 index 0000000..79c72a7 --- /dev/null +++ b/rr/android/x86_64/bin/rr-collect-symbols.py
@@ -0,0 +1,203 @@ +#!/usr/bin/env python3 + +import errno +import glob +import os +import re +import shutil +import subprocess +import sys +import tempfile +from urllib.request import urlretrieve +from urllib.error import HTTPError, ContentTooShortError + +# Usage: rr-collect-symbols.py <trace-dir> [<url> | <path>] +# +# Given a <url>, downloads the zip/.tar.zst file at <url>, uncompresses it, +# runs "gunzip" on any .gz files, and for any ELF files found whose build-ids +# match the build-id of an ELF file in the trace, moves it into the trace. +# +# Given a <path>, which must contain a .build-id directory with the usual +# structure (e.g. as Ubuntu and Fedora create under /usr/lib/debug), searches +# the directory tree for any ELF files whose build-ids match the build-id of +# an ELF file in the trace and copies them into the trace. <path> defaults to +# "/usr/lib/debug", which will grab any available system debuginfo files +# in Ubuntu and Fedora at least. +# +# This script assumes that the trace-dir has been packed via `rr pack` so all +# relevant files actually appear in the trace-dir. +# It also assumes rr is on the PATH. +# +# The debuginfo files are placed in the trace under a "debug" subdirectory, +# in a ".build-id" subdirectory with the usual structure. +# +# If a debuginfo file contains a .gnu_debugaltlink section then we also +# attempt to find the referenced file and copy it into the trace with the +# same file name as the .debug file, but with a .sup suffix. + +if len(sys.argv) < 2: + print("Usage: rr-collect-symbols.py <trace-dir> [<url> | <path>]", file=sys.stderr) + sys.exit(1) +trace_dir = sys.argv[1] + +if len(sys.argv) < 3: + source = "/usr/lib/debug" +else: + source = sys.argv[2] + +rr_buildid = subprocess.Popen(["rr", "buildid"], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE) + +def build_id_for(file): + global rr_buildid + rr_buildid.stdin.write(("%s\n"%file).encode('utf-8')) + try: + rr_buildid.stdin.flush() + except BrokenPipeError: + print("Can't write to rr, termination code %s"%rr_buildid.returncode, file=sys.stderr) + sys.exit(2) + return rr_buildid.stdout.readline().rstrip().decode('utf-8') + +altref_regex = re.compile(rb"^\s+\[\s*0\]\s+(.*)"); + +def find_altref(file): + proc = subprocess.Popen(["readelf", "-p", ".gnu_debugaltlink", file], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + try: + for line in proc.stdout: + m = altref_regex.match(line) + if m: + return m.group(1).rstrip() + finally: + proc.wait() + return None + +def find_altref_for_trace_file(trace_file, altref): + proc = subprocess.Popen(["rr", "filename", trace_file], stdout=subprocess.PIPE) + try: + for line in proc.stdout: + file = line.rstrip() + altref_file = os.path.join(os.path.dirname(file), altref) + if os.path.isfile(altref_file): + return altref_file + finally: + proc.wait() + return None + +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as exc: + if exc.errno == errno.EEXIST and os.path.isdir(path): + pass + else: + raise + +# 'dst' must be a complete file name, not a directory. +def copy_file(src, dst): + try: + # Remove the destination file in case it's a hard link + # or owned by someone else. + os.remove(dst) + except: + pass + shutil.copy(src, dst) + +# 'dst' must be a complete file name, not a directory +def create_link(src, dst): + try: + # Remove the destination file in case it's wrong. + os.remove(dst) + except: + pass + os.symlink(src, dst) + +def collect_trace_build_ids(): + ret = {} + for file in glob.iglob("%s/mmap_*"%trace_dir): + build_id = build_id_for(file) + if build_id: + ret[build_id] = True + altref = find_altref(file) + if altref: + altref_file = find_altref_for_trace_file(file, altref) + if not altref_file: + print("WARNING: Can't find alt file %s for %s"%(altref, file)) + continue + dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) + mkdir_p(dir) + copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:])) + return ret + +trace_build_ids = collect_trace_build_ids() + +def collect_archive(url): + is_tar_zst = url.endswith(".tar.zst") + tmp_dir = tempfile.mkdtemp(dir=trace_dir) + if is_tar_zst: + tmp_file_name = "%s/archive.tar.zst"%tmp_dir + else: + # Assume its a ZIP + tmp_file_name = "%s/archive.zip"%tmp_dir + try: + (file, headers) = urlretrieve(url, tmp_file_name) + except (HTTPError, ContentTooShortError) as exc: + print("Failed to load archive %s: %s"%(url, exc), file=sys.stderr) + sys.exit(2) + if is_tar_zst: + subprocess.check_call(["tar", "-C", tmp_dir, "-I", "zstd", "-xvf", file]) + else: + subprocess.check_call(["unzip", "-d", tmp_dir, file]) + os.remove(file) + + for root, dirs, files in os.walk(tmp_dir): + for name in files: + file = os.path.join(root, name) + if file.endswith(".gz"): + subprocess.check_call(["gunzip", file]) + file = file[:-3] + build_id = build_id_for(file) + if build_id and build_id in trace_build_ids: + dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) + mkdir_p(dir) + dst = "%s/%s.debug"%(dir, build_id[2:]) + os.rename(file, dst) + else: + os.remove(file) + + shutil.rmtree(tmp_dir) + +def collect_filesystem(path): + for root, dirs, files in os.walk(path): + for name in files: + file = os.path.join(root, name) + if not os.path.islink(file): + build_id = build_id_for(file) + if build_id and build_id in trace_build_ids: + dir = "%s/debug/.build-id/%s"%(trace_dir, build_id[:2]) + mkdir_p(dir) + copy_file(file, "%s/%s.debug"%(dir, build_id[2:])) + altref = find_altref(file) + if altref: + altref = altref.decode('utf-8') + altref_file = os.path.join(os.path.dirname(file), altref) + copy_file(altref_file, "%s/%s.sup"%(dir, build_id[2:])) + if altref.startswith("../../../.dwz/"): + mkdir_p("%s/.dwz"%trace_dir) + src = "../debug/.build-id/%s/%s.sup"%(build_id[:2], build_id[2:]) + create_link(src, "%s/.dwz/%s"%(trace_dir, altref[14:])) + elif altref.startswith("../../.dwz/"): + mkdir_p("%s/debug/.dwz"%trace_dir) + src = "../.build-id/%s/%s.sup"%(build_id[:2], build_id[2:]) + create_link(src, "%s/debug/.dwz/%s"%(trace_dir, altref[11:])) + elif altref.startswith("../.dwz/"): + mkdir_p("%s/debug/.build-id/.dwz"%trace_dir) + src = "../%s/%s.sup"%(build_id[:2], build_id[2:]) + create_link(src, "%s/debug/.build-id/.dwz/%s"%(trace_dir, altref[8:])) + +if re.search("^[^:/]+:", source): + collect_archive(source) +else: + collect_filesystem(source) + +rr_buildid.terminate()
diff --git a/rr/android/x86_64/bin/rr_exec_stub b/rr/android/x86_64/bin/rr_exec_stub new file mode 100755 index 0000000..5136526 --- /dev/null +++ b/rr/android/x86_64/bin/rr_exec_stub Binary files differ
diff --git a/rr/android/x86_64/bin/signal-rr-recording.sh b/rr/android/x86_64/bin/signal-rr-recording.sh new file mode 100755 index 0000000..18a4cfd --- /dev/null +++ b/rr/android/x86_64/bin/signal-rr-recording.sh
@@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +signal=$1 +if [[ "$signal" == "" ]]; then + echo "Usage: $0 <signal>" >&2 + echo "Sends <signal> to all processes being recorded by rr" >&2 + exit 1 +fi + +function signal_descendants { pid=$1 + for child in `ps -o pid= --ppid $pid`; do + echo Sending $signal to $child + kill -s $signal $child + signal_descendants $child + done +} + +for rr_pid in `pidof rr` ; do + if cat /proc/$rr_pid/cmdline | tr '\0' '\n' | head -n2 | tail -n1 | grep -qz '\(^record$\)\|/' ; then + signal_descendants $rr_pid + fi +done
diff --git a/rr/android/x86_64/lib/rr/librrpage.so b/rr/android/x86_64/lib/rr/librrpage.so new file mode 100644 index 0000000..1290a2d --- /dev/null +++ b/rr/android/x86_64/lib/rr/librrpage.so Binary files differ
diff --git a/rr/android/x86_64/lib/rr/librrpreload.so b/rr/android/x86_64/lib/rr/librrpreload.so new file mode 100644 index 0000000..f3e3b1b --- /dev/null +++ b/rr/android/x86_64/lib/rr/librrpreload.so Binary files differ
diff --git a/rr/android/x86_64/share/bash-completion/completions/rr b/rr/android/x86_64/share/bash-completion/completions/rr new file mode 100755 index 0000000..7325165 --- /dev/null +++ b/rr/android/x86_64/share/bash-completion/completions/rr
@@ -0,0 +1,29 @@ +# vi:syntax=sh +# +# completion script for rr commands (to be sourced) + +_rr_subcmd_completion() { + local cmd=$1 + local short_opts=$(rr help $cmd | sed -n 's/\s*-\([a-zA-Z]\),.*/-\1/p') + local long_opts=$(rr help $cmd | sed -n 's/.*--\([^= ]*\).*/--\1/p') + echo "$short_opts" "$long_opts" +} + +_rr_completion() { + COMPREPLY=() + local rr_commands="$(rr --list-commands | cut -s -d ' ' -f 3)" + + # completion for rr + if [ $COMP_CWORD -eq 1 ]; then + COMPREPLY=( $( compgen -W "$rr_commands" -- "${COMP_WORDS[1]}" ) ) + return + fi + + # completion for rr <command>'s options + local cmd="$(echo "${COMP_WORDS[1]}" | tr -d '[:space:]')" + + if [ "$(echo $rr_commands | grep -w "$cmd")" ] ; then + COMPREPLY=( $( compgen -W "$(_rr_subcmd_completion "$cmd")" -- "${COMP_WORDS[COMP_CWORD]}" ) ) + fi +} +complete -F _rr_completion rr
diff --git a/rr/android/x86_64/share/rr/32bit-avx.xml b/rr/android/x86_64/share/rr/32bit-avx.xml new file mode 100644 index 0000000..6eb44fe --- /dev/null +++ b/rr/android/x86_64/share/rr/32bit-avx.xml
@@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.avx"> + <reg name="ymm0h" bitsize="128" type="uint128"/> + <reg name="ymm1h" bitsize="128" type="uint128"/> + <reg name="ymm2h" bitsize="128" type="uint128"/> + <reg name="ymm3h" bitsize="128" type="uint128"/> + <reg name="ymm4h" bitsize="128" type="uint128"/> + <reg name="ymm5h" bitsize="128" type="uint128"/> + <reg name="ymm6h" bitsize="128" type="uint128"/> + <reg name="ymm7h" bitsize="128" type="uint128"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-core.xml b/rr/android/x86_64/share/rr/32bit-core.xml new file mode 100644 index 0000000..48c5890 --- /dev/null +++ b/rr/android/x86_64/share/rr/32bit-core.xml
@@ -0,0 +1,65 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.core"> + <flags id="i386_eflags" size="4"> + <field name="CF" start="0" end="0"/> + <field name="" start="1" end="1"/> + <field name="PF" start="2" end="2"/> + <field name="AF" start="4" end="4"/> + <field name="ZF" start="6" end="6"/> + <field name="SF" start="7" end="7"/> + <field name="TF" start="8" end="8"/> + <field name="IF" start="9" end="9"/> + <field name="DF" start="10" end="10"/> + <field name="OF" start="11" end="11"/> + <field name="NT" start="14" end="14"/> + <field name="RF" start="16" end="16"/> + <field name="VM" start="17" end="17"/> + <field name="AC" start="18" end="18"/> + <field name="VIF" start="19" end="19"/> + <field name="VIP" start="20" end="20"/> + <field name="ID" start="21" end="21"/> + </flags> + + <reg name="eax" bitsize="32" type="int32"/> + <reg name="ecx" bitsize="32" type="int32"/> + <reg name="edx" bitsize="32" type="int32"/> + <reg name="ebx" bitsize="32" type="int32"/> + <reg name="esp" bitsize="32" type="data_ptr"/> + <reg name="ebp" bitsize="32" type="data_ptr"/> + <reg name="esi" bitsize="32" type="int32"/> + <reg name="edi" bitsize="32" type="int32"/> + + <reg name="eip" bitsize="32" type="code_ptr"/> + <reg name="eflags" bitsize="32" type="i386_eflags"/> + <reg name="cs" bitsize="32" type="int32"/> + <reg name="ss" bitsize="32" type="int32"/> + <reg name="ds" bitsize="32" type="int32"/> + <reg name="es" bitsize="32" type="int32"/> + <reg name="fs" bitsize="32" type="int32"/> + <reg name="gs" bitsize="32" type="int32"/> + + <reg name="st0" bitsize="80" type="i387_ext"/> + <reg name="st1" bitsize="80" type="i387_ext"/> + <reg name="st2" bitsize="80" type="i387_ext"/> + <reg name="st3" bitsize="80" type="i387_ext"/> + <reg name="st4" bitsize="80" type="i387_ext"/> + <reg name="st5" bitsize="80" type="i387_ext"/> + <reg name="st6" bitsize="80" type="i387_ext"/> + <reg name="st7" bitsize="80" type="i387_ext"/> + + <reg name="fctrl" bitsize="32" type="int" group="float"/> + <reg name="fstat" bitsize="32" type="int" group="float"/> + <reg name="ftag" bitsize="32" type="int" group="float"/> + <reg name="fiseg" bitsize="32" type="int" group="float"/> + <reg name="fioff" bitsize="32" type="int" group="float"/> + <reg name="foseg" bitsize="32" type="int" group="float"/> + <reg name="fooff" bitsize="32" type="int" group="float"/> + <reg name="fop" bitsize="32" type="int" group="float"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-linux.xml b/rr/android/x86_64/share/rr/32bit-linux.xml new file mode 100644 index 0000000..7139db8 --- /dev/null +++ b/rr/android/x86_64/share/rr/32bit-linux.xml
@@ -0,0 +1,11 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.linux"> + <reg name="orig_eax" bitsize="32" type="int" regnum="41"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-pkeys.xml b/rr/android/x86_64/share/rr/32bit-pkeys.xml new file mode 100644 index 0000000..6f6723c --- /dev/null +++ b/rr/android/x86_64/share/rr/32bit-pkeys.xml
@@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2016-2021 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.pkeys"> + + <reg name="pkru" bitsize="32" type="uint32"/> + +</feature>
diff --git a/rr/android/x86_64/share/rr/32bit-sse.xml b/rr/android/x86_64/share/rr/32bit-sse.xml new file mode 100644 index 0000000..03b6421 --- /dev/null +++ b/rr/android/x86_64/share/rr/32bit-sse.xml
@@ -0,0 +1,52 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.sse"> + <vector id="v4f" type="ieee_single" count="4"/> + <vector id="v2d" type="ieee_double" count="2"/> + <vector id="v16i8" type="int8" count="16"/> + <vector id="v8i16" type="int16" count="8"/> + <vector id="v4i32" type="int32" count="4"/> + <vector id="v2i64" type="int64" count="2"/> + <union id="vec128"> + <field name="v4_float" type="v4f"/> + <field name="v2_double" type="v2d"/> + <field name="v16_int8" type="v16i8"/> + <field name="v8_int16" type="v8i16"/> + <field name="v4_int32" type="v4i32"/> + <field name="v2_int64" type="v2i64"/> + <field name="uint128" type="uint128"/> + </union> + <flags id="i386_mxcsr" size="4"> + <field name="IE" start="0" end="0"/> + <field name="DE" start="1" end="1"/> + <field name="ZE" start="2" end="2"/> + <field name="OE" start="3" end="3"/> + <field name="UE" start="4" end="4"/> + <field name="PE" start="5" end="5"/> + <field name="DAZ" start="6" end="6"/> + <field name="IM" start="7" end="7"/> + <field name="DM" start="8" end="8"/> + <field name="ZM" start="9" end="9"/> + <field name="OM" start="10" end="10"/> + <field name="UM" start="11" end="11"/> + <field name="PM" start="12" end="12"/> + <field name="FZ" start="15" end="15"/> + </flags> + + <reg name="xmm0" bitsize="128" type="vec128" regnum="32"/> + <reg name="xmm1" bitsize="128" type="vec128"/> + <reg name="xmm2" bitsize="128" type="vec128"/> + <reg name="xmm3" bitsize="128" type="vec128"/> + <reg name="xmm4" bitsize="128" type="vec128"/> + <reg name="xmm5" bitsize="128" type="vec128"/> + <reg name="xmm6" bitsize="128" type="vec128"/> + <reg name="xmm7" bitsize="128" type="vec128"/> + + <reg name="mxcsr" bitsize="32" type="i386_mxcsr" group="vector"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-avx.xml b/rr/android/x86_64/share/rr/64bit-avx.xml new file mode 100644 index 0000000..5dfe45e --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-avx.xml
@@ -0,0 +1,26 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.avx"> + <reg name="ymm0h" bitsize="128" type="uint128"/> + <reg name="ymm1h" bitsize="128" type="uint128"/> + <reg name="ymm2h" bitsize="128" type="uint128"/> + <reg name="ymm3h" bitsize="128" type="uint128"/> + <reg name="ymm4h" bitsize="128" type="uint128"/> + <reg name="ymm5h" bitsize="128" type="uint128"/> + <reg name="ymm6h" bitsize="128" type="uint128"/> + <reg name="ymm7h" bitsize="128" type="uint128"/> + <reg name="ymm8h" bitsize="128" type="uint128"/> + <reg name="ymm9h" bitsize="128" type="uint128"/> + <reg name="ymm10h" bitsize="128" type="uint128"/> + <reg name="ymm11h" bitsize="128" type="uint128"/> + <reg name="ymm12h" bitsize="128" type="uint128"/> + <reg name="ymm13h" bitsize="128" type="uint128"/> + <reg name="ymm14h" bitsize="128" type="uint128"/> + <reg name="ymm15h" bitsize="128" type="uint128"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-core.xml b/rr/android/x86_64/share/rr/64bit-core.xml new file mode 100644 index 0000000..7cd0673 --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-core.xml
@@ -0,0 +1,73 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.core"> + <flags id="i386_eflags" size="4"> + <field name="CF" start="0" end="0"/> + <field name="" start="1" end="1"/> + <field name="PF" start="2" end="2"/> + <field name="AF" start="4" end="4"/> + <field name="ZF" start="6" end="6"/> + <field name="SF" start="7" end="7"/> + <field name="TF" start="8" end="8"/> + <field name="IF" start="9" end="9"/> + <field name="DF" start="10" end="10"/> + <field name="OF" start="11" end="11"/> + <field name="NT" start="14" end="14"/> + <field name="RF" start="16" end="16"/> + <field name="VM" start="17" end="17"/> + <field name="AC" start="18" end="18"/> + <field name="VIF" start="19" end="19"/> + <field name="VIP" start="20" end="20"/> + <field name="ID" start="21" end="21"/> + </flags> + + <reg name="rax" bitsize="64" type="int64"/> + <reg name="rbx" bitsize="64" type="int64"/> + <reg name="rcx" bitsize="64" type="int64"/> + <reg name="rdx" bitsize="64" type="int64"/> + <reg name="rsi" bitsize="64" type="int64"/> + <reg name="rdi" bitsize="64" type="int64"/> + <reg name="rbp" bitsize="64" type="data_ptr"/> + <reg name="rsp" bitsize="64" type="data_ptr"/> + <reg name="r8" bitsize="64" type="int64"/> + <reg name="r9" bitsize="64" type="int64"/> + <reg name="r10" bitsize="64" type="int64"/> + <reg name="r11" bitsize="64" type="int64"/> + <reg name="r12" bitsize="64" type="int64"/> + <reg name="r13" bitsize="64" type="int64"/> + <reg name="r14" bitsize="64" type="int64"/> + <reg name="r15" bitsize="64" type="int64"/> + + <reg name="rip" bitsize="64" type="code_ptr"/> + <reg name="eflags" bitsize="32" type="i386_eflags"/> + <reg name="cs" bitsize="32" type="int32"/> + <reg name="ss" bitsize="32" type="int32"/> + <reg name="ds" bitsize="32" type="int32"/> + <reg name="es" bitsize="32" type="int32"/> + <reg name="fs" bitsize="32" type="int32"/> + <reg name="gs" bitsize="32" type="int32"/> + + <reg name="st0" bitsize="80" type="i387_ext"/> + <reg name="st1" bitsize="80" type="i387_ext"/> + <reg name="st2" bitsize="80" type="i387_ext"/> + <reg name="st3" bitsize="80" type="i387_ext"/> + <reg name="st4" bitsize="80" type="i387_ext"/> + <reg name="st5" bitsize="80" type="i387_ext"/> + <reg name="st6" bitsize="80" type="i387_ext"/> + <reg name="st7" bitsize="80" type="i387_ext"/> + + <reg name="fctrl" bitsize="32" type="int" group="float"/> + <reg name="fstat" bitsize="32" type="int" group="float"/> + <reg name="ftag" bitsize="32" type="int" group="float"/> + <reg name="fiseg" bitsize="32" type="int" group="float"/> + <reg name="fioff" bitsize="32" type="int" group="float"/> + <reg name="foseg" bitsize="32" type="int" group="float"/> + <reg name="fooff" bitsize="32" type="int" group="float"/> + <reg name="fop" bitsize="32" type="int" group="float"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-linux.xml b/rr/android/x86_64/share/rr/64bit-linux.xml new file mode 100644 index 0000000..b4229d0 --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-linux.xml
@@ -0,0 +1,11 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.linux"> + <reg name="orig_rax" bitsize="64" type="int" regnum="57"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-pkeys.xml b/rr/android/x86_64/share/rr/64bit-pkeys.xml new file mode 100644 index 0000000..6f6723c --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-pkeys.xml
@@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2016-2021 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.pkeys"> + + <reg name="pkru" bitsize="32" type="uint32"/> + +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-seg.xml b/rr/android/x86_64/share/rr/64bit-seg.xml new file mode 100644 index 0000000..1fa6c9e --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-seg.xml
@@ -0,0 +1,5 @@ +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.segments"> + <reg name="fs_base" bitsize="64" type="data_ptr"/> + <reg name="gs_base" bitsize="64" type="data_ptr"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/64bit-sse.xml b/rr/android/x86_64/share/rr/64bit-sse.xml new file mode 100644 index 0000000..eec4b79 --- /dev/null +++ b/rr/android/x86_64/share/rr/64bit-sse.xml
@@ -0,0 +1,60 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.i386.sse"> + <vector id="v4f" type="ieee_single" count="4"/> + <vector id="v2d" type="ieee_double" count="2"/> + <vector id="v16i8" type="int8" count="16"/> + <vector id="v8i16" type="int16" count="8"/> + <vector id="v4i32" type="int32" count="4"/> + <vector id="v2i64" type="int64" count="2"/> + <union id="vec128"> + <field name="v4_float" type="v4f"/> + <field name="v2_double" type="v2d"/> + <field name="v16_int8" type="v16i8"/> + <field name="v8_int16" type="v8i16"/> + <field name="v4_int32" type="v4i32"/> + <field name="v2_int64" type="v2i64"/> + <field name="uint128" type="uint128"/> + </union> + <flags id="i386_mxcsr" size="4"> + <field name="IE" start="0" end="0"/> + <field name="DE" start="1" end="1"/> + <field name="ZE" start="2" end="2"/> + <field name="OE" start="3" end="3"/> + <field name="UE" start="4" end="4"/> + <field name="PE" start="5" end="5"/> + <field name="DAZ" start="6" end="6"/> + <field name="IM" start="7" end="7"/> + <field name="DM" start="8" end="8"/> + <field name="ZM" start="9" end="9"/> + <field name="OM" start="10" end="10"/> + <field name="UM" start="11" end="11"/> + <field name="PM" start="12" end="12"/> + <field name="FZ" start="15" end="15"/> + </flags> + + <reg name="xmm0" bitsize="128" type="vec128" regnum="40"/> + <reg name="xmm1" bitsize="128" type="vec128"/> + <reg name="xmm2" bitsize="128" type="vec128"/> + <reg name="xmm3" bitsize="128" type="vec128"/> + <reg name="xmm4" bitsize="128" type="vec128"/> + <reg name="xmm5" bitsize="128" type="vec128"/> + <reg name="xmm6" bitsize="128" type="vec128"/> + <reg name="xmm7" bitsize="128" type="vec128"/> + <reg name="xmm8" bitsize="128" type="vec128"/> + <reg name="xmm9" bitsize="128" type="vec128"/> + <reg name="xmm10" bitsize="128" type="vec128"/> + <reg name="xmm11" bitsize="128" type="vec128"/> + <reg name="xmm12" bitsize="128" type="vec128"/> + <reg name="xmm13" bitsize="128" type="vec128"/> + <reg name="xmm14" bitsize="128" type="vec128"/> + <reg name="xmm15" bitsize="128" type="vec128"/> + + <reg name="mxcsr" bitsize="32" type="i386_mxcsr" group="vector"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-core.xml b/rr/android/x86_64/share/rr/aarch64-core.xml new file mode 100644 index 0000000..ee6a3a6 --- /dev/null +++ b/rr/android/x86_64/share/rr/aarch64-core.xml
@@ -0,0 +1,91 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2009-2020 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.aarch64.core"> + <reg name="x0" bitsize="64"/> + <reg name="x1" bitsize="64"/> + <reg name="x2" bitsize="64"/> + <reg name="x3" bitsize="64"/> + <reg name="x4" bitsize="64"/> + <reg name="x5" bitsize="64"/> + <reg name="x6" bitsize="64"/> + <reg name="x7" bitsize="64"/> + <reg name="x8" bitsize="64"/> + <reg name="x9" bitsize="64"/> + <reg name="x10" bitsize="64"/> + <reg name="x11" bitsize="64"/> + <reg name="x12" bitsize="64"/> + <reg name="x13" bitsize="64"/> + <reg name="x14" bitsize="64"/> + <reg name="x15" bitsize="64"/> + <reg name="x16" bitsize="64"/> + <reg name="x17" bitsize="64"/> + <reg name="x18" bitsize="64"/> + <reg name="x19" bitsize="64"/> + <reg name="x20" bitsize="64"/> + <reg name="x21" bitsize="64"/> + <reg name="x22" bitsize="64"/> + <reg name="x23" bitsize="64"/> + <reg name="x24" bitsize="64"/> + <reg name="x25" bitsize="64"/> + <reg name="x26" bitsize="64"/> + <reg name="x27" bitsize="64"/> + <reg name="x28" bitsize="64"/> + <reg name="x29" bitsize="64"/> + <reg name="x30" bitsize="64"/> + <reg name="sp" bitsize="64" type="data_ptr"/> + + <reg name="pc" bitsize="64" type="code_ptr"/> + + <flags id="cpsr_flags" size="4"> + <!-- Stack Pointer. --> + <field name="SP" start="0" end="0"/> + + <!-- Exception Level. --> + <field name="EL" start="2" end="3"/> + <!-- Execution state. --> + <field name="nRW" start="4" end="4"/> + + <!-- FIQ interrupt mask. --> + <field name="F" start="6" end="6"/> + <!-- IRQ interrupt mask. --> + <field name="I" start="7" end="7"/> + <!-- SError interrupt mask. --> + <field name="A" start="8" end="8"/> + <!-- Debug exception mask. --> + <field name="D" start="9" end="9"/> + + <!-- ARMv8.0-A: Speculative Store Bypass. --> + <field name="SSBS" start="12" end="12"/> + + <!-- Illegal Execution state. --> + <field name="IL" start="20" end="20"/> + <!-- Software Step. --> + <field name="SS" start="21" end="21"/> + <!-- ARMv8.1-A: Privileged Access Never. --> + <field name="PAN" start="22" end="22"/> + <!-- ARMv8.2-A: User Access Override. --> + <field name="UAO" start="23" end="23"/> + <!-- ARMv8.4-A: Data Independent Timing. --> + <field name="DIT" start="24" end="24"/> + <!-- ARMv8.5-A: Tag Check Override. --> + <field name="TCO" start="25" end="25"/> + + <!-- Overflow Condition flag. --> + <field name="V" start="28" end="28"/> + <!-- Carry Condition flag. --> + <field name="C" start="29" end="29"/> + <!-- Zero Condition flag. --> + <field name="Z" start="30" end="30"/> + <!-- Negative Condition flag. --> + <field name="N" start="31" end="31"/> + </flags> + <reg name="cpsr" bitsize="32" type="cpsr_flags"/> + +</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-fpu.xml b/rr/android/x86_64/share/rr/aarch64-fpu.xml new file mode 100644 index 0000000..eae763c --- /dev/null +++ b/rr/android/x86_64/share/rr/aarch64-fpu.xml
@@ -0,0 +1,88 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2009-2020 Free Software Foundation, Inc. + Contributed by ARM Ltd. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.aarch64.fpu"> + <vector id="v2d" type="ieee_double" count="2"/> + <vector id="v2u" type="uint64" count="2"/> + <vector id="v2i" type="int64" count="2"/> + <vector id="v4f" type="ieee_single" count="4"/> + <vector id="v4u" type="uint32" count="4"/> + <vector id="v4i" type="int32" count="4"/> + <vector id="v8f" type="ieee_half" count="8"/> + <vector id="v8u" type="uint16" count="8"/> + <vector id="v8i" type="int16" count="8"/> + <vector id="v16u" type="uint8" count="16"/> + <vector id="v16i" type="int8" count="16"/> + <vector id="v1u" type="uint128" count="1"/> + <vector id="v1i" type="int128" count="1"/> + <union id="vnd"> + <field name="f" type="v2d"/> + <field name="u" type="v2u"/> + <field name="s" type="v2i"/> + </union> + <union id="vns"> + <field name="f" type="v4f"/> + <field name="u" type="v4u"/> + <field name="s" type="v4i"/> + </union> + <union id="vnh"> + <field name="f" type="v8f"/> + <field name="u" type="v8u"/> + <field name="s" type="v8i"/> + </union> + <union id="vnb"> + <field name="u" type="v16u"/> + <field name="s" type="v16i"/> + </union> + <union id="vnq"> + <field name="u" type="v1u"/> + <field name="s" type="v1i"/> + </union> + <union id="aarch64v"> + <field name="d" type="vnd"/> + <field name="s" type="vns"/> + <field name="h" type="vnh"/> + <field name="b" type="vnb"/> + <field name="q" type="vnq"/> + </union> + <reg name="v0" bitsize="128" type="aarch64v" regnum="34"/> + <reg name="v1" bitsize="128" type="aarch64v" /> + <reg name="v2" bitsize="128" type="aarch64v" /> + <reg name="v3" bitsize="128" type="aarch64v" /> + <reg name="v4" bitsize="128" type="aarch64v" /> + <reg name="v5" bitsize="128" type="aarch64v" /> + <reg name="v6" bitsize="128" type="aarch64v" /> + <reg name="v7" bitsize="128" type="aarch64v" /> + <reg name="v8" bitsize="128" type="aarch64v" /> + <reg name="v9" bitsize="128" type="aarch64v" /> + <reg name="v10" bitsize="128" type="aarch64v"/> + <reg name="v11" bitsize="128" type="aarch64v"/> + <reg name="v12" bitsize="128" type="aarch64v"/> + <reg name="v13" bitsize="128" type="aarch64v"/> + <reg name="v14" bitsize="128" type="aarch64v"/> + <reg name="v15" bitsize="128" type="aarch64v"/> + <reg name="v16" bitsize="128" type="aarch64v"/> + <reg name="v17" bitsize="128" type="aarch64v"/> + <reg name="v18" bitsize="128" type="aarch64v"/> + <reg name="v19" bitsize="128" type="aarch64v"/> + <reg name="v20" bitsize="128" type="aarch64v"/> + <reg name="v21" bitsize="128" type="aarch64v"/> + <reg name="v22" bitsize="128" type="aarch64v"/> + <reg name="v23" bitsize="128" type="aarch64v"/> + <reg name="v24" bitsize="128" type="aarch64v"/> + <reg name="v25" bitsize="128" type="aarch64v"/> + <reg name="v26" bitsize="128" type="aarch64v"/> + <reg name="v27" bitsize="128" type="aarch64v"/> + <reg name="v28" bitsize="128" type="aarch64v"/> + <reg name="v29" bitsize="128" type="aarch64v"/> + <reg name="v30" bitsize="128" type="aarch64v"/> + <reg name="v31" bitsize="128" type="aarch64v"/> + <reg name="fpsr" bitsize="32"/> + <reg name="fpcr" bitsize="32"/> +</feature>
diff --git a/rr/android/x86_64/share/rr/aarch64-pauth.xml b/rr/android/x86_64/share/rr/aarch64-pauth.xml new file mode 100644 index 0000000..2ce14b4 --- /dev/null +++ b/rr/android/x86_64/share/rr/aarch64-pauth.xml
@@ -0,0 +1,13 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2018-2020 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!DOCTYPE feature SYSTEM "gdb-target.dtd"> +<feature name="org.gnu.gdb.aarch64.pauth"> + <reg name="pauth_dmask" bitsize="64"/> + <reg name="pauth_cmask" bitsize="64"/> +</feature> +
diff --git a/rr/android/x86_64/share/rr/amd64-avx-linux.xml b/rr/android/x86_64/share/rr/amd64-avx-linux.xml new file mode 100644 index 0000000..d2dc3bc --- /dev/null +++ b/rr/android/x86_64/share/rr/amd64-avx-linux.xml
@@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- AMD64 with AVX - Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386:x86-64</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="64bit-core.xml"/> + <xi:include href="64bit-sse.xml"/> + <xi:include href="64bit-linux.xml"/> + <xi:include href="64bit-seg.xml"/> + <xi:include href="64bit-avx.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/amd64-linux.xml b/rr/android/x86_64/share/rr/amd64-linux.xml new file mode 100644 index 0000000..aad02a3 --- /dev/null +++ b/rr/android/x86_64/share/rr/amd64-linux.xml
@@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- AMD64 - Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386:x86-64</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="64bit-core.xml"/> + <xi:include href="64bit-sse.xml"/> + <xi:include href="64bit-linux.xml"/> + <xi:include href="64bit-seg.xml"/> + <xi:include href="64bit-pkeys.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml b/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml new file mode 100644 index 0000000..1fa5bde --- /dev/null +++ b/rr/android/x86_64/share/rr/amd64-pkeys-linux.xml
@@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- AMD64 with AVX - Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386:x86-64</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="64bit-core.xml"/> + <xi:include href="64bit-sse.xml"/> + <xi:include href="64bit-linux.xml"/> + <xi:include href="64bit-seg.xml"/> + <xi:include href="64bit-avx.xml"/> + <xi:include href="64bit-pkeys.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/i386-avx-linux.xml b/rr/android/x86_64/share/rr/i386-avx-linux.xml new file mode 100644 index 0000000..c957fab --- /dev/null +++ b/rr/android/x86_64/share/rr/i386-avx-linux.xml
@@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- I386 with AVX- Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="32bit-core.xml"/> + <xi:include href="32bit-sse.xml"/> + <xi:include href="32bit-linux.xml"/> + <xi:include href="32bit-avx.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/i386-linux.xml b/rr/android/x86_64/share/rr/i386-linux.xml new file mode 100644 index 0000000..625984e --- /dev/null +++ b/rr/android/x86_64/share/rr/i386-linux.xml
@@ -0,0 +1,18 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- I386 with SSE - Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="32bit-core.xml"/> + <xi:include href="32bit-linux.xml"/> + <xi:include href="32bit-sse.xml"/> + <xi:include href="32bit-pkeys.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/i386-pkeys-linux.xml b/rr/android/x86_64/share/rr/i386-pkeys-linux.xml new file mode 100644 index 0000000..47f7b2f --- /dev/null +++ b/rr/android/x86_64/share/rr/i386-pkeys-linux.xml
@@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<!-- Copyright (C) 2010-2014 Free Software Foundation, Inc. + + Copying and distribution of this file, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. --> + +<!-- I386 with AVX- Includes Linux-only special "register". --> + +<!DOCTYPE target SYSTEM "gdb-target.dtd"> +<target> + <architecture>i386</architecture> + <osabi>GNU/Linux</osabi> + <xi:include href="32bit-core.xml"/> + <xi:include href="32bit-sse.xml"/> + <xi:include href="32bit-linux.xml"/> + <xi:include href="32bit-avx.xml"/> + <xi:include href="32bit-pkeys.xml"/> +</target>
diff --git a/rr/android/x86_64/share/rr/src/preload/overrides.c b/rr/android/x86_64/share/rr/src/preload/overrides.c new file mode 100644 index 0000000..2f572b3 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/overrides.c
@@ -0,0 +1,334 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#define RR_IMPLEMENT_PRELOAD + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#include <dlfcn.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "preload_interface.h" +#include "syscallbuf.h" + +#define PTHREAD_MUTEX_PRIO_INHERIT_NP 32 + +#define DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE 1 +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 34) +#undef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE +#endif +#endif + +#ifndef __BIONIC__ + +// Use an old version of dlsym so this code still works when built against glibc > 2.34 +// but loaded into a process linking a pre-2.34 glibc. +#ifdef __x86_64__ +__asm__(".symver dlsym,dlsym@GLIBC_2.2.5"); +#elif defined(__i386__) +__asm__(".symver dlsym,dlsym@GLIBC_2.0"); +#endif + +static int (*real_pthread_mutex_init)(void* mutex, const void* attr); +static int (*real_pthread_mutex_lock)(void* mutex); +static int (*real_pthread_mutex_trylock)(void* mutex); +static int (*real_pthread_mutex_timedlock)(void* mutex, + const struct timespec* abstime); +static int (*real_pthread_mutexattr_setprotocol)(void* attr, int protocol); + +static void __attribute__((constructor)) init_override(void) { + real_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init"); + real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock"); + real_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock"); + real_pthread_mutex_timedlock = dlsym(RTLD_NEXT, "pthread_mutex_timedlock"); + real_pthread_mutexattr_setprotocol = dlsym(RTLD_NEXT, "pthread_mutexattr_setprotocol"); +} + +static void fix_mutex_kind(pthread_mutex_t* mutex) { + /* Disable priority inheritance. */ + mutex->__data.__kind &= ~PTHREAD_MUTEX_PRIO_INHERIT_NP; +} + +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE +/* + * We need to able to call directly to __pthread_mutex_lock and + * __pthread_mutex_trylock because setting up our indirect function pointers + * calls dlsym which itself can call pthread_mutex_lock (e.g. via application + * code overriding malloc/calloc to use a pthreads-based implementation). + * So before our pointers are set up, call these. + * + * If we're building against glibc 2.34 *but* we get run against a binary + * linking with glibc < 2.34 *and* the application overrides malloc to use + * pthreads-based synchronization then this won't work and we lose. Let's + * hope this doesn't happen. + */ +extern int __pthread_mutex_init(pthread_mutex_t* mutex, + const pthread_mutexattr_t* attr); +extern int __pthread_mutex_lock(pthread_mutex_t* mutex); +extern int __pthread_mutex_trylock(pthread_mutex_t* mutex); +#endif + +int pthread_mutex_init(pthread_mutex_t* mutex, + const pthread_mutexattr_t* attr) { + int ret; + pthread_mutexattr_t realattr; + + if (attr) { + /* We wish to enforce the use of plain (no PI) mutex to avoid + * needing to handle PI futex() operations. + * We also wish to ensure that pthread_mutexattr_getprotocol() + * still returns the requested protocol. + * So we copy the attribute and force PTHREAD_PRIO_NONE. + */ + memcpy(&realattr, attr, sizeof(realattr)); + // We assume dlsym doesn't call pthread_mutex_init with attributes. + // We avoid calling pthread_mutexattr_setprotocol (and any other pthread functions) + // directly because that won't work when we're built against glibc 2.34 but loaded + // into a process using glibc < 2.34. (pthread functions got a symbol version bump + // in 2.34.) + // + // But note that we can't use dlsym in cases where we would want to use the double + // underscore methods (i.e. glibc < 2.34). There is no double underscore version of + // pthread_mutexattr_setprotocol, so we call it directly. + if (!real_pthread_mutexattr_setprotocol) { +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE + ret = pthread_mutexattr_setprotocol(&realattr, PTHREAD_PRIO_NONE); + goto setprotocol; +#else + real_pthread_mutexattr_setprotocol = dlsym(RTLD_NEXT, "pthread_mutexattr_setprotocol"); +#endif + } + ret = real_pthread_mutexattr_setprotocol(&realattr, PTHREAD_PRIO_NONE); +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE +setprotocol: +#endif + if (ret) { + return ret; + } + attr = &realattr; + } + if (!real_pthread_mutex_init) { +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE + return __pthread_mutex_init(mutex, attr); +#else + real_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init"); +#endif + } + return real_pthread_mutex_init(mutex, attr); +} + +/* Prevent use of lock elision; Haswell's TSX/RTM features used by + lock elision increment the rbc perf counter for instructions which + are later rolled back if the transaction fails. */ +int pthread_mutex_lock(pthread_mutex_t* mutex) { + fix_mutex_kind(mutex); + if (!real_pthread_mutex_lock) { +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE + return __pthread_mutex_lock(mutex); +#else + real_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock"); +#endif + } + return real_pthread_mutex_lock(mutex); +} + +int pthread_mutex_timedlock(pthread_mutex_t* mutex, + const struct timespec* abstime) { + fix_mutex_kind(mutex); + /* No __pthread_mutex_timedlock stub exists, so we have to use the + * indirect call no matter what. + */ + if (!real_pthread_mutex_timedlock) { + real_pthread_mutex_timedlock = dlsym(RTLD_NEXT, "pthread_mutex_timedlock"); + } + return real_pthread_mutex_timedlock(mutex, abstime); +} + +int pthread_mutex_trylock(pthread_mutex_t* mutex) { + fix_mutex_kind(mutex); + if (!real_pthread_mutex_trylock) { +#ifdef DOUBLE_UNDERSCORE_PTHREAD_LOCK_AVAILABLE + return __pthread_mutex_trylock(mutex); +#else + real_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock"); +#endif + } + return real_pthread_mutex_trylock(mutex); +} + +#endif + +typedef void* Dlopen(const char* filename, int flags); + +void* dlopen(const char* filename, int flags) { + // Give up our timeslice now. This gives us a full timeslice to + // execute the dlopen(), reducing the chance we'll hit + // https://sourceware.org/bugzilla/show_bug.cgi?id=19329. + Dlopen* f_ptr = (Dlopen*)dlsym(RTLD_NEXT, "dlopen"); + sched_yield(); + return f_ptr(filename, flags); +} + +/** Disable XShm since rr doesn't work with it */ +int XShmQueryExtension(__attribute__((unused)) void* dpy) { return 0; } + +/** Make sure XShmCreateImage returns null in case an application doesn't do + extension checks first. */ +void* XShmCreateImage(__attribute__((unused)) register void* dpy, + __attribute__((unused)) register void* visual, + __attribute__((unused)) unsigned int depth, + __attribute__((unused)) int format, + __attribute__((unused)) char* data, + __attribute__((unused)) void* shminfo, + __attribute__((unused)) unsigned int width, + __attribute__((unused)) unsigned int height) { + return 0; +} + +RR_HIDDEN char impose_syscall_delay; +RR_HIDDEN char impose_spurious_desched; + +/** + * This is for testing purposes only. + */ +void delayed_syscall(struct syscall_info* info) { + impose_syscall_delay = 1; + /* Make sure 'result' is used so it's not optimized out! */ + syscall(info->no, info->args[0], info->args[1], info->args[2], info->args[3], + info->args[4], info->args[5]); + impose_syscall_delay = 0; +} + +/** + * This is for testing purposes only. + * Note that this must be defined outside of the syscallbuf code. + * Otherwise, the signal recording code may expect exit from this function + * to trigger the syscallbuf exit breakpoint. + */ +void* syscallbuf_ptr(void) { + return ((struct preload_thread_locals*)PRELOAD_THREAD_LOCALS_ADDR)->buffer; +} + +/** + * This is for testing purposes only. + */ +void spurious_desched_syscall(struct syscall_info* info) { + impose_spurious_desched = 1; + /* Make sure 'result' is used so it's not optimized out! */ + syscall(info->no, info->args[0], info->args[1], info->args[2], info->args[3], + info->args[4], info->args[5]); + impose_spurious_desched = 0; +} + +/** + * clang's LeakSanitizer has regular threads call sched_yield() in a loop while + * a helper thread ptrace-attaches to them. If we let sched_yield() enter the + * syscallbuf, the helper thread sees that the regular thread SP register + * is pointing to the syscallbuf alt-stack, outside the stack region it + * expects, which causes it to freak out. + * So, override sched_yield() to perform the syscall in a way that can't + * be syscall-buffered. + */ +int sched_yield(void) { +#ifdef __i386__ + // We have no syscall hook for `syscall` followed by `inc %ecx` + int trash; + asm volatile ("int $0x80; inc %0" : "=c"(trash) : "a"(SYS_sched_yield)); +#elif defined(__x86_64__) + // We have no syscall hook for `syscall` followed by `inc %ecx` + int trash; + asm volatile ("syscall; inc %0" : "=c"(trash) : "a"(SYS_sched_yield)); +#elif defined(__aarch64__) + register long x8 __asm__("x8") = SYS_sched_yield; + // We explicitly blacklisted syscall that follows `mov x8, 0xdc` + // to avoid patching clone. Abuse that to prevent this from being patched. + __asm__ __volatile__("b 1f\n\t" + "mov x8, 0xdc\n" + "1:\n\t" + "svc 0\n" + :: "r"(x8) : "x0", "x30"); // x30 = lr +#else +#error "Unknown architecture" +#endif + return 0; +} + +#ifndef __aarch64__ + +/** + * glibc geteuid() can be compiled to instructions ending in "syscall; ret" + * which sometimes can't be hooked. So override it here with something that + * can be hooked. + * This is not an issue on aarch64 since we only need to patch a single instruction. + */ +uid_t geteuid(void) { +#ifdef __i386__ + return syscall(SYS_geteuid32); +#else + return syscall(SYS_geteuid); +#endif +} + +static void libstdcpp_not_found(void) { + const char msg[] = "[rr] Interposition for libstdc++ called but symbol lookups into libstdc++ failed.\n" + "Was libstdc++ loaded with RTLD_LOCAL? Try recording with `-v LD_PRELOAD=libstdc++.so.6`.\n" + "About to crash! "; + syscall(SYS_write, STDERR_FILENO, msg, sizeof(msg)); +} + +/** + * libstdc++3 uses RDRAND. Bypass that with this incredible hack. + */ +void _ZNSt13random_device7_M_initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE( + void* this, __attribute__((unused)) void* token) { + static void (*assign_string)(void *, char*) = NULL; + static void (*random_init)(void *, void*) = NULL; + if (!assign_string) { + assign_string = (void (*)(void *, char*))dlsym(RTLD_NEXT, + "_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE6assignEPKc"); + if (!assign_string) { + libstdcpp_not_found(); + } + } + assign_string(token, "/dev/urandom"); + if (!random_init) { + random_init = (void (*)(void *, void*))dlsym(RTLD_NEXT, __func__); + if (!random_init) { + libstdcpp_not_found(); + } + } + random_init(this, token); +} + +/** + * gcc 4.8.4 in Ubuntu 14.04-32 + */ +void _ZNSt13random_device7_M_initERKSs(void* this, + __attribute__((unused)) void* token) { + static void (*assign_string)(void *, char*) = NULL; + static void (*random_init)(void *, void*) = NULL; + if (!assign_string) { + assign_string = (void (*)(void *, char*))dlsym(RTLD_NEXT, + "_ZNSs6assignEPKc"); + if (!assign_string) { + libstdcpp_not_found(); + } + } + assign_string(token, "/dev/urandom"); + if (!random_init) { + random_init = (void (*)(void *, void*))dlsym(RTLD_NEXT, __func__); + if (!random_init) { + libstdcpp_not_found(); + } + } + random_init(this, token); +} + +#endif
diff --git a/rr/android/x86_64/share/rr/src/preload/preload_interface.h b/rr/android/x86_64/share/rr/src/preload/preload_interface.h new file mode 100644 index 0000000..5266498 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/preload_interface.h
@@ -0,0 +1,750 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#ifndef RR_PRELOAD_INTERFACE_H_ +#define RR_PRELOAD_INTERFACE_H_ + +/* Bump this whenever the interface between syscallbuf and rr changes in a way + * that would require changes to replay. So be very careful making changes to + * this file! Many changes would require a bump in this value, and support + * code in rr to handle old protocol versions. And when we bump it we'll need + * to figure out a way to test the old protocol versions. + * To be clear, changes that only affect recording and not replay, such as + * changes to the layout of syscall_patch_hook, do not need to bump this. + * Note also that SYSCALLBUF_PROTOCOL_VERSION is stored in the trace header, so + * replay always has access to the SYSCALLBUF_PROTOCOL_VERSION used during + * recording, even before the preload library is ever loaded. + * + * Version 0: initial rr 5.0.0 release + */ +#define SYSCALLBUF_PROTOCOL_VERSION 0 + +#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT) +/* Avoid using <string.h> library functions */ +static inline int streq(const char* s1, const char* s2) { + while (1) { + if (*s1 != *s2) { + return 0; + } + if (!*s1) { + return 1; + } + ++s1; + ++s2; + } + return 1; +} +static inline size_t rrstrlen(const char* s) { + size_t ret = 0; + while (*s) { + ++s; + ++ret; + } + return ret; +} +#else +#include <string.h> +static inline int streq(const char* s1, const char* s2) { + return !strcmp(s1, s2); +} +static inline size_t rrstrlen(const char* s) { return strlen(s); } +#include "../remote_ptr.h" +#endif + +#include <stdint.h> +#include <stddef.h> + +static inline int strprefix(const char* s1, const char* s2) { + while (1) { + if (!*s1) { + return 1; + } + if (*s1 != *s2) { + return 0; + } + ++s1; + ++s2; + } + return 1; +} + +static inline const char* extract_file_name(const char* s) { + const char* ret = s; + while (*s) { + if (*s == '/') { + ret = s + 1; + } + ++s; + } + return ret; +} + +/* This header file is included by preload.c and various rr .cc files. It + * defines the interface between the preload library and rr. preload.c + * #defines RR_IMPLEMENT_PRELOAD to let us handle situations where rr and + * preload.c need to see slightly different definitions of the same constructs. + * + * preload.c compiles this as C code. All rr modules compile this as C++ code. + * We do not use 'extern "C"' because we don't actually link between C and C++ + * and 'extern "C"' is not compatible with our use of templates below. + */ + +#define SYSCALLBUF_LIB_FILENAME_BASE "librrpreload" +#define SYSCALLBUF_LIB_FILENAME SYSCALLBUF_LIB_FILENAME_BASE ".so" +#define SYSCALLBUF_LIB_FILENAME_PADDED SYSCALLBUF_LIB_FILENAME_BASE ".so:::" +#define SYSCALLBUF_LIB_FILENAME_32 SYSCALLBUF_LIB_FILENAME_BASE "_32.so" + +#define RTLDAUDIT_LIB_FILENAME_BASE "librraudit" +#define RTLDAUDIT_LIB_FILENAME RTLDAUDIT_LIB_FILENAME_BASE ".so" +#define RTLDAUDIT_LIB_FILENAME_PADDED RTLDAUDIT_LIB_FILENAME_BASE ".so:::" +#define RTLDAUDIT_LIB_FILENAME_32 RTLDAUDIT_LIB_FILENAME_BASE "_32.so" + +#define RRPAGE_LIB_FILENAME_BASE "librrpage" +#define RRPAGE_LIB_FILENAME RRPAGE_LIB_FILENAME_BASE ".so" +#define RRPAGE_LIB_FILENAME_32 RRPAGE_LIB_FILENAME_BASE "_32.so" + +/* Set this env var to enable syscall buffering. */ +#define SYSCALLBUF_ENABLED_ENV_VAR "_RR_USE_SYSCALLBUF" + +/* Size of table mapping fd numbers to syscallbuf-disabled flag. */ +#define SYSCALLBUF_FDS_DISABLED_SIZE 16384 + +#define MPROTECT_RECORD_COUNT 1000 + +#if defined(__x86_64__) || defined(__i386__) +#define RR_PAGE_SYSCALL_STUB_SIZE 3 +#define RR_PAGE_SYSCALL_INSTRUCTION_END 2 +#elif defined(__aarch64__) +#define RR_PAGE_SYSCALL_STUB_SIZE 8 +#define RR_PAGE_SYSCALL_INSTRUCTION_END 4 +#else +#error "Must be defined for this architecture" +#endif + +/* Must match generate_rr_page.py */ +#define RR_PAGE_ADDR 0x70000000 +#ifdef __aarch64__ +#define PRELOAD_LIBRARY_PAGE_SIZE 65536 +#else +#define PRELOAD_LIBRARY_PAGE_SIZE 4096 +#endif +#define RR_PAGE_SYSCALL_ADDR(index) \ + ((void*)(RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * (index))) +#define RR_PAGE_SYSCALL_TRACED RR_PAGE_SYSCALL_ADDR(0) +#define RR_PAGE_SYSCALL_PRIVILEGED_TRACED RR_PAGE_SYSCALL_ADDR(1) +#define RR_PAGE_SYSCALL_UNTRACED RR_PAGE_SYSCALL_ADDR(2) +#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(3) +#define RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY RR_PAGE_SYSCALL_ADDR(4) +#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED RR_PAGE_SYSCALL_ADDR(5) +#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_REPLAY_ONLY RR_PAGE_SYSCALL_ADDR(6) +#define RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY \ + RR_PAGE_SYSCALL_ADDR(7) +#define RR_PAGE_SYSCALL_UNTRACED_REPLAY_ASSIST RR_PAGE_SYSCALL_ADDR(8) +#define RR_PAGE_IN_REPLAY_FLAG (RR_PAGE_ADDR + RR_PAGE_SYSCALL_STUB_SIZE * 9) +#define RR_PAGE_BREAKPOINT_VALUE (RR_PAGE_IN_REPLAY_FLAG + 4) + +/* Not ABI stable - in record page only */ +#define RR_PAGE_FF_BYTES RR_PAGE_BREAKPOINT_VALUE + +/* PRELOAD_THREAD_LOCALS_ADDR should not change. + * Tools depend on this address. */ +#define PRELOAD_THREAD_LOCALS_ADDR (RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE) +#ifdef __aarch64__ +#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE (1024 + 8 * 2) +#else +#define PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE 0 +#endif +#define PRELOAD_THREAD_LOCALS_SIZE (144 + PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE) + +#include "rrcalls.h" + +/* Define macros that let us compile a struct definition either "natively" + * (when included by preload.c) or as a template over Arch for use by rr. + */ +#if defined(RR_IMPLEMENT_PRELOAD) || defined(RR_IMPLEMENT_AUDIT) +#define TEMPLATE_ARCH +#define PTR(T) T* +#define PTR_ARCH(T) T* +#define EMBED_STRUCT(T) struct T +#define VOLATILE volatile +#define SIGNED_LONG long +#define UNSIGNED_LONG unsigned long +#else +#define TEMPLATE_ARCH template <typename Arch> +#define PTR(T) typename Arch::template ptr<T> +#define PTR_ARCH(T) typename Arch::template ptr<T<Arch>> +#define EMBED_STRUCT(T) T<Arch> +#define VOLATILE +#define SIGNED_LONG typename Arch::signed_long +#define UNSIGNED_LONG typename Arch::unsigned_long +#endif + +#define PATCH_IS_MULTIPLE_INSTRUCTIONS (1 << 0) +/* The syscall instruction is the last instruction in the patched area + * (rather than the first), which requires special handling. + */ +#define PATCH_SYSCALL_INSTRUCTION_IS_LAST (1 << 1) +/* All instructions in the patch are nop and their execution is thus not + * observable. This may allow more aggressive handling of interfering branches. + */ +#define PATCH_IS_NOP_INSTRUCTIONS (1 << 2) + + +/** + * To support syscall buffering, we replace syscall instructions with a "call" + * instruction that calls a hook in the preload library to handle the syscall. + * Since the call instruction takes more space than the syscall instruction, + * the patch replaces one or more instructions after the syscall instruction as + * well; those instructions are folded into the tail of the hook function + * and we have multiple hook functions, each one corresponding to an + * instruction that follows a syscall instruction. + * Each instance of this struct describes an instruction that can follow a + * syscall and a hook function to patch with. + * + * This is not (and must not ever be) used during replay so we can change it + * without bumping SYSCALLBUF_PROTOCOL_VERSION. + */ +struct syscall_patch_hook { + uint8_t flags; + uint8_t patch_region_length; + /* Avoid any padding or anything that would make the layout arch-specific. */ + uint8_t patch_region_bytes[14]; + uint64_t hook_address; +}; + +/** + * We buffer mprotect syscalls. Their effects need to be noted so we can + * update AddressSpace's cache of memory layout, which stores prot bits. So, + * the preload code builds a list of mprotect_records corresponding to the + * mprotect syscalls that have been buffered. This list is read by rr whenever + * we flush the syscallbuf, and its effects performed. The actual mprotect + * syscalls are performed during recording and replay. + * + * We simplify things by making this arch-independent. + */ +struct mprotect_record { + uint64_t start; + uint64_t size; + int32_t prot; + int32_t padding; +}; + +/** + * Must be arch-independent. + * Variables used to communicate between preload and rr. + * We package these up into a single struct to simplify the preload/rr + * interface. + * You can add to the end of this struct without breaking trace compatibility, + * but don't move existing fields. Do not write to it during replay except for + * the 'in_replay' field. Be careful reading fields during replay as noted + * below, since they don't all exist in all trace versions. + */ +struct preload_globals { + /* RESERVED in current versions of rr. + * + * QUIRK: With UsesGlobalsInReplayQuirk: + * 0 during recording, 1 during replay. Set by rr. + * This MUST NOT be used in conditional branches. It should only be used + * as the condition for conditional moves so that control flow during replay + * does not diverge from control flow during recording. + * We also have to be careful that values different between record and replay + * don't accidentally leak into other memory locations or registers. + * USE WITH CAUTION. + */ + unsigned char reserved_legacy_in_replay; + /* 0 during recording and replay, 1 during diversion. Set by rr. + */ + unsigned char in_diversion; + /* 1 if chaos mode is enabled. DO NOT READ from rr during replay, because + this field is not initialized in old traces. */ + unsigned char in_chaos; + /* The signal to use for desched events */ + unsigned char desched_sig; + /* RESERVED */ + int reserved; + /** + * Set by rr. + * For each fd, indicate a class that is valid for all fds with the given + * number in all tasks that share this address space. For fds >= + * SYSCALLBUF_FDS_DISABLED_SIZE - 1, the class is given by by + * syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE - 1]. See the + */ + VOLATILE char syscallbuf_fd_class[SYSCALLBUF_FDS_DISABLED_SIZE]; + + /* WARNING! SYSCALLBUF_FDS_DISABLED_SIZE can change, so + access to the following fields during replay is dangerous. Use + PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED or something + like it! */ + /* mprotect records. Set by preload. Us + PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access. */ + struct mprotect_record mprotect_records[MPROTECT_RECORD_COUNT]; + /* Random seed that can be used for various purposes. DO NOT READ from rr + during replay, because this field does not exist in old traces. */ + uint64_t random_seed; + /* RESERVED in current versions of rr. + * + * QUIRK: With UsesGlobalsInReplayQuirk: + * Indicates the value (in 8-byte increments) at which to raise a SIGSEGV + * trap once reached. NOTE: This remains constant during record, and is + * used only during replay. The same restrictions as in_replay above apply. + * + * Use PRELOAD_GLOBALS_FIELD_AFTER_SYSCALLBUF_FDS_DISABLED to access during + * replay. */ + uint64_t reserved_legacy_breakpoint_value; + /* Indicates whether or not all tasks in this address space have the same + fd table. Set by rr during record (modifications are recorded). + Read by the syscallbuf. Not read during replay. */ + unsigned char fdt_uniform; + /* The CPU we're bound to, if any; -1 if not bound. Not read during replay. */ + int32_t cpu_binding; +}; + +/** + * Represents syscall params. Makes it simpler to pass them around, + * and avoids pushing/popping all the data for calls. + */ +TEMPLATE_ARCH +struct syscall_info { + SIGNED_LONG no; + SIGNED_LONG args[6]; +}; + +TEMPLATE_ARCH +struct robust_list_info { + PTR(void) head; + uint32_t len; +}; + +TEMPLATE_ARCH +struct rseq_info { + PTR(void) rseq; + uint32_t len; + uint32_t sig; +}; + +/** + * Can be architecture dependent. The rr process does not manipulate + * these except to save and restore the values on task switches so that + * the values are always effectively local to the current task. rr also + * sets the |syscallbuf_stub_alt_stack| field. + * We use this instead of regular libc TLS because sometimes buggy application + * code breaks libc TLS for some tasks. With this approach we can be sure + * thread-locals are usable for any task in any state. + */ +TEMPLATE_ARCH +struct preload_thread_locals { + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * rr depends on. + * Offset of this field is hardcoded in syscall_hook.S and + * assembly_templates.py. + * Pointer to alt-stack used by syscallbuf stubs (allocated at the end of + * the scratch buffer. + */ + PTR(void) syscallbuf_stub_alt_stack; + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * tools can depend on. + * Where syscall result will be (or during replay, has been) saved. + */ + PTR(int64_t) pending_untraced_syscall_result; + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * rr depends on. + * Scratch space used by stub code. + */ + PTR(void) stub_scratch_1; + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * rr depends on. + */ + int32_t alt_stack_nesting_level; + /* Syscall hook saved flags (bottom 16 bits only) */ + int32_t saved_flags; + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * rr depends on. It contains the parameters to the patched syscall, or + * zero if we're not processing a buffered syscall. Do not depend on this + * existing during replay, some traces with SYSCALLBUF_PROTOCOL_VERSION 0 + * don't have it. + */ + PTR_ARCH(const struct syscall_info) original_syscall_parameters; + + /* Nonzero when thread-local state like the syscallbuf has been + * initialized. */ + int32_t thread_inited; + /* The offset of this field MUST NOT CHANGE, it is part of the ABI tools + * depend on. When buffering is enabled, points at the thread's mapped buffer + * segment. At the start of the segment is an object of type |struct + * syscallbuf_hdr|, so |buffer| is also a pointer to the buffer + * header. */ + PTR(uint8_t) buffer; + UNSIGNED_LONG buffer_size; + /* This is used to support the buffering of "may-block" system calls. + * The problem that needs to be addressed can be introduced with a + * simple example; assume that we're buffering the "read" and "write" + * syscalls. + * + * o (Tasks W and R set up a synchronous-IO pipe open between them; W + * "owns" the write end of the pipe; R owns the read end; the pipe + * buffer is full) + * o Task W invokes the write syscall on the pipe + * o Since write is a buffered syscall, the seccomp filter traps W + * directly to the kernel; there's no trace event for W delivered + * to rr. + * o The pipe is full, so W is descheduled by the kernel because W + * can't make progress. + * o rr thinks W is still running and doesn't schedule R. + * + * At this point, progress in the recorded application can only be + * made by scheduling R, but no one tells rr to do that. Oops! + * + * Thus enter the "desched counter". It's a perf_event for the "sw t + * switches" event (which, more precisely, is "sw deschedule"; it + * counts schedule-out, not schedule-in). We program the counter to + * deliver a signal to this task when there's new counter data + * available. And we set up the "sample period", how many descheds + * are triggered before the signal is delivered, to be "1". This + * means that when the counter is armed, the next desched (i.e., the + * next time the desched counter is bumped up) of this task will + * deliver the signal to it. And signal delivery always generates a + * ptrace trap, so rr can deduce that this task was descheduled and + * schedule another. + * + * The description above is sort of an idealized view; there are + * numerous implementation details that are documented in + * handle_signal.c, where they're dealt with. */ + int32_t desched_counter_fd; + int32_t cloned_file_data_fd; + SIGNED_LONG cloned_file_data_offset; + PTR(void) scratch_buf; + UNSIGNED_LONG usable_scratch_size; + + PTR(struct msghdr) notify_control_msg; + + /* The offset of this field MUST NOT CHANGE, it is part of the preload ABI + * rr depends on, on ARM. + */ + uint8_t stub_scratch_2[PRELOAD_THREAD_LOCAL_SCRATCH2_SIZE]; + + /** When the size is non-zero, there has been a buffered set_robust_list + * that must be accounted for. Set by preload code only, read by rr + * only during recording. + */ + EMBED_STRUCT(robust_list_info) robust_list; + + /** True when either a buffered rseq or unbuffered rseq has been called + * for this thread. Set by rr for buffered rseq and preload for unbuffered + * rseq. */ + int32_t rseq_called; + + /** When the len is non-zero, there has been a buffered rseq + * that must be accounted for. Set by preload code only, read by rr + * only during recording. + */ + EMBED_STRUCT(rseq_info) rseq; +}; +#if defined(__aarch64__) && (defined(RR_IMPLEMENT_PRELOAD) || \ + defined(RR_IMPLEMENT_AUDIT)) +// On aarch64, we the stub_scratch_2 offset is hardcoded in the syscallbuf code +_Static_assert(offsetof(struct preload_thread_locals, stub_scratch_2) == 8 * 13, + "stub_scratch_2 offset mismatch"); +#endif + +// The set of flags that can be set for each fd in syscallbuf_fds_disabled. +enum syscallbuf_fd_classes { + // fd is invalid, all syscalls will error (syscallbuf internal use only) + FD_CLASS_INVALID = -1, + // The fd is allowed to be completely untraced. No notification to the + // syscall buf is required. + FD_CLASS_UNTRACED = 0x0, + // This is the most conservative option. All operations on this fd are + // always traced. If there is a conflict between other options, this one + // should be chosen. + FD_CLASS_TRACED = 0x1, + // This fd either refers to a /proc/<pid>/mem or is untrace (if this as + // is shared with another fd table) + FD_CLASS_PROC_MEM = 0x2, +}; + +#define CURRENT_INIT_PRELOAD_PARAMS_VERSION 2 + +/** + * Packs up the parameters passed to |SYS_rrcall_init_preload|. + * We use this struct because it's a little cleaner. + * When evolving this struct, add new fields at the end and don't + * depend on them during replay. + */ +TEMPLATE_ARCH +struct rrcall_init_preload_params { + /* All "In" params. */ + /* The syscallbuf lib's idea of whether buffering is enabled. + * We let the syscallbuf code decide in order to more simply + * replay the same decision that was recorded. */ + int syscallbuf_enabled; + int syscall_patch_hook_count; + PTR(struct syscall_patch_hook) syscall_patch_hooks; + PTR(void) unused; + PTR(void) syscallbuf_code_start; + PTR(void) syscallbuf_code_end; + PTR(void) get_pc_thunks_start; + PTR(void) get_pc_thunks_end; + PTR(void) syscallbuf_final_exit_instruction; + PTR(struct preload_globals) globals; + union { + struct { + /* Address of the first entry of the breakpoint table. + * After processing a sycallbuf record (and unlocking the syscallbuf), + * we call a function in this table corresponding to the record processed. + * rr can set a breakpoint in this table to break on the completion of a + * particular syscallbuf record. + * This method of setting the breakpoint is deprecated. Instead, use the + * interface below. It is retained for compatibility */ + PTR(void) breakpoint_table; + int breakpoint_table_entry_size; + }; + struct { + PTR(void) breakpoint_instr_addr; + // Set of -1 to indicate non-legacy mode + int breakpoint_mode_sentinel; + }; + }; + PTR(void) syscallbuf_syscall_hook; +}; + +/** + * Packs up the inout parameters passed to |SYS_rrcall_init_buffers|. + * We use this struct because there are too many params to pass + * through registers on at least x86. (It's also a little cleaner.) + */ +TEMPLATE_ARCH +struct rrcall_init_buffers_params { + /* The fd we're using to track desched events. */ + int desched_counter_fd; + /* "Out" params. */ + int cloned_file_data_fd; + /* Returned pointer to and size of the shared syscallbuf + * segment. */ + PTR(void) syscallbuf_ptr; + /* Returned pointer to rr's syscall scratch buffer */ + PTR(void) scratch_buf; + uint32_t syscallbuf_size; + uint32_t usable_scratch_size; +}; + +/** + * The syscall buffer comprises an array of these variable-length + * records, along with the header below. + */ +struct syscallbuf_record { + /* Return value from the syscall. This can be a memory + * address, so must be as big as a memory address can be. + * We use 64 bits rather than make syscallbuf_record Arch-specific as that + * gets cumbersome. + */ + int64_t ret; + /* Syscall number. + * + * NB: the x86 linux ABI has 350 syscalls as of 3.9.6 and + * x86-64 defines 313, so this is a pretty safe storage + * allocation. It would be an earth-shattering event if the + * syscall surface were doubled in a short period of time, and + * even then we would have a comfortable cushion. Still, + * + * TODO: static_assert this can hold largest syscall num */ + uint16_t syscallno; + /* Did the tracee arm/disarm the desched notification for this + * syscall? */ + uint8_t desched : 1; + /* Does this record require an assist during replay ? */ + uint8_t replay_assist : 1; + uint8_t _flags_padding : 6; + uint8_t _padding; + /* Size of entire record in bytes: this struct plus extra + * recorded data stored inline after the last field, not + * including padding. + * + * TODO: static_assert this can repr >= buffer size */ + uint32_t size; + /* Extra recorded outparam data starts here. */ + uint8_t extra_data[0]; +}; + +/** + * This struct summarizes the state of the syscall buffer. It happens + * to be located at the start of the buffer. + */ +struct syscallbuf_hdr { + /* The number of valid syscallbuf_record bytes in the buffer, + * not counting this header. + * Make this volatile so that memory writes aren't reordered around + * updates to this field. */ + volatile uint32_t num_rec_bytes; + /* Number of mprotect calls since last syscallbuf flush. The last record in + * the list may not have been applied yet. + */ + volatile uint32_t mprotect_record_count; + /* Number of records whose syscalls have definitely completed. + * May be one less than mprotect_record_count. + */ + volatile uint32_t mprotect_record_count_completed; + /* True if the current syscall should not be committed to the + * buffer, for whatever reason; likely interrupted by + * desched. Set by rr. */ + volatile uint8_t abort_commit; + /* True if, next time we exit the syscall buffer hook, libpreload should + * execute SYS_rrcall_notify_syscall_hook_exit to give rr the opportunity to + * deliver a signal and/or reset the syscallbuf. */ + volatile uint8_t notify_on_syscall_hook_exit; + /* This tracks whether the buffer is currently in use for a + * system call or otherwise unavailable. This is helpful when + * a signal handler runs during a wrapped system call; we don't want + * it to use the buffer for its system calls. The different reasons why the + * buffer could be locked, use different bits of this field and the buffer + * may be used only if all are clear. See enum syscallbuf_locked_why for + * used bits. + */ + volatile uint8_t locked; + /* Nonzero when rr needs to worry about the desched signal. + * When it's zero, the desched signal can safely be + * discarded. */ + volatile uint8_t desched_signal_may_be_relevant; + /* A copy of the tasks's signal mask. Updated by preload when a buffered + * rt_sigprocmask executes. + */ + volatile uint64_t blocked_sigs; + /* Incremented by preload every time a buffered rt_sigprocmask executes. + * Cleared during syscallbuf reset. + */ + volatile uint32_t blocked_sigs_generation; + /* Nonzero when preload is in the process of calling an untraced + * sigprocmask; the real sigprocmask may or may not match blocked_sigs. + */ + volatile uint8_t in_sigprocmask_critical_section; + /* Nonzero when the syscall was aborted during preparation without doing + * anything. This is set when a user seccomp filter forces a SIGSYS. */ + volatile uint8_t failed_during_preparation; + + struct syscallbuf_record recs[0]; +} __attribute__((__packed__)); +/* TODO: static_assert(sizeof(uint32_t) == + * sizeof(struct syscallbuf_hdr)) */ + +/** + * Each bit of of syscallbuf_hdr->locked indicates a reason why the syscallbuf + * is locked. These are all the bits that are currently defined. + */ +enum syscallbuf_locked_why { + /* Used by the tracee, during interruptible syscalls to avoid recursion */ + SYSCALLBUF_LOCKED_TRACEE = 0x1, + /* Used by the tracer to prevent syscall buffering when necessary to preserve + semantics (e.g. for ptracees whose syscalls are being observed) */ + SYSCALLBUF_LOCKED_TRACER = 0x2 +}; + +/** + * Return a pointer to what may be the next syscall record. + * + * THIS POINTER IS NOT GUARANTEED TO BE VALID!!! Caveat emptor. + */ +inline static struct syscallbuf_record* next_record( + struct syscallbuf_hdr* hdr) { + uintptr_t next = (uintptr_t)hdr->recs + hdr->num_rec_bytes; + return (struct syscallbuf_record*)next; +} + +/** + * Return the amount of space that a record of |length| will occupy in + * the buffer if committed, including padding. + */ +inline static long stored_record_size(size_t length) { + /* Round up to a whole number of 64-bit words. */ + return (length + 7) & ~7; +} + +/** + * Return nonzero if an attempted open() of |filename| should be + * blocked. + * + * The background of this hack is that rr doesn't support DRI/DRM + * currently, so we use the blunt stick of refusing to open this + * interface file as a way of disabling it entirely. (In addition to + * tickling xorg.conf, which doesn't entirely do the trick.) It's + * known how to fix this particular, so let's not let this hack grow + * too much by piling on. + */ +inline static int is_blacklisted_filename(const char* filename) { + const char* f; + if (strprefix("/dev/dri/", filename) || streq("/dev/nvidiactl", filename) || + streq("/usr/share/alsa/alsa.conf", filename) || + streq("/dev/nvidia-uvm", filename)) { + return 1; + } + f = extract_file_name(filename); + return strprefix("rr-test-blacklist-file_name", f) || + strprefix("pulse-shm-", f); +} + +inline static int is_blacklisted_memfd(const char* name) { + return streq("pulseaudio", name); +} + +inline static int is_blacklisted_socket(const char* filename) { + /* Blacklist the nscd socket because glibc communicates with the daemon over + * shared memory rr can't handle. + */ + return streq("/var/run/nscd/socket", filename); +} + +inline static int is_gcrypt_deny_file(const char* filename) { + return streq("/etc/gcrypt/hwf.deny", filename); +} + +inline static int is_terminal(const char* filename) { + return strprefix("/dev/tty", filename) || strprefix("/dev/pts", filename); +} + +inline static int is_proc_mem_file(const char* filename) { + if (!strprefix("/proc/", filename)) { + return 0; + } + return streq(filename + rrstrlen(filename) - 4, "/mem"); +} + +inline static int is_proc_fd_dir(const char* filename) { + if (!strprefix("/proc/", filename)) { + return 0; + } + + int len = rrstrlen(filename); + const char* fd_bit = filename + len; + if (*fd_bit == '/') { + fd_bit--; + } + + return strprefix("/fd", fd_bit - 3); +} + +inline static int is_sys_cpu_online_file(const char* filename) { + return streq("/sys/devices/system/cpu/online", filename); +} + +inline static int is_proc_stat_file(const char* filename) { + return streq("/proc/stat", filename); +} + +inline static int is_rr_page_lib(const char* filename) { + return streq(extract_file_name(filename), RRPAGE_LIB_FILENAME) || + streq(extract_file_name(filename), RRPAGE_LIB_FILENAME_32); +} + +/** + * Returns nonzero if an attempted open() of |filename| can be syscall-buffered. + * When this returns zero, the open must be forwarded to the rr process. + * |filename| must be absolute. + * This is imperfect because it doesn't handle hard links and files (re)mounted + * in different places. + */ +inline static int allow_buffered_open(const char* filename) { + return filename && + !is_blacklisted_filename(filename) && !is_gcrypt_deny_file(filename) && + !is_terminal(filename) && !is_proc_mem_file(filename) && + !is_proc_fd_dir(filename) && !is_sys_cpu_online_file(filename) && + !is_proc_stat_file(filename) && !is_rr_page_lib(filename); +} + +#endif /* RR_PRELOAD_INTERFACE_H_ */
diff --git a/rr/android/x86_64/share/rr/src/preload/raw_syscall.S b/rr/android/x86_64/share/rr/src/preload/raw_syscall.S new file mode 100644 index 0000000..4c7b6a3 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/raw_syscall.S
@@ -0,0 +1,176 @@ +#if defined(__i386__) + .text + .globl _raw_syscall + .hidden _raw_syscall + .type _raw_syscall, @function +_raw_syscall: /* syscallno = 4(%esp) */ + .cfi_startproc + pushl %ebx /* syscallno = 8(%esp) */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ebx, 0 + pushl %esi /* syscallno = 12(%esp) */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %esi, 0 + pushl %edi /* syscallno = 16(%esp) */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %edi, 0 + pushl %ebp /* syscallno = 20(%esp) */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ebp, 0 + + movl 20(%esp), %eax /* %eax = syscallno */ + movl 24(%esp), %ebx /* %ebx = a0 */ + movl 28(%esp), %ecx /* %ecx = a1 */ + movl 32(%esp), %edx /* %edx = a2 */ + movl 36(%esp), %esi /* %esi = a3 */ + movl 40(%esp), %edi /* %edi = a4 */ + movl 44(%esp), %ebp /* %ebp = a5 */ + + pushl 56(%esp) + .cfi_adjust_cfa_offset 4 + pushl 56(%esp) + .cfi_adjust_cfa_offset 4 + + call *56(%esp) + + addl $8,%esp + .cfi_adjust_cfa_offset -8 + popl %ebp + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebp + popl %edi + .cfi_adjust_cfa_offset -4 + .cfi_restore %edi + popl %esi + .cfi_adjust_cfa_offset -4 + .cfi_restore %esi + popl %ebx + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebx + ret + .cfi_endproc + .size _raw_syscall, . - _raw_syscall + +#elif defined(__x86_64__) + .text + .globl _raw_syscall + .hidden _raw_syscall + .type _raw_syscall, @function +_raw_syscall: + .cfi_startproc + /* Incoming args are in %rdi, %rsi, %rdx, %rcx, %r8, %r9, and 8(%rsp). + Syscall arguments are %rdi, %rsi, %rdx, %r10, %r8, %r9. */ + movq %rdi, %rax /* syscall number */ + movq %rsi, %rdi /* first syscall arg */ + movq %rdx, %rsi /* second syscall arg */ + movq %rcx, %rdx /* third syscall arg */ + movq %r8, %r10 /* fourth syscall arg */ + movq %r9, %r8 /* fifth syscall arg */ + movq 8(%rsp), %r9 /* sixth syscall arg */ + + pushq 32(%rsp) + .cfi_adjust_cfa_offset 8 + pushq 32(%rsp) + .cfi_adjust_cfa_offset 8 + + /* During a system call the kernel makes some user-space-visible + register changes: + a) on entry, %r11 is set to %rflags + b) %rcx is sometimes set to -1 (perhaps because of something rr does) + c) on entry or exit, some flags are sometimes changed + Also, during replay we may perform single-stepping which can set + TF (trace flag). We need to hide this. + + fixup_syscall_registers is responsible for fixing up registers + to hide these effects when we get a ptrace trap from system calls + in the kernel: it clears TF from %r11, forces %rcx to -1, and sets + flags to fixed values (ZF+PF+IF+reserved, same as for "xor reg,reg"). + Task::canonicalize_and_set_regs is responsible for fixing up registers + when we emulate a system call that was traced during recording (by + running to a breakpoint at that system call). It does the above + effects after setting %r11 to %rflags. + + For untraced system calls there is no trap to rr during recording or + replay, so we must handle these issues here. We do not need + untraced system calls to behave exactly the same as traced + system calls, since whether a given system call was traced or not is + the same whether recording or replaying, but it's a good idea to + make them as similar as possible. We do need register values + to be perfectly consistent at every instruction in every replay + whether or not singlestepping is used (because a ReplayTimeline::mark + might be created at any point). During replay, untraced syscall + instructions are replaced with "xor %eax,%eax". + + The following code is harmless for traced syscalls (and needs to be, + because traced syscalls go through here too). + */ + + /* Set %r11 and %rcx to the values we expect them to have after the + system call. + Set flags to ZF+PF+IF+reserved (0x246) first. This simplifies + everything. + This all has to be independent of TF being set at any point during + replay! But the way we're doing it here, it's trivial. + */ + xor %ecx,%ecx + /* At this point, flags are 0x246 + possibly TF. */ + movq $0x246,%r11 + movq $-1,%rcx + + callq *32(%rsp) + + /* At this point, during recording we don't trust the kernel to have + restored flags correctly. It probably doesn't matter, but fix it + anyway. */ + xor %ecx,%ecx + /* At this point, the high 32 bits of %rcx are unknown. Fix that by + setting to -1 to match traced syscalls. */ + movq $-1,%rcx + /* At this point, %r11 is always 0x246 during replay and during + recording (because TF is never set during recording). Nothing to + fix in %r11. */ + + addq $16,%rsp + .cfi_adjust_cfa_offset -16 + ret + .cfi_endproc + .size _raw_syscall, . - _raw_syscall + +#elif defined(__aarch64__) + .text + .globl _raw_syscall + .hidden _raw_syscall + .type _raw_syscall, @function +_raw_syscall: + .cfi_startproc + // The two stack arguments needs to be at sp + 8 and sp + 16 + // but they are currently at sp and sp + 8. + // Since sp needs to be 16 bytes aligned we need to load and push them again. + str x30, [sp, -32]! + .cfi_def_cfa_offset 32 + .cfi_offset x30, -32 + ldp x8, x30, [sp, 32] + stp x8, x30, [sp, 8] + mov x8,x0 + mov x0,x1 + mov x1,x2 + mov x2,x3 + mov x3,x4 + mov x4,x5 + mov x5,x6 + blr x7 + ldr x30, [sp], 32 + .cfi_def_cfa_offset 0 + .cfi_restore x30 + ret + .cfi_endproc + .size _raw_syscall, . - _raw_syscall +#else +#error unknown CPU architecture +#endif /* __i386__/__x86_64__ */ + .global _syscallbuf_code_end + .hidden _syscallbuf_code_end +_syscallbuf_code_end: + + .section .note.GNU-stack,"",@progbits + .previous
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page.S b/rr/android/x86_64/share/rr/src/preload/rr_page.S new file mode 100644 index 0000000..e0d253e --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/rr_page.S
@@ -0,0 +1,100 @@ +// # Layout of the librrpage.so file +// +// The `rr page` is a special page mapped in low memory (at RR_PAGE_ADDR) that +// contains syscall instructions at known ip values. These values must be fixed +// for all processes in a given rr session, since rr cannot adjust the seccomp +// filter that makes use of these values once it has been set. `librrpage.so` +// contains this page, and rr will map it in place at process start and inform +// the process about it by passing it as the address of the vdso. This way +// the tracee's unwinders, as well as GDB will load the librrpage.so symbols and +// unwind info and function correctly if execution is stopped in these locations. +// +// The `librrpage.so` file is made up of five pages: +// 1: The ELF header, dynamic symbol/string table, and eh_frame sections +// 2: The ELF section, symbol string tables (moved here in a post-processing step) +// 3: A fake vdso that rr will ask the kernel to treat as the real vdso +// 4: The rr page to be used during recording +// 5: The rr page to be used during replay +// +// During record, rr will map the first four pages of librrpage.so only. +// During replay, rr will replace the record page by the replay page. +// Note however, that we only have one copy of the eh_frame and symbol +// information - we expect all offsets and unwind instructions to match between +// the record and replay versions (anything else would likely result in +// divergences anyway) + +#ifdef __i386__ +#define CALL \ + int $0x80; \ + ret +#define NOCALL \ + xor %eax, %eax; \ + ret +#define TRAP \ + nop; int $3; \ + ret +#define PAGE_ALIGN \ + .align 0x1000 +#define PRELOAD_LIBRARY_PAGE_SIZE 0x1000 +#elif defined(__x86_64__) +#define CALL \ + syscall; \ + ret +#define NOCALL \ + xor %eax, %eax; \ + ret +#define TRAP \ + nop; int $3; \ + ret +#define PAGE_ALIGN \ + .align 0x1000 +#define PRELOAD_LIBRARY_PAGE_SIZE 0x1000 +#elif defined(__aarch64__) +#define CALL \ + svc #0; \ + ret +#define NOCALL \ + movz x0, #0; \ + ret +#define TRAP \ + brk #0; \ + ret +#define PAGE_ALIGN \ + .align 16 +#define PRELOAD_LIBRARY_PAGE_SIZE 0x10000 +#endif + +.section .sh_placeholder, "a" +PAGE_ALIGN +.fill PRELOAD_LIBRARY_PAGE_SIZE, 1, 0xff + +.section .vdso.text, "a", @progbits +PAGE_ALIGN + +#include "rr_vdso.S" + +.section .record.text, "a", @progbits +PAGE_ALIGN + +.global rr_page_start +rr_page_start: + +#define LABEL(name) #name:; +#define STARTPROC(name) #name:; .cfi_startproc +#define STARTPROC_GLOBAL(name) .global #name; #name:; .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#include "rr_page_instructions.S" + +.section .replay.text, "", @progbits +PAGE_ALIGN +replay_page: +// No CFI instructions or symbols for the replay page - we'll implicitly share +// those of the record copy +#undef LABEL +#undef STARTPROC +#undef CFI_ENDPROC +#define LABEL(name) +#define STARTPROC(name) +#define CFI_ENDPROC +#define IS_REPLAY 1 +#include "rr_page_instructions.S"
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page.ld b/rr/android/x86_64/share/rr/src/preload/rr_page.ld new file mode 100644 index 0000000..df30100 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/rr_page.ld
@@ -0,0 +1,58 @@ +PHDRS +{ + header PT_LOAD FILEHDR PHDRS; + text PT_LOAD ; + dynamic PT_DYNAMIC ; + note PT_NOTE ; + eh_frame 0x6474e550 ; + replay PT_NULL; +} +SECTIONS +{ + . = 0x70000000 - 3 * 4096 + SIZEOF_HEADERS; + .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) } :header :eh_frame + .eh_frame : { KEEP (*(.eh_frame)) *(.eh_frame.*) } :header :eh_frame + .note.gnu.build-id : { *(.note.gnu.build-id) } :header :note + .note.gnu.property : { *(.note.gnu.property) } :header :note + .hash : { *(.hash) } :header + .gnu.hash : { *(.gnu.hash) } :header + .dynsym : { *(.dynsym) } :header + .dynstr : { *(.dynstr) } :header + .dynamic : { *(.dynamic) } :header :dynamic + .gnu.version : { *(.gnu.version) } :header + .gnu.version_d : { *(.gnu.version_d) } :header + .gnu.version_r : { *(.gnu.version_r) } :header + .got : { *(.got) } :header + .got.plt : { *(.got.plt) } :header + . = 0x70000000 - 2 * 4096; + /* This space in .sh_placeholder is reserved for the section table + symtab/strtab, which ordinarily go after the text sections, + but we need to have before the rr page. + We move it there in a post-processing step, since linker + scripts can't specify these locations for legacy reasons */ + .sh_placeholder : { *(.sh_placeholder) } :header + . = 0x70000000 - 4096; + .vdso.text : { *(.vdso.text) } :text + . = 0x70000000; + .record.text : { *(.record.text) } :text + . = 0x70000000 + 4096; + .replay.text : { *(.replay.text) } :replay + /DISCARD/ : { *(.debug_* ) } +} + +VERSION { + LINUX_2.6 { + global: + gettimeofday; + clock_gettime; + __vdso_gettimeofday; + __vdso_clock_getres; + __vdso_time; + __vdso_clock_gettime; + __vdso_getcpu; + __kernel_clock_getres; + __kernel_rt_sigreturn; + __kernel_gettimeofday; + __kernel_clock_gettime; + }; +}
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S b/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S new file mode 100644 index 0000000..a679187 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/rr_page_instructions.S
@@ -0,0 +1,61 @@ +// See rr_page.S + +#ifdef IS_REPLAY +#define REPLAY_ONLY_CALL CALL +#define RECORD_ONLY_CALL NOCALL +#else +#define REPLAY_ONLY_CALL NOCALL +#define RECORD_ONLY_CALL CALL +#endif + +STARTPROC(syscall_traced) + CALL + CFI_ENDPROC +STARTPROC(syscall_priv_traced) + CALL + CFI_ENDPROC +STARTPROC(syscall_untraced) + CALL + CFI_ENDPROC +STARTPROC(syscall_untraced_replay_only) + REPLAY_ONLY_CALL + CFI_ENDPROC +STARTPROC(syscall_untraced_record_only) + RECORD_ONLY_CALL + CFI_ENDPROC +STARTPROC(syscall_priv_untraced) + CALL + CFI_ENDPROC +STARTPROC(syscall_priv_untraced_replay_only) + REPLAY_ONLY_CALL + CFI_ENDPROC +STARTPROC(syscall_priv_untraced_record_only) + RECORD_ONLY_CALL + CFI_ENDPROC +STARTPROC(syscall_untraced_replay_assist) +#ifdef IS_REPLAY + TRAP +#else + CALL +#endif + CFI_ENDPROC + +LABEL(in_replay_flag) +#ifdef IS_REPLAY + .byte 0x01 +#else + .byte 0x00 +#endif +.byte 0x00, 0x00, 0x00 + +// During replay, we put the breakpoint_value here. During record this remains +// as -1, giving us 8 ff bytes at a well known address during record. These are used +// during exit. +LABEL(breakpoint_value) +LABEL(ff_bytes) +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + +// ABI stability ends here. + +#undef REPLAY_ONLY_CALL +#undef RECORD_ONLY_CALL
diff --git a/rr/android/x86_64/share/rr/src/preload/rr_vdso.S b/rr/android/x86_64/share/rr/src/preload/rr_vdso.S new file mode 100644 index 0000000..faa1799 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/rr_vdso.S
@@ -0,0 +1,138 @@ +#ifdef __aarch64__ +#define STARTPROC_GLOBAL(name) .globl #name; .type #name, @function; \ + #name:; .cfi_startproc +#else +#define STARTPROC_GLOBAL(name) .global #name; .type #name, @function; \ + #name:; .cfi_startproc +#endif +#define CFI_ENDPROC .cfi_endproc + +// Older libs don't use the __vdso symbols, but try to look for the syscall +// names directly. Follow the kernel vdso and make them weak aliases +#define WEAK_ALIAS(sym, target) .weak sym; .set sym, target + +#if defined(__x86_64__) + +#define SYSCALL(which) \ + movq $which, %rax; \ + syscall; \ + nop; \ + nop; \ + nop; \ + retq + +STARTPROC_GLOBAL(__vdso_clock_getres) +SYSCALL(229) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_getcpu) +SYSCALL(309) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_time) +SYSCALL(201) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_clock_gettime) +SYSCALL(228) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_gettimeofday) +SYSCALL(96) +CFI_ENDPROC + +WEAK_ALIAS(clock_getres, __vdso_clock_getres) +WEAK_ALIAS(getcpu, __vdso_getcpu) +WEAK_ALIAS(time, __vdso_time) +WEAK_ALIAS(clock_gettime, __vdso_clock_gettime) +WEAK_ALIAS(gettimeofday,__vdso_gettimeofday) + +.symver gettimeofday,gettimeofday@LINUX_2.6 +.symver clock_gettime,clock_gettime@LINUX_2.6 +.symver __vdso_gettimeofday,__vdso_gettimeofday@LINUX_2.6 +.symver __vdso_clock_getres,__vdso_clock_getres@LINUX_2.6 +.symver __vdso_time,__vdso_time@LINUX_2.6 +.symver __vdso_clock_gettime,__vdso_clock_gettime@LINUX_2.6 +.symver __vdso_getcpu,__vdso_getcpu@LINUX_2.6 + +#elif defined(__i386__) + +// __vdso functions use the C calling convention, so +// we have to set up the syscall parameters here. +// No x86-32 __vdso functions take more than two parameters. +#define SYSCALL(which) \ + push %ebx; \ + .cfi_adjust_cfa_offset 4; \ + .cfi_rel_offset %ebx, 0; \ + mov 8(%esp),%ebx; \ + mov 12(%esp),%ecx; \ + mov $which, %eax; \ + int $0x80; \ + nop; \ + nop; \ + nop; \ + pop %ebx; \ + .cfi_adjust_cfa_offset -4; \ + .cfi_restore %ebx; \ + ret + +// N.B.: We depend on this being the first symbol in the vdso page. +STARTPROC_GLOBAL(__kernel_vsyscall) +int $0x80 +nop +nop +nop +ret +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_clock_getres) +SYSCALL(266) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_time) +SYSCALL(13) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_clock_gettime) +SYSCALL(265) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_clock_gettime64) +SYSCALL(403) +CFI_ENDPROC +STARTPROC_GLOBAL(__vdso_gettimeofday) +SYSCALL(78) +CFI_ENDPROC + +WEAK_ALIAS(clock_getres, __vdso_clock_getres) +WEAK_ALIAS(time, __vdso_time) +WEAK_ALIAS(clock_gettime, __vdso_clock_gettime) +WEAK_ALIAS(clock_gettime64, __vdso_clock_gettime64) +WEAK_ALIAS(gettimeofday,__vdso_gettimeofday) + +.symver __vdso_gettimeofday,__vdso_gettimeofday@LINUX_2.6 +.symver __vdso_clock_getres,__vdso_clock_getres@LINUX_2.6 +.symver __vdso_time,__vdso_time@LINUX_2.6 +.symver __vdso_clock_gettime,__vdso_clock_gettime@LINUX_2.6 +.symver __vdso_getcpu,__vdso_getcpu@LINUX_2.6 + +#elif defined(__aarch64__) + +#define SYSCALL(which) \ + mov x8, which; \ + svc 0; \ + ret + +STARTPROC_GLOBAL(__kernel_clock_getres) +SYSCALL(114) +CFI_ENDPROC +STARTPROC_GLOBAL(__kernel_rt_sigreturn) +SYSCALL(139) +CFI_ENDPROC +STARTPROC_GLOBAL(__kernel_gettimeofday) +SYSCALL(169) +CFI_ENDPROC +STARTPROC_GLOBAL(__kernel_clock_gettime) +SYSCALL(113) +CFI_ENDPROC + +#else + +#error "VDSO Hooks not defined for this platform" + +#endif + +#undef STARTPROC_GLOBAL +#undef CFI_ENDPROC
diff --git a/rr/android/x86_64/share/rr/src/preload/rrcalls.h b/rr/android/x86_64/share/rr/src/preload/rrcalls.h new file mode 100644 index 0000000..b448495 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/rrcalls.h
@@ -0,0 +1,103 @@ +/* "Magic" (rr-implemented) syscalls that we use to initialize the + * syscallbuf. + * + * NB: magic syscalls must be positive, because with at least linux + * 3.8.0 / eglibc 2.17, rr only gets a trap for the *entry* of invalid + * syscalls, not the exit. rr can't handle that yet. */ +/* TODO: static_assert(LAST_SYSCALL < SYS_rrcall_init_buffers) */ + +#define RR_CALL_BASE 1000 + +/** + * The preload library calls SYS_rrcall_init_preload during its + * initialization. + */ +#define SYS_rrcall_init_preload RR_CALL_BASE +/** + * The preload library calls SYS_rrcall_init_buffers in each thread that + * gets created (including the initial main thread). + */ +#define SYS_rrcall_init_buffers (RR_CALL_BASE + 1) +/** + * The preload library calls SYS_rrcall_notify_syscall_hook_exit when + * unlocking the syscallbuf and notify_after_syscall_hook_exit has been set. + * The word at 4/8(sp) is returned in the syscall result and the word at + * 8/16(sp) is stored in original_syscallno. + */ +#define SYS_rrcall_notify_syscall_hook_exit (RR_CALL_BASE + 2) +/** + * When the preload library detects that control data has been received in a + * syscallbuf'ed recvmsg, it calls this syscall with a pointer to the + * 'struct msg' returned. + */ +#define SYS_rrcall_notify_control_msg (RR_CALL_BASE + 3) +/** + * When rr replay has restored the auxv vectors for a new process (completing + * emulation of exec), it calls this syscall. It takes one parameter, the tid + * of the task that it has restored auxv vectors for. + */ +#define SYS_rrcall_reload_auxv (RR_CALL_BASE + 4) +/** + * When rr replay has flushed a syscallbuf 'mprotect' record, notify any outer + * rr of that flush. The first parameter is the tid of the task, the second + * parameter is the address, the third parameter is the length, and the + * fourth parameter is the prot. + */ +#define SYS_rrcall_mprotect_record (RR_CALL_BASE + 5) +/** + * The audit library calls SYS_rrcall_notify_stap_semaphore_added once a batch + * of SystemTap semaphores have been incremented. The first parameter is the + * beginning of an address interval containing semaphores (inclusive) and the + * second parameter is the end of the address interval (exclusive). + * + * In practice a particular probe may be listed in an object's notes more than + * once, so be prepared to handle overlapping or redundant intervals. + */ +#define SYS_rrcall_notify_stap_semaphore_added (RR_CALL_BASE + 6) +/** + * The audit library calls SYS_rrcall_notify_stap_semaphore_removed once a + * batch of previously-incremented SystemTap semaphores have been decremented. + * The first parameter is the beginning of an address interval containing + * semaphores (inclusive) and the second parameter is the end of the address + * interval (exclusive). + * + * In practice a particular probe may be listed in an object's notes more than + * once, so be prepared to handle overlapping or redundant intervals. + */ +#define SYS_rrcall_notify_stap_semaphore_removed (RR_CALL_BASE + 7) +/** + * This syscall can be used be the application being recorded to check for the + * presence of the rr recorder. It is used e.g. to enable nested recording of + * rr itself. Use of this syscall should be limited to situations where it is + * absolutely necessary to avoid deviation of behavior depending on the + * presence of absence of rr. + */ +#define SYS_rrcall_check_presence (RR_CALL_BASE + 8) +/** + * Requests that rr detach from this process and re-create outside of its + * process tree, such that it may run without seccomp. + */ +#define SYS_rrcall_detach_teleport (RR_CALL_BASE + 9) +/** + * Requests that rr reset the time slice signal to the + * requested period. Used for testing interaction corner + * cases between the time slice signal and other rr behavior. + */ +#define SYS_rrcall_arm_time_slice (RR_CALL_BASE + 10) +/** + * Use as + * + * int rr_freeze_tid(pid_t tid, int freeze) { + * return syscall(SYS_rrcall_freeze_tid, tid, freeze, 0, 0, 0, 0); } + * + * With `freeze=1`, requests that rr's Scheduler not schedule task `tid` again + * until unfrozen using `rr_freeze_tid(tid, 0)`. Note that kernel scheduling + * behavior is unaffected. Used for testing Scheduler-sensitive scenarios. + */ +#define SYS_rrcall_freeze_tid (RR_CALL_BASE + 11) +/** + * Requests a simulated (buffered) RDTSC. + * The RDTSC value is returned as a 64-bit value stored in the + * memory location given by the first argument. RAX returns 0. + */ +#define SYS_rrcall_rdtsc (RR_CALL_BASE + 12) \ No newline at end of file
diff --git a/rr/android/x86_64/share/rr/src/preload/syscall_hook.S b/rr/android/x86_64/share/rr/src/preload/syscall_hook.S new file mode 100644 index 0000000..45b4d98 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/syscall_hook.S
@@ -0,0 +1,1047 @@ +#if defined(__aarch64__) + .set preload_thread_locals,0x70010000 +#else + .set preload_thread_locals,0x70001000 +#endif + + .global _syscallbuf_code_start + .hidden _syscallbuf_code_start + + .global _syscallbuf_final_exit_instruction + .hidden _syscallbuf_final_exit_instruction + .type _syscallbuf_final_exit_instruction, @function + +#define DW_OP_CONST4U(val) \ + 0x0c, /* DW_OP_const4u */ \ + /* Individually place bytes */ \ + (val) & 0xFF, \ + ((val) & (0xFF << 0x8)) >> 0x8, \ + ((val) & (0xFF << 0x10)) >> 0x10, \ + ((val) & (0xFF << 0x18)) >> 0x18 + +#define DW_OP_CONST8U(val) \ + 0x0e, /* DW_OP_const8u */ \ + /* Individually place bytes */ \ + (val) & 0xFF, \ + ((val) & (0xFF << 0x8)) >> 0x8, \ + ((val) & (0xFF << 0x10)) >> 0x10, \ + ((val) & (0xFF << 0x18)) >> 0x18, \ + ((val) & (0xFF << 0x20)) >> 0x20, \ + ((val) & (0xFF << 0x28)) >> 0x28, \ + ((val) & (0xFF << 0x30)) >> 0x30, \ + ((val) & (0xFF << 0x38)) >> 0x38 + +#define REG_AT_ADDR32(reg, addr) \ + .cfi_escape 0x10, /* DW_CFA_expression */ \ + reg, \ + 0x05, /* 5 byte expression follows */ \ + DW_OP_CONST4U(addr) +#define REG_AT_ADDR64(reg, addr) \ + .cfi_escape 0x10, /* DW_CFA_expression */ \ + reg, \ + 0x09, /* 9 byte expression follows */ \ + DW_OP_CONST8U(addr) + +// 10 bytes LEB128 is enough to encode 64bit integer and we shouldn't +// really need anything longer than that. +#define COUNT_LEB128(lebs...) \ + _COUNT_LEB128(lebs, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1) +#define _COUNT_LEB128(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N + +#define REG_AT_REG_OFFSET(reg, base, lebs...) \ + .cfi_escape 0x10, /* DW_CFA_expression */ \ + reg, \ + (COUNT_LEB128(lebs) + 1), /* 1 byte + LEB128 bytes */ \ + (0x70 + base), /* DW_OP_breg0 + base */ \ + lebs + +#if defined(__i386__) +.text +.set syscallbuf_stub_alt_stack, preload_thread_locals +.set stub_scratch_1, preload_thread_locals + 8 +.set alt_stack_nesting_level, preload_thread_locals + 12 +.set saved_flags, preload_thread_locals + 16 + +.p2align 4 + +_syscallbuf_code_start: +/* Insert a NOP here so we have no symbol clashes. Otherwise + in some configurations (gdb 7.7.1, Ubuntu 14.04) gdb sometimes gets confused. + */ + nop + + +_syscallbuf_final_exit_instruction: + jmp *(stub_scratch_1) + +_syscall_hook_trampoline: + .cfi_startproc + /* Build a |struct syscall_info| by pushing all the syscall + * args and the number onto the stack. */ + /* struct syscall_info info; */ + pushl %ebp /* info.args[5] = $ebp; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ebp, 0 + pushl %edi /* info.args[4] = $edi; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %edi, 0 + pushl %esi /* info.args[3] = $esi; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %esi, 0 + pushl %edx /* info.args[2] = $edx; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %edx, 0 + pushl %ecx /* info.args[1] = $ecx; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ecx, 0 + pushl %ebx /* info.args[0] = $ebx; */ + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset %ebx, 0 + pushl %eax /* info.no = $eax; */ + .cfi_adjust_cfa_offset 4 + + /* $esp points at &info. Push that pointer on the stack as + * our arg for vsyscall_hook(). + * Use %ebp as our temporary CFA register here. Don't use %ebx or + * any other GP register, since x86-64 gdb 7.7 (at least) treats all GP + * regs other than %esp/%ebp as *signed* and sign-extends their values. + * Having some CFA values sign-extended and others not breaks gdb + * stack walking. + */ + movl %esp, %ebp + .cfi_def_cfa_register %ebp + + /* Align stack to 16 bytes */ + and $0xfffffff0,%esp + + /* Save XMM registers */ + sub $0x80,%esp + movdqa %xmm0,(%esp) + movdqa %xmm1,0x10(%esp) + movdqa %xmm2,0x20(%esp) + movdqa %xmm3,0x30(%esp) + movdqa %xmm4,0x40(%esp) + movdqa %xmm5,0x50(%esp) + movdqa %xmm6,0x60(%esp) + movdqa %xmm7,0x70(%esp) + + sub $12,%esp + pushl %ebp + + call syscall_hook + /* $eax = vsyscall_hook(&info); */ + + movdqa 0x10(%esp),%xmm0 + movdqa 0x20(%esp),%xmm1 + movdqa 0x30(%esp),%xmm2 + movdqa 0x40(%esp),%xmm3 + movdqa 0x50(%esp),%xmm4 + movdqa 0x60(%esp),%xmm5 + movdqa 0x70(%esp),%xmm6 + movdqa 0x80(%esp),%xmm7 + + mov $saved_flags, %esp + popfw + /* From here on, non-application flag changes are not allowed */ + + /* Restore ESP */ + mov %ebp, %esp + .cfi_def_cfa_register %esp + + /* $eax is now the syscall return value. Erase |info.no| from the + * stack so that we can restore the other registers we saved. */ + lea 4(%esp),%esp + .cfi_adjust_cfa_offset -4 + + /* Contract of __kernel_vsyscall() and real syscalls is that even + * callee-save registers aren't touched, so we restore everything + * here. */ + popl %ebx + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebx + popl %ecx + .cfi_adjust_cfa_offset -4 + .cfi_restore %ecx + popl %edx + .cfi_adjust_cfa_offset -4 + .cfi_restore %edx + popl %esi + .cfi_adjust_cfa_offset -4 + .cfi_restore %esi + popl %edi + .cfi_adjust_cfa_offset -4 + .cfi_restore %edi + mov (alt_stack_nesting_level),%ebp + lea -1(%ebp),%ebp + mov %ebp,(alt_stack_nesting_level) + popl %ebp + .cfi_adjust_cfa_offset -4 + .cfi_restore %ebp + + ret + .cfi_endproc + .size _syscall_hook_trampoline, .-_syscall_hook_trampoline + +#define SYSCALLHOOK_START(name) \ + .global name; \ + .hidden name; \ + .type name, @function; \ +name: \ + .cfi_startproc; \ + .cfi_def_cfa_offset 0; \ + .cfi_offset %eip, 0; \ + .cfi_offset %esp, 4 + +#define SYSCALLHOOK_END(name) \ + pop (stub_scratch_1); \ + .cfi_adjust_cfa_offset -4; \ + pop %esp; \ + .cfi_same_value %esp; \ + REG_AT_ADDR32(0x08 /* %eip */, stub_scratch_1); \ + jmp _syscallbuf_final_exit_instruction; \ + .cfi_endproc; \ + .size name, .-name + +SYSCALLHOOK_START(_syscall_hook_trampoline_3d_01_f0_ff_ff) + call _syscall_hook_trampoline + cmpl $0xfffff001,%eax +SYSCALLHOOK_END(_syscall_hook_trampoline_3d_01_f0_ff_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90) + call _syscall_hook_trampoline +SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90) + +/* Declare gcc get_pc thunks here so they're in a known region of code */ + + .global _get_pc_thunks_start + .hidden _get_pc_thunks_start +_get_pc_thunks_start: + +#define THUNK(name, reg) \ + .section .text.__x86.get_pc_thunk.name,"axG",@progbits,__x86.get_pc_thunk.name,comdat; \ + .global __x86.get_pc_thunk.name; \ + .hidden __x86.get_pc_thunk.name; \ + .type __x86.get_pc_thunk.name, @function; \ +__x86.get_pc_thunk.name: \ + .cfi_startproc; \ + movl (%esp), %reg; \ + ret; \ + .cfi_endproc + +THUNK(ax, eax) +THUNK(bx, ebx) +THUNK(cx, ecx) +THUNK(dx, edx) +THUNK(si, esi) +THUNK(di, edi) +THUNK(bp, ebp) + + .global _get_pc_thunks_end + .hidden _get_pc_thunks_end +_get_pc_thunks_end: + +#elif defined(__x86_64__) + .text + + .set stub_scratch_1, preload_thread_locals + 16 + .set alt_stack_nesting_level, preload_thread_locals + 24 + .set saved_flags, preload_thread_locals + 28 + + .p2align 4 +_syscallbuf_code_start: + +_syscall_hook_trampoline: + .cfi_startproc + /* Save RBX because we need a callee-saves register */ + pushq %rbx + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rbx, 0 + + /* Build a |struct syscall_info| on the stack by pushing the arguments + and syscall number. */ + pushq %r9 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r9, 0 + pushq %r8 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r8, 0 + pushq %r10 + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %r10, 0 + pushq %rdx + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rdx, 0 + pushq %rsi + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rsi, 0 + pushq %rdi + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rdi, 0 + pushq %rax + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset %rax, 0 + + /* Align stack */ + mov %rsp,%rbx + .cfi_def_cfa_register %rbx + and $0xfffffffffffffff0,%rsp + + /* Save XMM registers */ + sub $0x80,%rsp + movdqa %xmm0,(%rsp) + movdqa %xmm1,0x10(%rsp) + movdqa %xmm2,0x20(%rsp) + movdqa %xmm3,0x30(%rsp) + movdqa %xmm4,0x40(%rsp) + movdqa %xmm5,0x50(%rsp) + movdqa %xmm6,0x60(%rsp) + movdqa %xmm7,0x70(%rsp) + + /* Save registers that aren't callee-saves preserved by syscall_hook, + and that we aren't already restoring from the syscall args */ + push %rcx + push %r11 + /* stack is 16-byte aligned again for entry to C */ + + /* Call our hook. */ + mov %rbx,%rdi + callq syscall_hook + + pop %r11 + pop %rcx + + /* Restore XMM registers */ + movdqa (%rsp),%xmm0 + movdqa 0x10(%rsp),%xmm1 + movdqa 0x20(%rsp),%xmm2 + movdqa 0x30(%rsp),%xmm3 + movdqa 0x40(%rsp),%xmm4 + movdqa 0x50(%rsp),%xmm5 + movdqa 0x60(%rsp),%xmm6 + movdqa 0x70(%rsp),%xmm7 + + mov $saved_flags, %rsp + popfw + /* From here on, non-application flag changes are not allowed */ + + mov %rbx,%rsp + .cfi_def_cfa_register %rsp + + /* On entrance, we pushed the %rax, the syscall number. But we don't + want to |pop %rax|, as that will overwrite our return value. Skip over it. */ + pop %rdi + .cfi_adjust_cfa_offset -8 + + /* We don't really *need* to restore these, since the kernel could have + trashed them all anyway. But it seems reasonable to do so. */ + pop %rdi + .cfi_adjust_cfa_offset -8 + .cfi_restore %rdi + pop %rsi + .cfi_adjust_cfa_offset -8 + .cfi_restore %rsi + pop %rdx + .cfi_adjust_cfa_offset -8 + .cfi_restore %rdx + pop %r10 + .cfi_adjust_cfa_offset -8 + .cfi_restore %r10 + pop %r8 + .cfi_adjust_cfa_offset -8 + .cfi_restore %r8 + mov (alt_stack_nesting_level),%r9d + lea -1(%r9),%r9 + mov %r9d,(alt_stack_nesting_level) + pop %r9 + .cfi_adjust_cfa_offset -8 + .cfi_restore %r9 + + pop %rbx + .cfi_adjust_cfa_offset -8 + .cfi_restore %rbx + + /* ...and we're done. */ + ret + .cfi_endproc + .size _syscall_hook_trampoline, . - _syscall_hook_trampoline + +_syscallbuf_final_exit_instruction: + jmp *(stub_scratch_1) + +/** + * Ok, bear with me here. When gdb sees our stack switch, it gets suspicious and if + * we're unlucky may decide that our unwind info is broken and abort the unwind. However, + * it decides to allow the unwind to proceed anyway if we happen to be in a function called + * __morestack (because that's what gcc calls its stack switching mechanism). Now, + * GDB does the stack switching comparison based on the CFA. What we thus need to do is keep the + * CFA pointing to the old stack until we get to a function named __morestack. We set the CFA for every + * syscallhook to what it will be at the end of the function (which, well, is an ok definition + * of the CFA). Then, we insert a __morestack function (still with the old CFA) that just jumps + * through to the trampoline. This way, we can force gdb's stack switch detection to think the + * stack switch happens between the hook and the common trampoline code (and add a __morestack + * local symbol to the trampoline code to avoid GDB messing with our stack trace). + */ +#define CFA_AT_RSP_OFFSET(offset) \ +.cfi_escape 0x0f, /* DW_CFA_def_cfa_expression */\ + 0x03, /* 3 bytes follow */\ + 0x77, offset, /* DW_OP_breg7, offset */\ + 0x06; /* DW_OP_deref */ + +#define RSP_IS_CFA \ +.cfi_escape 0x16, /* DW_CFA_val_expression */\ + 0x7, /* %rsp */\ + 0; /* 0 bytes follow */ + +#define RSP_IS_CFA_PLUS_OFFSET(offset) \ +.cfi_escape 0x16, /* DW_CFA_val_expression */\ + 0x7, /* %rsp */\ + 2, /* 2 bytes follow */\ + 0x23, /* DW_OP_plus_uconst */\ + offset; + +#define RSP_IS_RSP_PLUS_OFFSET(offset) \ +.cfi_escape 0x16, /* DW_CFA_val_expression */\ + 0x07, /* %rsp */\ + 0x02, /* 2 bytes follow */\ + 0x77, offset; /* DW_OP_breg7, offset */ + +#define RIP_IS_DEREF_RSP(offset) REG_AT_REG_OFFSET(0x10 /* %rip */, 7, offset) + +/** + * On syscallhook entry, the stack has been switched to the end of per-task + * scratch space, then the old RSP and the return address have been pushed. + */ +#define SYSCALLHOOK_START(name) \ + .global name; \ + .hidden name; \ + .type name, @function; \ +name: \ + .cfi_startproc; \ + CFA_AT_RSP_OFFSET(8) \ + RSP_IS_CFA \ + RIP_IS_DEREF_RSP(0) + +#define SYSCALLHOOK_END(name) \ + pop (stub_scratch_1); \ + CFA_AT_RSP_OFFSET(0) \ + REG_AT_ADDR32(0x10 /* %rip */, stub_scratch_1); \ + pop %rsp; \ + .cfi_def_cfa %rsp, 0; \ + jmp _syscallbuf_final_exit_instruction; \ + .cfi_endproc; \ + .size name, .-name + +/* See note above on what __morestack is for */ +.global __morestack +.hidden __morestack +.type __morestack, @function +__morestack: +.cfi_startproc +CFA_AT_RSP_OFFSET(16) +RSP_IS_RSP_PLUS_OFFSET(8) +RIP_IS_DEREF_RSP(0) +callq _syscall_hook_trampoline +/* GDB likes to override valid CFI with its own heuristics if the current + instruction is a retq. This becomes a problem here, because GDB will set + a breakpoint at the next instruction after the callq when continuing out of + `_syscall_hook_trampoline`. This `nop` makes said instruction not a retq, + thus preventing that GDB heuristic from kicking in and letting GDB realize + that it did in fact manage to step out of the `_syscall_hook_trampoline` + frame. */ +nop +retq +.cfi_endproc +.size __morestack, .-__morestack + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_3d_01_f0_ff_ff) + callq __morestack + cmpq $0xfffffffffffff001,%rax +SYSCALLHOOK_END(_syscall_hook_trampoline_48_3d_01_f0_ff_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_3d_00_f0_ff_ff) + callq __morestack + cmpq $0xfffffffffffff000,%rax +SYSCALLHOOK_END(_syscall_hook_trampoline_48_3d_00_f0_ff_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_3d_00_f0_ff_ff) + callq __morestack + cmpl $0xfffff000,%eax +SYSCALLHOOK_END(_syscall_hook_trampoline_3d_00_f0_ff_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_45_f8) + callq __morestack + mov %rax,-8(%rbp) +SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_45_f8) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_c3) + callq __morestack + mov %rax,%rbx +SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_c3) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_8b_3c_24) + callq __morestack + /* The original instruction after the syscall is movq (%rsp),%rdi. */ + movq 8(%rsp),%rdi + movq (%rdi),%rdi +SYSCALLHOOK_END(_syscall_hook_trampoline_48_8b_3c_24) + +SYSCALLHOOK_START(_syscall_hook_trampoline_5a_5e_c3) + .cfi_offset %rip, 16 + RSP_IS_CFA_PLUS_OFFSET(24) + callq __morestack + /* The original instructions after the syscall are + pop %rdx; pop %rsi; retq. */ + /* We're not returning to the dynamically generated stub, so + we need to fix the stack pointer ourselves. */ + pop %rdx + CFA_AT_RSP_OFFSET(0) + pop %rsp + .cfi_def_cfa %rsp, 0; + pop %rdx + .cfi_adjust_cfa_offset -8 + pop %rsi + .cfi_adjust_cfa_offset -8 + pop (stub_scratch_1) + .cfi_adjust_cfa_offset -8 + jmp _syscallbuf_final_exit_instruction + + .cfi_endproc + .size _syscall_hook_trampoline_5a_5e_c3, .-_syscall_hook_trampoline_5a_5e_c3 + +SYSCALLHOOK_START(_syscall_hook_trampoline_89_c2_f7_da) + call __morestack + mov %eax,%edx + neg %edx +SYSCALLHOOK_END(_syscall_hook_trampoline_89_c2_f7_da) + +SYSCALLHOOK_START(_syscall_hook_trampoline_90_90_90) + call __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_90_90_90) + +SYSCALLHOOK_START(_syscall_hook_trampoline_ba_01_00_00_00) + call __morestack + mov $1,%edx +SYSCALLHOOK_END(_syscall_hook_trampoline_ba_01_00_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_89_c1_31_d2) + call __morestack + mov %eax,%ecx + xor %edx,%edx +SYSCALLHOOK_END(_syscall_hook_trampoline_89_c1_31_d2) + +SYSCALLHOOK_START(_syscall_hook_trampoline_c3_nop) + .cfi_offset %rip, 16 + RSP_IS_CFA_PLUS_OFFSET(24) + callq __morestack + /* The original instructions after the syscall are + retq; nopl 0x0(%rax,%rax,1) */ + /* We're not returning to the dynamically generated stub, so + we need to fix the stack pointer ourselves. */ + pop %rdx + CFA_AT_RSP_OFFSET(0) + pop %rsp + .cfi_def_cfa %rsp, 0; + pop (stub_scratch_1) + .cfi_adjust_cfa_offset -8 + jmp _syscallbuf_final_exit_instruction + + .cfi_endproc + .size _syscall_hook_trampoline_c3_nop, .-_syscall_hook_trampoline_c3_nop + +SYSCALLHOOK_START(_syscall_hook_trampoline_40_80_f6_81) + xor $0x81, %sil + call __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_40_80_f6_81) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_8d_b3_f0_08_00_00) + lea 0x8f0(%rbx),%rsi + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_48_8d_b3_f0_08_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_49_89_ca) + mov %rcx, %r10 + call __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_49_89_ca) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_c1) + callq __morestack + mov %rax, %rcx +SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_c1) + +#define MOV_RDX_VARIANTS \ + MOV_RDX_TO_REG(48, d0) \ + MOV_RDX_TO_REG(48, d1) \ + MOV_RDX_TO_REG(48, d2) \ + MOV_RDX_TO_REG(48, d3) \ + MOV_RDX_TO_REG(48, d4) \ + MOV_RDX_TO_REG(48, d5) \ + MOV_RDX_TO_REG(48, d6) \ + MOV_RDX_TO_REG(48, d7) \ + MOV_RDX_TO_REG(49, d0) \ + MOV_RDX_TO_REG(49, d1) \ + MOV_RDX_TO_REG(49, d2) \ + MOV_RDX_TO_REG(49, d3) \ + MOV_RDX_TO_REG(49, d4) \ + MOV_RDX_TO_REG(49, d5) \ + MOV_RDX_TO_REG(49, d6) \ + MOV_RDX_TO_REG(49, d7) + +#define MOV_RDX_TO_REG(rex, op) \ +SYSCALLHOOK_START(_syscall_hook_trampoline_##rex##_89_##op); \ + callq __morestack; \ + .byte 0x##rex, 0x89, 0x##op; \ +SYSCALLHOOK_END(_syscall_hook_trampoline_##rex##_89_##op); + + MOV_RDX_VARIANTS + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_c1_e2_20) + callq __morestack + shl $32, %rdx +SYSCALLHOOK_END(_syscall_hook_trampoline_48_c1_e2_20) + +SYSCALLHOOK_START(_syscall_hook_trampoline_49_8b_44_24_28) + callq __morestack + mov 0x28(%r12),%rax +SYSCALLHOOK_END(_syscall_hook_trampoline_49_8b_44_24_28) + +SYSCALLHOOK_START(_syscall_hook_trampoline_4c_89_f7) + mov %r14, %rdi + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_4c_89_f7) + +SYSCALLHOOK_START(_syscall_hook_trampoline_4c_89_ff) + mov %r15, %rdi + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_4c_89_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff) + mov $0xffffffffffffffff,%r9 + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff) + +SYSCALLHOOK_START(_syscall_hook_trampoline_b8_0e_00_00_00) + mov $0x0e,%eax + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_b8_0e_00_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_b8_11_01_00_00) + mov $0x111,%eax + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_b8_11_01_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_b8_ca_00_00_00) + mov $0xca,%eax + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_b8_ca_00_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_be_18_00_00_00) + mov $0x18,%esi + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_be_18_00_00_00) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_e5) + /* Previous RSP is stored on the stack above our return address */ + mov 8(%rsp),%rbp + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_e5) + +SYSCALLHOOK_START(_syscall_hook_trampoline_48_89_fb) + mov %rdi,%rbx + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_48_89_fb) + +SYSCALLHOOK_START(_syscall_hook_trampoline_nops) + callq __morestack +SYSCALLHOOK_END(_syscall_hook_trampoline_nops) + +#elif defined(__aarch64__) + .text + + .set syscallbuf_stub_alt_stack, preload_thread_locals + .set stub_scratch_1, preload_thread_locals + 16 + .set alt_stack_nesting_level, preload_thread_locals + 24 + .set stub_scratch_2, preload_thread_locals + 8 * 13 + +// Store a pair of x registers to stack at offset sp. +// Assuming that CFA register is sp +#define STPX_STACK(r1, r2, offset) \ + stp x##r1, x##r2, [sp, offset]; \ + .cfi_rel_offset x##r1, offset; \ + .cfi_rel_offset x##r2, offset + 8 + +#define LDPX_STACK(r1, r2, offset) \ + ldp x##r1, x##r2, [sp, offset]; \ + .cfi_same_value x##r1; \ + .cfi_same_value x##r2 + +// Store a pair of q registers to stack at offset sp. +// Assuming that CFA register is sp +#define STPQ_STACK(r1, r2, offset) \ + stp q##r1, q##r2, [sp, offset]; \ + .cfi_rel_offset q##r1, offset; \ + .cfi_rel_offset q##r2, offset + 16 + +#define LDPQ_STACK(r1, r2, offset) \ + ldp q##r1, q##r2, [sp, offset]; \ + .cfi_same_value q##r1; \ + .cfi_same_value q##r2 + +// Mark all temporary registers as same_value except x8 and x15 +#define SAME_VALUE_X_NO8_NO15 \ + .cfi_same_value x0; \ + .cfi_same_value x1; \ + .cfi_same_value x2; \ + .cfi_same_value x3; \ + .cfi_same_value x4; \ + .cfi_same_value x5; \ + .cfi_same_value x6; \ + .cfi_same_value x7; \ + .cfi_same_value x9; \ + .cfi_same_value x10; \ + .cfi_same_value x11; \ + .cfi_same_value x12; \ + .cfi_same_value x13; \ + .cfi_same_value x14; \ + .cfi_same_value x16; \ + .cfi_same_value x17; \ + .cfi_same_value x18 + +#define SAME_VALUE_X \ + SAME_VALUE_X_NO8_NO15; \ + .cfi_same_value x8; \ + .cfi_same_value x15 + +#define SAME_VALUE_ALL_Q \ + .cfi_same_value q0; \ + .cfi_same_value q1; \ + .cfi_same_value q2; \ + .cfi_same_value q3; \ + .cfi_same_value q4; \ + .cfi_same_value q5; \ + .cfi_same_value q6; \ + .cfi_same_value q7; \ + .cfi_same_value q8; \ + .cfi_same_value q9; \ + .cfi_same_value q10; \ + .cfi_same_value q11; \ + .cfi_same_value q12; \ + .cfi_same_value q13; \ + .cfi_same_value q14; \ + .cfi_same_value q15; \ + .cfi_same_value q16; \ + .cfi_same_value q17; \ + .cfi_same_value q18; \ + .cfi_same_value q19; \ + .cfi_same_value q20; \ + .cfi_same_value q21; \ + .cfi_same_value q22; \ + .cfi_same_value q23; \ + .cfi_same_value q24; \ + .cfi_same_value q25; \ + .cfi_same_value q26; \ + .cfi_same_value q27; \ + .cfi_same_value q28; \ + .cfi_same_value q29; \ + .cfi_same_value q30; \ + .cfi_same_value q31 + + .p2align 4 +_syscallbuf_code_start: + +_syscall_hook_trampoline: + // stack frame: + // 208-688: q2 - q31 + // 128-200: x10 - x18 + // 112-128: x7, x9 + // 104-112: x6 + // 48-104: syscall_info + // 32-48: x29, x30 + // 0-32: q0, q1 + .cfi_startproc + // GAS correctly put these in CIE as long as they + // appears right after .cfi_startproc + SAME_VALUE_X + SAME_VALUE_ALL_Q + // Store the vector registers at the bottom so that we can take advantage of + // the larger pre-offset that can be encoded in the instruction + // to adjust the stack pointer. + stp q0, q1, [sp, -688]! + .cfi_def_cfa_offset 688 + .cfi_rel_offset q0, 0 + .cfi_rel_offset q1, 0 + 16 + STPX_STACK(29, 30, 32) + /* Build a |struct syscall_info| on the stack by pushing the arguments + and syscall number. */ + STPX_STACK(8, 0, 48) + add x0, sp, 48 // x0 saved, store new argument for syscall_hook in x0. + STPX_STACK(1, 2, 64) + STPX_STACK(3, 4, 80) + STPX_STACK(5, 6, 96) + STPX_STACK(7, 9, 112) + STPX_STACK(10, 11, 128) + STPX_STACK(12, 13, 144) + STPX_STACK(14, 15, 160) + STPX_STACK(16, 17, 176) + str x18, [sp, 192] + .cfi_rel_offset x18, 192 + STPQ_STACK(2, 3, 208) + STPQ_STACK(4, 5, 240) + STPQ_STACK(6, 7, 272) + // function call only maintain the bottom half of v8-v15 + // whereas syscall maintains all the v registers + // so we actually need to save and restore v8-v15 as well... + // (in principle we could save only the upper half but + // that's too much effort especially for the unwind info...) + STPQ_STACK(8, 9, 304) + STPQ_STACK(10, 11, 336) + STPQ_STACK(12, 13, 368) + STPQ_STACK(14, 15, 400) + STPQ_STACK(16, 17, 432) + STPQ_STACK(18, 19, 464) + STPQ_STACK(20, 21, 496) + STPQ_STACK(22, 23, 528) + STPQ_STACK(24, 25, 560) + STPQ_STACK(26, 27, 592) + STPQ_STACK(28, 29, 624) + STPQ_STACK(30, 31, 656) + + bl syscall_hook + + movz x29, #:abs_g1:alt_stack_nesting_level // assume 32bit address + movk x29, #:abs_g0_nc:alt_stack_nesting_level + ldr w30, [x29] + sub w30, w30, 1 + str w30, [x29] + + ldp x29, x30, [sp, 32] + .cfi_same_value x29 + // x30 should not use same_value since it's value is changed + // by the function call instruction + .cfi_restore x30 + ldr x8, [sp, 48] + .cfi_same_value x8 + LDPX_STACK(1, 2, 64) + LDPX_STACK(3, 4, 80) + LDPX_STACK(5, 6, 96) + LDPX_STACK(7, 9, 112) + LDPX_STACK(10, 11, 128) + LDPX_STACK(14, 15, 160) + LDPX_STACK(16, 17, 176) + ldr x18, [sp, 192] + .cfi_same_value x18 + + LDPQ_STACK(2, 3, 208) + LDPQ_STACK(4, 5, 240) + LDPQ_STACK(6, 7, 272) + LDPQ_STACK(8, 9, 304) + LDPQ_STACK(10, 11, 336) + LDPQ_STACK(12, 13, 368) + LDPQ_STACK(14, 15, 400) + LDPQ_STACK(16, 17, 432) + LDPQ_STACK(18, 19, 464) + LDPQ_STACK(20, 21, 496) + LDPQ_STACK(22, 23, 528) + LDPQ_STACK(24, 25, 560) + LDPQ_STACK(26, 27, 592) + LDPQ_STACK(28, 29, 624) + LDPQ_STACK(30, 31, 656) + + ldp q0, q1, [sp], 688 + .cfi_same_value q0 + .cfi_same_value q1 + .cfi_def_cfa_offset 0 + ret + .cfi_endproc + .size _syscall_hook_trampoline, .-_syscall_hook_trampoline + +/** + * On syscallhook entry, we are still on the old stack, + * with x30 (lr) points to right after the blr instruction that got us here. + * The old values of x15 and x30 are saved to [x8], which is the syscall number + * with an offset to land in the stub_scratch_2 area. + */ + .globl _syscall_hook_trampoline_raw + .hidden _syscall_hook_trampoline_raw + .type _syscall_hook_trampoline_raw, @function +_syscall_hook_trampoline_raw: + .cfi_startproc + // GAS correctly put these in CIE as long as they + // appears right after .cfi_startproc + .cfi_return_column 32 // pc + SAME_VALUE_X_NO8_NO15 + SAME_VALUE_ALL_Q + // We define CFA as the value of the stack pointer when we enter this function + // as specified in aadwarf64. + // Since we aren't using the caller stack, none of the registers + // we save will be in the CFA... + .cfi_def_cfa sp, 0 + REG_AT_REG_OFFSET(0x20 /* pc */, 30, 16) + REG_AT_REG_OFFSET(0x0f /* x15 */, 8, + (stub_scratch_2 - preload_thread_locals) | 0x80, 0) + REG_AT_REG_OFFSET(0x1e /* x30 */, 8, + (stub_scratch_2 - preload_thread_locals + 8) | 0x80, 0) + // x8 = x8 - preload_thread_locals + // The last byte of the signed number LEB128 contains the top 4 bits + // from the 32bit negative number (obtained using the shifted 0xF mask) + // and 3 bits of leading ones above it (the `or`ing of the `0x70`). + // The top bit of the byte is 0 signaling the end of the LEB128 encoding. + .cfi_escape 0x16, /* DW_CFA_val_expression */ \ + 0x08, /* x8 */ \ + 0x06, /* length 6 */ \ + 0x78, /* DW_OP_breg8 */ \ + ((-preload_thread_locals) & 0x7F) | 0x80, \ + ((-preload_thread_locals) & (0x7F << 7)) >> 7 | 0x80, \ + ((-preload_thread_locals) & (0x7F << 14)) >> 14 | 0x80, \ + ((-preload_thread_locals) & (0x7F << 21)) >> 21 | 0x80, \ + ((-preload_thread_locals) & ( 0xF << 28)) >> 28 | 0x70 + // old gcc version doesn't want to encode bti + // unless we specify armv8.5-a even though this was in the nop space. + .inst 0xd503245f // bti c + mov x15, preload_thread_locals + // Stash away x30 so that we can have two registers to use again + // we can't use stub_scratch_2 since we might overwrite the data there + str x30, [x15, stub_scratch_1 - preload_thread_locals] + .cfi_escape 0x10, /* DW_CFA_expression */ \ + 0x20, /* pc */ \ + 0x08, /* length 8 */ \ + DW_OP_CONST4U(stub_scratch_1), \ + 0x06, /* DW_OP_deref */ \ + 0x23, /* DW_OP_plus_uconst */ \ + 16 + // Move the register stash region from + // `x8 + stub_scratch_2 - preload_thread_locals` + // (i.e. `stub_scratch_2 + original_x8`) to the start of `stub_scratch_2` + // Do it in the forward order since we know x8 >= stub_scratch_2 + ldr x30, [x8, stub_scratch_2 - preload_thread_locals] + str x30, [x15, stub_scratch_2 - preload_thread_locals] + ldr x30, [x8, stub_scratch_2 - preload_thread_locals + 8] + str x30, [x15, stub_scratch_2 - preload_thread_locals + 8] + // Restore x8 + movk x8, 0, lsl 16 + .cfi_same_value x8 + REG_AT_ADDR32(0x0f /* x15 */, stub_scratch_2) + REG_AT_ADDR32(0x1e /* x30 */, stub_scratch_2 + 8) + + cmp x8, 0xdc // SYS_clone + .cfi_remember_state + b.eq .Lfallback_rawsyscall + + ldr w30, [x15, alt_stack_nesting_level - preload_thread_locals] + cmp w30, 0 + add w30, w30, 1 + str w30, [x15, alt_stack_nesting_level - preload_thread_locals] + + b.ne .Lnest_syscall_hook_trampoline_raw + ldr x30, [x15, syscallbuf_stub_alt_stack - preload_thread_locals] + sub x30, x30, 48 + b .Lstackset_syscall_hook_trampoline_raw +.Lnest_syscall_hook_trampoline_raw: + sub x30, sp, 48 +.Lstackset_syscall_hook_trampoline_raw: + // Now x30 points to the new stack with 48 bytes of space allocated + + // Move sp into a normal register. Otherwise we can't store it + mov x15, sp + // Save sp to new stack. + str x15, [x30, 16] + mov sp, x30 + REG_AT_REG_OFFSET(0x1f /* sp */, 31, 16) + .cfi_escape 0x0f, /* DW_CFA_def_cfa_expression */ \ + 0x03, /* 3 bytes follow */ \ + 0x8f, /* DW_OP_breg31 */ \ + 16, \ + 0x06 /* DW_OP_deref */ + // sp is switched, x15 and x30 are free to use + // [stub_scratch_1] holds the stub address + + // Now we need to construct the stack frame, with everything + // in the scratch area copied over so that we can nest again. + mov x15, preload_thread_locals + // load runtime stub address + ldr x30, [x15, stub_scratch_1 - preload_thread_locals] + // save stub return address + str x30, [sp] + // load syscall return address + ldr x30, [x30, 16] + str x30, [sp, 8] + ldr x30, [x15, stub_scratch_2 - preload_thread_locals] + str x30, [sp, 24] + ldr x30, [x15, stub_scratch_2 - preload_thread_locals + 8] + str x30, [sp, 32] + + // stackframe layout + // 32: original x30 + // 24: original x15 + // 16: original sp + // 8: return address to syscall + // 0: return address to stub + REG_AT_REG_OFFSET(0x20 /* pc */, 31, 8) + REG_AT_REG_OFFSET(0x0f /* x15 */, 31, 24) + REG_AT_REG_OFFSET(0x1e /* x30 */, 31, 32) + + bl _syscall_hook_trampoline + +/** + * The _syscall_hook_trampoline restores all the registers to the previous values + * (minus the register for syscall return value) so we just need to restore + * the registers we’ve overwritten by the end of the stack switch, + * i.e. x15 , x30 and sp. + * x15 and x30 will be restored when we get back to the stub + * so we don’t need to restore them here but we do need to copy their values + * to stub_scratch_2 again so that the stub can restore them + * (since without a valid stack that is still the only memory + * we can use to restore things). + * We also need to store the return address to stub_scratch_1 + * since that’ll help rr with setting breakpoint. + */ + + movz x15, #:abs_g1:stub_scratch_2 // assume 32bit address + movk x15, #:abs_g0_nc:stub_scratch_2 + ldr x30, [sp, 24] // x15 + str x30, [x15] + ldr x30, [sp, 32] // x30 + str x30, [x15, 8] + REG_AT_ADDR32(0x0f /* x15 */, stub_scratch_2) + REG_AT_ADDR32(0x1e /* x30 */, stub_scratch_2 + 8) + ldr x30, [sp, 8] // syscall return address + // tell rr breakpoint handling where we are going + str x30, [x15, stub_scratch_1 - stub_scratch_2] + REG_AT_ADDR32(0x20 /* pc */, stub_scratch_1) + ldr x30, [sp] // stub return address + ldr x15, [sp, 16] // sp + mov sp, x15 + .cfi_restore sp + .cfi_def_cfa sp, 0 + movz x15, #:abs_g1:stub_scratch_2 // assume 32bit address + movk x15, #:abs_g0_nc:stub_scratch_2 +_syscallbuf_final_exit_instruction: + ret + +.Lfallback_rawsyscall: + .cfi_restore_state + // Must not touch sp in this branch. + // Use x15 to remember the return address since we are only copying + // the first two elements of stub_scratch_2 for the child. + ldr x15, [x15, stub_scratch_1 - preload_thread_locals] + REG_AT_REG_OFFSET(0x20 /* pc */, 15, 16) + mov x30, 0x70000000 // RR_PAGE_SYSCALL_TRACED + blr x30 + // stub_scratch_2 content is maintained by rr + // we need to put the syscall return address in stub_scratch_1 + movz x30, #:abs_g1:stub_scratch_2 // assume 32bit address + movk x30, #:abs_g0_nc:stub_scratch_2 + str x15, [x30, 16] // stash away stub address + ldr x15, [x15, 16] // syscall return address + .cfi_register 32, x15 + str x15, [x30, stub_scratch_1 - stub_scratch_2] + REG_AT_ADDR32(0x20 /* pc */, stub_scratch_1) + mov x15, x30 + ldr x30, [x15, 16] + b _syscallbuf_final_exit_instruction + + .cfi_endproc + .size _syscall_hook_trampoline_raw, .-_syscall_hook_trampoline_raw + +#endif /* __aarch64__ */ + + .section .note.GNU-stack,"",@progbits
diff --git a/rr/android/x86_64/share/rr/src/preload/syscallbuf.c b/rr/android/x86_64/share/rr/src/preload/syscallbuf.c new file mode 100644 index 0000000..c201ba7 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/syscallbuf.c
@@ -0,0 +1,4327 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#define RR_IMPLEMENT_PRELOAD + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif +#include "syscallbuf.h" + +/** + * Buffer syscalls, so that rr can process the entire buffer with one + * trap instead of a trap per call. + * + * This file is compiled into a dso that's PRELOADed in recorded + * applications. The dso replaces libc syscall wrappers with our own + * implementation that saves nondeterministic outparams in a fixed-size + * buffer. When the buffer is full or the recorded application + * invokes an un-buffered syscall or receives a signal, we trap to rr + * and it records the state of the buffer. + * + * During replay, rr simply refills the buffer with the recorded data + * when it reaches the "flush-buffer" events that were recorded. Then + * rr emulates each buffered syscall, and the code here restores the + * client data from the refilled buffer. + * + * The crux of the implementation here is to selectively ptrace-trap + * syscalls. The normal (un-buffered) syscalls generate a ptrace + * trap, and the buffered syscalls trap directly to the kernel. This + * is implemented with a seccomp-bpf which examines the syscall and + * decides how to handle it (see seccomp-bpf.h and Task::spawn). + * + * Because this code runs in the tracee's address space and overrides + * system calls, the code is rather delicate. The following rules + * must be followed + * + * o No rr headers (other than seccomp-bpf.h and rr.h) may be included + * o All syscalls invoked by this code must be called directly, not + * through libc wrappers (which this file may itself indirectly override) + * + * The wrapper functions are named sys_xxxx. Each wrapper normally makes one + * untraced syscall or one traced syscall of the same type, but there are + * exceptions. For example sys_read can make a number of untraced syscalls + * instead of a single untraced syscall. A critical rule is that any traced + * or MAY_BLOCK untraced syscall *must* be the last syscall performed by the + * wrapper. + */ + +#include <dlfcn.h> +#include <limits.h> +#include <unistd.h> +#include <asm/errno.h> +#include <asm/ioctls.h> +#include <asm/poll.h> +#include <asm/signal.h> +#include <asm/siginfo.h> +#include <asm/stat.h> +#include <asm/statfs.h> +#include <linux/eventpoll.h> +#include <linux/futex.h> +#include <linux/fcntl.h> +#include <linux/if_packet.h> +#include <linux/ioctl.h> +#include <linux/mman.h> +#include <linux/net.h> +#include <linux/netlink.h> +#include <linux/perf_event.h> +#include <linux/ptrace.h> +#include <linux/quota.h> +#include <linux/resource.h> +#include <linux/stat.h> +#include <linux/socket.h> +#include <linux/stat.h> +#include <linux/time.h> +#include <linux/types.h> +#include <linux/uio.h> +#include <linux/un.h> +#include <linux/utsname.h> +#include <stdarg.h> +#include <stdio.h> +#include <syscall.h> +#include <sysexits.h> +#include <sys/mman.h> +#include <sys/prctl.h> +#include <unistd.h> + +#include "preload_interface.h" +#include "rr/rr.h" + +#ifndef SOL_NETLINK +#define SOL_NETLINK 270 +#endif + +#ifndef BTRFS_IOCTL_MAGIC +#define BTRFS_IOCTL_MAGIC 0x94 +#endif +#ifndef BTRFS_IOC_CLONE_RANGE +struct btrfs_ioctl_clone_range_args { + int64_t src_fd; + uint64_t src_offset; + uint64_t src_length; + uint64_t dest_offset; +}; +#define BTRFS_IOC_CLONE_RANGE \ + _IOW(BTRFS_IOCTL_MAGIC, 13, struct btrfs_ioctl_clone_range_args) +#endif +#ifndef MADV_FREE +#define MADV_FREE 8 +#endif + +#ifndef GRND_NONBLOCK +#define GRND_NONBLOCK 1 +#endif + +struct rr_rseq { + uint32_t cpu_id_start; + uint32_t cpu_id; + uint64_t rseq_cs; + uint32_t flags; +} __attribute__((aligned(32))); + +/* NB: don't include any other local headers here. */ + +#ifdef memcpy +#undef memcpy +#endif +#define memcpy you_must_use_local_memcpy + +static long _traced_init_syscall(int syscallno, long a0, long a1, long a2, + long a3, long a4, long a5) +{ + return syscall(syscallno, a0, a1, a2, a3, a4, a5); +} + +#ifdef syscall +#undef syscall +#endif +#define syscall you_must_use_traced_syscall + +static inline unsigned char *rr_page_replay_flag_addr(void) { + return (unsigned char *)RR_PAGE_IN_REPLAY_FLAG; +} + +/** + * Declaring this to avoid issues with the declaration of f_owner_ex + * across distros. See https://github.com/rr-debugger/rr/issues/2693 */ +struct rr_f_owner_ex { + int type; + int pid; +}; + +#ifndef __ARCH_FLOCK64_PAD +#define __ARCH_FLOCK64_PAD +#endif +struct rr_flock64 { + short l_type; + short l_whence; + __kernel_loff_t l_start; + __kernel_loff_t l_len; + __kernel_pid_t l_pid; + __ARCH_FLOCK64_PAD +}; + +// The alignment of this struct is incorrect, but as long as it's not +// used inside other structures, defining it this way makes the code below +// easier. +typedef uint64_t kernel_sigset_t; + +/* Nonzero when syscall buffering is enabled. */ +static int buffer_enabled; +/* Nonzero after process-global state has been initialized. */ +static int process_inited; + +RR_HIDDEN struct preload_globals globals; + +static struct preload_thread_locals* const thread_locals = + (struct preload_thread_locals*)PRELOAD_THREAD_LOCALS_ADDR; + +/** + * Return a pointer to the buffer header, which happens to occupy the + * initial bytes in the mapped region. + */ +static struct syscallbuf_hdr* buffer_hdr(void) { + return (struct syscallbuf_hdr*)thread_locals->buffer; +} + +/** + * Return a pointer to the byte just after the last valid syscall record in + * the buffer. + */ +static uint8_t* buffer_last(void) { + return (uint8_t*)next_record(buffer_hdr()); +} + +/** + * Return a pointer to the byte just after the very end of the mapped + * region. + */ +static uint8_t* buffer_end(void) { + return thread_locals->buffer + thread_locals->buffer_size; +} + +/** + * Same as libc memcpy(), but usable within syscallbuf transaction + * critical sections. + */ +static void local_memcpy(void* dest, const void* source, int n) { +#if defined(__i386__) || defined(__x86_64__) + /* On modern x86-ish CPUs rep movsb is fast, usually able to move + * 64 bytes at a time. + */ + __asm__ __volatile__("rep movsb\n\t" + : "+S"(source), "+D"(dest), "+c"(n) + : + : "cc", "memory"); +#elif defined(__aarch64__) + long c1; + long c2; + __asm__ __volatile__("subs %4, %2, 16\n\t" + "b.lt 2f\n\t" + "1:\n\t" + "mov %2, %4\n\t" + "ldp %3, %4, [%1], #16\n\t" + "stp %3, %4, [%0], #16\n\t" + "subs %4, %2, #16\n\t" + "b.ge 1b\n" + "2:\n\t" + "tbz %2, 3, 3f\n\t" + "ldr %3, [%1], #8\n\t" + "str %3, [%0], #8\n\t" + "3:\n\t" + "tbz %2, 2, 3f\n\t" + "ldr %w3, [%1], #4\n\t" + "str %w3, [%0], #4\n\t" + "3:\n\t" + "tbz %2, 1, 3f\n\t" + "ldrh %w3, [%1], #2\n\t" + "strh %w3, [%0], #2\n\t" + "3:\n\t" + "tbz %2, 0, 3f\n\t" + "ldrb %w3, [%1]\n\t" + "strb %w3, [%0]\n\t" + "3:\n\t" + : "+r"(dest), "+r"(source), "+r"(n), "=&r"(c1), "=&r"(c2) + : + : "cc", "memory"); +#else +#error Unknown architecture +#endif +} + +/** + * Same as libc memset(), but usable within syscallbuf transaction + * critical sections. + */ +static void local_memset(void* dest, uint8_t c, int n) { +#if defined(__i386__) || defined(__x86_64__) + /* On modern x86-ish CPUs rep stosb is fast, usually able to move + * 64 bytes at a time. + */ + __asm__ __volatile__("rep stosb\n\t" + : "+a"(c), "+D"(dest), "+c"(n) + : + : "cc", "memory"); +#elif defined(__aarch64__) + double v1; + long n2; + __asm__ __volatile__("subs %4, %2, 32\n\t" + "b.lt 2f\n\t" + "dup %3.16b, %w0\n" + "1:\n\t" + "mov %2, %4\n\t" + "stp %q3, %q3, [%1], #32\n\t" + "subs %4, %2, #32\n\t" + "b.ge 1b\n" + "2:\n\t" + "cbz %2, 4f\n" + "3:\n\t" + "strb %w0, [%1], #1\n\t" + "subs %2, %2, #1\n\t" + "b.ne 3b\n" + "4:\n\t" + : "+r"(c), "+r"(dest), "+r"(n), "=x"(v1), "=r"(n2) + : + : "cc", "memory"); +#else +#error Unknown architecture +#endif +} + +/** + * Xorshift* RNG + */ +static int64_t local_random(void) { + uint64_t x = globals.random_seed; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + globals.random_seed = x; + return x * 0x2545F4914F6CDD1D; +} + +/* The following are wrappers for the syscalls invoked by this library + * itself. These syscalls will generate ptrace traps. + * stack_param_1 and stack_param_2 are pushed onto the stack just before + * the syscall, for SYS_rrcall_notify_syscall_hook_exit which takes stack + * parameters as well as register parameters. + * syscall_instruction is the actual syscall invocation instruction + * (a function which we call with the registers set up appropriately). + */ + +extern RR_HIDDEN long _raw_syscall(int syscallno, long a0, long a1, long a2, + long a3, long a4, long a5, + void* syscall_instruction, + long stack_param_1, long stack_param_2); + +static int privileged_traced_syscall(int syscallno, long a0, long a1, long a2, + long a3, long a4, long a5) { + return _raw_syscall(syscallno, a0, a1, a2, a3, a4, a5, + RR_PAGE_SYSCALL_PRIVILEGED_TRACED, 0, 0); +} +#define privileged_traced_syscall6(no, a0, a1, a2, a3, a4, a5) \ + privileged_traced_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5) +#define privileged_traced_syscall5(no, a0, a1, a2, a3, a4) \ + privileged_traced_syscall6(no, a0, a1, a2, a3, a4, 0) +#define privileged_traced_syscall4(no, a0, a1, a2, a3) \ + privileged_traced_syscall5(no, a0, a1, a2, a3, 0) +#define privileged_traced_syscall3(no, a0, a1, a2) \ + privileged_traced_syscall4(no, a0, a1, a2, 0) +#define privileged_traced_syscall2(no, a0, a1) \ + privileged_traced_syscall3(no, a0, a1, 0) +#define privileged_traced_syscall1(no, a0) privileged_traced_syscall2(no, a0, 0) +#define privileged_traced_syscall0(no) privileged_traced_syscall1(no, 0) + +/** + * Make a raw traced syscall using the params in |call|. + */ +static long traced_raw_syscall(struct syscall_info* call) { + if (call->no == SYS_rrcall_rdtsc) { + // Handle this specially because the rrcall writes to a memory out-param + // and we need to actually modify the outgoing AX/DX registers instead. + uint32_t tsc[2]; + privileged_traced_syscall1(SYS_rrcall_rdtsc, tsc); + // Overwrite RDX (syscall arg 3) with our TSC value. + call->args[2] = tsc[1]; + return tsc[0]; + } + /* FIXME: pass |call| to avoid pushing these on the stack + * again. */ + return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2], + call->args[3], call->args[4], call->args[5], + RR_PAGE_SYSCALL_TRACED, 0, 0); +} + +/** + * Make a raw traced syscall using the params in |call|, privileged. + */ +static long privileged_traced_raw_syscall(const struct syscall_info* call) { + /* FIXME: pass |call| to avoid pushing these on the stack + * again. */ + return _raw_syscall(call->no, call->args[0], call->args[1], call->args[2], + call->args[3], call->args[4], call->args[5], + RR_PAGE_SYSCALL_PRIVILEGED_TRACED, 0, 0); +} + +#if defined(SYS_fcntl64) +#define RR_FCNTL_SYSCALL SYS_fcntl64 +#else +#define RR_FCNTL_SYSCALL SYS_fcntl +#endif + +static int privileged_traced_fcntl(int fd, int cmd, ...) { + va_list ap; + void* arg; + + va_start(ap, cmd); + arg = va_arg(ap, void*); + va_end(ap); + + return privileged_traced_syscall3(RR_FCNTL_SYSCALL, fd, cmd, arg); +} + +static pid_t privileged_traced_getpid(void) { + return privileged_traced_syscall0(SYS_getpid); +} + +static pid_t privileged_traced_gettid(void) { + return privileged_traced_syscall0(SYS_gettid); +} + +static int privileged_traced_perf_event_open(struct perf_event_attr* attr, + pid_t pid, int cpu, int group_fd, + unsigned long flags) { + return privileged_traced_syscall5(SYS_perf_event_open, attr, pid, cpu, + group_fd, flags); +} + +static __attribute__((noreturn)) void privileged_traced_raise(int sig) { + privileged_traced_syscall2(SYS_kill, privileged_traced_getpid(), sig); + __builtin_unreachable(); +} + +static ssize_t privileged_traced_write(int fd, const void* buf, size_t count) { + return privileged_traced_syscall3(SYS_write, fd, buf, count); +} + +static void logmsg(const char* msg) { + privileged_traced_write(STDERR_FILENO, msg, rrstrlen(msg)); +} + +#define STR_HELPER(x) #x +#define STR(x) STR_HELPER(x) + +#ifndef NDEBUG +#define assert(cond) \ + do { \ + if (!(cond)) { \ + logmsg(__FILE__ ":" STR(__LINE__) ": Assertion `" #cond "' failed.\n"); \ + privileged_traced_raise(SIGABRT); \ + } \ + } while (0) +#else +#define assert(cond) \ + do { \ + __attribute__((unused)) size_t s = sizeof(cond); \ + } while (0) +#endif + +#define fatal(msg) \ + do { \ + logmsg(__FILE__ ":" STR(__LINE__) ": Fatal error: " msg "\n"); \ + privileged_traced_raise(SIGABRT); \ + } while (0) + +/** + * Unlike |traced_syscall()|, this helper is implicitly "raw" (returns + * the direct kernel return value), because the syscall hooks have to + * save that raw return value. + * This is only called from syscall wrappers that are doing a proper + * buffered syscall. + */ +static long untraced_syscall_full(int syscallno, long a0, long a1, long a2, + long a3, long a4, long a5, + void* syscall_instruction, + long stack_param_1, long stack_param_2) { + struct syscallbuf_record* rec = (struct syscallbuf_record*)buffer_last(); + /* Ensure tools analyzing the replay can find the pending syscall result */ + thread_locals->pending_untraced_syscall_result = &rec->ret; + long ret = _raw_syscall(syscallno, a0, a1, a2, a3, a4, a5, + syscall_instruction, stack_param_1, stack_param_2); +/* During replay, return the result that's already in the buffer, instead + of what our "syscall" returned. */ +#if defined(__i386__) || defined(__x86_64__) + /* On entry, during recording %eax/%rax are whatever the kernel returned + * but during replay they may be invalid (e.g. 0). During replay, reload + * %eax/%rax from |rec->ret|. At the end of this sequence all registers + * will match between recording and replay. We clobber the temporary + * in_replay register, and the condition codes, to ensure this. + * This all assumes the compiler doesn't create unnecessary temporaries + * holding values like |ret|. Inspection of generated code shows it doesn't. + */ + unsigned char tmp_in_replay = *rr_page_replay_flag_addr(); + __asm__("test %1,%1\n\t" + "cmovne %2,%0\n\t" + "xor %1,%1\n\t" + : "+a"(ret), "+c"(tmp_in_replay) + : "m"(rec->ret) + : "cc"); +#elif defined(__aarch64__) + unsigned char *globals_in_replay = rr_page_replay_flag_addr(); + long *rec_ret = &rec->ret; + __asm__("ldrb %w1, [%1]\n\t" // tmp_in_replay = *rr_page_replay_flag_addr() + "ldr %2, [%2]\n\t" // tmp = rec->ret + "cmp %w1, #0\n\t" + "csel %0, %0, %2, eq\n\t" // ret = tmp_in_replay ? tmp : ret + "subs %1, xzr, xzr\n\t" // clear tmp_in_replay and flag + "mov %2, xzr\n\t" // clear tmp + : "+r"(ret), "+r"(globals_in_replay), "+r"(rec_ret) + : + : "cc"); +#else +#error Unknown architecture +#endif + return ret; +} +#define untraced_syscall_base(no, a0, a1, a2, a3, a4, a5, inst) \ + untraced_syscall_full(no, a0, a1, a2, a3, a4, a5, inst, 0, 0) +#define untraced_syscall6(no, a0, a1, a2, a3, a4, a5) \ + untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY) +#define untraced_syscall5(no, a0, a1, a2, a3, a4) \ + untraced_syscall6(no, a0, a1, a2, a3, a4, 0) +#define untraced_syscall4(no, a0, a1, a2, a3) \ + untraced_syscall5(no, a0, a1, a2, a3, 0) +#define untraced_syscall3(no, a0, a1, a2) untraced_syscall4(no, a0, a1, a2, 0) +#define untraced_syscall2(no, a0, a1) untraced_syscall3(no, a0, a1, 0) +#define untraced_syscall1(no, a0) untraced_syscall2(no, a0, 0) +#define untraced_syscall0(no) untraced_syscall1(no, 0) + +#define untraced_replayed_syscall6(no, a0, a1, a2, a3, a4, a5) \ + untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_UNTRACED) +#define untraced_replayed_syscall5(no, a0, a1, a2, a3, a4) \ + untraced_replayed_syscall6(no, a0, a1, a2, a3, a4, 0) +#define untraced_replayed_syscall4(no, a0, a1, a2, a3) \ + untraced_replayed_syscall5(no, a0, a1, a2, a3, 0) +#define untraced_replayed_syscall3(no, a0, a1, a2) \ + untraced_replayed_syscall4(no, a0, a1, a2, 0) +#define untraced_replayed_syscall2(no, a0, a1) \ + untraced_replayed_syscall3(no, a0, a1, 0) +#define untraced_replayed_syscall1(no, a0) untraced_replayed_syscall2(no, a0, 0) +#define untraced_replayed_syscall0(no) untraced_replayed_syscall1(no, 0) + +static long __attribute__((unused)) +untraced_replay_assist_syscall_base(int syscallno, long a0, long a1, long a2, + long a3, long a4, long a5, + void* syscall_instruction) { + struct syscallbuf_record* rec = (struct syscallbuf_record*)buffer_last(); + rec->replay_assist = 1; + return untraced_syscall_base(syscallno, a0, a1, a2, a3, a4, a5, syscall_instruction); +} + +#define untraced_replay_assist_syscall6(no, a0, a1, a2, a3, a4, a5) \ + untraced_replay_assist_syscall_base( \ + no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_UNTRACED_REPLAY_ASSIST) +#define untraced_replay_assist_syscall5(no, a0, a1, a2, a3, a4) \ + untraced_replay_assist_syscall6(no, a0, a1, a2, a3, a4, 0) +#define untraced_replay_assist_syscall4(no, a0, a1, a2, a3) \ + untraced_replay_assist_syscall5(no, a0, a1, a2, a3, 0) +#define untraced_replay_assist_syscall3(no, a0, a1, a2) \ + untraced_replay_assist_syscall4(no, a0, a1, a2, 0) +#define untraced_replay_assist_syscall2(no, a0, a1) \ + untraced_replay_assist_syscall3(no, a0, a1, 0) +#define untraced_replay_assist_syscall1(no, a0) \ + untraced_replay_assist_syscall2(no, a0, 0) +#define untraced_replay_assist_syscall0(no) \ + untraced_replay_assist_syscall1(no, 0) + +// "Privileged" syscalls are not affected by the application's own seccomp +// filters. +#define privileged_untraced_syscall6(no, a0, a1, a2, a3, a4, a5) \ + untraced_syscall_base(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY) +#define privileged_untraced_syscall5(no, a0, a1, a2, a3, a4) \ + privileged_untraced_syscall6(no, a0, a1, a2, a3, a4, 0) +#define privileged_untraced_syscall4(no, a0, a1, a2, a3) \ + privileged_untraced_syscall5(no, a0, a1, a2, a3, 0) +#define privileged_untraced_syscall3(no, a0, a1, a2) \ + privileged_untraced_syscall4(no, a0, a1, a2, 0) +#define privileged_untraced_syscall2(no, a0, a1) \ + privileged_untraced_syscall3(no, a0, a1, 0) +#define privileged_untraced_syscall1(no, a0) \ + privileged_untraced_syscall2(no, a0, 0) +#define privileged_untraced_syscall0(no) privileged_untraced_syscall1(no, 0) + +// "Unrecorded" syscalls are performed during recording only and are "raw"; +// they are not associated with syscallbuf records. +#define privileged_unrecorded_syscall6(no, a0, a1, a2, a3, a4, a5) \ + _raw_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, \ + (uintptr_t)a3, (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_RECORDING_ONLY, 0, 0) +#define privileged_unrecorded_syscall5(no, a0, a1, a2, a3, a4) \ + privileged_unrecorded_syscall6(no, a0, a1, a2, a3, a4, 0) +#define privileged_unrecorded_syscall4(no, a0, a1, a2, a3) \ + privileged_unrecorded_syscall5(no, a0, a1, a2, a3, 0) +#define privileged_unrecorded_syscall3(no, a0, a1, a2) \ + privileged_unrecorded_syscall4(no, a0, a1, a2, 0) +#define privileged_unrecorded_syscall2(no, a0, a1) \ + privileged_unrecorded_syscall3(no, a0, a1, 0) +#define privileged_unrecorded_syscall1(no, a0) \ + privileged_unrecorded_syscall2(no, a0, 0) +#define privileged_unrecorded_syscall0(no) privileged_unrecorded_syscall1(no, 0) + +#define replay_only_syscall6(no, a0, a1, a2, a3, a4, a5) \ + _raw_syscall(no, (uintptr_t)a0, (uintptr_t)a1, (uintptr_t)a2, (uintptr_t)a3, \ + (uintptr_t)a4, (uintptr_t)a5, \ + RR_PAGE_SYSCALL_PRIVILEGED_UNTRACED_REPLAY_ONLY, 0, 0) +#define replay_only_syscall5(no, a0, a1, a2, a3, a4) \ + replay_only_syscall6(no, a0, a1, a2, a3, a4, 0) +#define replay_only_syscall4(no, a0, a1, a2, a3) \ + replay_only_syscall5(no, a0, a1, a2, a3, 0) +#define replay_only_syscall3(no, a0, a1, a2) \ + replay_only_syscall4(no, a0, a1, a2, 0) +#define replay_only_syscall2(no, a0, a1) replay_only_syscall3(no, a0, a1, 0) +#define replay_only_syscall1(no, a0) replay_only_syscall2(no, a0, 0) +#define replay_only_syscall0(no) replay_only_syscall1(no, 0) + +static int privileged_untraced_close(int fd) { + return privileged_unrecorded_syscall1(SYS_close, fd); +} + +static int privileged_untraced_fcntl(int fd, int cmd, ...) { + va_list ap; + void* arg; + + va_start(ap, cmd); + arg = va_arg(ap, void*); + va_end(ap); + + return privileged_unrecorded_syscall3(RR_FCNTL_SYSCALL, fd, cmd, arg); +} + +/** + * Do what's necessary to set up buffers for the caller. + * |untraced_syscall_ip| lets rr know where our untraced syscalls will + * originate from. |addr| is the address of the control socket the + * child expects to connect to. |msg| is a pre-prepared IPC that can + * be used to share fds; |fdptr| is a pointer to the control-message + * data buffer where the fd number being shared will be stored. + * |args_vec| provides the tracer with preallocated space to make + * socketcall syscalls. + * + * Return a pointer to the syscallbuf (with an initialized header + * including the available size), if syscallbuf is enabled. + * + * This is a "magic" syscall implemented by rr. + */ +static void rrcall_init_buffers(struct rrcall_init_buffers_params* args) { + privileged_traced_syscall1(SYS_rrcall_init_buffers, args); +} + +/** + * Return a counter that generates a signal targeted at this task + * every time the task is descheduled |nr_descheds| times. + */ +static int open_desched_event_counter(size_t nr_descheds, pid_t tid) { + struct perf_event_attr attr; + int tmp_fd, fd; + struct rr_f_owner_ex own; + + local_memset(&attr, 0, sizeof(attr)); + attr.size = sizeof(attr); + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_CONTEXT_SWITCHES; + attr.disabled = 1; + attr.sample_period = nr_descheds; + + tmp_fd = privileged_traced_perf_event_open(&attr, 0 /*self*/, -1 /*any cpu*/, + -1, 0); + if (0 > tmp_fd) { + fatal("Failed to perf_event_open"); + } + fd = privileged_traced_fcntl(tmp_fd, F_DUPFD_CLOEXEC, + RR_DESCHED_EVENT_FLOOR_FD); + if (fd > 0) { + if (privileged_untraced_close(tmp_fd)) { + fatal("Failed to close tmp_fd"); + } + } else { + // We may be unable to find an fd above the RR_DESCHED_EVENT_FLOOR_FD (e.g + // because of a low ulimit). In that case, just use the tmp_fd we already + // have. + fd = tmp_fd; + } + if (privileged_untraced_fcntl(fd, F_SETFL, FASYNC)) { + fatal("Failed to fcntl(FASYNC) the desched counter"); + } + own.type = F_OWNER_TID; + own.pid = tid; + if (privileged_untraced_fcntl(fd, F_SETOWN_EX, &own)) { + fatal("Failed to fcntl(SETOWN_EX) the desched counter to this"); + } + if (privileged_untraced_fcntl(fd, F_SETSIG, globals.desched_sig)) { + fatal("Failed to fcntl(SETSIG) the desched counter"); + } + + return fd; +} + +/** + * Initialize thread-local buffering state, if enabled and not already + * initialized. + */ +static void init_thread(void) { + struct rrcall_init_buffers_params args; + + assert(process_inited); + if (thread_locals->thread_inited) { + return; + } + thread_locals->thread_inited = 1; + + /* Do not do any syscall buffering in a DiversionSession! */ + if (!buffer_enabled || globals.in_diversion) { + return; + } + + /* NB: we want this setup emulated during replay. */ + thread_locals->desched_counter_fd = + open_desched_event_counter(1, privileged_traced_gettid()); + + args.desched_counter_fd = thread_locals->desched_counter_fd; + + /* Trap to rr: let the magic begin! + * + * If the desched signal is currently blocked, then the tracer + * will clear our TCB guard and we won't be able to buffer + * syscalls. But the tracee will set the guard when (or if) + * the signal is unblocked. */ + rrcall_init_buffers(&args); + + thread_locals->cloned_file_data_fd = args.cloned_file_data_fd; + /* rr initializes the buffer header. */ + thread_locals->buffer = args.syscallbuf_ptr; + thread_locals->buffer_size = args.syscallbuf_size; + thread_locals->scratch_buf = args.scratch_buf; + thread_locals->usable_scratch_size = args.usable_scratch_size; +} + +// We don't include libc headers, since they include with Linux headers, +// so declared this prototype manually +extern const char* getenv(const char*); + +// getauxval is from glibc 2.16 (2012) - don't assume it exists. +unsigned long getauxval(unsigned long type) __attribute__((weak)); +#ifndef AT_SYSINFO_EHDR +#define AT_SYSINFO_EHDR 33 +#endif + +extern RR_HIDDEN long syscall_hook(struct syscall_info* call); + +/** + * Initialize process-global buffering state, if enabled. + * NOTE: constructors go into a special section by default so this won't + * be counted as syscall-buffering code! + */ +static void __attribute__((constructor)) init_process(void) { + struct rrcall_init_preload_params params; + + extern char _syscallbuf_final_exit_instruction; + extern char _syscallbuf_code_start; + extern char _syscallbuf_code_end; + extern char do_breakpoint_fault_addr; + +#if defined(__i386__) + extern RR_HIDDEN void __morestack(void); + extern RR_HIDDEN void _syscall_hook_trampoline_3d_01_f0_ff_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void); + struct syscall_patch_hook syscall_patch_hooks[] = { + /* pthread_cond_broadcast has 'int 80' followed by + * cmp $-4095,%eax (in glibc-2.18-16.fc20.i686) */ + { 0, + 5, + { 0x3d, 0x01, 0xf0, 0xff, 0xff }, + (uintptr_t)_syscall_hook_trampoline_3d_01_f0_ff_ff }, + /* Our vdso syscall patch has 'int 80' followed by onp; nop; nop */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 3, + { 0x90, 0x90, 0x90 }, + (uintptr_t)_syscall_hook_trampoline_90_90_90 } + }; + extern char _get_pc_thunks_start; + extern char _get_pc_thunks_end; +#elif defined(__x86_64__) + extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_01_f0_ff_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_3d_00_f0_ff_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_3d_00_f0_ff_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_8b_3c_24(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_89_45_f8(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c3(void); + extern RR_HIDDEN void _syscall_hook_trampoline_5a_5e_c3(void); + extern RR_HIDDEN void _syscall_hook_trampoline_89_c2_f7_da(void); + extern RR_HIDDEN void _syscall_hook_trampoline_90_90_90(void); + extern RR_HIDDEN void _syscall_hook_trampoline_ba_01_00_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_89_c1_31_d2(void); + extern RR_HIDDEN void _syscall_hook_trampoline_c3_nop(void); + extern RR_HIDDEN void _syscall_hook_trampoline_40_80_f6_81(void); + extern RR_HIDDEN void _syscall_hook_trampoline_49_89_ca(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_89_c1(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_c1_e2_20(void); + extern RR_HIDDEN void _syscall_hook_trampoline_49_8b_44_24_28(void); + extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_f7(void); + extern RR_HIDDEN void _syscall_hook_trampoline_4c_89_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff(void); + extern RR_HIDDEN void _syscall_hook_trampoline_b8_0e_00_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_b8_11_01_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_b8_ca_00_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_be_18_00_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_89_e5(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_89_fb(void); + extern RR_HIDDEN void _syscall_hook_trampoline_48_8d_b3_f0_08_00_00(void); + extern RR_HIDDEN void _syscall_hook_trampoline_nops(void); + +#define MOV_RDX_VARIANTS \ + MOV_RDX_TO_REG(48, d0) \ + MOV_RDX_TO_REG(48, d1) \ + MOV_RDX_TO_REG(48, d2) \ + MOV_RDX_TO_REG(48, d3) \ + MOV_RDX_TO_REG(48, d4) \ + MOV_RDX_TO_REG(48, d5) \ + MOV_RDX_TO_REG(48, d6) \ + MOV_RDX_TO_REG(48, d7) \ + MOV_RDX_TO_REG(49, d0) \ + MOV_RDX_TO_REG(49, d1) \ + MOV_RDX_TO_REG(49, d2) \ + MOV_RDX_TO_REG(49, d3) \ + MOV_RDX_TO_REG(49, d4) \ + MOV_RDX_TO_REG(49, d5) \ + MOV_RDX_TO_REG(49, d6) \ + MOV_RDX_TO_REG(49, d7) + +#define MOV_RDX_TO_REG(rex, op) \ + extern RR_HIDDEN void _syscall_hook_trampoline_##rex##_89_##op(void); + MOV_RDX_VARIANTS + + struct syscall_patch_hook syscall_patch_hooks[] = { + /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed + * by + * cmp $-4095,%rax (in glibc-2.18-16.fc20.x86_64) */ + { 0, + 6, + { 0x48, 0x3d, 0x01, 0xf0, 0xff, 0xff }, + (uintptr_t)_syscall_hook_trampoline_48_3d_01_f0_ff_ff }, + /* Many glibc syscall wrappers (e.g. __libc_recv) have 'syscall' + * followed by + * cmp $-4096,%rax (in glibc-2.18-16.fc20.x86_64) */ + { 0, + 6, + { 0x48, 0x3d, 0x00, 0xf0, 0xff, 0xff }, + (uintptr_t)_syscall_hook_trampoline_48_3d_00_f0_ff_ff }, + /* glibc-2.35-20.fc36.x86_64 start_thread has 'syscall' + * followed by 'cmp $-4096,%eax' */ + { 0, + 5, + { 0x3d, 0x00, 0xf0, 0xff, 0xff }, + (uintptr_t)_syscall_hook_trampoline_3d_00_f0_ff_ff }, + /* Many glibc syscall wrappers (e.g. read) have 'syscall' followed + * by + * mov (%rsp),%rdi (in glibc-2.18-16.fc20.x86_64) */ + { 0, + 4, + { 0x48, 0x8b, 0x3c, 0x24 }, + (uintptr_t)_syscall_hook_trampoline_48_8b_3c_24 }, + /* Some syscall wrappers have 'syscall' followed + * by + * mov %rax,-8(%rbp) */ + { 0, + 4, + { 0x48, 0x89, 0x45, 0xf8 }, + (uintptr_t)_syscall_hook_trampoline_48_89_45_f8 }, + /* Some syscall wrappers (e.g. read) have 'syscall' followed + * by + * mov %rax,%rbx */ + { 0, + 3, + { 0x48, 0x89, 0xc3 }, + (uintptr_t)_syscall_hook_trampoline_48_89_c3 }, + /* Some RDTSC instructions are followed by 'mov %rax,%rcx'. */ + { 0, + 3, + { 0x48, 0x89, 0xc1 }, + (uintptr_t)_syscall_hook_trampoline_48_89_c1 }, + /* __lll_unlock_wake has 'syscall' followed by + * pop %rdx; pop %rsi; ret */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 3, + { 0x5a, 0x5e, 0xc3 }, + (uintptr_t)_syscall_hook_trampoline_5a_5e_c3 }, + /* posix_fadvise64 has 'syscall' followed by + * mov %eax,%edx; neg %edx (in glibc-2.22-11.fc23.x86_64) */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 4, + { 0x89, 0xc2, 0xf7, 0xda }, + (uintptr_t)_syscall_hook_trampoline_89_c2_f7_da }, + /* Our VDSO vsyscall patches have 'syscall' followed by "nop; nop; + nop" */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 3, + { 0x90, 0x90, 0x90 }, + (uintptr_t)_syscall_hook_trampoline_90_90_90 }, + /* glibc-2.22-17.fc23.x86_64 has 'syscall' followed by 'mov $1,%rdx' + * in + * pthread_barrier_wait. + */ + { 0, + 5, + { 0xba, 0x01, 0x00, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_ba_01_00_00_00 }, + /* pthread_sigmask has 'syscall' followed by 'mov %eax,%ecx; xor + %edx,%edx' */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 4, + { 0x89, 0xc1, 0x31, 0xd2 }, + (uintptr_t)_syscall_hook_trampoline_89_c1_31_d2 }, + /* getpid has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 9, + { 0xc3, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* liblsan internal_close has 'syscall' followed by 'retq; nopl 0x0(%rax,%rax,1) */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 6, + { 0xc3, 0x0f, 0x1f, 0x44, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* glibc-2.29-15.fc30.x86_64 getpid has 'syscall' followed by 'retq; nopl 0x0(%rax) */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 5, + { 0xc3, 0x0f, 0x1f, 0x40, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* liblsan internal_open has 'syscall' followed by 'retq; nopl (%rax) */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 4, + { 0xc3, 0x0f, 0x1f, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* liblsan internal_dup2 has 'syscall' followed by 'retq; xchg %ax,%ax */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 3, + { 0xc3, 0x66, 0x90 }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* Go runtime has 'syscall' followed by 'retq; int3; int3 */ + { PATCH_IS_MULTIPLE_INSTRUCTIONS, + 3, + { 0xc3, 0xcc, 0xcc }, + (uintptr_t)_syscall_hook_trampoline_c3_nop }, + /* glibc-2.31 on Ubuntu 20.04 has 'xor $0x81, %sil' followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 4, + { 0x40, 0x80, 0xf6, 0x81 }, + (uintptr_t)_syscall_hook_trampoline_40_80_f6_81 }, + /* DynamoRIO has 'mov r10, rcx' followed by 'syscall' */ + { + PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 3, + { 0x49, 0x89, 0xca }, + (uintptr_t)_syscall_hook_trampoline_49_89_ca }, + /* Some applications have RDTSC followed by 'mov %rdx,any-reg' */ +#undef MOV_RDX_TO_REG +#define MOV_RDX_TO_REG(rex, op) \ + { \ + 0, \ + 3, \ + { 0x##rex, 0x89, 0x##op }, \ + (uintptr_t)_syscall_hook_trampoline_##rex##_89_##op }, + MOV_RDX_VARIANTS + /* Some application has RDTSC followed by 'shl $32,%rdx' */ + { + 0, + 4, + { 0x48, 0xc1, 0xe2, 0x20 }, + (uintptr_t)_syscall_hook_trampoline_48_c1_e2_20 }, + /* glibc-2.35-20.fc36.x86_64 __pthread_create_2_1 application has + syscall followed by 'mov 0x28(%r12),%rax' */ + { + 0, + 5, + { 0x49, 0x8b, 0x44, 0x24, 0x28 }, + (uintptr_t)_syscall_hook_trampoline_49_8b_44_24_28 }, + /* glibc-2.35-20.fc36.x86_64 thread_start has + 'lea 0x8f0(%rbx),%rsi' followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 7, + { 0x48, 0x8d, 0xb3, 0xf0, 0x08, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_48_8d_b3_f0_08_00_00 }, + /* Some application has 'mov %r14,%rdi' followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 3, + { 0x4c, 0x89, 0xf7 }, + (uintptr_t)_syscall_hook_trampoline_4c_89_f7 }, + /* Some application has 'mov %r15,%rdi' followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 3, + { 0x4c, 0x89, 0xff }, + (uintptr_t)_syscall_hook_trampoline_4c_89_ff }, + /* Some application has 'mov $0xffffffffffffffff,%r9' followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 7, + { 0x49, 0xc7, 0xc1, 0xff, 0xff, 0xff, 0xff }, + (uintptr_t)_syscall_hook_trampoline_49_c7_c1_ff_ff_ff_ff }, + /* glibc-2.35-20.fc36.x86_64 __pthread_create_2_1 has + 'mov $0xe,%eax' (sigprocmask) followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 5, + { 0xb8, 0x0e, 0x00, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_b8_0e_00_00_00 }, + /* glibc-2.35-20.fc36.x86_64 thread_start has + 'mov $0x111,%eax' (set_robust_list) followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 5, + { 0xb8, 0x11, 0x01, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_b8_11_01_00_00 }, + /* Some application has 'mov $0xca,%eax' (futex) followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 5, + { 0xb8, 0xca, 0x00, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_b8_ca_00_00_00 }, + /* Some application has 'mov $0x18,%esi' (sizeof(robust_list)) followed by 'syscall' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 5, + { 0xbe, 0x18, 0x00, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_be_18_00_00_00 }, + /* Some application has 'mov %rsp,%rbp' followed by 'rdtsc' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 3, + { 0x48, 0x89, 0xe5 }, + (uintptr_t)_syscall_hook_trampoline_48_89_e5 }, + /* Some application has 'mov %rdi,%rbx' followed by 'rdtsc' */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST, + 3, + { 0x48, 0x89, 0xfb }, + (uintptr_t)_syscall_hook_trampoline_48_89_fb }, + /* Support explicit 5 byte nop (`nopl 0(%ax, %ax, 1)`) before 'rdtsc' or syscall (may ignore interfering branches) */ + { PATCH_SYSCALL_INSTRUCTION_IS_LAST | + PATCH_IS_NOP_INSTRUCTIONS, + 5, + { 0x0f, 0x1f, 0x44, 0x00, 0x00 }, + (uintptr_t)_syscall_hook_trampoline_nops } + }; +#elif defined(__aarch64__) + extern RR_HIDDEN void _syscall_hook_trampoline_raw(void); + struct syscall_patch_hook syscall_patch_hooks[] = { + { 0, 4, { 0x01, 0, 0, 0xd4 }, (uintptr_t)_syscall_hook_trampoline_raw } + }; +#endif + + assert(sizeof(struct preload_thread_locals) <= PRELOAD_THREAD_LOCALS_SIZE); + + if (process_inited) { + return; + } + + // Check if the rr page is mapped. We avoid a syscall if it looks like + // rr places librrpage as the vdso + // Use 1 as size since linux implementation of msync round it up to page size + if ((!getauxval || (getauxval(AT_SYSINFO_EHDR) != RR_PAGE_ADDR - 3*PRELOAD_LIBRARY_PAGE_SIZE)) && + msync((void*)RR_PAGE_ADDR, 1, MS_ASYNC) != 0) { + // The RR page is not mapped - this process is not rr traced. + buffer_enabled = 0; + return; + } + + buffer_enabled = !!getenv(SYSCALLBUF_ENABLED_ENV_VAR); + + if (!buffer_enabled) { + // Don't risk executing the syscall before. If there is an external seccomp + // filter that doesn't like unknown syscalls, we risk breaking the recording. + return; + } + + params.syscallbuf_enabled = buffer_enabled; + +#ifdef __i386__ + params.get_pc_thunks_start = &_get_pc_thunks_start; + params.get_pc_thunks_end = &_get_pc_thunks_end; +#else + params.get_pc_thunks_start = NULL; + params.get_pc_thunks_end = NULL; +#endif + params.syscallbuf_code_start = &_syscallbuf_code_start; + params.syscallbuf_code_end = &_syscallbuf_code_end; + params.syscallbuf_final_exit_instruction = + &_syscallbuf_final_exit_instruction; + params.syscall_patch_hook_count = + sizeof(syscall_patch_hooks) / sizeof(syscall_patch_hooks[0]); + params.syscall_patch_hooks = syscall_patch_hooks; + params.globals = &globals; + + globals.fdt_uniform = 1; + params.breakpoint_instr_addr = &do_breakpoint_fault_addr; + params.breakpoint_mode_sentinel = -1; + params.syscallbuf_syscall_hook = (void*)syscall_hook; + + // We must not make any call into the syscall buffer in the init function + // in case a signal is delivered to us during initialization. + // This means that we must not call `_raw_syscall`. + int err = _traced_init_syscall(SYS_rrcall_init_preload, (long)¶ms, + 0, 0, 0, 0, 0); + if (err != 0) { + // Check if the rr tracer is present by looking for the thread local page + // (mapped just after the rr page). If it is not present, we were + // preloaded without rr listening, which is allowed (e.g. after detach). + // Otherwise give an intelligent error message indicating that our connection + // to rr is broken. + // Use 1 as size since linux implementation of msync round it up to page size + if (msync((void*)RR_PAGE_ADDR + PRELOAD_LIBRARY_PAGE_SIZE, 1, MS_ASYNC) == 0) { + fatal("Failed to communicated with rr tracer.\n" + "Perhaps a restrictive seccomp filter is in effect (e.g. docker?)?\n" + "Adjust the seccomp filter to allow syscalls above 1000, disable it,\n" + "or try using `rr record -n` (slow)."); + } else { + buffer_enabled = 0; + return; + } + } + + process_inited = 1; +} + +/** + * syscall hooks start here. + * + * !!! NBB !!!: from here on, all code that executes within the + * critical sections of transactions *MUST KEEP $ip IN THE SYSCALLBUF + * CODE*. That means no calls into libc, even for innocent-looking + * functions like |memcpy()|. + * + * How syscall hooks operate: + * + * 1. The rr tracer monkey-patches __kernel_vsyscall() to jump to + * _syscall_hook_trampoline() above. + * 2. When a call is made to __kernel_vsyscall(), it jumps to + * _syscall_hook_trampoline(), where the syscall params are + * packaged up into a call to syscall_hook() below. + * 3. syscall_hook() dispatches to a syscall processor function. + * 4. The syscall processor prepares a new record in the buffer. See + * struct syscallbuf_record for record fields. If the buffer runs + * out of space, the processor function aborts and makes a traced + * syscall, trapping to rr. rr then flushes the buffer. Records + * are directly saved to trace, and a buffer-flush event is + * recorded without execution info because it's a synthetic event. + * 5. Then, the syscall processor redirects all potential output + * for the syscall to the record (and corrects the overall size of + * the record while it does so). + * 6. The syscall is invoked through a asm helper that does *not* + * ptrace-trap to rr. + * 7. The syscall output, written on the buffer, is copied to the + * original pointers provided by the user. Take notice that this + * part saves us the injection of the data on replay, as we only + * need to push the data to the buffer and the wrapper code will + * copy it to the user address for us. + * 8. The return value and overall size are saved to the record. + */ + +/** + * Call this and save the result at the start of every system call we + * want to buffer. The result is a pointer into the record space. You + * can add to this pointer to allocate space in the trace record. + * However, do not read or write through this pointer until + * start_commit_syscall() has been called. And you *must* call + * start_commit_syscall() after this is called, otherwise buffering + * state will be inconsistent between syscalls. + * + * See |sys_clock_gettime()| for a simple example of how this helper + * should be used to buffer outparam data. + */ +static void* prep_syscall(void) { + /* We don't need to worry about a race between testing + * |locked| and setting it here. rr recording is responsible + * for ensuring signals are not delivered during + * syscall_buffer prologue and epilogue code. + * + * XXX except for synchronous signals generated in the syscall + * buffer code, while reading/writing user pointers */ + buffer_hdr()->locked |= SYSCALLBUF_LOCKED_TRACEE; + /* "Allocate" space for a new syscall record, not including + * syscall outparam data. */ + return buffer_last() + sizeof(struct syscallbuf_record); +} + +static enum syscallbuf_fd_classes fd_class(int fd) { + if (fd < 0) { + return FD_CLASS_INVALID; + } + if (fd >= SYSCALLBUF_FDS_DISABLED_SIZE - 1) { + fd = SYSCALLBUF_FDS_DISABLED_SIZE - 1; + } + return globals.syscallbuf_fd_class[fd]; +} + +static int is_bufferable_fd(int fd) { + switch (fd_class(fd)) { + case FD_CLASS_INVALID: + case FD_CLASS_UNTRACED: + return 1; + default: + return 0; + } +} + +/** + * Like prep_syscall, but preps a syscall to operate on a particular fd. If + * syscallbuf is disabled for this fd, returns NULL (in which case + * start_commit_syscall will abort cleanly and a traced syscall will be used). + * Allow negative fds to pass through; they'll either trigger an error or + * receive special treatment by the kernel (e.g. AT_FDCWD). + */ +static void* prep_syscall_for_fd(int fd) { + if (!is_bufferable_fd(fd)) { + return NULL; + } + return prep_syscall(); +} + +static void arm_desched_event(void) { + /* Don't trace the ioctl; doing so would trigger a flushing + * ptrace trap, which is exactly what this code is trying to + * avoid! :) Although we don't allocate extra space for these + * ioctl's, we do record that we called them; the replayer + * knows how to skip over them. */ + if ((int)privileged_unrecorded_syscall3(SYS_ioctl, + thread_locals->desched_counter_fd, + PERF_EVENT_IOC_ENABLE, 0)) { + fatal("Failed to ENABLE counter"); + } +} + +static void disarm_desched_event(void) { + /* See above. */ + if ((int)privileged_unrecorded_syscall3(SYS_ioctl, + thread_locals->desched_counter_fd, + PERF_EVENT_IOC_DISABLE, 0)) { + fatal("Failed to DISABLE counter"); + } +} + +/** + * Return 1 if it's ok to proceed with buffering this system call. + * Return 0 if we should trace the system call. + * This must be checked before proceeding with the buffered system call. + */ +/* (Negative numbers so as to not be valid syscall numbers, in case + * the |int| arguments below are passed in the wrong order.) */ +enum { MAY_BLOCK = -1, WONT_BLOCK = -2 }; + +static int fd_write_blocks(int fd) { + if (!globals.fdt_uniform) { + // If we're not uniform, it is possible for this fd to be untraced in one + // of the other tasks that share this fd table. Always assume it could block. + return MAY_BLOCK; + } + switch (fd_class(fd)) { + case FD_CLASS_UNTRACED: + case FD_CLASS_TRACED: + return MAY_BLOCK; + case FD_CLASS_INVALID: + case FD_CLASS_PROC_MEM: + return WONT_BLOCK; + } + fatal("Unknown or corrupted fd class"); +} + +static int start_commit_buffered_syscall(int syscallno, void* record_end, + int blockness) { + void* record_start; + void* stored_end; + struct syscallbuf_record* rec; + + if (!thread_locals->buffer) { + return 0; + } + record_start = buffer_last(); + stored_end = record_start + stored_record_size(record_end - record_start); + rec = record_start; + + if (stored_end < record_start + sizeof(struct syscallbuf_record)) { + /* Either a catastrophic buffer overflow or + * we failed to lock the buffer. Just bail out. */ + return 0; + } + if (stored_end > (void*)buffer_end() - sizeof(struct syscallbuf_record)) { + /* Buffer overflow. + * Unlock the buffer and then execute the system call + * with a trap to rr. Note that we reserve enough + * space in the buffer for the next prep_syscall(). */ + buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE; + return 0; + } + /* Store this breadcrumb so that the tracer can find out what + * syscall we're executing if our registers are in a weird + * state. If we end up aborting this syscall, no worry, this + * will just be overwritten later. + * + * NBB: this *MUST* be set before the desched event is + * armed. */ + rec->syscallno = syscallno; + rec->desched = MAY_BLOCK == blockness; + rec->size = record_end - record_start; + + if (rec->desched) { + pid_t pid = 0; + pid_t tid = 0; + uid_t uid = 0; + if (impose_spurious_desched) { + pid = privileged_unrecorded_syscall0(SYS_getpid); + tid = privileged_unrecorded_syscall0(SYS_gettid); + uid = privileged_unrecorded_syscall0(SYS_getuid); + } + + /* NB: the ordering of the next two statements is + * important. + * + * We set this flag to notify rr that it should pay + * attention to desched signals pending for this task. + * We have to set it *before* we arm the notification + * because we can't set the flag atomically with + * arming the event (too bad there's no ioctl() for + * querying the event enabled-ness state). That's + * important because if the notification is armed, + * then rr must be confident that when it disarms the + * event, the tracee is at an execution point that + * *must not* need the desched event. + * + * If we were to set the flag non-atomically after the + * event was armed, then if a desched signal was + * delivered right at the instruction that set the + * flag, rr wouldn't know that it needed to advance + * the tracee to the untraced syscall entry point. + * (And if rr didn't do /that/, then the syscall might + * block without rr knowing it, and the recording + * session would deadlock.) */ + buffer_hdr()->desched_signal_may_be_relevant = 1; + arm_desched_event(); + if (impose_spurious_desched) { + siginfo_t si; + si.si_code = POLL_IN; + si.si_fd = thread_locals->desched_counter_fd; + si.si_pid = pid; + si.si_uid = uid; + privileged_unrecorded_syscall4(SYS_rt_tgsigqueueinfo, pid, tid, + globals.desched_sig, + &si); + } + } + return 1; +} + +static void force_tick(void) { +#if defined(__i386__) || defined(__x86_64__) + __asm__ __volatile__("je 1f\n\t" + "1:"); +#elif defined(__aarch64__) + __asm__ __volatile__("cbz xzr, 1f\n" + "1:"); +#else +#error Unknown architecture +#endif +} + +static void __attribute__((noinline)) do_breakpoint(size_t value) +{ + char *unsafe_value = ((char*)-1)-0xf; + char **safe_value = &unsafe_value; + uint64_t *breakpoint_value_addr = (uint64_t*)RR_PAGE_BREAKPOINT_VALUE; +#if defined(__i386__) || defined(__x86_64__) + __asm__ __volatile__( + "mov (%1),%1\n\t" + "cmp %0,%1\n\t" + "cmove %3,%2\n\t" + // This will segfault if `value` matches + // the `breakpoint_value` set by rr. We + // detect this segfault and treat it + // specially. + "do_breakpoint_fault_addr:\n\t" + ".global do_breakpoint_fault_addr\n\t" + "mov (%2),%2\n\t" + "xor %1,%1\n\t" + "xor %2,%2\n\t" + "xor %3,%3\n\t" + : "+a"(value), "+D"(breakpoint_value_addr), + "+S"(safe_value), "+c"(unsafe_value) + : + : "cc", "memory"); +#elif defined(__aarch64__) + __asm__ __volatile__("ldr %1, [%1]\n\t" + "cmp %0, %1\n\t" + "csel %0, %3, %2, eq\n\t" + "do_breakpoint_fault_addr:\n\t" + ".global do_breakpoint_fault_addr\n\t" + "ldr %0, [%0]\n\t" + "subs %0, xzr, xzr\n\t" + "mov %1, xzr\n\t" + : "+r"(value), "+r"(breakpoint_value_addr), + "+r"(safe_value), "+r"(unsafe_value) + : + : "cc", "memory"); +#else +#error Unknown architecture +#endif +} + +/** + * Commit the record for a buffered system call. record_end can be + * adjusted downward from what was passed to + * start_commit_buffered_syscall, if not all of the initially + * requested space is needed. The result of this function should be + * returned directly by the kernel syscall hook. + */ +static long commit_raw_syscall(int syscallno, void* record_end, long ret) { + void* record_start = buffer_last(); + struct syscallbuf_record* rec = record_start; + struct syscallbuf_hdr* hdr = buffer_hdr(); + int call_breakpoint = 0; + + assert(record_end >= record_start); + rec->size = record_end - record_start; + + assert(hdr->locked); + + /* NB: the ordering of this statement with the + * |disarm_desched_event()| call below is important. + * + * We clear this flag to notify rr that the may-block syscall + * has finished, so there's no danger of blocking anymore. + * (And thus the desched signal is no longer relevant.) We + * have to clear this *before* disarming the event, because if + * rr sees the flag set, it has to PTRACE_SYSCALL this task to + * ensure it reaches an execution point where the desched + * signal is no longer relevant. We have to use the ioctl() + * that disarms the event as a safe "backstop" that can be hit + * by the PTRACE_SYSCALL. + * + * If we were to clear the flag *after* disarming the event, + * and the signal arrived at the instruction that cleared the + * flag, and rr issued the PTRACE_SYSCALL, then this tracee + * could fly off to any unknown execution point, including an + * iloop. So the recording session could livelock. */ + hdr->desched_signal_may_be_relevant = 0; + + if (rec->syscallno != syscallno) { + fatal("Record syscall number mismatch"); + } + + if (hdr->abort_commit) { + /* We were descheduled in the middle of a may-block + * syscall, and it was recorded as a normal entry/exit + * pair. So don't record the syscall in the buffer or + * replay will go haywire. */ + hdr->abort_commit = 0; + hdr->failed_during_preparation = 0; + /* Clear the return value that rr puts there during replay */ + rec->ret = 0; + } else { + rec->ret = ret; + // Finish 'rec' first before updating num_rec_bytes, since + // rr might read the record anytime after this update. + hdr->num_rec_bytes += stored_record_size(rec->size); + call_breakpoint = 1; + } + + if (rec->desched) { + disarm_desched_event(); + } + /* NBB: for may-block syscalls that are descheduled, the + * tracer uses the previous ioctl() as a stable point to reset + * the record counter. Therefore nothing from here on in the + * current txn must touch the record counter (at least, must + * not assume it's unchanged). */ + + buffer_hdr()->locked &= ~SYSCALLBUF_LOCKED_TRACEE; + + if (call_breakpoint) { + /* Call the breakpoint function corresponding to the record we just + * committed. This function just returns, but during replay it gives rr + * a chance to set a breakpoint for when a specific syscallbuf record + * has been processed. + */ + do_breakpoint(hdr->num_rec_bytes/8); + /* Force a tick now. + * During replay, if an async event (SIGKILL) happens between committing the syscall + * above and before this forced tick, we can detect that because the number of ticks + * recorded for the SIGKILL will be less than or equal to the number of ticks reported + * when the replay hits do_breakpoint. + */ + force_tick(); + } + + return ret; +} + +/** + * |ret_size| is the result of a syscall indicating how much data was returned + * in scratch buffer |buf2|; this function copies that data to |buf| and returns + * a pointer to the end of it. If there is no scratch buffer (|buf2| is NULL) + * just returns |ptr|. + */ +static void* copy_output_buffer(long ret_size, void* ptr, void* buf, + void* buf2) { + if (!buf2) { + return ptr; + } + if (ret_size <= 0 || buffer_hdr()->failed_during_preparation) { + return buf2; + } + local_memcpy(buf, buf2, ret_size); + return buf2 + ret_size; +} + +/** + * Copy an input parameter to the syscallbuf where the kernel needs to + * read and write it. During replay, we do a no-op self-copy in the buffer + * so that the buffered data is not lost. + * This code is written in assembler to ensure that the registers that receive + * values differing between record and replay (%0, rsi/esi, and flags) + * are reset to values that are the same between record and replay immediately + * afterward. This guards against diverging register values leaking into + * later code. + * Use local_memcpy or plain assignment instead if the kernel is not going to + * overwrite the values. + */ +static void memcpy_input_parameter(void* buf, void* src, int size) { +#if defined(__i386__) || defined(__x86_64__) + unsigned char tmp_in_replay = *rr_page_replay_flag_addr(); + __asm__ __volatile__("test %0,%0\n\t" + "cmovne %1,%2\n\t" + "rep movsb\n\t" + "xor %0,%0\n\t" + "xor %2,%2\n\t" + : "+a"(tmp_in_replay), "+D"(buf), "+S"(src), "+c"(size) + : + : "cc", "memory"); +#elif defined(__aarch64__) + long c1; + long c2; + unsigned char *globals_in_replay = rr_page_replay_flag_addr(); + __asm__ __volatile__("ldrb %w3, [%5]\n\t" + "cmp %3, #0\n\t" // eq -> record + "csel %1, %1, %0, eq\n\t" + "subs %4, %2, 16\n\t" + "b.lt 2f\n\t" + "1:\n\t" + "mov %2, %4\n\t" + "ldp %3, %4, [%1], #16\n\t" + "stp %3, %4, [%0], #16\n\t" + "subs %4, %2, #16\n\t" + "b.ge 1b\n" + "2:\n\t" + "tbz %2, 3, 3f\n\t" + "ldr %3, [%1], #8\n\t" + "str %3, [%0], #8\n\t" + "3:\n\t" + "tbz %2, 2, 3f\n\t" + "ldr %w3, [%1], #4\n\t" + "str %w3, [%0], #4\n\t" + "3:\n\t" + "tbz %2, 1, 3f\n\t" + "ldrh %w3, [%1], #2\n\t" + "strh %w3, [%0], #2\n\t" + "3:\n\t" + "tbz %2, 0, 3f\n\t" + "ldrb %w3, [%1]\n\t" + "strb %w3, [%0]\n\t" + "3:\n\t" + "subs %3, xzr, xzr\n\t" + "mov %4, xzr\n\t" + "mov %1, xzr\n\t" + : "+r"(buf), "+r"(src), + "+r"(size), "=&r"(c1), "=&r"(c2), "+r"(globals_in_replay) + : + : "cc", "memory"); +#else +#error Unknown architecture +#endif +} + +#if defined(__i386__) || defined(__x86_64__) +/** + * Perform an RDTSC, writing the output to 'buf', but only if we're in recording mode. + * Otherwise 'buf' is unchanged. + */ +static void rdtsc_recording_only(uint32_t buf[2]) { + unsigned char tmp_in_replay = *rr_page_replay_flag_addr(); + __asm__ __volatile__("test %%eax,%%eax\n\t" + "jne 1f\n\t" + "rdtsc\n\t" + "mov %%eax,(%1)\n\t" + "mov %%edx,4(%1)\n\t" + "1:\n\t" + "xor %%eax,%%eax\n\t" + "xor %%edx,%%edx\n\t" + : "+a"(tmp_in_replay) + : "S"(buf) + : "cc", "memory", "rdx"); +} +#endif + +/** + * During recording, we copy *real to *buf. + * During replay, we copy *buf to *real. + * Behaves like memcpy_input_parameter in terms of hiding differences between + * recording and replay. + */ +static void copy_futex_int(uint32_t* buf, uint32_t* real) { +#if defined(__i386__) || defined(__x86_64__) + uint32_t tmp_in_replay = *rr_page_replay_flag_addr(); + __asm__ __volatile__("test %0,%0\n\t" + "mov %2,%0\n\t" + "cmovne %1,%0\n\t" + "mov %0,%1\n\t" + "mov %0,%2\n\t" + /* This instruction is just to clear flags */ + "xor %0,%0\n\t" + : "+a"(tmp_in_replay) + : "m"(*buf), "m"(*real) + : "cc", "memory"); +#elif defined(__aarch64__) + unsigned char *globals_in_replay = rr_page_replay_flag_addr(); + __asm__ __volatile__("ldrb %w2, [%2]\n\t" + "cmp %w2, #0\n\t" // eq -> record + "csel %2, %1, %0, eq\n\t" + "ldr %w2, [%2]\n\t" + "csel %0, %0, %1, eq\n\t" + "str %w2, [%0]\n\t" + "subs %0, xzr, xzr\n\t" + "mov %2, xzr\n\t" + : "+r"(buf), "+r"(real), "+r"(globals_in_replay) + : + : "cc", "memory"); +#else +#error Unknown architecture +#endif +} + +static int trace_chaos_mode_syscalls = 0; +static int buffer_chaos_mode_syscalls = 0; + +static int force_traced_syscall_for_chaos_mode(void) { + if (!globals.in_chaos) { + return 0; + } + while (1) { + if (buffer_chaos_mode_syscalls) { + --buffer_chaos_mode_syscalls; + return 0; + } + if (trace_chaos_mode_syscalls) { + --trace_chaos_mode_syscalls; + return 1; + } + /* force a run of up to 50 syscalls to be traced */ + trace_chaos_mode_syscalls = (local_random() % 50) + 1; + buffer_chaos_mode_syscalls = (trace_chaos_mode_syscalls - 5) * 10; + if (buffer_chaos_mode_syscalls < 0) { + buffer_chaos_mode_syscalls = 0; + } + } +} + +/* Keep syscalls in alphabetical order, please. */ + +/** + * Call this for syscalls that have no memory effects, don't block, and + * aren't fd-related. + */ +static long sys_generic_nonblocking(struct syscall_info* call) { + void* ptr = prep_syscall(); + long ret; + + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall6(call->no, call->args[0], call->args[1], call->args[2], + call->args[3], call->args[4], call->args[5]); + return commit_raw_syscall(call->no, ptr, ret); +} + +/** + * Call this for syscalls that have no memory effects, don't block, and + * have an fd as their first parameter. + */ +static long sys_generic_nonblocking_fd(struct syscall_info* call) { + int fd = call->args[0]; + void* ptr = prep_syscall_for_fd(fd); + long ret; + + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall6(call->no, fd, call->args[1], call->args[2], + call->args[3], call->args[4], call->args[5]); + return commit_raw_syscall(call->no, ptr, ret); +} + +/** + * Call this for syscalls that have no memory effects, don't block, and + * have an fd as their first parameter, and should run privileged. + */ +static long privileged_sys_generic_nonblocking_fd(const struct syscall_info* call) { + int fd = call->args[0]; + void* ptr = prep_syscall_for_fd(fd); + long ret; + + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return privileged_traced_raw_syscall(call); + } + ret = privileged_untraced_syscall6(call->no, fd, call->args[1], call->args[2], + call->args[3], call->args[4], call->args[5]); + return commit_raw_syscall(call->no, ptr, ret); +} + +static long sys_clock_gettime(struct syscall_info* call) { + const int syscallno = SYS_clock_gettime; + __kernel_clockid_t clk_id = (__kernel_clockid_t)call->args[0]; + struct timespec* tp = (struct timespec*)call->args[1]; + + void* ptr = prep_syscall(); + struct timespec* tp2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (tp) { + tp2 = ptr; + ptr += sizeof(*tp2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, clk_id, tp2); + if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + /* This is small and won't get optimized to a memcpy call outside + our library. */ + *tp = *tp2; + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +#ifdef SYS_clock_gettime64 + +static long sys_clock_gettime64(struct syscall_info* call) { + const int syscallno = SYS_clock_gettime64; + __kernel_clockid_t clk_id = (__kernel_clockid_t)call->args[0]; + struct __kernel_timespec* tp = (struct __kernel_timespec*)call->args[1]; + + void* ptr = prep_syscall(); + struct __kernel_timespec* tp2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (tp) { + tp2 = ptr; + ptr += sizeof(*tp2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, clk_id, tp2); + if (tp && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + /* This is small and won't get optimized to a memcpy call outside + our library. */ + *tp = *tp2; + } + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#if defined(SYS_creat) +static long sys_open(struct syscall_info* call); +static long sys_creat(struct syscall_info* call) { + const char* pathname = (const char*)call->args[0]; + __kernel_mode_t mode = call->args[1]; + /* Thus sayeth the man page: + * + * creat() is equivalent to open() with flags equal to + * O_CREAT|O_WRONLY|O_TRUNC. */ + struct syscall_info open_call = + { SYS_open, { (long)pathname, O_CREAT | O_TRUNC | O_WRONLY, mode } }; + return sys_open(&open_call); +} +#endif + +static int sys_fcntl64_no_outparams(struct syscall_info* call) { + const int syscallno = RR_FCNTL_SYSCALL; + int fd = call->args[0]; + int cmd = call->args[1]; + long arg = call->args[2]; + + /* None of the no-outparam fcntl's are known to be + * may-block. */ + void* ptr = prep_syscall_for_fd(fd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall3(syscallno, fd, cmd, arg); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static int sys_fcntl64_own_ex(struct syscall_info* call) { + const int syscallno = RR_FCNTL_SYSCALL; + int fd = call->args[0]; + int cmd = call->args[1]; + struct rr_f_owner_ex* owner = (struct rr_f_owner_ex*)call->args[2]; + + /* The OWN_EX fcntl's aren't may-block. */ + void* ptr = prep_syscall_for_fd(fd); + struct rr_f_owner_ex* owner2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (owner) { + owner2 = ptr; + ptr += sizeof(*owner2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + if (owner2) { + memcpy_input_parameter(owner2, owner, sizeof(*owner2)); + } + ret = untraced_syscall3(syscallno, fd, cmd, owner2); + if (owner2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(owner, owner2, sizeof(*owner)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static int sys_fcntl64_setlk64(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Releasing a lock could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = RR_FCNTL_SYSCALL; + int fd = call->args[0]; + int cmd = call->args[1]; + struct rr_flock64* lock = (struct rr_flock64*)call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + struct rr_flock64* lock2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (lock) { + lock2 = ptr; + ptr += sizeof(*lock2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + if (lock2) { + memcpy_input_parameter(lock2, lock, sizeof(*lock2)); + } + ret = untraced_syscall3(syscallno, fd, cmd, lock2); + if (lock2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(lock, lock2, sizeof(*lock)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static int sys_fcntl64_setlkw64(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Releasing a lock could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = RR_FCNTL_SYSCALL; + int fd = call->args[0]; + int cmd = call->args[1]; + struct rr_flock64* lock = (struct rr_flock64*)call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall3(syscallno, fd, cmd, lock); + return commit_raw_syscall(syscallno, ptr, ret); +} + +#if defined(SYS_fcntl64) +/* 32-bit system */ +static long sys_fcntl64(struct syscall_info* call) +#else +/* 64-bit system */ +static long sys_fcntl(struct syscall_info* call) +#endif +{ + switch (call->args[1]) { + case F_SETFL: + if (call->args[2] == O_DIRECT) { + /* This needs to go to rr so we can disable syscall buffering + on this fd. */ + return traced_raw_syscall(call); + } + /* Falls through. */ + case F_DUPFD: + case F_GETFD: + case F_GETFL: + case F_GETOWN: + case F_SETFD: + case F_SETOWN: + case F_SETSIG: + return sys_fcntl64_no_outparams(call); + + case F_GETOWN_EX: + case F_SETOWN_EX: + return sys_fcntl64_own_ex(call); + +#ifndef F_SETLK64 +#define F_SETLK64 13 +#endif + case F_SETLK64: +#if !defined(SYS_fcntl64) + /* Also uses 64-bit flock format */ + case F_SETLK: +#endif + return sys_fcntl64_setlk64(call); + +#ifndef F_SETLKW64 +#define F_SETLKW64 14 +#endif + case F_SETLKW64: +#if !defined(SYS_fcntl64) + /* Also uses 64-bit flock format */ + case F_SETLKW: +#endif + return sys_fcntl64_setlkw64(call); + + default: + return traced_raw_syscall(call); + } +} + +static long ret_buf_len(long ret, size_t len) { + if (ret < 0) { + return 0; + } + if (len > LONG_MAX) { + return ret; + } + return ret < (long)len ? ret : (long)len; +} + +static long sys_flistxattr(struct syscall_info* call) { + const int syscallno = SYS_flistxattr; + int fd = (int)call->args[0]; + char* buf = (char*)call->args[1]; + size_t size = call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + void* buf2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (buf && size > 0) { + buf2 = ptr; + ptr += size; + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, fd, buf2, size); + ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_safe_nonblocking_ioctl(struct syscall_info* call) { + const int syscallno = SYS_ioctl; + int fd = call->args[0]; + + void* ptr = prep_syscall_for_fd(fd); + long ret; + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall3(syscallno, fd, call->args[1], call->args[2]); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_ioctl_fionread(struct syscall_info* call) { + const int syscallno = SYS_ioctl; + int fd = call->args[0]; + int* value = (int*)call->args[2]; + void* buf = NULL; + + void* ptr = prep_syscall_for_fd(fd); + long ret; + + if (value) { + buf = ptr; + ptr += sizeof(*value); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall3(syscallno, fd, FIONREAD, buf); + if (buf && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(value, buf, sizeof(*value)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_ioctl(struct syscall_info* call) { + switch (call->args[1]) { + case BTRFS_IOC_CLONE_RANGE: + case FIOCLEX: + case FIONCLEX: + return sys_safe_nonblocking_ioctl(call); + case FIONREAD: + return sys_ioctl_fionread(call); + default: + return traced_raw_syscall(call); + } +} + +static long sys_futex(struct syscall_info* call) { + enum { + FUTEX_USES_UADDR2 = 1 << 0, + }; + + /* This can make wakeups a lot more expensive. We assume + that wakeups are only used when some thread is actually waiting, + in which case we're at most doubling the overhead of the combined + wait + wakeup. */ + if (globals.in_chaos) { + return traced_raw_syscall(call); + } + + int op = call->args[1]; + int flags = 0; + switch (FUTEX_CMD_MASK & op) { + case FUTEX_WAKE_BITSET: + case FUTEX_WAKE: + break; + case FUTEX_REQUEUE: + case FUTEX_CMP_REQUEUE: + case FUTEX_WAKE_OP: + flags |= FUTEX_USES_UADDR2; + break; + + /* It turns out not to be worth buffering the FUTEX_WAIT* + * calls. When a WAIT call is made, we know almost for sure + * that the tracee is going to be desched'd (otherwise the + * userspace CAS would have succeeded). This is unlike + * read/write, f.e., where the vast majority of calls aren't + * desched'd and the overhead is worth it. So all that + * buffering WAIT does is add the overhead of arming/disarming + * desched (which is a measurable perf loss). + * + * NB: don't ever try to buffer FUTEX_LOCK_PI; it requires + * special processing in the tracer process (in addition to + * not being worth doing for perf reasons). */ + default: + return traced_raw_syscall(call); + } + + const int syscallno = SYS_futex; + uint32_t* uaddr = (uint32_t*)call->args[0]; + uint32_t val = call->args[2]; + const struct timespec* timeout = (const struct timespec*)call->args[3]; + uint32_t* uaddr2 = (uint32_t*)call->args[4]; + uint32_t val3 = call->args[5]; + + void* ptr = prep_syscall(); + uint32_t* saved_uaddr; + uint32_t* saved_uaddr2 = NULL; + long ret; + + assert(syscallno == call->no); + + /* We have to record the value of the futex at kernel exit, + * but we can't substitute a scratch pointer for the uaddrs: + * the futex identity is the memory cell. There are schemes + * that would allow us to use scratch futexes, but they get + * complicated quickly. */ + saved_uaddr = ptr; + ptr += sizeof(*saved_uaddr); + if (FUTEX_USES_UADDR2 & flags) { + saved_uaddr2 = ptr; + ptr += sizeof(*saved_uaddr2); + } + /* See above; it's not worth buffering may-block futex + * calls. */ + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall6(syscallno, uaddr, op, val, timeout, uaddr2, val3); + /* During recording, save the real outparams to the buffer. + * During replay, save the values from the buffer to the real outparams. + * + * The *ONLY* reason it's correct for us to read the outparams + * carelessly is that rr protects this syscallbuf + * transaction as as a critical section. */ + copy_futex_int(saved_uaddr, uaddr); + if (saved_uaddr2) { + copy_futex_int(saved_uaddr2, uaddr2); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_getrandom(struct syscall_info* call) { + void* buf = (void*)call->args[0]; + size_t buf_len = (size_t)call->args[1]; + unsigned int flags = (unsigned int)call->args[2]; + const int syscallno = SYS_getrandom; + + void* ptr = prep_syscall(); + void* buf2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (buf && buf_len > 0) { + buf2 = ptr; + ptr += buf_len; + } + if (!start_commit_buffered_syscall(call->no, ptr, (flags & GRND_NONBLOCK) ? WONT_BLOCK : MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(call->no, buf2, buf_len, flags); + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(call->no, ptr, ret); +} + +static long sys_generic_getdents(struct syscall_info* call) { + int fd = (int)call->args[0]; + void* buf = (void*)call->args[1]; + unsigned int count = (unsigned int)call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + void* buf2 = NULL; + long ret; + + if (buf && count > 0) { + buf2 = ptr; + ptr += count; + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(call->no, fd, buf2, count); + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(call->no, ptr, ret); +} + +#if defined(SYS_getdents) +static long sys_getdents(struct syscall_info* call) { + return sys_generic_getdents(call); +} +#endif + +static long sys_getdents64(struct syscall_info* call) { + return sys_generic_getdents(call); +} + +static long sys_gettimeofday(struct syscall_info* call) { + const int syscallno = SYS_gettimeofday; + struct timeval* tp = (struct timeval*)call->args[0]; + struct timezone* tzp = (struct timezone*)call->args[1]; + + /* XXX it seems odd that clock_gettime() is spec'd to be + * async-signal-safe while gettimeofday() isn't, but that's + * what the docs say! */ + void* ptr = prep_syscall(); + struct timeval* tp2 = NULL; + struct timezone* tzp2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (tp) { + tp2 = ptr; + ptr += sizeof(*tp2); + } + if (tzp) { + tzp2 = ptr; + ptr += sizeof(*tzp2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, tp2, tzp2); + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + if (tp) { + /* This is small and won't get optimized to a memcpy call outside + our library. */ + *tp = *tp2; + } + if (tzp) { + /* This is small and won't get optimized to a memcpy call outside + our library. */ + *tzp = *tzp2; + } + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_generic_getxattr(struct syscall_info* call) { + const char* path = (const char*)call->args[0]; + const char* name = (const char*)call->args[1]; + void* value = (void*)call->args[2]; + size_t size = call->args[3]; + + void* ptr = prep_syscall(); + void* value2 = NULL; + long ret; + + if (value && size > 0) { + value2 = ptr; + ptr += size; + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall4(call->no, path, name, value2, size); + ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); + return commit_raw_syscall(call->no, ptr, ret); +} + +static long sys_getxattr(struct syscall_info* call) { + return sys_generic_getxattr(call); +} + +static long sys_lgetxattr(struct syscall_info* call) { + return sys_generic_getxattr(call); +} + +static long sys_fgetxattr(struct syscall_info* call) { + int fd = (int)call->args[0]; + const char* name = (const char*)call->args[1]; + void* value = (void*)call->args[2]; + size_t size = call->args[3]; + + void* ptr = prep_syscall_for_fd(fd); + void* value2 = NULL; + long ret; + + if (value && size > 0) { + value2 = ptr; + ptr += size; + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall4(call->no, fd, name, value2, size); + ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, value, value2); + return commit_raw_syscall(call->no, ptr, ret); +} + +static long sys_generic_listxattr(struct syscall_info* call) { + char* path = (char*)call->args[0]; + char* buf = (char*)call->args[1]; + size_t size = call->args[2]; + + void* ptr = prep_syscall(); + void* buf2 = NULL; + long ret; + + if (buf && size > 0) { + buf2 = ptr; + ptr += size; + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(call->no, path, buf2, size); + ptr = copy_output_buffer(ret_buf_len(ret, size), ptr, buf, buf2); + return commit_raw_syscall(call->no, ptr, ret); +} + +static long sys_listxattr(struct syscall_info* call) { + return sys_generic_listxattr(call); +} + +static long sys_llistxattr(struct syscall_info* call) { + return sys_generic_listxattr(call); +} + +#if defined(SYS__llseek) +static long sys__llseek(struct syscall_info* call) { + const int syscallno = SYS__llseek; + int fd = call->args[0]; + unsigned long offset_high = call->args[1]; + unsigned long offset_low = call->args[2]; + __kernel_loff_t* result = (__kernel_loff_t*)call->args[3]; + unsigned int whence = call->args[4]; + + void* ptr = prep_syscall_for_fd(fd); + __kernel_loff_t* result2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (result) { + result2 = ptr; + ptr += sizeof(*result2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + if (result2) { + memcpy_input_parameter(result2, result, sizeof(*result2)); + } + ret = untraced_syscall5(syscallno, fd, offset_high, offset_low, result2, + whence); + if (result2) { + *result = *result2; + } + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +static long sys_madvise(struct syscall_info* call) { + const int syscallno = SYS_madvise; + void* addr = (void*)call->args[0]; + size_t length = call->args[1]; + int advice = call->args[2]; + + void* ptr; + long ret; + + switch (advice) { + // Whitelist advice values that we know are OK to pass through to the + // kernel directly. + case MADV_NORMAL: + case MADV_RANDOM: + case MADV_SEQUENTIAL: + case MADV_WILLNEED: + case MADV_DONTNEED: + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: + case MADV_DONTDUMP: + case MADV_DODUMP: + break; + case MADV_FREE: + // See record_syscall. We disallow MADV_FREE because it creates + // nondeterminism. + advice = -1; + break; + default: + return traced_raw_syscall(call); + } + + ptr = prep_syscall(); + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + /* Ensure this syscall happens during replay. In particular MADV_DONTNEED + * must be executed. + */ + ret = untraced_replayed_syscall3(syscallno, addr, length, advice); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_mprotect(struct syscall_info* call) { + const int syscallno = SYS_mprotect; + void* addr = (void*)call->args[0]; + size_t length = call->args[1]; + int prot = call->args[2]; + struct mprotect_record* mrec; + + void* ptr; + long ret; + + if ((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)) || !buffer_hdr() || + buffer_hdr()->mprotect_record_count >= MPROTECT_RECORD_COUNT) { + return traced_raw_syscall(call); + } + + ptr = prep_syscall(); + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + mrec = &globals.mprotect_records[buffer_hdr()->mprotect_record_count++]; + mrec->start = (uint64_t)(uintptr_t)addr; + mrec->size = length; + mrec->prot = prot; + ret = untraced_replayed_syscall3(syscallno, addr, length, prot); + if (ret < 0 && ret != -ENOMEM) { + /* indicate that nothing was mprotected */ + mrec->size = 0; + } + buffer_hdr()->mprotect_record_count_completed++; + + return commit_raw_syscall(syscallno, ptr, ret); +} + +static int supported_open(const char* file_name, int flags) { + if (is_gcrypt_deny_file(file_name)) { + /* This needs to be a traced syscall. We want to return an + open file even if the file doesn't exist and the untraced syscall + returns ENOENT. */ + return 0; + } + if (flags & O_DIRECT) { + /* O_DIRECT needs to go to rr so we can blacklist the file for + syscall buffering. */ + return 0; + } + /* Writeable opens need to go to rr to be checked in case + they could write to a mapped file. + But if they're O_EXCL | O_CREAT, a new file must be created + so that will be fine. */ + return !(flags & (O_RDWR | O_WRONLY)) || + (flags & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT); +} + +static long sys_readlinkat(struct syscall_info* call, int privileged); + +struct check_open_state { + uint8_t did_abort; + uint8_t did_fail_during_preparation; +}; + +static int check_file_open_ok(struct syscall_info* call, int ret, struct check_open_state state) { + /* If we failed during preparation then a SIGSYS or similar prevented the syscall + from doing anything, so there is nothing for us to do here and we shouldn't + try to interpret the "syscall result". */ + if (state.did_fail_during_preparation || ret < 0) { + return ret; + } + char buf[100]; + sprintf(buf, "/proc/self/fd/%d", ret); + char link[PATH_MAX]; + long link_ret; + if (state.did_abort) { + /* Don't add any new syscallbuf records, that won't work. */ + link_ret = privileged_traced_syscall4(SYS_readlinkat, -1, (long)buf, (long)link, sizeof(link)); + } else { + struct syscall_info readlink_call = + { SYS_readlinkat, { -1, (long)buf, (long)link, sizeof(link), 0, 0 } }; + link_ret = sys_readlinkat(&readlink_call, 1); + } + if (link_ret >= 0 && link_ret < (ssize_t)sizeof(link)) { + link[link_ret] = 0; + if (allow_buffered_open(link)) { + return ret; + } + } + /* Clean up by closing the file descriptor we should not have opened and + opening it again, traced this time. + Use a privileged traced syscall for the close to ensure it + can't fail due to lack of privilege. + We expect this to return an error. + We could try an untraced close syscall here, falling back to traced + syscall, but that's a bit more complicated and we're already on + the slow (and hopefully rare) path. */ + privileged_traced_syscall1(SYS_close, ret); + return traced_raw_syscall(call); +} + +static struct check_open_state capture_check_open_state(void) { + struct check_open_state ret; + ret.did_abort = buffer_hdr()->abort_commit; + ret.did_fail_during_preparation = buffer_hdr()->failed_during_preparation; + return ret; +} + +#if defined(SYS_open) +static long sys_open(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Opening a FIFO could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_open; + const char* pathname = (const char*)call->args[0]; + int flags = call->args[1]; + __kernel_mode_t mode = call->args[2]; + void* ptr; + long ret; + + assert(syscallno == call->no); + + if (!supported_open(pathname, flags)) { + return traced_raw_syscall(call); + } + + ptr = prep_syscall(); + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, pathname, flags, mode); + struct check_open_state state = capture_check_open_state(); + ret = commit_raw_syscall(syscallno, ptr, ret); + return check_file_open_ok(call, ret, state); +} +#endif + +static long sys_openat(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Opening a FIFO could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_openat; + int dirfd = call->args[0]; + const char* pathname = (const char*)call->args[1]; + int flags = call->args[2]; + __kernel_mode_t mode = call->args[3]; + void* ptr; + long ret; + + assert(syscallno == call->no); + + if (!supported_open(pathname, flags)) { + return traced_raw_syscall(call); + } + + ptr = prep_syscall(); + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall4(syscallno, dirfd, pathname, flags, mode); + struct check_open_state state = capture_check_open_state(); + ret = commit_raw_syscall(syscallno, ptr, ret); + return check_file_open_ok(call, ret, state); +} + +#if defined(SYS_poll) || defined(SYS_ppoll) +/** + * Make this function external so desched_ticks.py can set a breakpoint on it. + * Make it visibility-"protected" so that our local definition binds to it + * directly and doesn't go through a PLT thunk (which would mean temporarily + * leaving syscallbuf code). + */ +__attribute__((visibility("protected"))) void __before_poll_syscall_breakpoint( + void) {} +#endif + +#if defined(SYS_poll) +static long sys_poll(struct syscall_info* call) { + const int syscallno = SYS_poll; + struct pollfd* fds = (struct pollfd*)call->args[0]; + unsigned int nfds = call->args[1]; + int timeout = call->args[2]; + + void* ptr = prep_syscall(); + struct pollfd* fds2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (fds && nfds > 0) { + fds2 = ptr; + ptr += nfds * sizeof(*fds2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + if (fds2) { + memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); + } + + __before_poll_syscall_breakpoint(); + + /* Try a no-timeout version of the syscall first. If this doesn't return + anything, and we should have blocked, we'll try again with a traced syscall + which will be the one that blocks. This usually avoids the + need to trigger desched logic, which adds overhead, especially the + rrcall_notify_syscall_hook_exit that gets triggered. */ + ret = untraced_syscall3(syscallno, fds2, nfds, 0); + + if (fds2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + /* NB: even when poll returns 0 indicating no pending + * fds, it still sets each .revent outparam to 0. + * (Reasonably.) So we always need to copy on return + * value >= 0. + * It's important that we not copy when there's an error. + * The syscallbuf commit might have been aborted, which means + * during replay fds2 might be non-recorded data, so we'd be + * incorrectly trashing 'fds'. */ + local_memcpy(fds, fds2, nfds * sizeof(*fds)); + } + commit_raw_syscall(syscallno, ptr, ret); + + if (ret != 0 || timeout == 0) { + return ret; + } + /* The syscall didn't return anything, and we should have blocked. + Just perform a raw syscall now since we're almost certain to block. */ + return traced_raw_syscall(call); +} +#endif + +#if defined(SYS_ppoll) +static long sys_ppoll(struct syscall_info* call) { + const int syscallno = SYS_ppoll; + struct pollfd* fds = (struct pollfd*)call->args[0]; + unsigned int nfds = call->args[1]; + const struct timespec *tmo_p = (struct timespec*)call->args[2]; + const kernel_sigset_t *sigmask = (const kernel_sigset_t*)call->args[3]; + size_t sigmask_size = call->args[4]; + + if (sigmask) { + // See ppoll_deliver. ppoll calls that temporarily change the + // sigmask are hard to handle; we may get a signal that we can't + // deliver later because it's blocked by the application. + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + struct pollfd* fds2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (fds && nfds > 0) { + fds2 = ptr; + ptr += nfds * sizeof(*fds2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + if (fds2) { + memcpy_input_parameter(fds2, fds, nfds * sizeof(*fds2)); + } + + __before_poll_syscall_breakpoint(); + + /* Try a no-timeout version of the syscall first. If this doesn't return + anything, and we should have blocked, we'll try again with a traced syscall + which will be the one that blocks. This usually avoids the + need to trigger desched logic, which adds overhead, especially the + rrcall_notify_syscall_hook_exit that gets triggered. */ + const struct timespec tmo0 = {0, 0}; + ret = untraced_syscall5(syscallno, fds2, nfds, &tmo0, sigmask, sigmask_size); + + if (fds2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + /* NB: even when poll returns 0 indicating no pending + * fds, it still sets each .revent outparam to 0. + * (Reasonably.) So we always need to copy on return + * value >= 0. + * It's important that we not copy when there's an error. + * The syscallbuf commit might have been aborted, which means + * during replay fds2 might be non-recorded data, so we'd be + * incorrectly trashing 'fds'. */ + local_memcpy(fds, fds2, nfds * sizeof(*fds)); + } + commit_raw_syscall(syscallno, ptr, ret); + + if (ret != 0 || (tmo_p && tmo_p->tv_sec == 0 && tmo_p->tv_nsec == 0)) { + return ret; + } + /* The syscall didn't return anything, and we should have blocked. + Just perform a raw syscall now since we're almost certain to block. */ + return traced_raw_syscall(call); +} +#endif + +static long sys_epoll_wait(struct syscall_info* call) { + int epfd = call->args[0]; + struct epoll_event* events = (struct epoll_event*)call->args[1]; + int max_events = call->args[2]; + int timeout = call->args[3]; + + void* ptr; + struct epoll_event* events2 = NULL; + long ret; + + ptr = prep_syscall(); + + assert(SYS_epoll_pwait == call->no +#if defined(SYS_epoll_wait) + || SYS_epoll_wait == call->no +#endif + ); + + if (events && max_events > 0) { + events2 = ptr; + ptr += max_events * sizeof(*events2); + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + /* Try a no-timeout version of the syscall first. If this doesn't return + anything, and we should have blocked, we'll try again with a traced syscall + which will be the one that blocks. This usually avoids the + need to trigger desched logic, which adds overhead, especially the + rrcall_notify_syscall_hook_exit that gets triggered. + N.B.: SYS_epoll_wait only has four arguments, but we don't care + if the last two arguments are garbage */ + ret = untraced_syscall6(call->no, epfd, events2, max_events, 0, + call->args[4] /*sigmask*/, call->args[5] /*sizeof(*sigmask)*/); + + ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2); + ret = commit_raw_syscall(call->no, ptr, ret); + if (timeout == 0 || (ret != EINTR && ret != 0)) { + /* If we got some real results, or a non-EINTR error, we can just + return it directly. + If we got no results and the timeout was 0, we can just return 0. + If we got EINTR and the timeout was 0, a signal must have + interrupted the syscall (not sure if this can happen...). If the signal + needs to be handled, we'll handle it as we exit the syscallbuf. + Returning EINTR is fine because that's what the syscall would have + returned had it run traced. (We didn't enable the desched signal + so no extra signals could have affected our untraced syscall that + could not have been delivered to a traced syscall.) */ + return ret; + } + /* Some timeout was requested and either we got no results or we got + EINTR. + In the former case we just have to wait, so we do a traced syscall. + In the latter case, the syscall must have been interrupted by a + signal (which rr will have handled or stashed, and won't deliver until + we exit syscallbuf code or do a traced syscall). The kernel doesn't + automatically restart the syscall because of a longstanding bug (as of + 4.17 anyway). Doing a traced syscall will allow a stashed signal to be + processed (if necessary) and allow things to proceed normally after that. + Note that if rr decides to deliver a signal to the tracee, that will + itself interrupt the syscall and cause it to return EINTR just as + would happen without rr. + */ + return traced_raw_syscall(call); +} + +struct timespec64 { + uint64_t tv_sec; + uint64_t tv_nsec; +}; + +#ifdef SYS_epoll_pwait2 +static long sys_epoll_pwait2(struct syscall_info* call) { + int epfd = call->args[0]; + struct epoll_event* events = (struct epoll_event*)call->args[1]; + int max_events = call->args[2]; + struct timespec64* timeout = (struct timespec64*)call->args[3]; + + void* ptr; + struct epoll_event* events2 = NULL; + long ret; + + ptr = prep_syscall(); + + assert(SYS_epoll_pwait2 == call->no); + + if (events && max_events > 0) { + events2 = ptr; + ptr += max_events * sizeof(*events2); + } + if (!start_commit_buffered_syscall(call->no, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + /* Try a no-timeout version of the syscall first. If this doesn't return + anything, and we should have blocked, we'll try again with a traced syscall + which will be the one that blocks. This usually avoids the + need to trigger desched logic, which adds overhead, especially the + rrcall_notify_syscall_hook_exit that gets triggered. */ + struct timespec64 no_timeout = { 0, 0 }; + ret = untraced_syscall6(call->no, epfd, events2, max_events, &no_timeout, + call->args[4] /*sigmask*/, call->args[5] /*sizeof(*sigmask)*/); + + ptr = copy_output_buffer(ret * sizeof(*events2), ptr, events, events2); + ret = commit_raw_syscall(call->no, ptr, ret); + if ((timeout && timeout->tv_sec == 0 && timeout->tv_nsec == 0) || + (ret != EINTR && ret != 0)) { + /* If we got some real results, or a non-EINTR error, we can just + return it directly. + If we got no results and the timeout was 0, we can just return 0. + If we got EINTR and the timeout was 0, a signal must have + interrupted the syscall (not sure if this can happen...). If the signal + needs to be handled, we'll handle it as we exit the syscallbuf. + Returning EINTR is fine because that's what the syscall would have + returned had it run traced. (We didn't enable the desched signal + so no extra signals could have affected our untraced syscall that + could not have been delivered to a traced syscall.) */ + return ret; + } + /* Some timeout was requested and either we got no results or we got + EINTR. + In the former case we just have to wait, so we do a traced syscall. + In the latter case, the syscall must have been interrupted by a + signal (which rr will have handled or stashed, and won't deliver until + we exit syscallbuf code or do a traced syscall). The kernel doesn't + automatically restart the syscall because of a longstanding bug (as of + 4.17 anyway). Doing a traced syscall will allow a stashed signal to be + processed (if necessary) and allow things to proceed normally after that. + Note that if rr decides to deliver a signal to the tracee, that will + itself interrupt the syscall and cause it to return EINTR just as + would happen without rr. + */ + return traced_raw_syscall(call); +} +#endif + +#define CLONE_SIZE_THRESHOLD 0x10000 + +static long sys_read(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Reading from a pipe could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_read; + int fd = call->args[0]; + void* buf = (void*)call->args[1]; + size_t count = call->args[2]; + + void* ptr; + void* buf2 = NULL; + long ret; + + /* Try cloning data using CLONE_RANGE ioctl. + * XXX switch to FIOCLONERANGE when that's more widely available. It's the + * same ioctl number so it won't affect rr per se but it'd be cleaner code. + * 64-bit only for now, since lseek and pread64 need special handling for + * 32-bit. + * Basically we break down the read into three syscalls lseek, clone and + * read-from-clone, each of which is individually syscall-buffered. + * Crucially, the read-from-clone syscall does NOT store data in the syscall + * buffer; instead, we perform the syscall during replay, assuming that + * cloned_file_data_fd is open to the same file during replay. + * Reads that hit EOF are rejected by the CLONE_RANGE ioctl so we take the + * slow path. That's OK. + * There is a possible race here: between cloning the data and reading from + * |fd|, |fd|'s data may be overwritten, in which case the data read during + * replay will not match the data read during recording, causing divergence. + * I don't see any performant way to avoid this race; I tried reading from + * the cloned data instead of |fd|, but that is very slow because readahead + * doesn't work. (The cloned data file always ends at the current offset so + * there is nothing to readahead.) However, if an application triggers this + * race, it's almost certainly a bad bug because Linux can return any + * interleaving of old+new data for the read even without rr. + */ + if (buf && count >= CLONE_SIZE_THRESHOLD && + thread_locals->cloned_file_data_fd >= 0 && is_bufferable_fd(fd) && + sizeof(void*) == 8 && !(count & 4095)) { + struct syscall_info lseek_call = { SYS_lseek, + { fd, 0, SEEK_CUR, 0, 0, 0 } }; + off_t lseek_ret = privileged_sys_generic_nonblocking_fd(&lseek_call); + if (lseek_ret >= 0 && !(lseek_ret & 4095)) { + struct btrfs_ioctl_clone_range_args ioctl_args; + int ioctl_ret; + void* ioctl_ptr = prep_syscall(); + ioctl_args.src_fd = fd; + ioctl_args.src_offset = lseek_ret; + ioctl_args.src_length = count; + ioctl_args.dest_offset = thread_locals->cloned_file_data_offset; + + /* Don't call sys_ioctl here; cloned_file_data_fd has syscall buffering + * disabled for it so rr can reject attempts to close/dup to it. But + * we want to allow syscall buffering of this ioctl on it. + */ + if (!start_commit_buffered_syscall(SYS_ioctl, ioctl_ptr, WONT_BLOCK)) { + struct syscall_info ioctl_call = { SYS_ioctl, + { thread_locals->cloned_file_data_fd, + BTRFS_IOC_CLONE_RANGE, + (long)&ioctl_args, 0, 0, 0 } }; + ioctl_ret = privileged_traced_raw_syscall(&ioctl_call); + } else { + ioctl_ret = + privileged_untraced_syscall3(SYS_ioctl, thread_locals->cloned_file_data_fd, + BTRFS_IOC_CLONE_RANGE, &ioctl_args); + ioctl_ret = commit_raw_syscall(SYS_ioctl, ioctl_ptr, ioctl_ret); + } + + if (ioctl_ret >= 0) { + struct syscall_info read_call = { SYS_read, + { fd, (long)buf, count, 0, 0, 0 } }; + thread_locals->cloned_file_data_offset += count; + + replay_only_syscall3(SYS_dup3, thread_locals->cloned_file_data_fd, fd, 0); + + ptr = prep_syscall(); + if (count > thread_locals->usable_scratch_size) { + if (!start_commit_buffered_syscall(SYS_read, ptr, WONT_BLOCK)) { + return traced_raw_syscall(&read_call); + } + ret = untraced_replayed_syscall3(SYS_read, fd, buf, count); + } else { + if (!start_commit_buffered_syscall(SYS_read, ptr, MAY_BLOCK)) { + return traced_raw_syscall(&read_call); + } + ret = untraced_replayed_syscall3(SYS_read, fd, + thread_locals->scratch_buf, count); + copy_output_buffer(ret, NULL, buf, thread_locals->scratch_buf); + } + // Do this now before we finish processing the syscallbuf record. + // This means the syscall will be executed in + // ReplaySession::flush_syscallbuf instead of + // ReplaySession::enter_syscall or something similar. + replay_only_syscall1(SYS_close, fd); + ret = commit_raw_syscall(SYS_read, ptr, ret); + return ret; + } + } + } + + ptr = prep_syscall_for_fd(fd); + + assert(syscallno == call->no); + + if (buf && count > 0) { + buf2 = ptr; + ptr += count; + } + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, fd, buf2, count); + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} + +/* On x86-32, pread/pwrite take the offset in two registers. We don't bother + * handling that. + */ +#if !defined(__i386__) +static long sys_pread64(struct syscall_info* call) { + const int syscallno = SYS_pread64; + int fd = call->args[0]; + void* buf = (void*)call->args[1]; + size_t count = call->args[2]; + off_t offset = call->args[3]; + + void* ptr; + void* buf2 = NULL; + long ret; + + ptr = prep_syscall_for_fd(fd); + + assert(syscallno == call->no); + + if (buf && count > 0) { + buf2 = ptr; + ptr += count; + } + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall4(syscallno, fd, buf2, count, offset); + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#if defined(SYS_readlink) +static long sys_readlink(struct syscall_info* call) { + const int syscallno = SYS_readlink; + const char* path = (const char*)call->args[0]; + char* buf = (char*)call->args[1]; + int bufsiz = call->args[2]; + + void* ptr = prep_syscall(); + char* buf2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (buf && bufsiz > 0) { + buf2 = ptr; + ptr += bufsiz; + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, path, buf2, bufsiz); + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +static long sys_readlinkat(struct syscall_info* call, int privileged) { + const int syscallno = SYS_readlinkat; + int dirfd = call->args[0]; + const char* path = (const char*)call->args[1]; + char* buf = (char*)call->args[2]; + int bufsiz = call->args[3]; + + void* ptr = prep_syscall(); + char* buf2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (buf && bufsiz > 0) { + buf2 = ptr; + ptr += bufsiz; + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + if (privileged) { + return privileged_traced_raw_syscall(call); + } + return traced_raw_syscall(call); + } + + if (privileged) { + ret = privileged_untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz); + } else { + ret = untraced_syscall4(syscallno, dirfd, path, buf2, bufsiz); + } + ptr = copy_output_buffer(ret, ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} + +#if defined(SYS_socketcall) +static long sys_socketcall_recv(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Reading from a socket could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_socketcall; + long* args = (long*)call->args[1]; + int sockfd = args[0]; + void* buf = (void*)args[1]; + size_t len = args[2]; + unsigned int flags = args[3]; + unsigned long new_args[4]; + + void* ptr = prep_syscall_for_fd(sockfd); + void* buf2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (buf && len > 0) { + buf2 = ptr; + ptr += len; + } + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + new_args[0] = sockfd; + new_args[1] = (unsigned long)buf2; + new_args[2] = len; + new_args[3] = flags; + ret = untraced_syscall2(SYS_socketcall, SYS_RECV, new_args); + /* Account for MSG_TRUNC */ + ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_socketcall(struct syscall_info* call) { + switch (call->args[0]) { + case SYS_RECV: + return sys_socketcall_recv(call); + default: + return traced_raw_syscall(call); + } +} +#endif + +#ifdef SYS_recvfrom +static long sys_recvfrom(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Reading from a socket could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_recvfrom; + int sockfd = call->args[0]; + void* buf = (void*)call->args[1]; + size_t len = call->args[2]; + int flags = call->args[3]; + /* struct sockaddr isn't useful here since some sockaddrs are bigger than + * it. To avoid making false assumptions, treat the sockaddr parameter + * as an untyped buffer. + */ + void* src_addr = (void*)call->args[4]; + socklen_t* addrlen = (socklen_t*)call->args[5]; + + void* ptr = prep_syscall_for_fd(sockfd); + void* buf2 = NULL; + struct sockaddr* src_addr2 = NULL; + socklen_t* addrlen2 = NULL; + long ret; + + assert(syscallno == call->no); + /* If addrlen is NULL then src_addr must also be null */ + assert(addrlen || !src_addr); + + if (src_addr) { + src_addr2 = ptr; + ptr += *addrlen; + } + if (addrlen) { + addrlen2 = ptr; + ptr += sizeof(*addrlen); + } + if (buf && len > 0) { + buf2 = ptr; + ptr += len; + } + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + if (addrlen) { + memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2)); + } + ret = untraced_syscall6(syscallno, sockfd, buf2, len, flags, src_addr2, + addrlen2); + + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + if (src_addr2) { + socklen_t actual_size = *addrlen2; + if (actual_size > *addrlen) { + actual_size = *addrlen; + } + local_memcpy(src_addr, src_addr2, actual_size); + } + if (addrlen2) { + *addrlen = *addrlen2; + } + } + ptr = copy_output_buffer(ret_buf_len(ret, len), ptr, buf, buf2); + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_recvmsg + +/* These macros are from musl Copyright © 2005-2020 Rich Felker, et al. (MIT LICENSE) */ +#define __CMSG_LEN(cmsg) (((cmsg)->cmsg_len + sizeof(long) - 1) & ~(long)(sizeof(long) - 1)) +#define __CMSG_NEXT(cmsg) ((unsigned char *)(cmsg) + __CMSG_LEN(cmsg)) +#define __MHDR_END(mhdr) ((unsigned char *)(mhdr)->msg_control + (mhdr)->msg_controllen) + +#define CMSG_DATA(cmsg) ((unsigned char *) (((struct cmsghdr *)(cmsg)) + 1)) +#define CMSG_NXTHDR(mhdr, cmsg) ((cmsg)->cmsg_len < sizeof (struct cmsghdr) || \ + (__CMSG_LEN(cmsg) + sizeof(struct cmsghdr) >= (unsigned long)(__MHDR_END(mhdr) - (unsigned char *)(cmsg))) \ + ? 0 : (struct cmsghdr *)__CMSG_NEXT(cmsg)) +#define CMSG_FIRSTHDR(mhdr) ((size_t) (mhdr)->msg_controllen >= sizeof (struct cmsghdr) ? (struct cmsghdr *) (mhdr)->msg_control : (struct cmsghdr *) 0) + +struct cmsghdr { + __kernel_size_t cmsg_len; + int cmsg_level; + int cmsg_type; +}; + +struct msghdr /* struct user_msghdr in the kernel */ { + void* msg_name; + int msg_namelen; + struct iovec* msg_iov; + __kernel_size_t msg_iovlen; + void* msg_control; + __kernel_size_t msg_controllen; + unsigned int msg_flags; +}; + +#define SCM_RIGHTS 0x01 +#define SOL_PACKET 263 + +static int msg_received_file_descriptors(struct msghdr* msg) { + struct cmsghdr* cmh; + for (cmh = CMSG_FIRSTHDR(msg); cmh; cmh = CMSG_NXTHDR(msg, cmh)) { + if (cmh->cmsg_level == SOL_SOCKET && cmh->cmsg_type == SCM_RIGHTS) { + return 1; + } + } + return 0; +} + +static long sys_recvmsg(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Reading from a socket could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_recvmsg; + int sockfd = call->args[0]; + struct msghdr* msg = (struct msghdr*)call->args[1]; + int flags = call->args[2]; + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + struct msghdr* msg2; + void* ptr_base = ptr; + void* ptr_overwritten_end; + void* ptr_bytes_start; + void* ptr_end; + size_t i; + + assert(syscallno == call->no); + + /* Compute final buffer size up front, before writing syscall inputs to the + * buffer. Thus if we decide not to buffer this syscall, we bail out + * before trying to write to a buffer that won't be recorded and may be + * invalid (e.g. overflow). + */ + ptr += sizeof(struct msghdr) + sizeof(struct iovec) * msg->msg_iovlen; + if (msg->msg_name) { + ptr += msg->msg_namelen; + } + if (msg->msg_control) { + ptr += msg->msg_controllen; + } + for (i = 0; i < msg->msg_iovlen; ++i) { + ptr += msg->msg_iov[i].iov_len; + } + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + /** + * The kernel only writes to the struct msghdr, and the iov buffers. We must + * not overwrite that data (except using memcpy_input_parameter) during + * replay. For the rest of the data, the values we write here during replay + * are guaranteed to match what was recorded in the buffer. + * We can't rely on the values we wrote here during recording also being + * here during replay since the syscall might have been aborted and our + * written data not recorded. + */ + msg2 = ptr = ptr_base; + memcpy_input_parameter(msg2, msg, sizeof(*msg)); + ptr += sizeof(struct msghdr); + msg2->msg_iov = ptr; + ptr += sizeof(struct iovec) * msg->msg_iovlen; + ptr_overwritten_end = ptr; + if (msg->msg_name) { + msg2->msg_name = ptr; + ptr += msg->msg_namelen; + } + if (msg->msg_control) { + msg2->msg_control = ptr; + ptr += msg->msg_controllen; + } + ptr_bytes_start = ptr; + for (i = 0; i < msg->msg_iovlen; ++i) { + msg2->msg_iov[i].iov_base = ptr; + ptr += msg->msg_iov[i].iov_len; + msg2->msg_iov[i].iov_len = msg->msg_iov[i].iov_len; + } + + ret = untraced_syscall3(syscallno, sockfd, msg2, flags); + + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + size_t bytes = ret; + size_t i; + if (msg->msg_name) { + local_memcpy(msg->msg_name, msg2->msg_name, msg2->msg_namelen); + } + msg->msg_namelen = msg2->msg_namelen; + if (msg->msg_control) { + local_memcpy(msg->msg_control, msg2->msg_control, msg2->msg_controllen); + } + msg->msg_controllen = msg2->msg_controllen; + ptr_end = ptr_bytes_start + bytes; + for (i = 0; i < msg->msg_iovlen; ++i) { + long copy_bytes = + bytes < msg->msg_iov[i].iov_len ? bytes : msg->msg_iov[i].iov_len; + local_memcpy(msg->msg_iov[i].iov_base, msg2->msg_iov[i].iov_base, + copy_bytes); + bytes -= copy_bytes; + } + msg->msg_flags = msg2->msg_flags; + + if (msg_received_file_descriptors(msg)) { + /* When we reach a safe point, notify rr that the control message with + * file descriptors was received. + */ + thread_locals->notify_control_msg = msg; + } + } else { + /* Allocate record space as least to cover the data we overwrote above. + * We don't want to start the next record overlapping that data, since then + * we'll corrupt it during replay. + */ + ptr_end = ptr_overwritten_end; + } + return commit_raw_syscall(syscallno, ptr_end, ret); +} +#endif + +#ifdef SYS_sendmsg +static long sys_sendmsg(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Sending to a socket could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_sendmsg; + int sockfd = call->args[0]; + struct msghdr* msg = (struct msghdr*)call->args[1]; + int flags = call->args[2]; + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, sockfd, msg, flags); + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_sendto +static long sys_sendto(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Sending to a socket could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_sendto; + int sockfd = call->args[0]; + void* buf = (void*)call->args[1]; + size_t len = call->args[2]; + int flags = call->args[3]; + const struct sockaddr* dest_addr = (const struct sockaddr*)call->args[4]; + socklen_t addrlen = call->args[5]; + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = + untraced_syscall6(syscallno, sockfd, buf, len, flags, dest_addr, addrlen); + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_setsockopt +static long sys_setsockopt(struct syscall_info* call) { + const int syscallno = SYS_setsockopt; + int sockfd = call->args[0]; + int level = call->args[1]; + int optname = call->args[2]; + void* optval = (void*)call->args[3]; + socklen_t optlen = (socklen_t)call->args[4]; + + if (level == SOL_PACKET && + (optname == PACKET_RX_RING || optname == PACKET_TX_RING)) { + // Let rr intercept this (and probably disable it) + return traced_raw_syscall(call); + } + if (level == SOL_NETLINK && + (optname == NETLINK_RX_RING || optname == NETLINK_TX_RING)) { + // Let rr intercept this (and probably disable it) + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall5(syscallno, sockfd, level, optname, optval, optlen); + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_getsockopt +static long sys_getsockopt(struct syscall_info* call) { + const int syscallno = SYS_getsockopt; + int sockfd = call->args[0]; + int level = call->args[1]; + int optname = call->args[2]; + void* optval = (void*)call->args[3]; + socklen_t* optlen = (socklen_t*)call->args[4]; + socklen_t* optlen2; + void* optval2; + + if (!optlen || !optval) { + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + + optlen2 = ptr; + ptr += sizeof(*optlen2); + optval2 = ptr; + ptr += *optlen; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + memcpy_input_parameter(optlen2, optlen, sizeof(*optlen2)); + // Some variance of getsockopt does use the initial content of *optval + // (e.g. SOL_IP + IPT_SO_GET_INFO) so we need to copy it. + memcpy_input_parameter(optval2, optval, *optlen); + + // We may need to manually restart this syscall due to kernel bug + // returning a EFAULT when interrupted by signal and we won't have + // access to the actual arg1 on aarch64 in a normal way in such case. + // Pass in the arg1 in the stack argument so that we can use it in the tracer. + ret = untraced_syscall_full(syscallno, sockfd, level, optname, + (long)optval2, (long)optlen2, 0, + RR_PAGE_SYSCALL_UNTRACED_RECORDING_ONLY, sockfd, 0); + + if (ret >= 0) { + socklen_t val_len = *optlen < *optlen2 ? *optlen : *optlen2; + local_memcpy(optval, optval2, val_len); + local_memcpy(optlen, optlen2, sizeof(*optlen)); + } + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_getsockname +static long sys_getsockname(struct syscall_info* call) { + const int syscallno = SYS_getsockname; + int sockfd = call->args[0]; + struct sockaddr* addr = (struct sockaddr*)call->args[1]; + socklen_t* addrlen = (socklen_t*)call->args[2]; + socklen_t* addrlen2; + struct sockaddr* addr2 = NULL; + + void* ptr = prep_syscall_for_fd(sockfd); + long ret; + + addrlen2 = ptr; + ptr += sizeof(*addrlen2); + if (addr) { + addr2 = ptr; + ptr += *addrlen; + } + + assert(syscallno == call->no); + + if (addrlen2) { + memcpy_input_parameter(addrlen2, addrlen, sizeof(*addrlen2)); + } + + if (!start_commit_buffered_syscall(syscallno, ptr, MAY_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, sockfd, addr2, addrlen2); + + if (ret >= 0) { + if (addr) { + socklen_t addr_len = *addrlen < *addrlen2 ? *addrlen : *addrlen2; + local_memcpy(addr, addr2, addr_len); + } + local_memcpy(addrlen, addrlen2, sizeof(*addrlen)); + } + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#ifdef SYS_socketpair +typedef int two_ints[2]; +static long sys_socketpair(struct syscall_info* call) { + const int syscallno = SYS_socketpair; + int domain = call->args[0]; + int type = call->args[1]; + int protocol = call->args[2]; + two_ints* sv = (two_ints*)call->args[3]; + + void* ptr = prep_syscall(); + struct timezone* sv2 = NULL; + long ret; + + assert(syscallno == call->no); + + sv2 = ptr; + ptr += sizeof(*sv2); + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall4(syscallno, domain, type, protocol, sv2); + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(sv, sv2, sizeof(*sv)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +static long sys_uname(struct syscall_info* call) { + const int syscallno = SYS_uname; + void* buf = (void*)call->args[0]; + + void* ptr = prep_syscall(); + void* buf2; + long ret; + size_t bufsize = sizeof(struct new_utsname); + + assert(syscallno == call->no); + + buf2 = ptr; + ptr += bufsize; + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall1(syscallno, buf2); + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, bufsize); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + + +#if defined(SYS_time) +static long sys_time(struct syscall_info* call) { + const int syscallno = SYS_time; + __kernel_time_t* tp = (__kernel_time_t*)call->args[0]; + + void* ptr = prep_syscall(); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall1(syscallno, NULL); + if (tp) { + /* No error is possible here. */ + *tp = ret; + } + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +#if defined(__i386__) +typedef struct stat64 stat64_t; +#else +typedef struct stat stat64_t; +#endif +static long sys_xstat64(struct syscall_info* call) { + const int syscallno = call->no; + /* NB: this arg may be a string or an fd, but for the purposes + * of this generic helper we don't care. */ + long what = call->args[0]; + stat64_t* buf = (stat64_t*)call->args[1]; + + /* Like open(), not arming the desched event because it's not + * needed for correctness, and there are no data to suggest + * whether it's a good idea perf-wise. */ + void* ptr = prep_syscall(); + stat64_t* buf2 = NULL; + long ret; + + if (buf) { + buf2 = ptr; + ptr += sizeof(*buf2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, what, buf2); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, sizeof(*buf)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +#ifdef SYS_statx +/* Like sys_xstat64, but with different arguments */ +static long sys_statx(struct syscall_info* call) { + const int syscallno = call->no; + struct statx* buf = (struct statx*)call->args[4]; + + void* ptr = prep_syscall(); + struct statx* buf2 = NULL; + long ret; + + if (buf) { + buf2 = ptr; + ptr += sizeof(*buf2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall5(syscallno, + call->args[0], call->args[1], call->args[2], call->args[3], + buf2); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, sizeof(*buf)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +static long sys_fstatat(struct syscall_info* call) { + const int syscallno = call->no; + stat64_t* buf = (stat64_t*)call->args[2]; + + /* Like stat(), not arming the desched event because it's not + * needed for correctness, and there are no data to suggest + * whether it's a good idea perf-wise. */ + void* ptr = prep_syscall(); + stat64_t* buf2 = NULL; + long ret; + + if (buf) { + buf2 = ptr; + ptr += sizeof(*buf2); + } + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall4(syscallno, + call->args[0], call->args[1], buf2, call->args[3]); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, sizeof(*buf)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_quotactl(struct syscall_info* call) { + const int syscallno = call->no; + int cmd = call->args[0]; + const char* special = (const char*)call->args[1]; + int id = call->args[2]; + void* addr = (void*)call->args[3]; + + if ((cmd >> SUBCMDSHIFT) != Q_GETQUOTA) { + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + struct if_dqblk* buf2 = NULL; + long ret; + + if (addr) { + buf2 = ptr; + ptr += sizeof(*buf2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall4(syscallno, cmd, special, id, buf2); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(addr, buf2, sizeof(*buf2)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_statfs(struct syscall_info* call) { + const int syscallno = call->no; + /* NB: this arg may be a string or an fd, but for the purposes + * of this generic helper we don't care. */ + long what = call->args[0]; + struct statfs* buf = (struct statfs*)call->args[1]; + + /* Like open(), not arming the desched event because it's not + * needed for correctness, and there are no data to suggest + * whether it's a good idea perf-wise. */ + void* ptr = prep_syscall(); + struct statfs* buf2 = NULL; + long ret; + + if (buf) { + buf2 = ptr; + ptr += sizeof(*buf2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, what, buf2); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, sizeof(*buf)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_write(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Writing to a pipe or FIFO could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + const int syscallno = SYS_write; + int fd = call->args[0]; + const void* buf = (const void*)call->args[1]; + size_t count = call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, fd, buf, count); + + return commit_raw_syscall(syscallno, ptr, ret); +} + +/* On x86-32, pread/pwrite take the offset in two registers. We don't bother + * handling that. + */ +#if !defined(__i386__) +static long sys_pwrite64(struct syscall_info* call) { + const int syscallno = SYS_pwrite64; + int fd = call->args[0]; + const void* buf = (const void*)call->args[1]; + size_t count = call->args[2]; + off_t offset = call->args[3]; + + enum syscallbuf_fd_classes cls = fd_class(fd); + if (cls == FD_CLASS_TRACED) { + return traced_raw_syscall(call); + } + void* ptr = prep_syscall(); + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { + return traced_raw_syscall(call); + } + + long ret; + if (cls == FD_CLASS_PROC_MEM) { + ret = untraced_replay_assist_syscall4(syscallno, fd, buf, count, offset); + } else { + ret = untraced_syscall4(syscallno, fd, buf, count, offset); + } + + return commit_raw_syscall(syscallno, ptr, ret); +} +#endif + +static long sys_writev(struct syscall_info* call) { + if (force_traced_syscall_for_chaos_mode()) { + /* Writing to a pipe or FIFO could unblock a higher priority task */ + return traced_raw_syscall(call); + } + + int syscallno = SYS_writev; + int fd = call->args[0]; + const struct iovec* iov = (const struct iovec*)call->args[1]; + unsigned long iovcnt = call->args[2]; + + void* ptr = prep_syscall_for_fd(fd); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, fd_write_blocks(fd))) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall3(syscallno, fd, iov, iovcnt); + + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_prctl(struct syscall_info* call) { + int syscallno = SYS_prctl; + long option = call->args[0]; + unsigned long arg2 = call->args[1]; + unsigned long arg3 = call->args[2]; + unsigned long arg4 = call->args[3]; + unsigned long arg5 = call->args[4]; + + if (option != PR_SET_NAME) { + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + long ret; + + assert(syscallno == call->no); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_replay_assist_syscall5(syscallno, option, arg2, arg3, arg4, arg5); + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_set_robust_list(struct syscall_info* call) { + int syscallno = SYS_set_robust_list; + void* head = (void*)call->args[0]; + size_t len = call->args[1]; + long ret; + + assert(syscallno == call->no); + + /* Avoid len values we don't support via our buffering mechanism */ + if (len == 0 || len >= UINT32_MAX) { + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall2(syscallno, head, len); + if (!ret) { + thread_locals->robust_list.head = head; + thread_locals->robust_list.len = len; + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +#if defined(SYS_rseq) +static long sys_rseq(struct syscall_info* call) { + int syscallno = SYS_rseq; + struct rr_rseq* rseq = (struct rr_rseq*)call->args[0]; + size_t rseq_len = call->args[1]; + int flags = call->args[2]; + uint32_t sig = call->args[3]; + + assert(syscallno == call->no); + + if (flags || ((uintptr_t)rseq & 31) || rseq_len != sizeof(*rseq) || + thread_locals->rseq_called || globals.cpu_binding < 0) { + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + /* Allow buffering only for the simplest case: setting up the + initial rseq, all parameters OK and CPU binding in place. */ + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + /* We don't actually need to make a syscall since rr is + going to emulate everything. */ + rseq->cpu_id_start = rseq->cpu_id = globals.cpu_binding; + thread_locals->rseq_called = 1; + thread_locals->rseq.rseq = rseq; + thread_locals->rseq.len = rseq_len; + thread_locals->rseq.sig = sig; + /* We do need to commit a syscallbuf record to ensure that flushing + happens with associated processing. */ + return commit_raw_syscall(syscallno, ptr, 0); +} +#endif + +static long sys_ptrace(struct syscall_info* call) { + int syscallno = SYS_ptrace; + long request = call->args[0]; + pid_t pid = call->args[1]; + void* addr = (void*)call->args[2]; + void* data = (void*)call->args[3]; + + if (request != PTRACE_PEEKDATA || !data) { + return traced_raw_syscall(call); + } + + /* We try to emulate PTRACE_PEEKDATA using process_vm_readv. That might not + * work for permissions reasons; if it fails for any reason, we retry with + * a traced syscall. + * This does mean that if a process issues a PTRACE_PEEKDATA while not + * actually ptracing the target, it might succeed under rr whereas normally + * it would have failed. That's hard to avoid and unlikely to be a real + * problem in practice (typically it would fail on some other ptrace call like + * PTRACE_GETREGS before or after the PEEKDATA). + */ + void* ptr = prep_syscall(); + long ret; + void* data2; + + assert(syscallno == call->no); + syscallno = SYS_process_vm_readv; + + data2 = ptr; + ptr += sizeof(long); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + struct iovec local_iov = { data2, sizeof(long) }; + struct iovec remote_iov = { addr, sizeof(long) }; + ret = untraced_syscall6(syscallno, pid, &local_iov, 1, &remote_iov, 1, 0); + if (ret > 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(data, data2, ret); + } + commit_raw_syscall(syscallno, ptr, ret); + + if (ret != sizeof(long)) { + return traced_raw_syscall(call); + } + return ret; +} + +static long sys_getrusage(struct syscall_info* call) { + const int syscallno = SYS_getrusage; + int who = (int)call->args[0]; + struct rusage* buf = (struct rusage*)call->args[1]; + void* ptr = prep_syscall(); + long ret; + struct rusage* buf2 = NULL; + + assert(syscallno == call->no); + + if (buf) { + buf2 = ptr; + ptr += sizeof(struct rusage); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + ret = untraced_syscall2(syscallno, who, buf2); + if (buf2 && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + local_memcpy(buf, buf2, sizeof(*buf)); + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_rt_sigprocmask(struct syscall_info* call) { + const int syscallno = SYS_rt_sigprocmask; + long ret; + kernel_sigset_t modified_set; + void* oldset2; + struct syscallbuf_hdr* hdr; + + if (call->args[3] != sizeof(kernel_sigset_t)) { + // Unusual sigset size. Bail. + return traced_raw_syscall(call); + } + + void* ptr = prep_syscall(); + + int how = (int)call->args[0]; + const kernel_sigset_t* set = (const kernel_sigset_t*)call->args[1]; + kernel_sigset_t* oldset = (kernel_sigset_t*)call->args[2]; + + oldset2 = ptr; + ptr += sizeof(kernel_sigset_t); + + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + if (set && (how == SIG_BLOCK || how == SIG_SETMASK)) { + local_memcpy(&modified_set, set, sizeof(kernel_sigset_t)); + // SIGSTKFLT (PerfCounters::TIME_SLICE_SIGNAL) and + // SIGPWR(SYSCALLBUF_DESCHED_SIGNAL) are used by rr + modified_set &= + ~(((uint64_t)1) << (SIGSTKFLT - 1)) & + ~(((uint64_t)1) << (globals.desched_sig - 1)); + set = &modified_set; + } + + hdr = buffer_hdr(); + hdr->in_sigprocmask_critical_section = 1; + + ret = + untraced_syscall4(syscallno, how, set, oldset2, sizeof(kernel_sigset_t)); + if (ret >= 0 && !buffer_hdr()->failed_during_preparation) { + if (oldset) { + local_memcpy(oldset, oldset2, sizeof(kernel_sigset_t)); + } + if (set) { + kernel_sigset_t previous_set; + local_memcpy(&previous_set, oldset2, sizeof(kernel_sigset_t)); + switch (how) { + case SIG_UNBLOCK: + previous_set &= ~*set; + break; + case SIG_BLOCK: + previous_set |= *set; + break; + case SIG_SETMASK: + previous_set = *set; + break; + } + hdr->blocked_sigs = previous_set; + // We must update the generation last to ensure that an update is not + // lost. + ++hdr->blocked_sigs_generation; + } + } + hdr->in_sigprocmask_critical_section = 0; + + commit_raw_syscall(syscallno, ptr, ret); + + if (ret == -EAGAIN) { + // The rr supervisor emulated EAGAIN because there was a pending signal. + // Retry using a traced syscall so the pending signal(s) can be delivered. + return traced_raw_syscall(call); + } + return ret; +} + +static long sys_sigaltstack(struct syscall_info* call) { + const int syscallno = SYS_sigaltstack; + stack_t* ss = (void*)call->args[0]; + stack_t* old_ss = (void*)call->args[1]; + + void* ptr = prep_syscall(); + stack_t* old_ss2 = NULL; + long ret; + + assert(syscallno == call->no); + + if (old_ss) { + old_ss2 = ptr; + ptr += sizeof(*old_ss2); + } + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + ret = untraced_syscall2(syscallno, ss, old_ss2); + if (old_ss && ret >= 0 && !buffer_hdr()->failed_during_preparation) { + /* This is small and won't get optimized to a memcpy call outside + our library. */ + *old_ss = *old_ss2; + } + return commit_raw_syscall(syscallno, ptr, ret); +} + +static long sys_rrcall_rdtsc(struct syscall_info* call) { +#if defined(__i386__) || defined(__x86_64__) + const int syscallno = SYS_rrcall_rdtsc; + uint32_t tsc[2]; + void* ptr = prep_syscall(); + void* buf = ptr; + ptr += 8; + if (!start_commit_buffered_syscall(syscallno, ptr, WONT_BLOCK)) { + return traced_raw_syscall(call); + } + + // Do an RDTSC without context-switching to rr. This is still a lot slower + // than a plain RDTSC. Maybe we coud do something better with RDPMC... + privileged_unrecorded_syscall5(SYS_prctl, PR_SET_TSC, PR_TSC_ENABLE, 0, 0, 0); + rdtsc_recording_only(buf); + privileged_unrecorded_syscall5(SYS_prctl, PR_SET_TSC, PR_TSC_SIGSEGV, 0, 0, 0); + + local_memcpy(tsc, buf, sizeof(tsc)); + // Overwrite RDX (syscall arg 3) with our TSC value. + call->args[2] = tsc[1]; + return commit_raw_syscall(syscallno, ptr, tsc[0]); +#else + (void)call; + fatal("RDTSC not supported in this architecture"); + return 0; +#endif +} + +static long syscall_hook_internal(struct syscall_info* call) { + switch (call->no) { +#define CASE(syscallname) \ + case SYS_##syscallname: \ + return sys_##syscallname(call) +#define CASE_GENERIC_NONBLOCKING(syscallname) \ + case SYS_##syscallname: \ + return sys_generic_nonblocking(call) +#define CASE_GENERIC_NONBLOCKING_FD(syscallname) \ + case SYS_##syscallname: \ + return sys_generic_nonblocking_fd(call) + CASE(rrcall_rdtsc); +#if defined(SYS_access) + CASE_GENERIC_NONBLOCKING(access); +#endif + CASE(clock_gettime); +#if defined(SYS_clock_gettime64) + CASE(clock_gettime64); +#endif + CASE_GENERIC_NONBLOCKING_FD(close); +#if defined(SYS_creat) + CASE(creat); +#endif + CASE_GENERIC_NONBLOCKING_FD(dup); +#if defined(SYS_epoll_wait) +case SYS_epoll_wait: +#endif +case SYS_epoll_pwait: + return sys_epoll_wait(call); +#if defined(SYS_epoll_pwait2) + CASE(epoll_pwait2); +#endif + CASE_GENERIC_NONBLOCKING_FD(fadvise64); + CASE_GENERIC_NONBLOCKING(fchmod); +#if defined(SYS_fcntl64) + CASE(fcntl64); +#else + CASE(fcntl); +#endif + CASE(fgetxattr); + CASE(flistxattr); + CASE_GENERIC_NONBLOCKING_FD(fsetxattr); + CASE_GENERIC_NONBLOCKING_FD(ftruncate); + CASE(futex); +#if defined(SYS_getdents) + CASE(getdents); +#endif + CASE(getdents64); + CASE_GENERIC_NONBLOCKING(getegid); + CASE_GENERIC_NONBLOCKING(geteuid); + CASE_GENERIC_NONBLOCKING(getgid); + CASE_GENERIC_NONBLOCKING(getpid); + CASE_GENERIC_NONBLOCKING(getppid); + CASE(getrandom); + CASE(getrusage); + CASE_GENERIC_NONBLOCKING(gettid); + CASE(gettimeofday); + CASE_GENERIC_NONBLOCKING(getuid); + CASE(getxattr); + CASE(ioctl); +#if defined(lchown) + CASE_GENERIC_NONBLOCKING(lchown); +#endif + CASE(lgetxattr); + CASE(listxattr); + CASE(llistxattr); +#if defined(SYS__llseek) + CASE(_llseek); +#endif + CASE_GENERIC_NONBLOCKING_FD(lseek); + CASE(madvise); +#if defined(SYS_mkdir) + CASE_GENERIC_NONBLOCKING(mkdir); +#endif +#if defined(SYS_mkdor) + CASE_GENERIC_NONBLOCKING(mknod); +#endif + CASE(mprotect); +#if defined(SYS_open) + CASE(open); +#endif + CASE(openat); +#if defined(SYS_poll) + CASE(poll); +#endif +#if defined(SYS_ppoll) + CASE(ppoll); +#endif + CASE(prctl); +#if !defined(__i386__) + CASE(pread64); + CASE(pwrite64); +#endif + CASE(ptrace); + CASE(quotactl); + CASE(read); +#if defined(SYS_readlink) + CASE(readlink); +#endif + case SYS_readlinkat: + return sys_readlinkat(call, 0); +#if defined(SYS_recvfrom) + CASE(recvfrom); +#endif +#if defined(SYS_recvmsg) + CASE(recvmsg); +#endif +#if defined(SYS_rseq) + CASE(rseq); +#endif +#if defined(SYS_rmdir) + CASE_GENERIC_NONBLOCKING(rmdir); +#endif + CASE(rt_sigprocmask); +#if defined(SYS_sendmsg) + CASE(sendmsg); +#endif +#if defined(SYS_sendto) + CASE(sendto); +#endif + CASE(set_robust_list); +#if defined(SYS_setsockopt) + CASE(setsockopt); +#endif +#if defined(SYS_getsockopt) + CASE(getsockopt); +#endif +#if defined(SYS_getsockname) + CASE(getsockname); +#endif + CASE_GENERIC_NONBLOCKING(setxattr); + CASE(sigaltstack); +#if defined(SYS_socketcall) + CASE(socketcall); +#endif +#if defined(SYS_socketpair) + CASE(socketpair); +#endif +#if defined(SYS_symlink) + CASE_GENERIC_NONBLOCKING(symlink); +#endif +#if defined(SYS_time) + CASE(time); +#endif + CASE_GENERIC_NONBLOCKING(truncate); + CASE(uname); +#if defined(SYS_unlink) + CASE_GENERIC_NONBLOCKING(unlink); +#endif + CASE_GENERIC_NONBLOCKING(unlinkat); + CASE_GENERIC_NONBLOCKING_FD(utimensat); + CASE(write); + CASE(writev); +#if defined(SYS_fstat64) + case SYS_fstat64: +#elif defined(SYS_fstat) + case SYS_fstat: +#endif +#if defined(SYS_lstat64) + case SYS_lstat64: +#elif defined(SYS_lstat) + case SYS_lstat: +#endif +#if defined(SYS_stat64) + case SYS_stat64: +#elif defined(SYS_stat) + case SYS_stat: +#endif + return sys_xstat64(call); +#if defined(SYS_statx) + case SYS_statx: + return sys_statx(call); +#endif + case SYS_statfs: + case SYS_fstatfs: + return sys_statfs(call); +#if defined(SYS_newfstatat) + case SYS_newfstatat: +#elif defined(SYS_fstatat64) + case SYS_fstatat64: +#endif + return sys_fstatat(call); +#undef CASE +#undef CASE_GENERIC_NONBLOCKING +#undef CASE_GENERIC_NONBLOCKING_FD + default: + return traced_raw_syscall(call); + } +} + +/* Delay for testing purposes */ +static void do_delay(void) { + int i; + int result = 0; + for (i = 0; i < 10000000; ++i) { + result += i * i; + } + // Make sure result is used so this doesn't get optimized away + impose_syscall_delay = result | 1; +} + +/* Explicitly declare this as hidden so we can call it from + * _syscall_hook_trampoline without doing all sorts of special PIC handling. + */ +RR_HIDDEN long syscall_hook(struct syscall_info* call) { + // Initialize thread-local state if this is the first syscall for this + // thread. + init_thread(); + + if (!thread_locals->buffer || buffer_hdr()->locked) { + /* We may be reentering via a signal handler. Bail. */ + return traced_raw_syscall(call); + } + + thread_locals->original_syscall_parameters = call; + + if (impose_syscall_delay) { + do_delay(); + } + + long result = syscall_hook_internal(call); + if (buffer_hdr() && buffer_hdr()->notify_on_syscall_hook_exit) { + // Sometimes a signal is delivered to interrupt an untraced syscall in + // a non-restartable way (e.g. seccomp SIGSYS). Those signals must be + // handled outside any syscallbuf transactions. We defer them until + // this SYS_rrcall_notify_syscall_hook_exit, which is triggered by rr + // setting notify_on_syscall_hook_exit. The parameters to the + // SYS_rrcall_notify_syscall_hook_exit are magical and fully control + // the syscall parameters and result seen by the signal handler. + // + // SYS_rrcall_notify_syscall_hook_exit will clear + // notify_on_syscall_hook_exit. Clearing it ourselves is tricky to get + // right without races. + // + // During recording, this flag is set when the recorder needs to delay + // delivery of a signal until we've stopped using the syscallbuf. + // During replay, this flag is set when the next event is entering a + // SYS_rrcall_notify_syscall_hook_exit. + // + // The correctness argument is as follows: + // Correctness requires that a) replay's setting of the flag happens before + // we read the flag in the call to syscall_hook that triggered the + // SYS_rrcall_notify_syscall_hook_exit and b) replay's setting of the flag + // must happen after we read the flag in the previous execution of + // syscall_hook. + // Condition a) holds as long as no events are recorded between the + // checking of the flag above and the execution of this syscall. This + // should be the case; no synchronous signals or syscalls are + // triggerable, all async signals other than SYSCALLBUF_DESCHED_SIGNAL + // are delayed, and SYSCALLBUF_DESCHED_SIGNAL shouldn't fire since we've + // disarmed the desched fd at this point. SYSCALLBUF_FLUSH events may be + // emitted when we process the SYS_rrcall_notify_syscall_hook_exit event, + // but replay of those events ends at the last flushed syscall, before + // we exit syscall_hook_internal. + // Condition b) failing would mean no new events were generated between + // testing the flag in the previous syscall_hook and the execution of this + // SYS_rrcall_notify_syscall_hook_exit. However, every invocation of + // syscall_hook_internal generates either a traced syscall or a syscallbuf + // record that would be flushed by SYSCALLBUF_FLUSH, so that can't + // happen. + result = _raw_syscall(SYS_rrcall_notify_syscall_hook_exit, call->args[0], + call->args[1], call->args[2], call->args[3], + call->args[4], call->args[5], + RR_PAGE_SYSCALL_PRIVILEGED_TRACED, result, call->no); + } + // Do work that can only be safely done after syscallbuf can be flushed + if (thread_locals->notify_control_msg) { + privileged_traced_syscall1(SYS_rrcall_notify_control_msg, + thread_locals->notify_control_msg); + thread_locals->notify_control_msg = NULL; + } + thread_locals->original_syscall_parameters = NULL; + return result; +}
diff --git a/rr/android/x86_64/share/rr/src/preload/syscallbuf.h b/rr/android/x86_64/share/rr/src/preload/syscallbuf.h new file mode 100644 index 0000000..84e87d3 --- /dev/null +++ b/rr/android/x86_64/share/rr/src/preload/syscallbuf.h
@@ -0,0 +1,15 @@ +/* -*- Mode: C; tab-width: 8; c-basic-offset: 2; indent-tabs-mode: nil; -*- */ + +#ifndef RR_SYSCALLBUF_H_ +#define RR_SYSCALLBUF_H_ + +struct timespec; + +#define RR_HIDDEN __attribute__((visibility("hidden"))) + +RR_HIDDEN extern struct preload_globals globals; + +RR_HIDDEN extern char impose_syscall_delay; +RR_HIDDEN extern char impose_spurious_desched; + +#endif /* RR_SYSCALLBUF_H_ */