| #!/usr/bin/env bash | 
 | # | 
 | # american fuzzy lop++ - corpus minimization tool | 
 | # --------------------------------------------- | 
 | # | 
 | # Originally written by Michal Zalewski | 
 | # | 
 | # Copyright 2014, 2015 Google Inc. All rights reserved. | 
 | # | 
 | # Licensed under the Apache License, Version 2.0 (the "License"); | 
 | # you may not use this file except in compliance with the License. | 
 | # You may obtain a copy of the License at: | 
 | # | 
 | #   https://www.apache.org/licenses/LICENSE-2.0 | 
 | # | 
 | # This tool tries to find the smallest subset of files in the input directory | 
 | # that still trigger the full range of instrumentation data points seen in | 
 | # the starting corpus. This has two uses: | 
 | # | 
 | #   - Screening large corpora of input files before using them as a seed for | 
 | #     afl-fuzz. The tool will remove functionally redundant files and likely | 
 | #     leave you with a much smaller set. | 
 | # | 
 | #     (In this case, you probably also want to consider running afl-tmin on | 
 | #     the individual files later on to reduce their size.) | 
 | # | 
 | #   - Minimizing the corpus generated organically by afl-fuzz, perhaps when | 
 | #     planning to feed it to more resource-intensive tools. The tool achieves | 
 | #     this by removing all entries that used to trigger unique behaviors in the | 
 | #     past, but have been made obsolete by later finds. | 
 | # | 
 | # Note that the tool doesn't modify the files themselves. For that, you want | 
 | # afl-tmin. | 
 | # | 
 | # This script must use bash because other shells may have hardcoded limits on | 
 | # array sizes. | 
 | # | 
 |  | 
 | echo "corpus minimization tool for afl-fuzz by Michal Zalewski" | 
 | echo | 
 |  | 
 | ######### | 
 | # SETUP # | 
 | ######### | 
 |  | 
 | # Process command-line options... | 
 |  | 
 | MEM_LIMIT=none | 
 | TIMEOUT=none | 
 |  | 
 | unset IN_DIR OUT_DIR STDIN_FILE EXTRA_PAR MEM_LIMIT_GIVEN \ | 
 |   AFL_CMIN_CRASHES_ONLY AFL_CMIN_ALLOW_ANY QEMU_MODE UNICORN_MODE | 
 |  | 
 | export AFL_QUIET=1 | 
 |  | 
 | while getopts "+i:o:f:m:t:eOQUCh" opt; do | 
 |  | 
 |   case "$opt" in  | 
 |  | 
 |     "h") | 
 | 	;; | 
 |  | 
 |     "i") | 
 |          IN_DIR="$OPTARG" | 
 |          ;; | 
 |  | 
 |     "o") | 
 |          OUT_DIR="$OPTARG" | 
 |          ;; | 
 |     "f") | 
 |          STDIN_FILE="$OPTARG" | 
 |          ;; | 
 |     "m") | 
 |          MEM_LIMIT="$OPTARG" | 
 |          MEM_LIMIT_GIVEN=1 | 
 |          ;; | 
 |     "t") | 
 |          TIMEOUT="$OPTARG" | 
 |          ;; | 
 |     "e") | 
 |          EXTRA_PAR="$EXTRA_PAR -e" | 
 |          ;; | 
 |     "C") | 
 |          export AFL_CMIN_CRASHES_ONLY=1 | 
 |          ;; | 
 |     "O") | 
 |          EXTRA_PAR="$EXTRA_PAR -O" | 
 |          FRIDA_MODE=1 | 
 |          ;;          | 
 |     "Q") | 
 |          EXTRA_PAR="$EXTRA_PAR -Q" | 
 |          QEMU_MODE=1 | 
 |          ;; | 
 |     "U") | 
 |          EXTRA_PAR="$EXTRA_PAR -U" | 
 |          UNICORN_MODE=1 | 
 |          ;;     | 
 |     "?") | 
 |          exit 1 | 
 |          ;; | 
 |  | 
 |    esac | 
 |  | 
 | done | 
 |  | 
 | shift $((OPTIND-1)) | 
 |  | 
 | TARGET_BIN="$1" | 
 |  | 
 | if [ "$TARGET_BIN" = "" -o "$IN_DIR" = "" -o "$OUT_DIR" = "" ]; then | 
 |  | 
 |   cat 1>&2 <<_EOF_ | 
 | Usage: $0 [ options ] -- /path/to/target_app [ ... ] | 
 |  | 
 | Required parameters: | 
 |  | 
 |   -i dir        - input directory with the starting corpus | 
 |   -o dir        - output directory for minimized files | 
 |  | 
 | Execution control settings: | 
 |  | 
 |   -f file       - location read by the fuzzed program (stdin) | 
 |   -m megs       - memory limit for child process ($MEM_LIMIT MB) | 
 |   -t msec       - run time limit for child process (none) | 
 |   -O            - use binary-only instrumentation (FRIDA mode) | 
 |   -Q            - use binary-only instrumentation (QEMU mode) | 
 |   -U            - use unicorn-based instrumentation (Unicorn mode) | 
 |    | 
 | Minimization settings: | 
 |  | 
 |   -C            - keep crashing inputs, reject everything else | 
 |   -e            - solve for edge coverage only, ignore hit counts | 
 |  | 
 | For additional tips, please consult README.md. | 
 |  | 
 | Environment variables used: | 
 | AFL_KEEP_TRACES: leave the temporary <out_dir>\.traces directory | 
 | AFL_NO_FORKSRV: run target via execve instead of using the forkserver | 
 | AFL_PATH: last resort location to find the afl-showmap binary | 
 | AFL_SKIP_BIN_CHECK: skip check for target binary | 
 | _EOF_ | 
 |   exit 1 | 
 | fi | 
 |  | 
 | # Do a sanity check to discourage the use of /tmp, since we can't really | 
 | # handle this safely from a shell script. | 
 |  | 
 | #if [ "$AFL_ALLOW_TMP" = "" ]; then | 
 | # | 
 | #  echo "$IN_DIR" | grep -qE '^(/var)?/tmp/' | 
 | #  T1="$?" | 
 | # | 
 | #  echo "$TARGET_BIN" | grep -qE '^(/var)?/tmp/' | 
 | #  T2="$?" | 
 | # | 
 | #  echo "$OUT_DIR" | grep -qE '^(/var)?/tmp/' | 
 | #  T3="$?" | 
 | # | 
 | #  echo "$STDIN_FILE" | grep -qE '^(/var)?/tmp/' | 
 | #  T4="$?" | 
 | # | 
 | #  echo "$PWD" | grep -qE '^(/var)?/tmp/' | 
 | #  T5="$?" | 
 | # | 
 | #  if [ "$T1" = "0" -o "$T2" = "0" -o "$T3" = "0" -o "$T4" = "0" -o "$T5" = "0" ]; then | 
 | #    echo "[-] Error: do not use this script in /tmp or /var/tmp." 1>&2 | 
 | #    exit 1 | 
 | #  fi | 
 | # | 
 | #fi | 
 |  | 
 | # If @@ is specified, but there's no -f, let's come up with a temporary input | 
 | # file name. | 
 |  | 
 | TRACE_DIR="$OUT_DIR/.traces" | 
 |  | 
 | if [ "$STDIN_FILE" = "" ]; then | 
 |  | 
 |   if echo "$*" | grep -qF '@@'; then | 
 |     STDIN_FILE="$TRACE_DIR/.cur_input" | 
 |   fi | 
 |  | 
 | fi | 
 |  | 
 | # Check for obvious errors. | 
 |  | 
 | if [ ! "$MEM_LIMIT" = "none" ]; then | 
 |  | 
 |   if [ "$MEM_LIMIT" -lt "5" ]; then | 
 |     echo "[-] Error: dangerously low memory limit." 1>&2 | 
 |     exit 1 | 
 |   fi | 
 |  | 
 | fi | 
 |  | 
 | if [ ! "$TIMEOUT" = "none" ]; then | 
 |  | 
 |   if [ "$TIMEOUT" -lt "10" ]; then | 
 |     echo "[-] Error: dangerously low timeout." 1>&2 | 
 |     exit 1 | 
 |   fi | 
 |  | 
 | fi | 
 |  | 
 | if [ ! -f "$TARGET_BIN" -o ! -x "$TARGET_BIN" ]; then | 
 |  | 
 |   TNEW="`which "$TARGET_BIN" 2>/dev/null`" | 
 |  | 
 |   if [ ! -f "$TNEW" -o ! -x "$TNEW" ]; then | 
 |     echo "[-] Error: binary '$TARGET_BIN' not found or not executable." 1>&2 | 
 |     exit 1 | 
 |   fi | 
 |  | 
 |   TARGET_BIN="$TNEW" | 
 |  | 
 | fi | 
 |  | 
 | if [ "$AFL_SKIP_BIN_CHECK" = "" -a "$QEMU_MODE" = "" -a "$FRIDA_MODE" = "" -a "$UNICORN_MODE" = "" ]; then | 
 |  | 
 |   if ! grep -qF "__AFL_SHM_ID" "$TARGET_BIN"; then | 
 |     echo "[-] Error: binary '$TARGET_BIN' doesn't appear to be instrumented." 1>&2 | 
 |     exit 1 | 
 |   fi | 
 |  | 
 | fi | 
 |  | 
 | if [ ! -d "$IN_DIR" ]; then | 
 |   echo "[-] Error: directory '$IN_DIR' not found." 1>&2 | 
 |   exit 1 | 
 | fi | 
 |  | 
 | test -d "$IN_DIR/default" && IN_DIR="$IN_DIR/default" | 
 | test -d "$IN_DIR/queue" && IN_DIR="$IN_DIR/queue" | 
 |  | 
 | find "$OUT_DIR" -name 'id[:_]*' -maxdepth 1 -exec rm -- {} \; 2>/dev/null | 
 | rm -rf "$TRACE_DIR" 2>/dev/null | 
 |  | 
 | rmdir "$OUT_DIR" 2>/dev/null | 
 |  | 
 | if [ -d "$OUT_DIR" ]; then | 
 |   echo "[-] Error: directory '$OUT_DIR' exists and is not empty - delete it first." 1>&2 | 
 |   exit 1 | 
 | fi | 
 |  | 
 | mkdir -m 700 -p "$TRACE_DIR" || exit 1 | 
 |  | 
 | if [ ! "$STDIN_FILE" = "" ]; then | 
 |   rm -f "$STDIN_FILE" || exit 1 | 
 |   touch "$STDIN_FILE" || exit 1 | 
 | fi | 
 |  | 
 | SHOWMAP=`command -v afl-showmap 2>/dev/null` | 
 |  | 
 | if [ -z "$SHOWMAP" ]; then | 
 |   TMP="${0%/afl-cmin.bash}/afl-showmap" | 
 |   if [ -x "$TMP" ]; then | 
 |     SHOWMAP=$TMP | 
 |   fi | 
 | fi | 
 |  | 
 | if [ -z "$SHOWMAP" -a -x "./afl-showmap" ]; then | 
 |   SHOWMAP="./afl-showmap" | 
 | else | 
 |   if [ -n "$AFL_PATH" ]; then | 
 |     SHOWMAP="$AFL_PATH/afl-showmap" | 
 |   fi | 
 | fi | 
 |  | 
 | if [ ! -x "$SHOWMAP" ]; then | 
 |   echo "[-] Error: can't find 'afl-showmap' - please set AFL_PATH." 1>&2 | 
 |   rm -rf "$TRACE_DIR" | 
 |   exit 1 | 
 | fi | 
 |  | 
 | IN_COUNT=$((`ls -- "$IN_DIR" 2>/dev/null | wc -l`)) | 
 |  | 
 | if [ "$IN_COUNT" = "0" ]; then | 
 |   echo "[+] Hmm, no inputs in the target directory. Nothing to be done." | 
 |   rm -rf "$TRACE_DIR" | 
 |   exit 1 | 
 | fi | 
 |  | 
 | FIRST_FILE=`ls "$IN_DIR" | head -1` | 
 |  | 
 | # Make sure that we're not dealing with a directory. | 
 |  | 
 | if [ -d "$IN_DIR/$FIRST_FILE" ]; then | 
 |   echo "[-] Error: The target directory contains subdirectories - please fix." 1>&2 | 
 |   rm -rf "$TRACE_DIR" | 
 |   exit 1 | 
 | fi | 
 |  | 
 | # Check for the more efficient way to copy files... | 
 |  | 
 | if ln "$IN_DIR/$FIRST_FILE" "$TRACE_DIR/.link_test" 2>/dev/null; then | 
 |   CP_TOOL=ln | 
 | else | 
 |   CP_TOOL=cp | 
 | fi | 
 |  | 
 | # Make sure that we can actually get anything out of afl-showmap before we | 
 | # waste too much time. | 
 |  | 
 | echo "[*] Testing the target binary..." | 
 |  | 
 | if [ "$STDIN_FILE" = "" ]; then | 
 |  | 
 |   AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$FIRST_FILE" | 
 |  | 
 | else | 
 |  | 
 |   cp "$IN_DIR/$FIRST_FILE" "$STDIN_FILE" | 
 |   AFL_CMIN_ALLOW_ANY=1 "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/.run_test" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null | 
 |  | 
 | fi | 
 |  | 
 | FIRST_COUNT=$((`grep -c . "$TRACE_DIR/.run_test"`)) | 
 |  | 
 | if [ "$FIRST_COUNT" -gt "0" ]; then | 
 |  | 
 |   echo "[+] OK, $FIRST_COUNT tuples recorded." | 
 |  | 
 | else | 
 |  | 
 |   echo "[-] Error: no instrumentation output detected (perhaps crash or timeout)." 1>&2 | 
 |   test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" | 
 |   exit 1 | 
 |  | 
 | fi | 
 |  | 
 | # Let's roll! | 
 |  | 
 | ############################# | 
 | # STEP 1: COLLECTING TRACES # | 
 | ############################# | 
 |  | 
 | echo "[*] Obtaining traces for input files in '$IN_DIR'..." | 
 |  | 
 | ( | 
 |  | 
 |   CUR=0 | 
 |  | 
 |   if [ "$STDIN_FILE" = "" ]; then | 
 |  | 
 |     ls "$IN_DIR" | while read -r fn; do | 
 |  | 
 |       CUR=$((CUR+1)) | 
 |       printf "\\r    Processing file $CUR/$IN_COUNT... " | 
 |  | 
 |       "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -- "$@" <"$IN_DIR/$fn" | 
 |  | 
 |     done | 
 |  | 
 |   else | 
 |  | 
 |     ls "$IN_DIR" | while read -r fn; do | 
 |  | 
 |       CUR=$((CUR+1)) | 
 |       printf "\\r    Processing file $CUR/$IN_COUNT... " | 
 |  | 
 |       cp "$IN_DIR/$fn" "$STDIN_FILE" | 
 |  | 
 |       "$SHOWMAP" -m "$MEM_LIMIT" -t "$TIMEOUT" -o "$TRACE_DIR/$fn" -Z $EXTRA_PAR -H "$STDIN_FILE" -- "$@" </dev/null | 
 |  | 
 |     done | 
 |  | 
 |  | 
 |   fi | 
 |  | 
 | ) | 
 |  | 
 | echo | 
 |  | 
 | ########################## | 
 | # STEP 2: SORTING TUPLES # | 
 | ########################## | 
 |  | 
 | # With this out of the way, we sort all tuples by popularity across all | 
 | # datasets. The reasoning here is that we won't be able to avoid the files | 
 | # that trigger unique tuples anyway, so we will want to start with them and | 
 | # see what's left. | 
 |  | 
 | echo "[*] Sorting trace sets (this may take a while)..." | 
 |  | 
 | ls "$IN_DIR" | sed "s#^#$TRACE_DIR/#" | tr '\n' '\0' | xargs -0 -n 1 cat | \ | 
 |   sort | uniq -c | sort -k 1,1 -n >"$TRACE_DIR/.all_uniq" | 
 |  | 
 | TUPLE_COUNT=$((`grep -c . "$TRACE_DIR/.all_uniq"`)) | 
 |  | 
 | echo "[+] Found $TUPLE_COUNT unique tuples across $IN_COUNT files." | 
 |  | 
 | ##################################### | 
 | # STEP 3: SELECTING CANDIDATE FILES # | 
 | ##################################### | 
 |  | 
 | # The next step is to find the best candidate for each tuple. The "best" | 
 | # part is understood simply as the smallest input that includes a particular | 
 | # tuple in its trace. Empirical evidence suggests that this produces smaller | 
 | # datasets than more involved algorithms that could be still pulled off in | 
 | # a shell script. | 
 |  | 
 | echo "[*] Finding best candidates for each tuple..." | 
 |  | 
 | CUR=0 | 
 |  | 
 | ls -rS "$IN_DIR" | while read -r fn; do | 
 |  | 
 |   CUR=$((CUR+1)) | 
 |   printf "\\r    Processing file $CUR/$IN_COUNT... " | 
 |  | 
 |   sed "s#\$# $fn#" "$TRACE_DIR/$fn" >>"$TRACE_DIR/.candidate_list" | 
 |  | 
 | done | 
 |  | 
 | echo | 
 |  | 
 | ############################## | 
 | # STEP 4: LOADING CANDIDATES # | 
 | ############################## | 
 |  | 
 | # At this point, we have a file of tuple-file pairs, sorted by file size | 
 | # in ascending order (as a consequence of ls -rS). By doing sort keyed | 
 | # only by tuple (-k 1,1) and configured to output only the first line for | 
 | # every key (-s -u), we end up with the smallest file for each tuple. | 
 |  | 
 | echo "[*] Sorting candidate list (be patient)..." | 
 |  | 
 | sort -k1,1 -s -u "$TRACE_DIR/.candidate_list" | \ | 
 |   sed 's/^/BEST_FILE[/;s/ /]="/;s/$/"/' >"$TRACE_DIR/.candidate_script" | 
 |  | 
 | if [ ! -s "$TRACE_DIR/.candidate_script" ]; then | 
 |   echo "[-] Error: no traces obtained from test cases, check syntax!" 1>&2 | 
 |   test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" | 
 |   exit 1 | 
 | fi | 
 |  | 
 | # The sed command converted the sorted list to a shell script that populates | 
 | # BEST_FILE[tuple]="fname". Let's load that! | 
 |  | 
 | . "$TRACE_DIR/.candidate_script" | 
 |  | 
 | ########################## | 
 | # STEP 5: WRITING OUTPUT # | 
 | ########################## | 
 |  | 
 | # The final trick is to grab the top pick for each tuple, unless said tuple is | 
 | # already set due to the inclusion of an earlier candidate; and then put all | 
 | # tuples associated with the newly-added file to the "already have" list. The | 
 | # loop works from least popular tuples and toward the most common ones. | 
 |  | 
 | echo "[*] Processing candidates and writing output files..." | 
 |  | 
 | CUR=0 | 
 |  | 
 | touch "$TRACE_DIR/.already_have" | 
 |  | 
 | while read -r cnt tuple; do | 
 |  | 
 |   CUR=$((CUR+1)) | 
 |   printf "\\r    Processing tuple $CUR/$TUPLE_COUNT with count $cnt... " | 
 |  | 
 |   # If we already have this tuple, skip it. | 
 |  | 
 |   grep -q "^$tuple\$" "$TRACE_DIR/.already_have" && continue | 
 |  | 
 |   FN=${BEST_FILE[tuple]} | 
 |  | 
 | #  echo "tuple nr $CUR ($tuple cnt=$cnt) -> $FN" >> "$TRACE_DIR/.log" | 
 |   $CP_TOOL "$IN_DIR/$FN" "$OUT_DIR/$FN" | 
 |  | 
 |   if [ "$((CUR % 5))" = "0" ]; then | 
 |     sort -u "$TRACE_DIR/$FN" "$TRACE_DIR/.already_have" >"$TRACE_DIR/.tmp" | 
 |     mv -f "$TRACE_DIR/.tmp" "$TRACE_DIR/.already_have" | 
 |   else | 
 |     cat "$TRACE_DIR/$FN" >>"$TRACE_DIR/.already_have" | 
 |   fi | 
 |  | 
 | done <"$TRACE_DIR/.all_uniq" | 
 |  | 
 | echo | 
 |  | 
 | OUT_COUNT=`ls -- "$OUT_DIR" | wc -l` | 
 |  | 
 | if [ "$OUT_COUNT" = "1" ]; then | 
 |   echo "[!] WARNING: All test cases had the same traces, check syntax!" | 
 | fi | 
 |  | 
 | echo "[+] Narrowed down to $OUT_COUNT files, saved in '$OUT_DIR'." | 
 | echo | 
 |  | 
 | test "$AFL_KEEP_TRACES" = "" && rm -rf "$TRACE_DIR" | 
 |  | 
 | exit 0 |