AutoHeuristic: util scripts (#133409) This PR introduces scripts that make it easier to use autoheuristic: - `collect_data.sh`: The user can specify things like the number of GPUs to be used and the number of training samples to collect. This script will open one tmux pane per GPU and collect num_training_samples/num_gpus samples per GPU. - `merge_data.py`: This script can be used to merge multiple training data files into a single file. Pull Request resolved: https://github.com/pytorch/pytorch/pull/133409 Approved by: https://github.com/Chillee

commit: 142353eca3ce0c7155b7305982ea933a4644f330 [log] [tgz]
author: Alnis Murtovi <murtovi@meta.com> Tue Aug 13 22:38:56 2024 -0700
committer: PyTorch MergeBot <pytorchmergebot@users.noreply.github.com> Thu Aug 15 10:49:56 2024 +0000
tree: 1b2cef98dcab39c7453bff9482e61153697b7100
parent: b0fc6aa412a81c0073eca570af6199657fe37ff7 [diff]
diff --git a/torchgen/_autoheuristic/benchmark_runner.py b/torchgen/_autoheuristic/benchmark_runner.py
index 00b47f5..999ea48 100644
--- a/torchgen/_autoheuristic/benchmark_runner.py
+++ b/torchgen/_autoheuristic/benchmark_runner.py

@@ -57,7 +57,9 @@
         args = self.parser.parse_args()
         if args.use_heuristic:
             torch._inductor.config.autoheuristic_use = self.name
+            torch._inductor.config.autoheuristic_collect = ""
         else:
+            torch._inductor.config.autoheuristic_use = ""
             torch._inductor.config.autoheuristic_collect = self.name
         torch._inductor.config.autoheuristic_log_path = args.o
         if args.device is not None:

diff --git a/torchgen/_autoheuristic/collect_data.sh b/torchgen/_autoheuristic/collect_data.sh
new file mode 100644
index 0000000..442f612
--- /dev/null
+++ b/torchgen/_autoheuristic/collect_data.sh

@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# this script makes it easy parallize collecting data across using multiple GPUs
+
+# Check if tmux is installed
+if ! command -v tmux &> /dev/null; then
+    echo "tmux is not installed. Please install it and try again."
+    exit 1
+fi
+
+# Check if the correct number of arguments is provided
+if [ "$#" -ne 5 ]; then
+    echo "Usage: $0 \"<python_command>\" <comma_separated_device_numbers> <num_samples to generate> <CONDA_ENV> <OUTPUT_DIR>"
+    echo "Example: $0 \"python run.py --a b --b c\" 1,4,5,3 1000 pytorch-3.10 a100"
+    exit 1
+fi
+
+PYTHON_COMMAND=$1
+DEVICE_NUMBERS=$2
+NUM_SAMPLES=$3
+CONDA_ENV=$4
+OUTPUT_DIR=$5
+
+# Create a new tmux session
+SESSION_NAME="parallel_run_$(date +%s)"
+tmux new-session -d -s "$SESSION_NAME"
+
+# Split the device numbers
+IFS=',' read -ra DEVICES <<< "$DEVICE_NUMBERS"
+
+NUM_GPUS=${#DEVICES[@]}
+NUM_SAMPLES_PER_GPU=$((NUM_SAMPLES / NUM_GPUS))
+echo "AutoHeuristic will collect ${NUM_SAMPLES} samples split across ${NUM_GPUS} GPUs"
+echo "Each GPU will collect ${NUM_SAMPLES_PER_GPU}"
+
+# Function to create a new pane and run the script
+create_pane() {
+    local device=$1
+    tmux split-window -t "$SESSION_NAME"
+    tmux send-keys -t "$SESSION_NAME" "conda activate ${CONDA_ENV} && $PYTHON_COMMAND --device $device -o ${OUTPUT_DIR}/data_${device}.txt --num-samples ${NUM_SAMPLES_PER_GPU}" C-m
+}
+
+# Create panes for each device number
+for device in "${DEVICES[@]}"; do
+    create_pane ${device}
+done
+
+# Remove the first pane (empty one)
+tmux kill-pane -t "$SESSION_NAME.0"
+
+# Arrange panes in a tiled layout
+tmux select-layout -t "$SESSION_NAME" tiled
+
+# Attach to the tmux session
+tmux attach-session -t "$SESSION_NAME"

diff --git a/torchgen/_autoheuristic/generate_heuristic.sh b/torchgen/_autoheuristic/generate_heuristic.sh
new file mode 100644
index 0000000..97696a4
--- /dev/null
+++ b/torchgen/_autoheuristic/generate_heuristic.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+
+if [ $# -lt 8 ]; then
+    echo "Error: This script requires exactly at least 8 arguments."
+    exit 1
+fi
+
+MODE=$1
+GPU_DEVICE_IDS=$2
+CONDA_ENV=$3
+NUM_SAMPLES=$4
+OUTPUT_DIR=$5
+HEURISTIC_NAME=$6
+BENCHMARK_SCRIPT=$7
+TRAIN_SCRIPT=$8
+EXTRA_TRAIN_ARGS=$9
+
+mkdir -p ${OUTPUT_DIR}
+
+if [ "$MODE" = "collect" ]; then
+    # this will collect data for NUM_SAMPLES samples on the number of GPUs specified in GPU_DEVICE_IDS in parallel
+    bash ../collect_data.sh "python ${BENCHMARK_SCRIPT}" ${GPU_DEVICE_IDS} ${NUM_SAMPLES} ${CONDA_ENV} ${OUTPUT_DIR}
+elif [ "$MODE" = "generate" ]; then
+    # the bash script above generates one separate txt file per GPU
+    # if GPU_DEVICE_IDS=6,7, it will generate "data_6.txt", "data_7.txt" inside OUTPUT_DIR
+    # these files have to be merged into a single file before we can use AutoHeuristic to learn a heuristic
+    OUTPUT_FILE="${OUTPUT_DIR}/${HEURISTIC_NAME}.txt"
+    INPUT_FILES=$(echo $GPU_DEVICE_IDS | tr ',' '\n' | sed "s|^|${OUTPUT_DIR}/data_|" | sed 's/$/.txt/')
+    python ../merge_data.py ${OUTPUT_FILE} ${INPUT_FILES}
+
+    # This will learn a heuristic and generate the code into torch/_inductor/autoheuristic/artifacts/_${HEURISTIC_NAME}.py
+    python ${TRAIN_SCRIPT} ${OUTPUT_FILE} --heuristic-name ${HEURISTIC_NAME} ${EXTRA_TRAIN_ARGS}
+else
+    echo "Error: Invalid mode ${MODE}. Please use 'collect' or 'generate'."
+    exit 1
+fi

diff --git a/torchgen/_autoheuristic/merge_data.py b/torchgen/_autoheuristic/merge_data.py
new file mode 100644
index 0000000..374e77d
--- /dev/null
+++ b/torchgen/_autoheuristic/merge_data.py

@@ -0,0 +1,60 @@
+import sys
+from typing import List
+
+
+def merge_txt_files(file_list: List[str], output_file: str) -> None:
+    if not file_list:
+        print("No input files provided.")
+        return
+
+    metadata: List[str] = []
+    content: List[str] = []
+
+    # Read metadata and content from all files
+    for file_path in file_list:
+        try:
+            with open(file_path) as file:
+                lines = file.readlines()
+                if len(lines) < 2:
+                    print(
+                        f"Error: {file_path} does not have enough lines for metadata."
+                    )
+                    return
+
+                file_metadata = lines[:2]
+                file_content = lines[2:]
+
+                if not metadata:
+                    metadata = file_metadata
+                elif metadata != file_metadata:
+                    print(f"Error: Metadata mismatch in {file_path}")
+                    print("Expected metadata:")
+                    print("".join(metadata))
+                    print(f"Metadata in {file_path}:")
+                    print("".join(file_metadata))
+                    return
+
+                content.extend(file_content)
+        except OSError as e:
+            print(f"Error reading file {file_path}: {e}")
+            return
+
+    # Write merged content to output file
+    try:
+        with open(output_file, "w") as outfile:
+            outfile.writelines(metadata)
+            outfile.writelines(content)
+        print(f"Successfully merged files into {output_file}")
+    except OSError as e:
+        print(f"Error writing to output file {output_file}: {e}")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        print(
+            "Usage: python script.py output_file.txt input_file1.txt input_file2.txt ..."
+        )
+    else:
+        output_file = sys.argv[1]
+        input_files = sys.argv[2:]
+        merge_txt_files(input_files, output_file)

diff --git a/torchgen/_autoheuristic/requirements.txt b/torchgen/_autoheuristic/requirements.txt
new file mode 100644
index 0000000..fda4dd6
--- /dev/null
+++ b/torchgen/_autoheuristic/requirements.txt

@@ -0,0 +1,2 @@
+pandas
+scikit-learn
commit	142353eca3ce0c7155b7305982ea933a4644f330	[log] [tgz]
author	Alnis Murtovi <murtovi@meta.com>	Tue Aug 13 22:38:56 2024 -0700
committer	PyTorch MergeBot <pytorchmergebot@users.noreply.github.com>	Thu Aug 15 10:49:56 2024 +0000
tree	1b2cef98dcab39c7453bff9482e61153697b7100
parent	b0fc6aa412a81c0073eca570af6199657fe37ff7 [diff]