Create script to upload test aggregation data (#97954)
<!--
copilot:summary
-->
### <samp>🤖 Generated by Copilot at 79f1b37</samp>
This pull request improves the workflow and data processing for uploading contribution and testing statistics to Rockset and S3. It renames and updates a workflow file, removes unused code from a script, and adds a new script to aggregate and upload test results.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/97954
Approved by: https://github.com/huydhn
diff --git a/.github/workflows/upload-contrib-stats.yml b/.github/workflows/nightly-rockset-uploads.yml
similarity index 78%
rename from .github/workflows/upload-contrib-stats.yml
rename to .github/workflows/nightly-rockset-uploads.yml
index 95f8e96..6977b62 100644
--- a/.github/workflows/upload-contrib-stats.yml
+++ b/.github/workflows/nightly-rockset-uploads.yml
@@ -1,4 +1,4 @@
-name: Upload contribution stats
+name: Nightly Upload to rockset
on:
schedule:
@@ -11,11 +11,14 @@
jobs:
- upload-contribution-stats:
+ upload-stats-to-rockset:
runs-on: [self-hosted, linux.2xlarge]
steps:
- name: Checkout PyTorch
uses: pytorch/pytorch/.github/actions/checkout-pytorch@master
+ with:
+ fetch-depth: 1
+ submodules: false
- run: |
pip3 install requests==2.26
@@ -32,5 +35,7 @@
max_attempts: 5
retry_wait_seconds: 90
command: |
+ echo "Uploading testing aggregate data" "$(date -d yesterday '+%Y-%m-%d')"
+ python3 -m tools.stats.upload_test_stat_aggregates --date "$(date -d yesterday '+%Y-%m-%d')"
echo "Uploading external contribution stats for" "$(date -d yesterday '+%Y-%m-%d')"
python3 -m tools.stats.upload_external_contrib_stats --startDate "$(date -d yesterday '+%Y-%m-%d')"
\ No newline at end of file
diff --git a/tools/stats/upload_external_contrib_stats.py b/tools/stats/upload_external_contrib_stats.py
index a371591..9196e3a 100644
--- a/tools/stats/upload_external_contrib_stats.py
+++ b/tools/stats/upload_external_contrib_stats.py
@@ -93,7 +93,6 @@
"date": str(period_begin_date),
"pr_count": pr_count,
"user_count": len(users),
- "users": list(users),
}
)
period_begin_date = period_end_date + datetime.timedelta(days=1)
diff --git a/tools/stats/upload_test_stat_aggregates.py b/tools/stats/upload_test_stat_aggregates.py
new file mode 100644
index 0000000..1eb67f4
--- /dev/null
+++ b/tools/stats/upload_test_stat_aggregates.py
@@ -0,0 +1,84 @@
+import argparse
+import ast
+import datetime
+import json
+import os
+import re
+from typing import Any, List, Union
+
+import rockset # type: ignore[import]
+
+from tools.stats.upload_stats_lib import upload_to_s3
+
+
+def get_oncall_from_testfile(testfile: str) -> Union[List[str], None]:
+ path = f"test/{testfile}"
+ if not path.endswith(".py"):
+ path += ".py"
+ # get oncall on test file
+ try:
+ with open(path) as f:
+ for line in f:
+ if line.startswith("# Owner(s): "):
+ possible_lists = re.findall(r"\[.*\]", line)
+ if len(possible_lists) > 1:
+ raise Exception("More than one list found")
+ elif len(possible_lists) == 0:
+ raise Exception("No oncalls found or file is badly formatted")
+ oncalls = ast.literal_eval(possible_lists[0])
+ return list(oncalls)
+ except Exception as e:
+ if "." in testfile:
+ return [f"module: {testfile.split('.')[0]}"]
+ else:
+ return ["module: unmarked"]
+ return None
+
+
+def get_test_stat_aggregates(date: datetime.date) -> Any:
+ # Initialize the Rockset client with your API key
+ rockset_api_key = os.environ["ROCKSET_API_KEY"]
+ rockset_api_server = "api.rs2.usw2.rockset.com"
+ iso_date = date.isoformat()
+ rs = rockset.RocksetClient(
+ host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+ )
+
+ # Define the name of the Rockset collection and lambda function
+ collection_name = "commons"
+ lambda_function_name = "test_insights_per_daily_upload"
+ query_parameters = [
+ rockset.models.QueryParameter(name="startTime", type="string", value=iso_date)
+ ]
+ api_response = rs.QueryLambdas.execute_query_lambda(
+ query_lambda=lambda_function_name,
+ version="865e3748f31e9b59",
+ parameters=query_parameters,
+ )
+ for i in range(len(api_response["results"])):
+ oncalls = get_oncall_from_testfile(api_response["results"][i]["test_file"])
+ api_response["results"][i]["oncalls"] = oncalls
+ return json.loads(
+ json.dumps(api_response["results"], indent=4, sort_keys=True, default=str)
+ )
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Upload test stat aggregates to Rockset."
+ )
+ parser.add_argument(
+ "--date",
+ type=datetime.date.fromisoformat,
+ help="Date to upload test stat aggregates for (YYYY-MM-DD). Must be in the last 30 days",
+ required=True,
+ )
+ args = parser.parse_args()
+ if args.date < datetime.datetime.now().date() - datetime.timedelta(days=30):
+ raise ValueError("date must be in the last 30 days")
+ data = get_test_stat_aggregates(date=args.date)
+ upload_to_s3(
+ bucket_name="torchci-aggregated-stats",
+ key=f"test_data_aggregates/{str(args.date)}",
+ docs=data,
+ )