Create a new unstable workflow for periodic jobs (#98858)
And move ROCm distributed job there as it's very flaky in trunk at the moment. Also move ROCm slow job to `slow` workflow as it should be.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/98858
Approved by: https://github.com/malfet, https://github.com/ZainRizvi
diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml
index 28618ff..f90ffd1 100644
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@@ -68,31 +68,6 @@
docker-image: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-cuda11_7-py3_10-gcc7-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
- linux-focal-rocm5_4_2-py3_8-build:
- name: linux-focal-rocm5.4.2-py3.8
- uses: ./.github/workflows/_linux-build.yml
- with:
- build-environment: linux-focal-rocm5.4.2-py3.8
- docker-image-name: pytorch-linux-focal-rocm-n-py3
- test-matrix: |
- { include: [
- { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
- { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
- { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
- ]}
-
- linux-focal-rocm5_4_2-py3_8-test:
- name: linux-focal-rocm5.4.2-py3.8
- uses: ./.github/workflows/_rocm-test.yml
- needs: linux-focal-rocm5_4_2-py3_8-build
- with:
- build-environment: linux-focal-rocm5.4.2-py3.8
- docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
- test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
- secrets:
- AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
- AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
-
linux-bionic-cuda11_8-py3_9-gcc7-build:
name: linux-bionic-cuda11.8-py3.9-gcc7
uses: ./.github/workflows/_linux-build.yml
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 14b01de..ec3d8df 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -82,3 +82,26 @@
build-environment: linux-bionic-py3.8-clang9
docker-image: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.docker-image }}
test-matrix: ${{ needs.linux-bionic-py3_8-clang9-build.outputs.test-matrix }}
+
+ linux-focal-rocm5_4_2-py3_8-build:
+ name: linux-focal-rocm5.4.2-py3.8
+ uses: ./.github/workflows/_linux-build.yml
+ with:
+ build-environment: linux-focal-rocm5.4.2-py3.8
+ docker-image-name: pytorch-linux-focal-rocm-n-py3
+ test-matrix: |
+ { include: [
+ { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+ ]}
+
+ linux-focal-rocm5_4_2-py3_8-test:
+ name: linux-focal-rocm5.4.2-py3.8
+ uses: ./.github/workflows/_rocm-test.yml
+ needs: linux-focal-rocm5_4_2-py3_8-build
+ with:
+ build-environment: linux-focal-rocm5.4.2-py3.8
+ docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
+ test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
+ secrets:
+ AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+ AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/unstable-periodic.yml b/.github/workflows/unstable-periodic.yml
new file mode 100644
index 0000000..e9a281b
--- /dev/null
+++ b/.github/workflows/unstable-periodic.yml
@@ -0,0 +1,57 @@
+name: unstable-periodic
+
+on:
+ schedule:
+ - cron: 45 0,4,8,12,16,20 * * *
+ - cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
+ push:
+ tags:
+ - ciflow/unstable/*
+ workflow_dispatch:
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+ cancel-in-progress: true
+
+jobs:
+ # There must be at least one job here to satisfy GitHub action workflow syntax
+ introduction:
+ runs-on: ubuntu-latest
+ continue-on-error: true
+ steps:
+ - name: Introduce PyTorch unstable (periodic) workflow
+ run: |
+ echo "PyTorch unstable workflow is used to host experimental or flaky jobs"
+ echo " that needs to be run periodically, but doesn't impact trunk as part"
+ echo " of the stable periodic workflows."
+ echo
+ echo "In addition, a new label called ciflow/unstable can be attached to the"
+ echo " PR to trigger this workflow. That can be done either manually or"
+ echo " automatically using PyTorch auto-label bot."
+ echo
+ echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h),"
+ echo " they can graduate and move back to periodic."
+
+ linux-focal-rocm5_4_2-py3_8-build:
+ name: linux-focal-rocm5.4.2-py3.8
+ uses: ./.github/workflows/_linux-build.yml
+ with:
+ build-environment: linux-focal-rocm5.4.2-py3.8
+ docker-image-name: pytorch-linux-focal-rocm-n-py3
+ test-matrix: |
+ { include: [
+ { config: "distributed", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+ { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+ ]}
+
+ linux-focal-rocm5_4_2-py3_8-test:
+ name: linux-focal-rocm5.4.2-py3.8
+ uses: ./.github/workflows/_rocm-test.yml
+ needs: linux-focal-rocm5_4_2-py3_8-build
+ with:
+ build-environment: linux-focal-rocm5.4.2-py3.8
+ docker-image: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.docker-image }}
+ test-matrix: ${{ needs.linux-focal-rocm5_4_2-py3_8-build.outputs.test-matrix }}
+ secrets:
+ AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }}
+ AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }}