Enable caffe2 tests for RocM jobs (#41604)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/41604
Reviewed By: ezyang
Differential Revision: D22603703
Pulled By: malfet
fbshipit-source-id: 789ccf2bb79668a5a68006bb877b2d88fb569809
diff --git a/.circleci/cimodel/data/pytorch_build_definitions.py b/.circleci/cimodel/data/pytorch_build_definitions.py
index 22aa53a..70fa343 100644
--- a/.circleci/cimodel/data/pytorch_build_definitions.py
+++ b/.circleci/cimodel/data/pytorch_build_definitions.py
@@ -34,7 +34,7 @@
@staticmethod
def is_test_phase(phase):
- return phase in ["test", "test1", "test2"]
+ return "test" in phase
# TODO: Eliminate the special casing for docker paths
# In the short term, we *will* need to support special casing as docker images are merged for caffe2 and pytorch
@@ -235,7 +235,7 @@
elif compiler_name == "rocm":
rocm_version = fc.find_prop("compiler_version")
- restrict_phases = ["build", "test1", "test2"]
+ restrict_phases = ["build", "test1", "test2", "caffe2_test"]
elif compiler_name == "android":
android_ndk_version = fc.find_prop("compiler_version")
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 19bf4ca..47b3472 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -608,12 +608,24 @@
command: |
set -e
+ cat >docker_commands.sh \<<EOL
+ # =================== The following code will be executed inside Docker container ===================
+ set -ex
+ export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}
+ ${PARALLEL_FLAGS}
+ source ./workspace/env
+ cd workspace
+ EOL
if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
- export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+ echo ".jenkins/pytorch/multigpu-test.sh" >> docker_commands.sh
+ elif [[ ${BUILD_ENVIRONMENT} == *caffe2* ]]; then
+ echo "pip -q install --user -b /tmp/pip_install_onnx \"file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx\"" >> docker_commands.sh
+ echo ".jenkins/caffe2/test.sh" >> docker_commands.sh
else
- export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export CIRCLE_PULL_REQUEST=${CIRCLE_PULL_REQUEST}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+ echo ".jenkins/pytorch/test.sh" >> docker_commands.sh
fi
- echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+ echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh
+ unbuffer bash command.sh | ts
- run:
name: Report results
no_output_timeout: "5m"
@@ -5733,6 +5745,19 @@
build_environment: "pytorch-linux-xenial-rocm3.5.1-py3.6-test2"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-rocm3.5.1-py3.6:8bdba785b1eac4d297d5f5930f979518012a56e0"
resource_class: pytorch/amd-gpu
+ - pytorch_linux_test:
+ name: pytorch_linux_xenial_rocm3_5_1_py3_6_caffe2_test
+ requires:
+ - pytorch_linux_xenial_rocm3_5_1_py3_6_build
+ filters:
+ branches:
+ only:
+ - master
+ - /ci-all\/.*/
+ - /release\/.*/
+ build_environment: "pytorch-linux-xenial-rocm3.5.1-py3.6-caffe2_test"
+ docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-rocm3.5.1-py3.6:8bdba785b1eac4d297d5f5930f979518012a56e0"
+ resource_class: pytorch/amd-gpu
- pytorch_linux_build:
name: pytorch_linux_xenial_py3_6_gcc5_4_build
build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-build"
diff --git a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
index 745c9f8..091c98a 100644
--- a/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
+++ b/.circleci/verbatim-sources/job-specs/pytorch-job-specs.yml
@@ -146,12 +146,24 @@
command: |
set -e
+ cat >docker_commands.sh \<<EOL
+ # =================== The following code will be executed inside Docker container ===================
+ set -ex
+ export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}
+ ${PARALLEL_FLAGS}
+ source ./workspace/env
+ cd workspace
+ EOL
if [[ ${BUILD_ENVIRONMENT} == *"multigpu"* ]]; then
- export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "cd workspace && .jenkins/pytorch/multigpu-test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+ echo ".jenkins/pytorch/multigpu-test.sh" >> docker_commands.sh
+ elif [[ ${BUILD_ENVIRONMENT} == *caffe2* ]]; then
+ echo "pip -q install --user -b /tmp/pip_install_onnx \"file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx\"" >> docker_commands.sh
+ echo ".jenkins/caffe2/test.sh" >> docker_commands.sh
else
- export COMMAND='((echo "export BUILD_ENVIRONMENT=${BUILD_ENVIRONMENT}" && echo "export CIRCLE_PULL_REQUEST=${CIRCLE_PULL_REQUEST}" && echo "${PARALLEL_FLAGS}" && echo "source ./workspace/env" && echo "cd workspace && .jenkins/pytorch/test.sh") | docker exec -u jenkins -i "$id" bash) 2>&1'
+ echo ".jenkins/pytorch/test.sh" >> docker_commands.sh
fi
- echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts
+ echo "(cat docker_commands.sh | docker exec -u jenkins -i "$id" bash) 2>&1" > command.sh
+ unbuffer bash command.sh | ts
- run:
name: Report results
no_output_timeout: "5m"
diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh
index 21c7875..0d15bf6 100755
--- a/.jenkins/caffe2/test.sh
+++ b/.jenkins/caffe2/test.sh
@@ -138,6 +138,10 @@
# This test has been flaky in ROCm CI (but note the tests are
# cpu-only so should be unrelated to ROCm)
rocm_ignore_test+=("--ignore $caffe2_pypath/python/operator_test/blobs_queue_db_test.py")
+ # This test is skipped on Jenkins(compiled without MKL) and otherwise known flaky
+ rocm_ignore_test+=("--ignore $caffe2_pypath/python/ideep/convfusion_op_test.py")
+ # This test is skipped on Jenkins(compiled without MKL) and causing segfault on Circle
+ rocm_ignore_test+=("--ignore $caffe2_pypath/python/ideep/pool_op_test.py")
fi
# NB: Warnings are disabled because they make it harder to see what
diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py
index fb92834..9daf340 100644
--- a/caffe2/python/ideep/elementwise_sum_op_test.py
+++ b/caffe2/python/ideep/elementwise_sum_op_test.py
@@ -44,7 +44,7 @@
batch_size=st.integers(1, 3),
inputs=st.integers(2, 7),
inplace=st.booleans(),
- **mu.gcs)
+ **mu.gcs_cpu_ideep)
def test_elementwise_sum_fallback(self,
size,
input_channels,
@@ -84,7 +84,7 @@
batch_size=st.integers(1, 3),
inputs=st.integers(2, 7),
inplace=st.booleans(),
- **mu.gcs)
+ **mu.gcs_cpu_ideep)
def test_int8_elementwise_sum(self,
size,
input_channels,
diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py
index cdd7cd9..4a4fb73 100644
--- a/caffe2/python/ideep/expanddims_squeeze_op_test.py
+++ b/caffe2/python/ideep/expanddims_squeeze_op_test.py
@@ -33,7 +33,7 @@
@given(
squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
inplace=st.booleans(),
- **mu.gcs
+ **mu.gcs_cpu_ideep
)
def test_squeeze_fallback(self, squeeze_dims, inplace, gc, dc):
shape = [
@@ -92,7 +92,7 @@
@given(
squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
inplace=st.booleans(),
- **mu.gcs
+ **mu.gcs_cpu_ideep
)
def test_expand_dims_fallback(self, squeeze_dims, inplace, gc, dc):
oshape = [
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
index e4a9ee0..389a466 100644
--- a/caffe2/python/ideep/fc_op_test.py
+++ b/caffe2/python/ideep/fc_op_test.py
@@ -261,7 +261,7 @@
self.assertGradientChecks(gc, op, [X, W, b], i, [0])
@given(n=st.integers(2, 5), m=st.integers(2, 5),
- k=st.integers(2, 5), **mu.gcs)
+ k=st.integers(2, 5), **mu.gcs_cpu_ideep)
def test_int8_fc_4_dims(self, n, m, k, gc, dc):
X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
w = np.random.rand(n, k, m, m).astype(np.float32) - 0.5
diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py
index 4fa6398..bd7b283 100644
--- a/caffe2/python/ideep/pool_op_test.py
+++ b/caffe2/python/ideep/pool_op_test.py
@@ -49,7 +49,7 @@
input_channels=st.integers(1, 3),
batch_size=st.integers(1, 3),
method=st.sampled_from(["MaxPool", "AveragePool"]),
- **mu.gcs)
+ **mu.gcs_cpu_ideep)
def test_int8_pooling(self, stride, pad, kernel, size,
input_channels, batch_size,
method, gc, dc):
diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py
index 4e4451d..79ad242 100644
--- a/caffe2/python/ideep/relu_op_test.py
+++ b/caffe2/python/ideep/relu_op_test.py
@@ -35,7 +35,7 @@
input_channels=st.integers(1, 3),
batch_size=st.integers(1, 3),
inplace=st.booleans(),
- **mu.gcs)
+ **mu.gcs_cpu_ideep)
def test_int8_relu(self, size, input_channels, batch_size, inplace, gc, dc):
relu_fp32 = core.CreateOperator(
"Relu",