#!/bin/bash # shellcheck disable=SC2086,SC2048,SC2068,SC2145,SC2034,SC2207,SC2143 # TODO: Re-enable shellchecks above set -eux -o pipefail # Essentially runs pytorch/test/run_test.py, but keeps track of which tests to # skip in a centralized place. # # TODO Except for a few tests, this entire file is a giant TODO. Why are these # tests # failing? # TODO deal with Windows # This script expects to be in the pytorch root folder if [[ ! -d 'test' || ! -f 'test/run_test.py' ]]; then echo "run_tests.sh expects to be run from the Pytorch root directory " \ "but I'm actually in $(pwd)" exit 2 fi # Allow master skip of all tests if [[ -n "${SKIP_ALL_TESTS:-}" ]]; then exit 0 fi # If given specific test params then just run those if [[ -n "${RUN_TEST_PARAMS:-}" ]]; then echo "$(date) :: Calling user-command $(pwd)/test/run_test.py ${RUN_TEST_PARAMS[@]}" python test/run_test.py ${RUN_TEST_PARAMS[@]} exit 0 fi # Function to retry functions that sometimes timeout or have flaky failures retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) } # Parameters ############################################################################## if [[ "$#" != 3 ]]; then if [[ -z "${DESIRED_PYTHON:-}" || -z "${DESIRED_CUDA:-}" || -z "${PACKAGE_TYPE:-}" ]]; then echo "USAGE: run_tests.sh PACKAGE_TYPE DESIRED_PYTHON DESIRED_CUDA" echo "The env variable PACKAGE_TYPE must be set to 'manywheel' or 'libtorch'" echo "The env variable DESIRED_PYTHON must be set like '2.7mu' or '3.6m' etc" echo "The env variable DESIRED_CUDA must be set like 'cpu' or 'cu80' etc" exit 1 fi package_type="$PACKAGE_TYPE" py_ver="$DESIRED_PYTHON" cuda_ver="$DESIRED_CUDA" else package_type="$1" py_ver="$2" cuda_ver="$3" fi if [[ "$cuda_ver" == 'cpu-cxx11-abi' ]]; then cuda_ver="cpu" fi # cu80, cu90, cu100, cpu if [[ ${#cuda_ver} -eq 4 ]]; then cuda_ver_majmin="${cuda_ver:2:1}.${cuda_ver:3:1}" elif [[ ${#cuda_ver} -eq 5 ]]; then cuda_ver_majmin="${cuda_ver:2:2}.${cuda_ver:4:1}" fi NUMPY_PACKAGE="" if [[ ${py_ver} == "3.10" ]]; then PROTOBUF_PACKAGE="protobuf>=3.17.2" NUMPY_PACKAGE="numpy>=1.21.2" else PROTOBUF_PACKAGE="protobuf=3.14.0" fi # Environment initialization if [[ "$(uname)" == Darwin ]]; then # Install the testing dependencies retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml else retry pip install -qr requirements.txt || true retry pip install -q hypothesis protobuf pytest setuptools || true numpy_ver=1.15 case "$(python --version 2>&1)" in *2* | *3.5* | *3.6*) numpy_ver=1.11 ;; esac retry pip install -q "numpy==${numpy_ver}" || true fi echo "Testing with:" pip freeze ############################################################################## # Smoke tests ############################################################################## # TODO use check_binary.sh, which requires making sure it runs on Windows pushd / echo "Smoke testing imports" python -c 'import torch' # Test that MKL is there if [[ "$(uname)" == 'Darwin' && "$package_type" == *wheel ]]; then echo 'Not checking for MKL on Darwin wheel packages' else echo "Checking that MKL is available" python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)' fi if [[ "$OSTYPE" == "msys" ]]; then GPUS=$(wmic path win32_VideoController get name) if [[ ! "$GPUS" == *NVIDIA* ]]; then echo "Skip CUDA tests for machines without a Nvidia GPU card" exit 0 fi fi # Test that the version number is consistent during building and testing if [[ "$PYTORCH_BUILD_NUMBER" -gt 1 ]]; then expected_version="${PYTORCH_BUILD_VERSION}.post${PYTORCH_BUILD_NUMBER}" else expected_version="${PYTORCH_BUILD_VERSION}" fi echo "Checking that we are testing the package that is just built" python -c "import torch; exit(0 if torch.__version__ == '$expected_version' else 1)" # Test that CUDA builds are setup correctly if [[ "$cuda_ver" != 'cpu' ]]; then cuda_installed=1 nvidia-smi || cuda_installed=0 if [[ "$cuda_installed" == 0 ]]; then echo "Skip CUDA tests for machines without a Nvidia GPU card" else # Test CUDA archs echo "Checking that CUDA archs are setup correctly" timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()' # These have to run after CUDA is initialized echo "Checking that magma is available" python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)' echo "Checking that CuDNN is available" python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)' fi fi # Check that OpenBlas is not linked to on MacOS if [[ "$(uname)" == 'Darwin' ]]; then echo "Checking the OpenBLAS is not linked to" all_dylibs=($(find "$(python -c "import site; print(site.getsitepackages()[0])")"/torch -name '*.dylib')) for dylib in "${all_dylibs[@]}"; do if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then echo "Found openblas as a dependency of $dylib" echo "Full dependencies is: $(otool -L $dylib)" exit 1 fi done echo "Checking that OpenMP is available" python -c "import torch; exit(0 if torch.backends.openmp.is_available() else 1)" fi popd # TODO re-enable the other tests after the nightlies are moved to CI. This is # because the binaries keep breaking, often from additional tests, that aren't # real problems. Once these are on circleci and a smoke-binary-build is added # to PRs then this should stop happening and these can be re-enabled. echo "Not running unit tests. Hopefully these problems are caught by CI" exit 0 ############################################################################## # Running unit tests (except not right now) ############################################################################## echo "$(date) :: Starting tests for $package_type package for python$py_ver and $cuda_ver" # We keep track of exact tests to skip, as otherwise we would be hardly running # any tests. But b/c of issues working with pytest/normal-python-test/ and b/c # of special snowflake tests in test/run_test.py we also take special care of # those tests_to_skip=() # # Entire file exclusions ############################################################################## entire_file_exclusions=("-x") # cpp_extensions doesn't work with pytest, so we exclude it from the pytest run # here and then manually run it later. Note that this is only because this # entire_fil_exclusions flag is only passed to the pytest run entire_file_exclusions+=("cpp_extensions") # TODO temporary line to fix next days nightlies, but should be removed when # issue is fixed entire_file_exclusions+=('type_info') if [[ "$cuda_ver" == 'cpu' ]]; then # test/test_cuda.py exits early if the installed torch is not built with # CUDA, but the exit doesn't work when running with pytest, so pytest will # still try to run all the CUDA tests and then fail entire_file_exclusions+=("cuda") entire_file_exclusions+=("nccl") fi if [[ "$(uname)" == 'Darwin' || "$OSTYPE" == "msys" ]]; then # pytest on Mac doesn't like the exits in these files entire_file_exclusions+=('c10d') entire_file_exclusions+=('distributed') # pytest doesn't mind the exit but fails the tests. On Mac we run this # later without pytest entire_file_exclusions+=('thd_distributed') fi # # Universal flaky tests ############################################################################## # RendezvousEnvTest sometimes hangs forever # Otherwise it will fail on CUDA with # Traceback (most recent call last): # File "test_c10d.py", line 179, in test_common_errors # next(gen) # AssertionError: ValueError not raised tests_to_skip+=('RendezvousEnvTest and test_common_errors') # This hung forever once on conda_3.5_cu92 tests_to_skip+=('TestTorch and test_sum_dim') # test_trace_warn isn't actually flaky, but it doesn't work with pytest so we # just skip it tests_to_skip+=('TestJit and test_trace_warn') # # Python specific flaky tests ############################################################################## # test_dataloader.py:721: AssertionError # looks like a timeout, but interestingly only appears on python 3 if [[ "$py_ver" == 3* ]]; then tests_to_skip+=('TestDataLoader and test_proper_exit') fi # # CUDA flaky tests, all package types ############################################################################## if [[ "$cuda_ver" != 'cpu' ]]; then # # DistributedDataParallelTest # All of these seem to fail tests_to_skip+=('DistributedDataParallelTest') # # RendezvousEnvTest # Traceback (most recent call last): # File "test_c10d.py", line 201, in test_nominal # store0, rank0, size0 = next(gen0) # File "/opt/python/cp36-cp36m/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 131, in _env_rendezvous_handler # store = TCPStore(master_addr, master_port, start_daemon) # RuntimeError: Address already in use tests_to_skip+=('RendezvousEnvTest and test_nominal') # # TestCppExtension # # Traceback (most recent call last): # File "test_cpp_extensions.py", line 134, in test_jit_cudnn_extension # with_cuda=True) # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 552, in load # with_cuda) # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 729, in _jit_compile # return _import_module_from_library(name, build_directory) # File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 867, in _import_module_from_library # return imp.load_module(module_name, file, path, description) # File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 243, in load_module # return load_dynamic(name, filename, file) # File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 343, in load_dynamic # return _load(spec) # File "", line 693, in _load # File "", line 666, in _load_unlocked # File "", line 577, in module_from_spec # File "", line 938, in create_module # File "", line 222, in _call_with_frames_removed # ImportError: libcudnn.so.7: cannot open shared object file: No such file or directory tests_to_skip+=('TestCppExtension and test_jit_cudnn_extension') # # TestCuda # # 3.7_cu80 # RuntimeError: CUDA error: out of memory tests_to_skip+=('TestCuda and test_arithmetic_large_tensor') # 3.7_cu80 # RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1538097262541/work/aten/src/THC/THCTensorCopy.cu:205 tests_to_skip+=('TestCuda and test_autogpu') # # TestDistBackend # # Traceback (most recent call last): # File "test_thd_distributed.py", line 1046, in wrapper # self._join_and_reduce(fn) # File "test_thd_distributed.py", line 1108, in _join_and_reduce # self.assertEqual(p.exitcode, first_process.exitcode) # File "/pytorch/test/common.py", line 399, in assertEqual # super(TestCase, self).assertEqual(x, y, message) # AssertionError: None != 77 : tests_to_skip+=('TestDistBackend and test_all_gather_group') tests_to_skip+=('TestDistBackend and test_all_reduce_group_max') tests_to_skip+=('TestDistBackend and test_all_reduce_group_min') tests_to_skip+=('TestDistBackend and test_all_reduce_group_sum') tests_to_skip+=('TestDistBackend and test_all_reduce_group_product') tests_to_skip+=('TestDistBackend and test_barrier_group') tests_to_skip+=('TestDistBackend and test_broadcast_group') # Traceback (most recent call last): # File "test_thd_distributed.py", line 1046, in wrapper # self._join_and_reduce(fn) # File "test_thd_distributed.py", line 1108, in _join_and_reduce # self.assertEqual(p.exitcode, first_process.exitcode) # File "/pytorch/test/common.py", line 397, in assertEqual # super(TestCase, self).assertLessEqual(abs(x - y), prec, message) # AssertionError: 12 not less than or equal to 1e-05 tests_to_skip+=('TestDistBackend and test_barrier') # Traceback (most recent call last): # File "test_distributed.py", line 1267, in wrapper # self._join_and_reduce(fn) # File "test_distributed.py", line 1350, in _join_and_reduce # self.assertEqual(p.exitcode, first_process.exitcode) # File "/pytorch/test/common.py", line 399, in assertEqual # super(TestCase, self).assertEqual(x, y, message) # AssertionError: None != 1 tests_to_skip+=('TestDistBackend and test_broadcast') # Memory leak very similar to all the conda ones below, but appears on manywheel # 3.6m_cu80 # AssertionError: 1605632 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_vae_cuda leaked 1605632 bytes CUDA memory on device 0 tests_to_skip+=('TestEndToEndHybridFrontendModels and test_vae_cuda') # ________________________ TestNN.test_embedding_bag_cuda ________________________ # # self = # dtype = torch.float32 # # @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") # @repeat_test_for_types(ALL_TENSORTYPES) # @skipIfRocm # def test_embedding_bag_cuda(self, dtype=torch.float): # self._test_EmbeddingBag(True, 'sum', False, dtype) # self._test_EmbeddingBag(True, 'mean', False, dtype) # self._test_EmbeddingBag(True, 'max', False, dtype) # if dtype != torch.half: # # torch.cuda.sparse.HalfTensor is not enabled. # self._test_EmbeddingBag(True, 'sum', True, dtype) # > self._test_EmbeddingBag(True, 'mean', True, dtype) # # test_nn.py:2144: # _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ # test_nn.py:2062: in _test_EmbeddingBag # _test_vs_Embedding(N, D, B, L) # test_nn.py:2059: in _test_vs_Embedding # self.assertEqual(es_weight_grad, e.weight.grad, needed_prec) # common.py:373: in assertEqual # assertTensorsEqual(x, y) # common.py:365: in assertTensorsEqual # self.assertLessEqual(max_err, prec, message) # E AssertionError: tensor(0.0000, device='cuda:0', dtype=torch.float32) not less than or equal to 2e-05 : # 1 failed, 1202 passed, 19 skipped, 2 xfailed, 796 warnings in 1166.73 seconds = # Traceback (most recent call last): # File "test/run_test.py", line 391, in # main() # File "test/run_test.py", line 383, in main # raise RuntimeError(message) tests_to_skip+=('TestNN and test_embedding_bag_cuda') fi ############################################################################## # MacOS specific flaky tests ############################################################################## if [[ "$(uname)" == 'Darwin' ]]; then # TestCppExtensions by default uses a temp folder in /tmp. This doesn't # work for this Mac machine cause there is only one machine and /tmp is # shared. (All the linux builds are on docker so have their own /tmp). tests_to_skip+=('TestCppExtension') fi # Turn the set of tests to skip into an invocation that pytest understands excluded_tests_logic='' for exclusion in "${tests_to_skip[@]}"; do if [[ -z "$excluded_tests_logic" ]]; then # Only true for i==0 excluded_tests_logic="not ($exclusion)" else excluded_tests_logic="$excluded_tests_logic and not ($exclusion)" fi done ############################################################################## # Run the tests ############################################################################## echo echo "$(date) :: Calling 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'" python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k "'" "$excluded_tests_logic" "'" echo echo "$(date) :: Finished 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'" # cpp_extensions don't work with pytest, so we run them without pytest here, # except there's a failure on CUDA builds (documented above), and # cpp_extensions doesn't work on a shared mac machine (also documented above) if [[ "$cuda_ver" == 'cpu' && "$(uname)" != 'Darwin' ]]; then echo echo "$(date) :: Calling 'python test/run_test.py -v -i cpp_extensions'" python test/run_test.py -v -i cpp_extensions echo echo "$(date) :: Finished 'python test/run_test.py -v -i cpp_extensions'" fi # thd_distributed can run on Mac but not in pytest if [[ "$(uname)" == 'Darwin' ]]; then echo echo "$(date) :: Calling 'python test/run_test.py -v -i thd_distributed'" python test/run_test.py -v -i thd_distributed echo echo "$(date) :: Finished 'python test/run_test.py -v -i thd_distributed'" fi