sglang_v0.5.2/pytorch_2.8.0/.ci/pytorch/run_tests.sh

436 lines
17 KiB
Bash
Executable File

#!/bin/bash
# shellcheck disable=SC2086,SC2048,SC2068,SC2145,SC2034,SC2207,SC2143
# TODO: Re-enable shellchecks above
set -eux -o pipefail
# Essentially runs pytorch/test/run_test.py, but keeps track of which tests to
# skip in a centralized place.
#
# TODO Except for a few tests, this entire file is a giant TODO. Why are these
# tests # failing?
# TODO deal with Windows
# This script expects to be in the pytorch root folder
if [[ ! -d 'test' || ! -f 'test/run_test.py' ]]; then
echo "run_tests.sh expects to be run from the Pytorch root directory " \
"but I'm actually in $(pwd)"
exit 2
fi
# Allow master skip of all tests
if [[ -n "${SKIP_ALL_TESTS:-}" ]]; then
exit 0
fi
# If given specific test params then just run those
if [[ -n "${RUN_TEST_PARAMS:-}" ]]; then
echo "$(date) :: Calling user-command $(pwd)/test/run_test.py ${RUN_TEST_PARAMS[@]}"
python test/run_test.py ${RUN_TEST_PARAMS[@]}
exit 0
fi
# Function to retry functions that sometimes timeout or have flaky failures
retry () {
$* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
}
# Parameters
##############################################################################
if [[ "$#" != 3 ]]; then
if [[ -z "${DESIRED_PYTHON:-}" || -z "${DESIRED_CUDA:-}" || -z "${PACKAGE_TYPE:-}" ]]; then
echo "USAGE: run_tests.sh PACKAGE_TYPE DESIRED_PYTHON DESIRED_CUDA"
echo "The env variable PACKAGE_TYPE must be set to 'manywheel' or 'libtorch'"
echo "The env variable DESIRED_PYTHON must be set like '2.7mu' or '3.6m' etc"
echo "The env variable DESIRED_CUDA must be set like 'cpu' or 'cu80' etc"
exit 1
fi
package_type="$PACKAGE_TYPE"
py_ver="$DESIRED_PYTHON"
cuda_ver="$DESIRED_CUDA"
else
package_type="$1"
py_ver="$2"
cuda_ver="$3"
fi
if [[ "$cuda_ver" == 'cpu-cxx11-abi' ]]; then
cuda_ver="cpu"
fi
# cu80, cu90, cu100, cpu
if [[ ${#cuda_ver} -eq 4 ]]; then
cuda_ver_majmin="${cuda_ver:2:1}.${cuda_ver:3:1}"
elif [[ ${#cuda_ver} -eq 5 ]]; then
cuda_ver_majmin="${cuda_ver:2:2}.${cuda_ver:4:1}"
fi
NUMPY_PACKAGE=""
if [[ ${py_ver} == "3.10" ]]; then
PROTOBUF_PACKAGE="protobuf>=3.17.2"
NUMPY_PACKAGE="numpy>=1.21.2"
else
PROTOBUF_PACKAGE="protobuf=3.14.0"
fi
# Environment initialization
if [[ "$(uname)" == Darwin ]]; then
# Install the testing dependencies
retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
else
retry pip install -qr requirements.txt || true
retry pip install -q hypothesis protobuf pytest setuptools || true
numpy_ver=1.15
case "$(python --version 2>&1)" in
*2* | *3.5* | *3.6*)
numpy_ver=1.11
;;
esac
retry pip install -q "numpy==${numpy_ver}" || true
fi
echo "Testing with:"
pip freeze
##############################################################################
# Smoke tests
##############################################################################
# TODO use check_binary.sh, which requires making sure it runs on Windows
pushd /
echo "Smoke testing imports"
python -c 'import torch'
# Test that MKL is there
if [[ "$(uname)" == 'Darwin' && "$package_type" == *wheel ]]; then
echo 'Not checking for MKL on Darwin wheel packages'
else
echo "Checking that MKL is available"
python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)'
fi
if [[ "$OSTYPE" == "msys" ]]; then
GPUS=$(wmic path win32_VideoController get name)
if [[ ! "$GPUS" == *NVIDIA* ]]; then
echo "Skip CUDA tests for machines without a Nvidia GPU card"
exit 0
fi
fi
# Test that the version number is consistent during building and testing
if [[ "$PYTORCH_BUILD_NUMBER" -gt 1 ]]; then
expected_version="${PYTORCH_BUILD_VERSION}.post${PYTORCH_BUILD_NUMBER}"
else
expected_version="${PYTORCH_BUILD_VERSION}"
fi
echo "Checking that we are testing the package that is just built"
python -c "import torch; exit(0 if torch.__version__ == '$expected_version' else 1)"
# Test that CUDA builds are setup correctly
if [[ "$cuda_ver" != 'cpu' ]]; then
cuda_installed=1
nvidia-smi || cuda_installed=0
if [[ "$cuda_installed" == 0 ]]; then
echo "Skip CUDA tests for machines without a Nvidia GPU card"
else
# Test CUDA archs
echo "Checking that CUDA archs are setup correctly"
timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()'
# These have to run after CUDA is initialized
echo "Checking that magma is available"
python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)'
echo "Checking that CuDNN is available"
python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)'
fi
fi
# Check that OpenBlas is not linked to on MacOS
if [[ "$(uname)" == 'Darwin' ]]; then
echo "Checking the OpenBLAS is not linked to"
all_dylibs=($(find "$(python -c "import site; print(site.getsitepackages()[0])")"/torch -name '*.dylib'))
for dylib in "${all_dylibs[@]}"; do
if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then
echo "Found openblas as a dependency of $dylib"
echo "Full dependencies is: $(otool -L $dylib)"
exit 1
fi
done
echo "Checking that OpenMP is available"
python -c "import torch; exit(0 if torch.backends.openmp.is_available() else 1)"
fi
popd
# TODO re-enable the other tests after the nightlies are moved to CI. This is
# because the binaries keep breaking, often from additional tests, that aren't
# real problems. Once these are on circleci and a smoke-binary-build is added
# to PRs then this should stop happening and these can be re-enabled.
echo "Not running unit tests. Hopefully these problems are caught by CI"
exit 0
##############################################################################
# Running unit tests (except not right now)
##############################################################################
echo "$(date) :: Starting tests for $package_type package for python$py_ver and $cuda_ver"
# We keep track of exact tests to skip, as otherwise we would be hardly running
# any tests. But b/c of issues working with pytest/normal-python-test/ and b/c
# of special snowflake tests in test/run_test.py we also take special care of
# those
tests_to_skip=()
#
# Entire file exclusions
##############################################################################
entire_file_exclusions=("-x")
# cpp_extensions doesn't work with pytest, so we exclude it from the pytest run
# here and then manually run it later. Note that this is only because this
# entire_fil_exclusions flag is only passed to the pytest run
entire_file_exclusions+=("cpp_extensions")
# TODO temporary line to fix next days nightlies, but should be removed when
# issue is fixed
entire_file_exclusions+=('type_info')
if [[ "$cuda_ver" == 'cpu' ]]; then
# test/test_cuda.py exits early if the installed torch is not built with
# CUDA, but the exit doesn't work when running with pytest, so pytest will
# still try to run all the CUDA tests and then fail
entire_file_exclusions+=("cuda")
entire_file_exclusions+=("nccl")
fi
if [[ "$(uname)" == 'Darwin' || "$OSTYPE" == "msys" ]]; then
# pytest on Mac doesn't like the exits in these files
entire_file_exclusions+=('c10d')
entire_file_exclusions+=('distributed')
# pytest doesn't mind the exit but fails the tests. On Mac we run this
# later without pytest
entire_file_exclusions+=('thd_distributed')
fi
#
# Universal flaky tests
##############################################################################
# RendezvousEnvTest sometimes hangs forever
# Otherwise it will fail on CUDA with
# Traceback (most recent call last):
# File "test_c10d.py", line 179, in test_common_errors
# next(gen)
# AssertionError: ValueError not raised
tests_to_skip+=('RendezvousEnvTest and test_common_errors')
# This hung forever once on conda_3.5_cu92
tests_to_skip+=('TestTorch and test_sum_dim')
# test_trace_warn isn't actually flaky, but it doesn't work with pytest so we
# just skip it
tests_to_skip+=('TestJit and test_trace_warn')
#
# Python specific flaky tests
##############################################################################
# test_dataloader.py:721: AssertionError
# looks like a timeout, but interestingly only appears on python 3
if [[ "$py_ver" == 3* ]]; then
tests_to_skip+=('TestDataLoader and test_proper_exit')
fi
#
# CUDA flaky tests, all package types
##############################################################################
if [[ "$cuda_ver" != 'cpu' ]]; then
#
# DistributedDataParallelTest
# All of these seem to fail
tests_to_skip+=('DistributedDataParallelTest')
#
# RendezvousEnvTest
# Traceback (most recent call last):
# File "test_c10d.py", line 201, in test_nominal
# store0, rank0, size0 = next(gen0)
# File "/opt/python/cp36-cp36m/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 131, in _env_rendezvous_handler
# store = TCPStore(master_addr, master_port, start_daemon)
# RuntimeError: Address already in use
tests_to_skip+=('RendezvousEnvTest and test_nominal')
#
# TestCppExtension
#
# Traceback (most recent call last):
# File "test_cpp_extensions.py", line 134, in test_jit_cudnn_extension
# with_cuda=True)
# File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 552, in load
# with_cuda)
# File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 729, in _jit_compile
# return _import_module_from_library(name, build_directory)
# File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 867, in _import_module_from_library
# return imp.load_module(module_name, file, path, description)
# File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 243, in load_module
# return load_dynamic(name, filename, file)
# File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 343, in load_dynamic
# return _load(spec)
# File "<frozen importlib._bootstrap>", line 693, in _load
# File "<frozen importlib._bootstrap>", line 666, in _load_unlocked
# File "<frozen importlib._bootstrap>", line 577, in module_from_spec
# File "<frozen importlib._bootstrap_external>", line 938, in create_module
# File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
# ImportError: libcudnn.so.7: cannot open shared object file: No such file or directory
tests_to_skip+=('TestCppExtension and test_jit_cudnn_extension')
#
# TestCuda
#
# 3.7_cu80
# RuntimeError: CUDA error: out of memory
tests_to_skip+=('TestCuda and test_arithmetic_large_tensor')
# 3.7_cu80
# RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1538097262541/work/aten/src/THC/THCTensorCopy.cu:205
tests_to_skip+=('TestCuda and test_autogpu')
#
# TestDistBackend
#
# Traceback (most recent call last):
# File "test_thd_distributed.py", line 1046, in wrapper
# self._join_and_reduce(fn)
# File "test_thd_distributed.py", line 1108, in _join_and_reduce
# self.assertEqual(p.exitcode, first_process.exitcode)
# File "/pytorch/test/common.py", line 399, in assertEqual
# super(TestCase, self).assertEqual(x, y, message)
# AssertionError: None != 77 :
tests_to_skip+=('TestDistBackend and test_all_gather_group')
tests_to_skip+=('TestDistBackend and test_all_reduce_group_max')
tests_to_skip+=('TestDistBackend and test_all_reduce_group_min')
tests_to_skip+=('TestDistBackend and test_all_reduce_group_sum')
tests_to_skip+=('TestDistBackend and test_all_reduce_group_product')
tests_to_skip+=('TestDistBackend and test_barrier_group')
tests_to_skip+=('TestDistBackend and test_broadcast_group')
# Traceback (most recent call last):
# File "test_thd_distributed.py", line 1046, in wrapper
# self._join_and_reduce(fn)
# File "test_thd_distributed.py", line 1108, in _join_and_reduce
# self.assertEqual(p.exitcode, first_process.exitcode)
# File "/pytorch/test/common.py", line 397, in assertEqual
# super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
# AssertionError: 12 not less than or equal to 1e-05
tests_to_skip+=('TestDistBackend and test_barrier')
# Traceback (most recent call last):
# File "test_distributed.py", line 1267, in wrapper
# self._join_and_reduce(fn)
# File "test_distributed.py", line 1350, in _join_and_reduce
# self.assertEqual(p.exitcode, first_process.exitcode)
# File "/pytorch/test/common.py", line 399, in assertEqual
# super(TestCase, self).assertEqual(x, y, message)
# AssertionError: None != 1
tests_to_skip+=('TestDistBackend and test_broadcast')
# Memory leak very similar to all the conda ones below, but appears on manywheel
# 3.6m_cu80
# AssertionError: 1605632 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_vae_cuda leaked 1605632 bytes CUDA memory on device 0
tests_to_skip+=('TestEndToEndHybridFrontendModels and test_vae_cuda')
# ________________________ TestNN.test_embedding_bag_cuda ________________________
#
# self = <test_nn.TestNN testMethod=test_embedding_bag_cuda>
# dtype = torch.float32
#
# @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
# @repeat_test_for_types(ALL_TENSORTYPES)
# @skipIfRocm
# def test_embedding_bag_cuda(self, dtype=torch.float):
# self._test_EmbeddingBag(True, 'sum', False, dtype)
# self._test_EmbeddingBag(True, 'mean', False, dtype)
# self._test_EmbeddingBag(True, 'max', False, dtype)
# if dtype != torch.half:
# # torch.cuda.sparse.HalfTensor is not enabled.
# self._test_EmbeddingBag(True, 'sum', True, dtype)
# > self._test_EmbeddingBag(True, 'mean', True, dtype)
#
# test_nn.py:2144:
# _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
# test_nn.py:2062: in _test_EmbeddingBag
# _test_vs_Embedding(N, D, B, L)
# test_nn.py:2059: in _test_vs_Embedding
# self.assertEqual(es_weight_grad, e.weight.grad, needed_prec)
# common.py:373: in assertEqual
# assertTensorsEqual(x, y)
# common.py:365: in assertTensorsEqual
# self.assertLessEqual(max_err, prec, message)
# E AssertionError: tensor(0.0000, device='cuda:0', dtype=torch.float32) not less than or equal to 2e-05 :
# 1 failed, 1202 passed, 19 skipped, 2 xfailed, 796 warnings in 1166.73 seconds =
# Traceback (most recent call last):
# File "test/run_test.py", line 391, in <module>
# main()
# File "test/run_test.py", line 383, in main
# raise RuntimeError(message)
tests_to_skip+=('TestNN and test_embedding_bag_cuda')
fi
##############################################################################
# MacOS specific flaky tests
##############################################################################
if [[ "$(uname)" == 'Darwin' ]]; then
# TestCppExtensions by default uses a temp folder in /tmp. This doesn't
# work for this Mac machine cause there is only one machine and /tmp is
# shared. (All the linux builds are on docker so have their own /tmp).
tests_to_skip+=('TestCppExtension')
fi
# Turn the set of tests to skip into an invocation that pytest understands
excluded_tests_logic=''
for exclusion in "${tests_to_skip[@]}"; do
if [[ -z "$excluded_tests_logic" ]]; then
# Only true for i==0
excluded_tests_logic="not ($exclusion)"
else
excluded_tests_logic="$excluded_tests_logic and not ($exclusion)"
fi
done
##############################################################################
# Run the tests
##############################################################################
echo
echo "$(date) :: Calling 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"
python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k "'" "$excluded_tests_logic" "'"
echo
echo "$(date) :: Finished 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"
# cpp_extensions don't work with pytest, so we run them without pytest here,
# except there's a failure on CUDA builds (documented above), and
# cpp_extensions doesn't work on a shared mac machine (also documented above)
if [[ "$cuda_ver" == 'cpu' && "$(uname)" != 'Darwin' ]]; then
echo
echo "$(date) :: Calling 'python test/run_test.py -v -i cpp_extensions'"
python test/run_test.py -v -i cpp_extensions
echo
echo "$(date) :: Finished 'python test/run_test.py -v -i cpp_extensions'"
fi
# thd_distributed can run on Mac but not in pytest
if [[ "$(uname)" == 'Darwin' ]]; then
echo
echo "$(date) :: Calling 'python test/run_test.py -v -i thd_distributed'"
python test/run_test.py -v -i thd_distributed
echo
echo "$(date) :: Finished 'python test/run_test.py -v -i thd_distributed'"
fi