sglang_v0.5.2/pytorch_2.8.0/.ci/pytorch/run_tests.sh

#!/bin/bash
# shellcheck disable=SC2086,SC2048,SC2068,SC2145,SC2034,SC2207,SC2143
# TODO: Re-enable shellchecks above

set -eux -o pipefail

# Essentially runs pytorch/test/run_test.py, but keeps track of which tests to
# skip in a centralized place.
#
# TODO Except for a few tests, this entire file is a giant TODO. Why are these
# tests # failing?
# TODO deal with Windows

# This script expects to be in the pytorch root folder
if [[ ! -d 'test' || ! -f 'test/run_test.py' ]]; then
    echo "run_tests.sh expects to be run from the Pytorch root directory " \
         "but I'm actually in $(pwd)"
    exit 2
fi

# Allow master skip of all tests
if [[ -n "${SKIP_ALL_TESTS:-}" ]]; then
    exit 0
fi

# If given specific test params then just run those
if [[ -n "${RUN_TEST_PARAMS:-}" ]]; then
    echo "$(date) :: Calling user-command $(pwd)/test/run_test.py ${RUN_TEST_PARAMS[@]}"
    python test/run_test.py ${RUN_TEST_PARAMS[@]}
    exit 0
fi

# Function to retry functions that sometimes timeout or have flaky failures
retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
}

# Parameters
##############################################################################
if [[ "$#" != 3 ]]; then
  if [[ -z "${DESIRED_PYTHON:-}" || -z "${DESIRED_CUDA:-}" || -z "${PACKAGE_TYPE:-}" ]]; then
    echo "USAGE: run_tests.sh  PACKAGE_TYPE  DESIRED_PYTHON  DESIRED_CUDA"
    echo "The env variable PACKAGE_TYPE must be set to 'manywheel' or 'libtorch'"
    echo "The env variable DESIRED_PYTHON must be set like '2.7mu' or '3.6m' etc"
    echo "The env variable DESIRED_CUDA must be set like 'cpu' or 'cu80' etc"
    exit 1
  fi
  package_type="$PACKAGE_TYPE"
  py_ver="$DESIRED_PYTHON"
  cuda_ver="$DESIRED_CUDA"
else
  package_type="$1"
  py_ver="$2"
  cuda_ver="$3"
fi

if [[ "$cuda_ver" == 'cpu-cxx11-abi' ]]; then
    cuda_ver="cpu"
fi

# cu80, cu90, cu100, cpu
if [[ ${#cuda_ver} -eq 4 ]]; then
    cuda_ver_majmin="${cuda_ver:2:1}.${cuda_ver:3:1}"
elif [[ ${#cuda_ver} -eq 5 ]]; then
    cuda_ver_majmin="${cuda_ver:2:2}.${cuda_ver:4:1}"
fi

NUMPY_PACKAGE=""
if [[ ${py_ver} == "3.10" ]]; then
    PROTOBUF_PACKAGE="protobuf>=3.17.2"
    NUMPY_PACKAGE="numpy>=1.21.2"
else
    PROTOBUF_PACKAGE="protobuf=3.14.0"
fi

# Environment initialization
if [[ "$(uname)" == Darwin ]]; then
    # Install the testing dependencies
    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
else
    retry pip install -qr requirements.txt || true
    retry pip install -q hypothesis protobuf pytest setuptools || true
    numpy_ver=1.15
    case "$(python --version 2>&1)" in
      *2* | *3.5* | *3.6*)
        numpy_ver=1.11
        ;;
    esac
    retry pip install -q "numpy==${numpy_ver}" || true
fi

echo "Testing with:"
pip freeze

##############################################################################
# Smoke tests
##############################################################################
# TODO use check_binary.sh, which requires making sure it runs on Windows
pushd /
echo "Smoke testing imports"
python -c 'import torch'

# Test that MKL is there
if [[ "$(uname)" == 'Darwin' && "$package_type" == *wheel ]]; then
    echo 'Not checking for MKL on Darwin wheel packages'
else
    echo "Checking that MKL is available"
    python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)'
fi

if [[ "$OSTYPE" == "msys" ]]; then
    GPUS=$(wmic path win32_VideoController get name)
    if [[ ! "$GPUS" == *NVIDIA* ]]; then
        echo "Skip CUDA tests for machines without a Nvidia GPU card"
        exit 0
    fi
fi

# Test that the version number is consistent during building and testing
if [[ "$PYTORCH_BUILD_NUMBER" -gt 1 ]]; then
    expected_version="${PYTORCH_BUILD_VERSION}.post${PYTORCH_BUILD_NUMBER}"
else
    expected_version="${PYTORCH_BUILD_VERSION}"
fi
echo "Checking that we are testing the package that is just built"
python -c "import torch; exit(0 if torch.__version__ == '$expected_version' else 1)"

# Test that CUDA builds are setup correctly
if [[ "$cuda_ver" != 'cpu' ]]; then
    cuda_installed=1
    nvidia-smi || cuda_installed=0
    if [[ "$cuda_installed" == 0 ]]; then
      echo "Skip CUDA tests for machines without a Nvidia GPU card"
    else
      # Test CUDA archs
      echo "Checking that CUDA archs are setup correctly"
      timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()'

      # These have to run after CUDA is initialized
      echo "Checking that magma is available"
      python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)'
      echo "Checking that CuDNN is available"
      python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)'
    fi
fi

# Check that OpenBlas is not linked to on MacOS
if [[ "$(uname)" == 'Darwin' ]]; then
    echo "Checking the OpenBLAS is not linked to"
    all_dylibs=($(find "$(python -c "import site; print(site.getsitepackages()[0])")"/torch -name '*.dylib'))
    for dylib in "${all_dylibs[@]}"; do
        if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then
            echo "Found openblas as a dependency of $dylib"
            echo "Full dependencies is: $(otool -L $dylib)"
            exit 1
        fi
    done

    echo "Checking that OpenMP is available"
    python -c "import torch; exit(0 if torch.backends.openmp.is_available() else 1)"
fi

popd

# TODO re-enable the other tests after the nightlies are moved to CI. This is
# because the binaries keep breaking, often from additional tests, that aren't
# real problems. Once these are on circleci and a smoke-binary-build is added
# to PRs then this should stop happening and these can be re-enabled.
echo "Not running unit tests. Hopefully these problems are caught by CI"
exit 0


##############################################################################
# Running unit tests (except not right now)
##############################################################################
echo "$(date) :: Starting tests for $package_type package for python$py_ver and $cuda_ver"

# We keep track of exact tests to skip, as otherwise we would be hardly running
# any tests. But b/c of issues working with pytest/normal-python-test/ and b/c
# of special snowflake tests in test/run_test.py we also take special care of
# those
tests_to_skip=()

#
# Entire file exclusions
##############################################################################
entire_file_exclusions=("-x")

# cpp_extensions doesn't work with pytest, so we exclude it from the pytest run
# here and then manually run it later. Note that this is only because this
# entire_fil_exclusions flag is only passed to the pytest run
entire_file_exclusions+=("cpp_extensions")

# TODO temporary line to fix next days nightlies, but should be removed when
# issue is fixed
entire_file_exclusions+=('type_info')

if [[ "$cuda_ver" == 'cpu' ]]; then
    # test/test_cuda.py exits early if the installed torch is not built with
    # CUDA, but the exit doesn't work when running with pytest, so pytest will
    # still try to run all the CUDA tests and then fail
    entire_file_exclusions+=("cuda")
    entire_file_exclusions+=("nccl")
fi

if [[ "$(uname)" == 'Darwin' || "$OSTYPE" == "msys" ]]; then
    # pytest on Mac doesn't like the exits in these files
    entire_file_exclusions+=('c10d')
    entire_file_exclusions+=('distributed')

    # pytest doesn't mind the exit but fails the tests. On Mac we run this
    # later without pytest
    entire_file_exclusions+=('thd_distributed')
fi


#
# Universal flaky tests
##############################################################################

# RendezvousEnvTest sometimes hangs forever
# Otherwise it will fail on CUDA with
#   Traceback (most recent call last):
#     File "test_c10d.py", line 179, in test_common_errors
#       next(gen)
#   AssertionError: ValueError not raised
tests_to_skip+=('RendezvousEnvTest and test_common_errors')

# This hung forever once on conda_3.5_cu92
tests_to_skip+=('TestTorch and test_sum_dim')

# test_trace_warn isn't actually flaky, but it doesn't work with pytest so we
# just skip it
tests_to_skip+=('TestJit and test_trace_warn')
#
# Python specific flaky tests
##############################################################################

# test_dataloader.py:721: AssertionError
# looks like a timeout, but interestingly only appears on python 3
if [[ "$py_ver" == 3* ]]; then
    tests_to_skip+=('TestDataLoader and test_proper_exit')
fi

#
# CUDA flaky tests, all package types
##############################################################################
if [[ "$cuda_ver" != 'cpu' ]]; then

    #
    # DistributedDataParallelTest
    # All of these seem to fail
    tests_to_skip+=('DistributedDataParallelTest')

    #
    # RendezvousEnvTest
    # Traceback (most recent call last):
    #   File "test_c10d.py", line 201, in test_nominal
    #     store0, rank0, size0 = next(gen0)
    #   File "/opt/python/cp36-cp36m/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 131, in _env_rendezvous_handler
    #     store = TCPStore(master_addr, master_port, start_daemon)
    # RuntimeError: Address already in use
    tests_to_skip+=('RendezvousEnvTest and test_nominal')

    #
    # TestCppExtension
    #
    # Traceback (most recent call last):
    #   File "test_cpp_extensions.py", line 134, in test_jit_cudnn_extension
    #     with_cuda=True)
    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 552, in load
    #     with_cuda)
    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 729, in _jit_compile
    #     return _import_module_from_library(name, build_directory)
    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 867, in _import_module_from_library
    #     return imp.load_module(module_name, file, path, description)
    #   File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 243, in load_module
    #     return load_dynamic(name, filename, file)
    #   File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 343, in load_dynamic
    #     return _load(spec)
    #   File "<frozen importlib._bootstrap>", line 693, in _load
    #   File "<frozen importlib._bootstrap>", line 666, in _load_unlocked
    #   File "<frozen importlib._bootstrap>", line 577, in module_from_spec
    #   File "<frozen importlib._bootstrap_external>", line 938, in create_module
    #   File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
    # ImportError: libcudnn.so.7: cannot open shared object file: No such file or directory
    tests_to_skip+=('TestCppExtension and test_jit_cudnn_extension')

    #
    # TestCuda
    #

    # 3.7_cu80
    #  RuntimeError: CUDA error: out of memory
    tests_to_skip+=('TestCuda and test_arithmetic_large_tensor')

    # 3.7_cu80
    # RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1538097262541/work/aten/src/THC/THCTensorCopy.cu:205
    tests_to_skip+=('TestCuda and test_autogpu')

    #
    # TestDistBackend
    #

    # Traceback (most recent call last):
    #   File "test_thd_distributed.py", line 1046, in wrapper
    #     self._join_and_reduce(fn)
    #   File "test_thd_distributed.py", line 1108, in _join_and_reduce
    #     self.assertEqual(p.exitcode, first_process.exitcode)
    #   File "/pytorch/test/common.py", line 399, in assertEqual
    #     super(TestCase, self).assertEqual(x, y, message)
    # AssertionError: None != 77 :
    tests_to_skip+=('TestDistBackend and test_all_gather_group')
    tests_to_skip+=('TestDistBackend and test_all_reduce_group_max')
    tests_to_skip+=('TestDistBackend and test_all_reduce_group_min')
    tests_to_skip+=('TestDistBackend and test_all_reduce_group_sum')
    tests_to_skip+=('TestDistBackend and test_all_reduce_group_product')
    tests_to_skip+=('TestDistBackend and test_barrier_group')
    tests_to_skip+=('TestDistBackend and test_broadcast_group')

    # Traceback (most recent call last):
    #   File "test_thd_distributed.py", line 1046, in wrapper
    #     self._join_and_reduce(fn)
    #   File "test_thd_distributed.py", line 1108, in _join_and_reduce
    #     self.assertEqual(p.exitcode, first_process.exitcode)
    #   File "/pytorch/test/common.py", line 397, in assertEqual
    #     super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
    # AssertionError: 12 not less than or equal to 1e-05
    tests_to_skip+=('TestDistBackend and test_barrier')

    # Traceback (most recent call last):
    #   File "test_distributed.py", line 1267, in wrapper
    #     self._join_and_reduce(fn)
    #   File "test_distributed.py", line 1350, in _join_and_reduce
    #     self.assertEqual(p.exitcode, first_process.exitcode)
    #   File "/pytorch/test/common.py", line 399, in assertEqual
    #     super(TestCase, self).assertEqual(x, y, message)
    # AssertionError: None != 1
    tests_to_skip+=('TestDistBackend and test_broadcast')

    # Memory leak very similar to all the conda ones below, but appears on manywheel
    # 3.6m_cu80
    # AssertionError: 1605632 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_vae_cuda leaked 1605632 bytes CUDA memory on device 0
    tests_to_skip+=('TestEndToEndHybridFrontendModels and test_vae_cuda')

    # ________________________ TestNN.test_embedding_bag_cuda ________________________
    #
    # self = <test_nn.TestNN testMethod=test_embedding_bag_cuda>
    # dtype = torch.float32
    #
    #     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
    #     @repeat_test_for_types(ALL_TENSORTYPES)
    #     @skipIfRocm
    #     def test_embedding_bag_cuda(self, dtype=torch.float):
    #         self._test_EmbeddingBag(True, 'sum', False, dtype)
    #         self._test_EmbeddingBag(True, 'mean', False, dtype)
    #         self._test_EmbeddingBag(True, 'max', False, dtype)
    #         if dtype != torch.half:
    #             # torch.cuda.sparse.HalfTensor is not enabled.
    #             self._test_EmbeddingBag(True, 'sum', True, dtype)
    # >           self._test_EmbeddingBag(True, 'mean', True, dtype)
    #
    # test_nn.py:2144:
    # _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
    # test_nn.py:2062: in _test_EmbeddingBag
    #     _test_vs_Embedding(N, D, B, L)
    # test_nn.py:2059: in _test_vs_Embedding
    #     self.assertEqual(es_weight_grad, e.weight.grad, needed_prec)
    # common.py:373: in assertEqual
    #     assertTensorsEqual(x, y)
    # common.py:365: in assertTensorsEqual
    #     self.assertLessEqual(max_err, prec, message)
    # E   AssertionError: tensor(0.0000, device='cuda:0', dtype=torch.float32) not less than or equal to 2e-05 :
    #  1 failed, 1202 passed, 19 skipped, 2 xfailed, 796 warnings in 1166.73 seconds =
    # Traceback (most recent call last):
    #   File "test/run_test.py", line 391, in <module>
    #     main()
    #   File "test/run_test.py", line 383, in main
    #     raise RuntimeError(message)
    tests_to_skip+=('TestNN and test_embedding_bag_cuda')
fi

##############################################################################
# MacOS specific flaky tests
##############################################################################

if [[ "$(uname)" == 'Darwin' ]]; then
    # TestCppExtensions by default uses a temp folder in /tmp. This doesn't
    # work for this Mac machine cause there is only one machine and /tmp is
    # shared. (All the linux builds are on docker so have their own /tmp).
    tests_to_skip+=('TestCppExtension')
fi

# Turn the set of tests to skip into an invocation that pytest understands
excluded_tests_logic=''
for exclusion in "${tests_to_skip[@]}"; do
    if [[ -z "$excluded_tests_logic" ]]; then
        # Only true for i==0
        excluded_tests_logic="not ($exclusion)"
    else
        excluded_tests_logic="$excluded_tests_logic and not ($exclusion)"
    fi
done


##############################################################################
# Run the tests
##############################################################################
echo
echo "$(date) :: Calling 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"

python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k "'" "$excluded_tests_logic" "'"

echo
echo "$(date) :: Finished 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"

# cpp_extensions don't work with pytest, so we run them without pytest here,
# except there's a failure on CUDA builds (documented above), and
# cpp_extensions doesn't work on a shared mac machine (also documented above)
if [[ "$cuda_ver" == 'cpu' && "$(uname)" != 'Darwin' ]]; then
    echo
    echo "$(date) :: Calling 'python test/run_test.py -v -i cpp_extensions'"
    python test/run_test.py -v -i cpp_extensions
    echo
    echo "$(date) :: Finished 'python test/run_test.py -v -i cpp_extensions'"
fi

# thd_distributed can run on Mac but not in pytest
if [[ "$(uname)" == 'Darwin' ]]; then
    echo
    echo "$(date) :: Calling 'python test/run_test.py -v -i thd_distributed'"
    python test/run_test.py -v -i thd_distributed
    echo
    echo "$(date) :: Finished 'python test/run_test.py -v -i thd_distributed'"
fi