/* * this file is used to test mscclpp_allreduce.cu using mpirun * this file is adapted from https://github.com/flashinfer-ai/flashinfer/blob/v0.2.5/src/test_sum_all_reduce.cu usage: cd PATH-TO-THIS-FILE export MPI_HOME=/usr/local/mpi # export MPI_HOME=/opt/hpcx/ompi/ export MSCCLPP_HOME=/workspace/test/mscclpp nvcc -O2 -arch=native -std=c++17 test_mscclpp_allreduce.cu \ -o test_mscclpp_allreduce -D_GLIBCXX_USE_CXX11_ABI=0 \ -I${MSCCLPP_HOME}/include -L${MSCCLPP_HOME}/build -lmscclpp \ -lnccl -I${MPI_HOME}/include -L${MPI_HOME}/lib -lmpi /opt/hpcx/ompi/bin/ mpirun --allow-run-as-root -H 127.0.0.1:8 -np 8 \ --map-by ppr:8:node \ --mca btl_openib_warn_no_device_params_found 0 \ --mca btl_tcp_if_include bond0 \ --allow-run-as-root -np 8 \ -x NCCL_RUNTIME_CONNECT=0 -x NCCL_IB_GID_INDEX=3 -x NCCL_DEBUG=WARN \ -x LD_PRELOAD=${MSCCLPP_HOME}/build/libmscclpp.so ./test_mscclpp_allreduce */ #include #include #include #include #ifndef CHECK_CUDA_SUCCESS #define CHECK_CUDA_SUCCESS(cmd) \ do { \ cudaError_t e = cmd; \ if (e != cudaSuccess) { \ printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, cudaGetErrorString(e)); \ exit(EXIT_FAILURE); \ } \ } while (0) #endif #include #include "mscclpp_allreduce.cuh" template bool isclose(T a, T b, float rtol = 1e-5, float atol = 1e-8) { return fabs(a - b) <= (atol + rtol * fabs(b)); } int main(int argc, char* argv[]) { // init mpi MPI_Init(&argc, &argv); printf("MPI Initialized.\n"); int nranks, rank; // get work size and rank id MPI_Comm_size(MPI_COMM_WORLD, &nranks); MPI_Comm_rank(MPI_COMM_WORLD, &rank); cudaSetDevice(rank); printf("nranks: %d, rank: %d\n", nranks, rank); // init host and device buffers using T = float; using ReduceT = float; const size_t num_elems = 2 * 1024 * 1024; std::vector host_buf(num_elems); for (uint32_t i = 0; i < num_elems; ++i) { host_buf[i] = T(i + rank); } thrust::device_vector device_buf(host_buf); const size_t buf_size_in_bytes = num_elems * sizeof(T); std::vector host_result_buf(num_elems); thrust::device_vector device_result_buf(host_result_buf); std::vector host_scratch_buf(num_elems * 8); for (uint32_t i = 0; i < num_elems; ++i) { host_scratch_buf[i] = 1; } thrust::device_vector device_scratch_buf(host_scratch_buf); std::vector host_put_buf(num_elems); thrust::device_vector device_put_buf(host_put_buf); mscclpp::UniqueId unique_id; if (rank == 0) unique_id = mscclpp::TcpBootstrap::createUniqueId(); MPI_Bcast(&unique_id, sizeof(unique_id), MPI_BYTE, 0, MPI_COMM_WORLD); std::vector rank_to_node(nranks); std::vector rank_to_ib(nranks); for (int i = 0; i < nranks; i++) { rank_to_node[i] = i / 8; rank_to_ib[i] = i % 8; } cudaStream_t s; CHECK_CUDA_SUCCESS(cudaStreamCreate(&s)); CHECK_CUDA_SUCCESS(cudaStreamSynchronize(s)); if (nranks == 8) { auto context = std::make_shared( unique_id, rank, nranks, thrust::raw_pointer_cast(device_scratch_buf.data()), buf_size_in_bytes * 8, rank_to_node, rank_to_ib); printf("rank: %d, Msccl1NodeLLcontext setup.\n", rank); MPI_Barrier(MPI_COMM_WORLD); context->allreduce( s, thrust::raw_pointer_cast(device_buf.data()), thrust::raw_pointer_cast(device_result_buf.data()), device_buf.size()); } else if (nranks == 16) { // TODO: this branch is untested since there is something wrong with mpirun in my test machince auto context = std::make_shared( unique_id, rank, nranks, thrust::raw_pointer_cast(device_scratch_buf.data()), buf_size_in_bytes * 8, thrust::raw_pointer_cast(device_put_buf.data()), buf_size_in_bytes, rank_to_node, rank_to_ib); printf("rank: %d, Msccl2NodeLLcontext setup.\n", rank); MPI_Barrier(MPI_COMM_WORLD); context->allreduce( s, thrust::raw_pointer_cast(device_buf.data()), thrust::raw_pointer_cast(device_result_buf.data()), device_buf.size()); } // check result correctness thrust::host_vector host_buf_result = device_result_buf; size_t num_results_error_atol_1e_3_rtol_1e_3 = 0; bool nan_detected = false; for (uint32_t i = 0; i < num_elems; ++i) { T expected = T(i * nranks + (nranks - 1) * nranks / 2); if (std::isnan(float(host_buf_result[i]))) { nan_detected = true; } if (!isclose(float(host_buf_result[i]), float(expected), 1e-3, 1e-3)) { num_results_error_atol_1e_3_rtol_1e_3++; } } float result_accuracy = 1. - float(num_results_error_atol_1e_3_rtol_1e_3) / float(num_elems); printf("rank: %d, nan_detected: %d accuracy: %f\n", rank, nan_detected, result_accuracy); CHECK_CUDA_SUCCESS(cudaStreamDestroy(s)); MPI_Finalize(); return 0; }