sglang_v0.5.2/nvshmem_src/perftest/perftestRunner.py

#!/usr/bin/env python3
import os
import time
import sys
from subprocess import Popen
import perftestCommon
import logging

NPE_START_END_STEP = [1, 2, 4, 8, 16]

def post_process(nvshmem_install_path, perftest_install_path, ftesto, fteste):
  Popen(['echo', 'nvidia-smi\r\n'], stdout=fteste)
  Popen(['nvidia-smi'], stdout=fteste)
  time.sleep(5)
  Popen(['echo', '\r\n'], stdout=fteste)
  Popen(['echo', 'nvidia-smi topo -m\r\n'], stdout=fteste)
  Popen(['nvidia-smi', 'topo', '-m'], stdout=fteste)
  time.sleep(5)
  Popen(['echo', '\r\n'], stdout=fteste)
  if 'CUDA_HOME' in os.environ:
    Popen(['echo', os.environ['CUDA_HOME']+'\r\n'], stdout=fteste)
  else:
    Popen(['echo', 'CUDA_HOME not set\r\n'], stdout=fteste)

  if 'MPI_HOME' in os.environ:
    Popen(['echo', os.environ['MPI_HOME']+'\r\n'], stdout=fteste)
  else:
    Popen(['echo', 'MPI_HOME not set\r\n'], stdout=fteste)

  if 'PATH' in os.environ:
    Popen(['echo', os.environ['PATH']+'\r\n'], stdout=fteste)

  if 'LD_LIBRARY_PATH' in os.environ:
    Popen(['echo', os.environ['LD_LIBRARY_PATH']+'\r\n'], stdout=fteste)

  if 'CUDA_VISIBLE_DEVICES' in os.environ:
    Popen(['echo', os.environ['CUDA_VISIBLE_DEVICES']+'\r\n'], stdout=fteste)
  else:
    Popen(['echo', 'CUDA_VISIBLE_DEVICES not set\r\n'], stdout=fteste)

  Popen(['ldd', nvshmem_install_path+'/bin/nvshmrun.hydra'], stdout=fteste)
  time.sleep(5)
  Popen(['echo', 'Failed tests : \r\n'], stdout=fteste)
  for binary_cmdline in perftestCommon.failed_binary_cmdlines_list:
    Popen(['echo', perftest_install_path+binary_cmdline[1]+'\r\n'], stdout=fteste)
    Popen(['echo', 'ldd '+perftest_install_path+binary_cmdline[0]+'\r\n'], stdout=fteste)
    Popen(['ldd', perftest_install_path+binary_cmdline[0]], stdout=fteste)
    time.sleep(5)


if __name__ == '__main__':
  logging.basicConfig(level=logging.INFO)
  logger = logging.getLogger('[perftestRunner]')

  args_list = sys.argv[1:]

  if "--interact" in args_list or "--partial" in args_list:
    try:
      from bullet import colors
      from bullet import Check, keyhandler, styles
      from bullet import ScrollBar
      from bullet.charDef import NEWLINE_KEY
    except Exception as e:
      print("Please pip3 install bullet and use python3.")
      logger.info(e)
      sys.exit(255)

    class MinMaxCheck(Check):
      def __init__(self, min_selections=0, max_selections=None, s_all="0", *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.min_selections = min_selections
        self.max_selections = max_selections
        self.s_all = s_all
        if max_selections is None:
          self.max_selections = len(self.choices)
        if self.s_all != "0":
          self.checked = [True] * len(self.choices)

      @keyhandler.register(NEWLINE_KEY)
      def accept(self):
        if self.valid():
            return super().accept()

      def valid(self):
        return self.min_selections <= sum(1 for c in self.checked if c) <= self.max_selections

    def choose_cases(list_file_case_list, show_text="", select_all="0"):
      client = MinMaxCheck(
        s_all = select_all,
        prompt = show_text,
        min_selections = 1,
        max_selections = 999,
        return_index = True,
        choices = list_file_case_list,
        check_color = colors.foreground["red"],
        check_on_switch = colors.foreground["red"],
        word_color = colors.foreground["black"],
        word_on_switch = colors.foreground["black"],
        background_color = colors.background["white"],
        background_on_switch = colors.background["yellow"]
      )

      return client.launch()

    def select_case(show_text, list_to_select):
      cli = ScrollBar(
        show_text,
        list_to_select,
        height = 7,
        align = 7,
        margin = 0,
        pointer = "->",
        word_color = colors.foreground["black"],
        word_on_switch = colors.foreground["black"],
        background_color = colors.background["green"],
        background_on_switch = colors.background["yellow"],
        return_index = True
      )

      return cli.launch()

  if "--interact" in args_list:
    import glob

    # Host
    if not os.path.exists("hosts.txt"):
      print("Please write all nodes address into hosts.txt. Each line contain one node.")
      sys.exit(255)
    else:
      hosts_content = open("hosts.txt",'r')
      h = []
      for i in hosts_content:
        h.append(i.strip("\n"))

      # Select all hosts defaultly
      hosts_list= choose_cases(h, "Please Select nodes.", "1")
      hosts_content.close()
      hosts = ",".join(hosts_list[0])
      print("================================")
      print("Test on %s" % hosts)
      print("================================")

    # CUDA_HOME
    try:
      cuda_home = os.environ["CUDA_HOME"]
    except Exception as e:
      cuda_home = select_case("There is no CUDA_HOME. Please select:",\
        ["/usr/loca/cuda", "/usr/loca/cuda-11.0", "/usr/loca/cuda-11.8", "/usr/loca/cuda-12.0", "/usr/loca/cuda-12.1", "/usr/loca/cuda-12.2", "/usr/loca/cuda-12.4"])[0]
      os.environ.update({"CUDA_HOME": cuda_home})

    # GDRCopy
    try:
      gdr_home = os.environ["GDRCOPY_HOME"]
    except Exception as e:
      gdr_home = select_case("There is no GDRCOPY_HOME. Please select:",\
        ["/usr", "/usr/lib", "/usr/lib64", "/usr/local/nvshmem_gdrcopy", ""])[0]
      os.environ.update({"GDRCOPY_HOME": gdr_home})

    # MPI_HOME
    try:
      mpi_install_path = os.environ["MPI_HOME"]
    except Exception as e:
      mpi_install_path = select_case("There is no MPI_HOME. Please select:",\
        ["/usr/local/openmpi_nvshmem", "/usr/local/openmpi"])[0]
      os.environ.update({"MPI_HOME": mpi_install_path})

    # Launcher
    launcher_choice = select_case("Please select test launcher:", ["nvshmem.hydra + mpirun + oshrun", "mpirun", "nvshmem.hydra", "oshrun"])[0]
    if launcher_choice == "nvshmem.hydra + mpirun + oshrun":
      launcher_choice = 1
      launcher = "all"
    elif launcher_choice == "mpirun":
      launcher_choice = 0
      launcher = "mpirun"
    elif launcher_choice == "nvshmem.hydra":
      launcher_choice = 2
      launcher = "hydra"
    else:
      launcher_choice = 3
      launcher = "oshrun"

    # PE
    pe = select_case("Please select NPE:", ["1", "2", "4", "8", "16", "32"])[0]
    max_pes_per_node = int(pe)

    # NVSHMEM_HOME
    try:
      nvshmem_home = os.environ['NVSHMEM_HOME']
    except Exception as e:
      nvshmem_home = "%s/nvshmem" % os.getcwd()

    nvshmem_install_path = select_case("Please select NVSHMEM_HOME path:",["/usr/local/nvshmem", nvshmem_home])[0]

    # PERFTEST INSTALL
    try:
      perftest_install_p = os.environ['PERFTEST_INSTALL']
    except Exception as e:
      perftest_install_p = "%s/../perftest/perftest_install" % nvshmem_home

    perftest_install_path = select_case("Please select PERFTEST_INSTALL path:", ["%s/perf/perf_install" % os.getcwd(), perftest_install_p])[0]

    # TIMEOUT
    timeout_str = select_case("Please select TIMEOUT value(seconds):",["30", "60", "100", "200", "400", "800", "1600"])[0]
    timeout = int(timeout_str)

    # list files.
    testlist_files = glob.glob("*.list")
    test_list_name = select_case("Please select testlist file:", testlist_files)[0]

    # bind.sh
    bind_scr = select_case("Run perftest with bind.sh?", ["Yes", "No"])[0]
    if bind_scr == "Yes":
      os.environ["GPUBIND_SCRIPT"] = "../bind.sh"

    # Extra Parameters
    select_extra_parameter_from_list = choose_cases(
                                                   ["Null",
                                                    "NVSHMEM_DEBUG=INFO",
                                                    "NVSHMEM_DEBUG_SUBSYS=ALL",
                                                    "NVSHMEM_DEBUG_SUBSYS=TRANSPORT",
                                                    "NVSHMEM_ENABLE_ERROR_CHECKS=1",
                                                    "NCCL_DEBUG=INFO",
                                                    "NVSHMEM_ENABLE_NIC_PE_MAPPING=1",
                                                    "NVSHMEM_BOOTSTRAP_PMI=PMIX",
                                                    "NVSHMEM_IB_ENABLE_IBGDA=1 NVSHMEM_REMOTE_TRANSPORT=None",
                                                    "NVSHMEM_DISABLE_NCCL=1",
                                                    "NVSHMEM_IB_GID_INDEX=3",
                                                    "NVSHMEM_IB_GID_INDEX=5",
                                                    "NVSHMEM_REMOTE_TRANSPORT=None",
                                                    "NVSHMEM_REMOTE_TRANSPORT=ucx",
                                                    "NVSHMEM_HEAP_KIND=SYSMEM NVSHMEM_ENABLE_RAIL_OPT=1 NVSHMEM_DISABLE_GDRCOPY=1",
                                                    "NVSHMEM_DISABLE_CUDA_VMM=1",
                                                    "NVSHMEM_IB_ENABLE_RELAXED_ORDERING=0",
                                                    ], "Attach extra parameter?")[0]

    if "Null" in select_extra_parameter_from_list:
      extra_parameters_string = ""
    else:
      extra_parameters_string = " ".join(select_extra_parameter_from_list)
    print(extra_parameters_string)

    # Partial
    select_testcase_from_list = select_case("Run all perftest cases in testlist?", ["Yes", "No"])[0]

    if select_testcase_from_list == "No":
      logger.info("Please select cases...")
      cases_list=[]
      with open(test_list_name, 'r') as read_list:
        for i in read_list:
          cases_list.append(i.strip("\n"))

      result = choose_cases(cases_list)
      read_list.close()

      with open("%s_partial" % test_list_name, 'w') as partial_list:
        for r in result[0]:
          partial_list.write("%s\n" % r)
      partial_list.close()

      test_list_name = "%s_partial" % test_list_name
    else:
      logger.info("Run all perftest in the list file: %s" % test_list_name)

    logger.info("TEST LIST:")
    os.system('cat %s' % test_list_name)

    try:
      LD_LP = os.environ["LD_LIBRARY_PATH"]
    except Exception as e:
      LD_LP = ""

    try:
      nccl_home = os.environ["NCCL_HOME"]
    except Exception as e:
      nccl_home = ""

    if nccl_home == "":
      LD_LIBRARY_PATH = '%(NVSHMEM_HOME)s/lib:%(MPI_HOME)s/lib:%(CUDA_HOME)s/lib64:%(GDRCOPY_HOME)s/lib64:%(GDRCOPY_HOME)s/lib:%(LD_LIBRARY_PATH)s' % {
      'NVSHMEM_HOME': nvshmem_install_path,
      'MPI_HOME': mpi_install_path,
      'CUDA_HOME': cuda_home,
      'GDRCOPY_HOME': gdr_home,
      'LD_LIBRARY_PATH': LD_LP
      }
    else:
      LD_LIBRARY_PATH = '%(NVSHMEM_HOME)s/lib:%(MPI_HOME)s/lib:%(CUDA_HOME)s/lib64:%(GDRCOPY_HOME)s/lib64:%(GDRCOPY_HOME)s/lib:%(NCCL_HOME)s/lib:%(LD_LIBRARY_PATH)s' % {
      'NVSHMEM_HOME': nvshmem_install_path,
      'MPI_HOME': mpi_install_path,
      'CUDA_HOME': cuda_home,
      'GDRCOPY_HOME': gdr_home,
      'NCCL_HOME': nccl_home,
      'LD_LIBRARY_PATH': LD_LP
      }

    os.environ.update({'LD_LIBRARY_PATH': LD_LIBRARY_PATH})
    print("[SET] LD_LIBRARY_PATH=%s" % LD_LIBRARY_PATH)
    enable_skip = 0

  elif "--nvshmem" in args_list or "--perftest_install" in args_list or "--help" in args_list:

    import argparse

    parser = argparse.ArgumentParser(
      description="NVSHMEM Performance Test Runner", epilog=None,
      formatter_class=argparse.RawDescriptionHelpFormatter)

    parser.add_argument(
      "--mpirun", nargs='?', dest="mpi_install_path",
      default="/usr/local/openmpi_nvshmem", help='MPI HOME')

    parser.add_argument(
      "--nvshmem", nargs='?', dest="nvshmem_install_path",
      help='NVSHMEM HOME')

    parser.add_argument(
      "--launcher", nargs='?', dest="launcher_choice",
      choices=['all', 'mpirun', 'shmem', 'nvshmem'],
      required=True,
      help='Launcher choice: all/shmem/nvshmem/mpirun')

    parser.add_argument(
      "--perftest_install", nargs='?', dest="perftest_install_path",
      required=True, help='The localcation of perf_install folder.')

    parser.add_argument(
      "--test_list", nargs='?', dest="test_list_name",
      required=True, help='Test list file name in the folder.')

    parser.add_argument(
      "--pe", nargs='?', dest="max_pes_per_node",
      required=True, help='Max PEs per node. Int.')

    parser.add_argument(
      "--hosts", nargs='?', dest="hosts",
      required=True, help='Hosts, ip/hostname which can ssh directly. Use comma in different hosts.')

    parser.add_argument(
      "--timeout", nargs='?', dest="timeout",
      required=True, help='Timeout value for each case, unit is second.')

    parser.add_argument(
      "-a", nargs='?', dest="extra_parameters_string",
      required=False, help='Extra parameters. For example -a "NVSHMEM_DEBUG=INFO"')

    parser.add_argument(
      "--partial", dest="partial", action="store_true",
      required=False, help='partial')

    parser.add_argument(
      "--gpubind", nargs='?', dest="bind_scr",
      required=False, help='Bind a script to cmdline to do something. For example, bind gpu to processes.')

    parser.add_argument(
      "--show_data", dest="show_data", action="store_true",
      required=False, help='show perf results on screen. So far we only show minor and max results from each table/case.')

    parser.add_argument(
      "--bootstrap", nargs='?', dest="bootstrap",
      choices=['uid', 'mpi', 'pmi'],
      # NVSHMEM_BOOTSTRAP=MPI
      # NVSHMEMTEST_USE_UID_BOOTSTRAP=1
      default="pmi",
      required=False, help='Select a bootstrap mode')

    parser.add_argument(
      "--bind-to", nargs='?', dest="bind_to_value",
      choices=['none', 'socket', 'numa'],
      default="socket",
      required=False, help='--bind-to value in command line.')

    parser.add_argument(
      "--cases", dest="cases",
      required=False, help='case name list. Use comma join them. If use this option, script will ignore the list files in arguments.')

    args = parser.parse_args()
    enable_skip = 0

    # Args parser
    if args.bind_to_value is None:
      bind_to_value = "socket"
    else:
      bind_to_value = args.bind_to_value
    os.environ["QA_BIND_TO"] = bind_to_value

    if args.bootstrap is None:
      bootstrap = "pmi"
    else:
      bootstrap = args.bootstrap
    os.environ["QA_BOOTSTRAP"] = bootstrap

    if args.launcher_choice is not None:
      launcher = args.launcher_choice
      if launcher == "all":
        launcher_choice = 1
      elif launcher == "shmem":
        launcher_choice = 2
      elif launcher == "nvshmem":
        launcher_choice = 3
      elif launcher == "mpirun":
        launcher_choice = 0
      else:
        logger.error("Unsupported launcher...")
        sys.exit()
      launcher = "-%s" % launcher
      logger.info("launcher_choice is %s" % launcher_choice)
    else:
      logger.info("--launcher <name> required.")
      sys.exit(255)

    if args.mpi_install_path is not None:
      mpi_install_path = args.mpi_install_path
    elif launcher_choice != 3:
      logger.error("--mpirun is required for all/mpirun/shmem launcher. ")
    else:
      pass

    if args.nvshmem_install_path is not None:
      nvshmem_install_path = args.nvshmem_install_path
    elif launcher_choice == 1 or launcher_choice == 3:
      logger.error("--nvshmem is required for all/nvshmem launcher.")
      sys.exit(255)
    else:
      nvshmem_install_path = ""

    if args.perftest_install_path is not None:
      perftest_install_path = args.perftest_install_path

    if args.test_list_name is not None:
      test_list_name = args.test_list_name

    if args.max_pes_per_node is not None:
      max_pes_per_node = int(args.max_pes_per_node)

    if args.hosts is not None:
      hosts = args.hosts

    if args.timeout is not None:
      timeout = int(args.timeout)

    if args.extra_parameters_string is not None:
      extra_parameters_string = args.extra_parameters_string
      logger.info("Will add extra parameters: \"%s\" in perftest commands." % extra_parameters_string)
    else:
      extra_parameters_string = ""

    if args.bind_scr is not None:
      bind_scr = args.bind_scr
      if os.access(bind_scr, os.F_OK):
        logger.info("Find script file: %s" % bind_scr)
      else:
        logger.error("Failed to find %s" % bind_scr)
        sys.exit(245)
    else:
      bind_scr = ""
    os.environ["GPUBIND_SCRIPT"] = bind_scr

    if args.show_data:
      show_data = "Yes"
    else:
      show_data = "No"
    os.environ["SHOW_PERF_DATA"] = show_data

    if args.cases is not None:
      cases_list = args.cases.split(",")
    else:
      cases_list = []

    if args.partial:

      logger.info("Please select cases...")
      cases_list=[]
      with open(test_list_name, 'r') as read_list:
        for i in read_list:
          cases_list.append(i.strip("\n"))

      result = choose_cases(cases_list)
      read_list.close()

      with open("%s_partial" % test_list_name, 'w') as partial_list:
        for r in result[0]:
          partial_list.write("%s\n" % r)
      partial_list.close()

      test_list_name = "%s_partial" % test_list_name
    else:
      if cases_list != []:
        # Put cases into a tempfile and with a file name.
        from tempfile import NamedTemporaryFile
        with NamedTemporaryFile('w+t', delete=False) as cases_f:
          for case in cases_list:
            cases_f.write('%s\n' % case)
        test_list_name = cases_f.name

        logger.info("Temp Cases File is %s" % cases_f.name)
      else:
        logger.info("Run all tests in the list file: %s" % test_list_name)

  else:
    mpi_install_path = '/usr/local/openmpi-3.0.1'
    nvshmem_install_path = '/usr/local/nvshmem'
    perftest_install_path = '/usr/local/nvshmem-perftest'
    launcher_choice = 0
    test_list_name = 'perftest-p2p.list'
    max_pes_per_node = 8
    hosts = 'localhost'
    timeout = 60
    enable_skip = 0
    extra_parameters_string = ""
    launcher=""

    if (len(sys.argv) == 1) or (len(sys.argv) == 9) or (len(sys.argv) == 10):
      if (len(sys.argv) == 9) or (len(sys.argv) == 10):
        mpi_install_path = sys.argv[1]
        nvshmem_install_path = sys.argv[2]
        perftest_install_path = sys.argv[3]
        launcher_choice = int(sys.argv[4])
        test_list_name = sys.argv[5]
        max_pes_per_node = int(sys.argv[6])
        hosts = sys.argv[7]
        timeout = int(sys.argv[8])
        launcher=""

      if (len(sys.argv) == 10):
        extra_parameters_string = sys.argv[9]
        logger.info("Will add extra parameters: \"%s\" in perftest commands." % extra_parameters_string)

    else:
      logger.error('Include libmpi.so and libcudart.so in LD_LIBRARY_PATH and provide these 8 or 9 arguments - \
          1) MPI install path, 2) NVSHMEM install path, 3) perftest install path, 4) all launchers or single launcher (1/0), \
          5) test list name (P2P-PCIE, PCP-NVLink, IB) 6) maximum available GPUs per node to run test, 7) comma separated host list, 8) timeout (in seconds) per test, \
          9) Extra parameters(Eg. NCCL_INFO=WARN)')
      sys.exit()

  with open('perftest-'+time.strftime("%Y%m%d-%H%M%S")+launcher+'.out', 'w') as ftesto:
    with open('perftest-'+time.strftime("%Y%m%d-%H%M%S")+launcher+'.err', 'w') as fteste:

      perftestCommon.walk_dir(nvshmem_install_path, mpi_install_path, perftest_install_path, launcher_choice, NPE_START_END_STEP, max_pes_per_node, hosts, timeout,
                              enable_skip, test_list_name, extra_parameters_string, ftesto, fteste)
      if perftestCommon.failed_binary_cmdlines_list:
        post_process(nvshmem_install_path, perftest_install_path, ftesto, fteste)

  logger.info("OUT file is %s" % ftesto.name)
  logger.info("ERR file is %s" % fteste.name)

  try:
    # Try to remove tempfile.
    os.remove(cases_f.namesen)
  except Exception as e:
    pass