/* * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Licensed under the Apache License v2.0 with LLVM Exceptions. * See https://nvidia.github.io/NVTX/LICENSE.txt for license information. */ #pragma once // [Best practices for injection implementions] // Set NVTX_NO_IMPL to make the NVTX headers define the API types and function // prototypes only, not the inline impls. Be sure on GCC to use -Wno-unused-function // to avoid warnings for undefined static prototypes. #define NVTX_NO_IMPL // [Best practices for injection implementions] // Microsoft's compiler issues warning 26812 when compiling a C-style enum in C++ // instead of using the new "enum class" style. Since the NVTX headers are written in // C, the enums defined there will trigger this warning. Use this code to disable it. #if defined(_MSC_VER) #pragma warning (disable : 26812) #endif #include #include #include #include #include namespace NvtxInjectionHelper { //============ Generic utility functions ====================================== inline namespace detail_generic { //--- maxVal --- // Variadic alternative to std::max that doesn't need an initializer list, // doesn't conflict with MSVC's #define for max, and has no trouble with // constexpr usage. Handles having zero parameters passed, returning // std::numeric_limits::min in that case, as long as the template // parameter T is explicitly specified. Takes arguments by value, which // avoids the issue of returning a reference to something when called // with no parameters. Example uses: // template constexpr inline T maxVal() { return std::numeric_limits::min(); } template constexpr inline T maxVal(T first, Rest... rest) { T restMax = maxVal(rest...); return (first > restMax) ? first : restMax; } //--- tuple size helper --- // Generic utility for getting the size of a std::tuple, using its value // as opposed to std::tuple_size<> which takes the tuple's type. In a // generic lambda where the parameter's type is "auto", it's extra work // to figure out the type template constexpr inline size_t size_of_tuple(std::tuple const&) { return sizeof...(Ts); } //--- tuple helpers to loop over items --- // We need a way to call a function f on each element of a tuple, like this: // // f(std::get<0>(t)); // f(std::get<1>(t)); // f(std::get<2>(t)); etc. // // We want something like this, where "Is" is a parameter pack of 0,1,2,etc.: // // f(std::get(t))...; // // ...but parameter pack expansion is only allowed within the context of args // to a function call or a braced init list. We also must handle the case // where the tuple is empty, we should discard the results of all the calls // to f, even if it returns different types for each call. Easiest way to // do this is by forwarding the elements of the tuple as args to a helper // function that calls f on each arg, like this: // // for_each_in_parameter_pack(f, std::get(t)...); // // But we also want perfect forwarding of the function and the tuple. // The following utilites "for_each_in_tuple", "for_each_in_tuple_helper", // and "for_each_in_parameter_pack" are provided to allow code such as this // "loop" over tuple elements. Note that "thing" in each iteration can be // a different type, because a tuple's elements may be different types, so // generic lambdas are very convenient here: // // for_each_in_tuple(tuple_of_things, // [](auto const& thing) // { // std::cout << thing << std::endl; // } // ); template inline void for_each_in_parameter_pack(F&& f) {} template inline void for_each_in_parameter_pack(F&& f, First const& first, Rest const&... rest) { // Call f on the first argument, and explicitly discard the result by casting to void static_cast(std::forward(f)(first)); // Recurse to call f on the rest of the arguments for_each_in_parameter_pack(std::forward(f), rest...); } // Generic utility for calling a function f for each element of a tuple t template inline void for_each_in_tuple_helper(T const& t, F&& f, std::index_sequence) { for_each_in_parameter_pack( std::forward(f), std::get(t)... ); } template inline void for_each_in_tuple(std::tuple const& t, F&& f) { for_each_in_tuple_helper(t, std::forward(f), std::make_index_sequence()); } } // namespace detail_generic //============ NVTX injection helper internal utilities ======================= inline namespace detail_nvtx { //--- id_t --- // Define generic integer type for holding all modules' callback id enum values. // These are used as indexes into the handler arrays for each module. using id_t = unsigned int; //--- id_v --- // Nickname for std::integral_constant, which is used for all callback enum values. // Using an integral constant allows performing correctness checks at compile time, // which is not possible in C++ with function parameter values, only their types. // Including the value in the type works around this problem. template using id_v = std::integral_constant; //--- NVTX_CBID --- // Macro to succinctly turn an NVTX_CBID_* enum value into a compile-time constant, // using std::integral_constant. This makes it possible to perform correctness // checks at compile time, for example ensuring a handler's signature is compatible // with the NVTX API call it is being installed to handle. Syntax is meant to look // familiar. For example, replace: // NVTX_CBID_CORE_MarkA // with: // NVTX_CBID(CORE_MarkA) // when passing CBID values to NvtxInjectionHelper::MakeHandlerTable. #define NVTX_CBID(func) NvtxInjectionHelper::id_v{} //--- EnumTypeToModuleId --- // Template variable to map from call id enum types to module id values (see nvtxTypes.h) // For example, EnumTypeToModuleId == NVTX_CB_MODULE_CORE. template constexpr static NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_INVALID; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_CORE; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_CUDA; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_OPENCL; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_CUDART; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_CORE2; template<> constexpr NvtxCallbackModule EnumTypeToModuleId = NVTX_CB_MODULE_SYNC; //--- IdToModuleId --- // Helper for EnumTypeToModuleId to convert directly from an integral_constant of a call id enum // to its module id. For example, since NVTX_CBID(CORE_MarkA) is an integral_constant, it cannot // be used directly as in EnumTypeToModuleId, since NVTX_CBID(CORE_MarkA)'s // type is std::integral_constant. This helper extracts // the enum's type from the integral_constant, allowing EnumConstToModuleId. template constexpr static NvtxCallbackModule IdToModuleId = EnumTypeToModuleId; //--- IdToHandlerType // Template using to map from call id values to matching function pointer types. template struct IdToHandlerType { using type = nullptr_t; }; // Macro for defining IdToHandlerType specializations for each id. // mod = module, i.e. CORE, CORE2 // func = prefixless function name, i.e. MarkEx, DomainCreateA // impl = impl or fakeimpl, depending on whether or not to use real types or the // nvtxTypes.h "fakeimpl" types, which don't depend on CUDA/OpenCL headers. #define NVTX_ID_TO_TYPE(mod, func, impl) \ template <> struct IdToHandlerType { using type = nvtx##func##_##impl##_fntype; } NVTX_ID_TO_TYPE(CORE, MarkEx , impl); NVTX_ID_TO_TYPE(CORE, MarkA , impl); NVTX_ID_TO_TYPE(CORE, MarkW , impl); NVTX_ID_TO_TYPE(CORE, RangeStartEx , impl); NVTX_ID_TO_TYPE(CORE, RangeStartA , impl); NVTX_ID_TO_TYPE(CORE, RangeStartW , impl); NVTX_ID_TO_TYPE(CORE, RangeEnd , impl); NVTX_ID_TO_TYPE(CORE, RangePushEx , impl); NVTX_ID_TO_TYPE(CORE, RangePushA , impl); NVTX_ID_TO_TYPE(CORE, RangePushW , impl); NVTX_ID_TO_TYPE(CORE, RangePop , impl); NVTX_ID_TO_TYPE(CORE, NameCategoryA, impl); NVTX_ID_TO_TYPE(CORE, NameCategoryW, impl); NVTX_ID_TO_TYPE(CORE, NameOsThreadA, impl); NVTX_ID_TO_TYPE(CORE, NameOsThreadW, impl); NVTX_ID_TO_TYPE(CORE2, DomainMarkEx , impl); NVTX_ID_TO_TYPE(CORE2, DomainRangeStartEx , impl); NVTX_ID_TO_TYPE(CORE2, DomainRangeEnd , impl); NVTX_ID_TO_TYPE(CORE2, DomainRangePushEx , impl); NVTX_ID_TO_TYPE(CORE2, DomainRangePop , impl); NVTX_ID_TO_TYPE(CORE2, DomainResourceCreate , impl); NVTX_ID_TO_TYPE(CORE2, DomainResourceDestroy, impl); NVTX_ID_TO_TYPE(CORE2, DomainNameCategoryA , impl); NVTX_ID_TO_TYPE(CORE2, DomainNameCategoryW , impl); NVTX_ID_TO_TYPE(CORE2, DomainRegisterStringA, impl); NVTX_ID_TO_TYPE(CORE2, DomainRegisterStringW, impl); NVTX_ID_TO_TYPE(CORE2, DomainCreateA , impl); NVTX_ID_TO_TYPE(CORE2, DomainCreateW , impl); NVTX_ID_TO_TYPE(CORE2, DomainDestroy , impl); NVTX_ID_TO_TYPE(CORE2, Initialize , impl); #undef NVTX_ID_TO_TYPE //--- CheckHandlerTypeMatchesId --- // Compile-time check provides easy-to-read error if FuncT isn't compatible with EnumT template constexpr inline void CheckHandlerTypeMatchesId() { using ExpectedFuncT = typename IdToHandlerType::type; static_assert(std::is_same(), "NVTX Injection Helper: The provided handler function's signature does not match the NVTX API for the given call id."); } //--- Handler --- // Represents id/handler pair for an NVTX call. Provides: // - the call's id (NVTX_CBID_* enum values) // - handler function pointer // Preserves the type of the function as a template parameter. // Erases the type of the enum, so it's not module-specific anymore. // Allows being constructed and placed into a container at compile time, then // later at run time doing the run-time-only cast of the function pointer. // This enables processing of ids to occur at compile time. template class Handler { public: id_t id; FuncT pfn; template constexpr Handler(id_v e, FuncT pfn_) : id(static_cast(EnumVal)) // Erase enum's type , pfn(pfn_) {} NvtxFunctionPointer Address() const noexcept { return reinterpret_cast(pfn); } }; //--- MakeHandler --- // "Make" function for Handler to automatically deduce types from parameters template constexpr inline Handler MakeHandler(IdT id_, FuncT func) { CheckHandlerTypeMatchesId(); return Handler(id_, func); } //--- ModuleHandlerTable --- // Represents the set of Handlers for one module. Provides: // - the module's id (NVTX_CB_MODULE_* enum values) // - iterable container of id/handler pairs (empty means skip getting etbl for module) // - highest call id value of handler in module (to confirm client has sufficient size) // - a method to assign all the stored handlers into a client's handler table // These objects can be constructed at compile time, including the highest call id used. template class ModuleHandlerTable { public: using tuple_t = std::tuple...>; static constexpr NvtxCallbackModule moduleId = mod; tuple_t handlers; id_t highestIdUsed; constexpr ModuleHandlerTable(tuple_t t) : handlers(t) , highestIdUsed(FindHighestId(t)) {} void AssignToClient(NvtxFunctionTable clientTable) const noexcept { for_each_in_tuple(handlers, [clientTable](auto const& handler) { if (handler.id != 0 && handler.pfn != nullptr) { *clientTable[handler.id] = handler.Address(); } } ); } private: template static constexpr id_t FindHighestIdHelper(tuple_t t, std::index_sequence) { return maxVal(std::get(t).id...); } static constexpr id_t FindHighestId(tuple_t t) { return FindHighestIdHelper(t, std::make_index_sequence()); } }; //--- MakeModuleHandlerTuple --- // MakeModuleHandlerTuple takes NvtxCallbackModule "mod" as a template parameter, // and loops over pairs of arguments (an enum and a handler function), building a // tuple of Handler objects for the enums that are in module "mod", and ignoring // ones that aren't. This lets the user pass in handlers for for all modules in // one simple call, and we can build up separate handler tables for each module. // MakeModuleHandlerTuple is recursive, peeling off two arguments in each recursive // case, and having no args be the base case. The recursive case has a pair of // overloads for whether or not the enum's type matches "mod" or not. Since these // overloads are separate functions, it's mutual recursion, so both are declared // first before the definitions. // Base case: no more arguments template constexpr inline auto MakeModuleHandlerTuple() { return std::tuple<>{}; } // Prototypes of recursive cases -- needed since they can call each other template == mod, int> = 0, typename... Args> constexpr inline auto MakeModuleHandlerTuple(IdT, FuncT, Args...); template != mod, int> = 0, typename... Args> constexpr inline auto MakeModuleHandlerTuple(IdT, FuncT, Args...); // Recursive case 1: enum's type matches mod, so add it to the tuple template == mod, int>, typename... Args> constexpr inline auto MakeModuleHandlerTuple(IdT id, FuncT f, Args... rest) { // Verify types of id and function, using static_assert to provide a // clear compile error if the types don't meet the requirements. static_assert(IdToModuleId != NVTX_CB_MODULE_INVALID, "MakeHandlerTable arguments must be pairs of IDs and handler functions. IDs must be enums starting with NVTX_CBID_. An invalid ID value was provided."); // Before adding this id/handler pair to the tuple, check to make sure // there's not already an entry in the tuple with the same id. If so, // provide a clear compile-time error message. auto restTuple = MakeModuleHandlerTuple(rest...); return std::tuple_cat( std::make_tuple(MakeHandler(id, f)), restTuple); } // Recursive case 2: id is not in module, so fwd result from remaining args template != mod, int>, typename... Args> constexpr inline auto MakeModuleHandlerTuple(IdT id, FuncT f, Args... rest) { return MakeModuleHandlerTuple(rest...); } //--- MakeModuleHandlerFromTuple --- // Helper function for MakeModuleHandlerTable. Coverts type of Handlers into // a ModuleHandlerTable object. This approach was simpler than building up the // ModuleHandlerTable incrementally, since std::tuple_cat makes it so easy to // build up a tuple. template constexpr inline auto MakeModuleHandlerFromTuple(std::tuple...> t) { return ModuleHandlerTable(t); } //--- "Make" function for ModuleHandlerTable to automatically deduce type --- // First, create a tuple of just the handlers in the argument list in module "mod". // Uses the mutually-recursive MakeModuleHandlerTuple overloads, which only add // handlers into the tuple if the module matches. Then, MakeModuleHandlerFromTuple // converts the tuple into a properly-typed ModuleHandlerTable object. template constexpr inline auto MakeModuleHandlerTable(Args... args) { const auto handlerTuple = MakeModuleHandlerTuple(args...); return MakeModuleHandlerFromTuple(handlerTuple); } } // namespace detail_nvtx //============ NVTX injection helper public interface ========================= // Define sentinel-value constants for use in handler implementations namespace ReturnCodes { constexpr auto NVTX_TOOL_ATTACHED_UNUSED_RANGE_ID = static_cast(-1LL); constexpr int NVTX_TOOL_ATTACHED_UNUSED_PUSH_POP_ID = -1; const auto NVTX_TOOL_ATTACHED_UNUSED_DOMAIN_HANDLE = reinterpret_cast(-1LL); const auto NVTX_TOOL_ATTACHED_UNUSED_STRING_HANDLE = reinterpret_cast(-1LL); // Note: In C++20, use bit_cast instead of reinterpret_cast, so the handles // (which are pointer types) can also be made constexpr. } template constexpr inline auto MakeHandlerTable(Args... args) { return std::make_tuple( MakeModuleHandlerTable(args...), MakeModuleHandlerTable(args...), MakeModuleHandlerTable(args...), MakeModuleHandlerTable(args...), MakeModuleHandlerTable(args...), MakeModuleHandlerTable(args...) ); } enum class InstallResult { Success, ExportTableVersionInfoMissing, ExportTableVersionInfoTooSmall, ClientVersionTooOld, ExportTableCallbacksMissing, ExportTableCallbacksTooSmall, ModuleNotSupported, ModuleTableTooSmall }; template inline InstallResult InstallHandlers( NvtxGetExportTableFunc_t getExportTable, HandlerTableT const& injectionHandlerTable, std::ostringstream* errStream = nullptr, uint32_t* pVersion = nullptr) { uint32_t version = 0; auto pVersionInfo = reinterpret_cast(getExportTable(NVTX_ETID_VERSIONINFO)); if (!pVersionInfo) { if (errStream) *errStream << "Client NVTX instance doesn't support NVTX_ETID_VERSIONINFO"; return InstallResult::ExportTableVersionInfoMissing; } if (pVersionInfo->struct_size < sizeof(*pVersionInfo)) { if (errStream) *errStream << "NvtxExportTableVersionInfo structure size is " << pVersionInfo->struct_size << ", expected " << sizeof(*pVersionInfo) << "!"; return InstallResult::ExportTableVersionInfoTooSmall; } version = pVersionInfo->version; if (version < 2) { if (errStream) *errStream << "client's NVTX version is " << version << ", expected 2+"; return InstallResult::ClientVersionTooOld; } if (pVersion) *pVersion = version; auto pCallbacks = reinterpret_cast(getExportTable(NVTX_ETID_CALLBACKS)); if (!pCallbacks) { if (errStream) *errStream << "Client NVTX instance doesn't support NVTX_ETID_CALLBACKS"; return InstallResult::ExportTableCallbacksMissing; } if (pCallbacks->struct_size < sizeof(*pCallbacks)) { if (errStream) *errStream << "NvtxExportTableCallbacks structure size is " << pCallbacks->struct_size << ", expected " << sizeof(*pCallbacks) << "!"; return InstallResult::ExportTableCallbacksTooSmall; } #if defined(DEBUG) || true // Simple loop to print handler table internal details for_each_in_tuple(injectionHandlerTable, [](auto const& handlerModule) { auto count = size_of_tuple(handlerModule.handlers); printf("Module: %d Count: %d Highest: %d\n", static_cast(handlerModule.moduleId), static_cast(count), static_cast(handlerModule.highestIdUsed)); if (count > 0) { for_each_in_tuple(handlerModule.handlers, [](auto const& handler) { auto addr = static_cast(handler.Address()); printf(" Id: %d Address: 0x%llx\n", static_cast(handler.id), addr); } ); } } ); #endif // Loop over module handler tables and install handlers into client bool errors = false; for_each_in_tuple(injectionHandlerTable, [&](auto const& handlerModule) { NvtxFunctionTable clientTable = 0; unsigned int clientTableSize = 0; int success; if (handlerModule.moduleId == NVTX_CB_MODULE_INVALID) return; success = pCallbacks->GetModuleFunctionTable(handlerModule.moduleId, &clientTable, &clientTableSize); if (!success || !clientTable) { if (errStream) *errStream << "Client NVTX instance doesn't support callback module with id " << handlerModule.moduleId; // TODO: return InstallResult::ModuleNotSupported; errors = true; } // Ensure client's table is new enough to support the function pointers we want to register if (clientTableSize <= handlerModule.highestIdUsed) { if (errStream) *errStream << "Size of client NVTX instance's handler table with module id " << handlerModule.moduleId << " too small. Size is " << clientTableSize << ", but injection needs to assign table[" << handlerModule.highestIdUsed << "]"; // TODO: return InstallResult::ModuleTableTooSmall; errors = true; } handlerModule.AssignToClient(clientTable); } ); if (errors) return InstallResult::ModuleNotSupported; return InstallResult::Success; } } // namespace NvtxInjectionHelper