2854 lines
106 KiB
Python
2854 lines
106 KiB
Python
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||
# SPDX-License-Identifier: BSD-3-Clause
|
||
|
||
# Redistribution and use in source and binary forms, with or without
|
||
# modification, are permitted provided that the following conditions are met:
|
||
|
||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||
# list of conditions and the following disclaimer.
|
||
|
||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||
# this list of conditions and the following disclaimer in the documentation
|
||
# and/or other materials provided with the distribution.
|
||
|
||
# 3. Neither the name of the copyright holder nor the names of its
|
||
# contributors may be used to endorse or promote products derived from
|
||
# this software without specific prior written permission.
|
||
|
||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
||
from typing import Optional, Tuple, Type, Union
|
||
|
||
import cuda.bindings.driver as cuda
|
||
import cutlass
|
||
import cutlass.cute as cute
|
||
import cutlass.pipeline as pipeline
|
||
import cutlass.torch as cutlass_torch
|
||
import cutlass.utils as utils
|
||
import cutlass.utils.blackwell_helpers as sm100_utils
|
||
import cutlass.utils.blockscaled_layout as blockscaled_utils
|
||
import torch
|
||
import functools
|
||
from cutlass._mlir import ir
|
||
from cutlass.cute.nvgpu import cpasync, tcgen05
|
||
from cutlass.cute.runtime import from_dlpack
|
||
|
||
from cutlass.cutlass_dsl import (
|
||
Int32,
|
||
Integer,
|
||
dsl_user_op,
|
||
extract_mlir_values,
|
||
new_from_mlir_values,
|
||
)
|
||
from flashinfer.utils import get_compute_capability
|
||
from cutlass.utils.static_persistent_tile_scheduler import WorkTileInfo
|
||
from .utils import get_cutlass_dtype, cutlass_to_torch_dtype, get_num_sm, make_ptr
|
||
from typing import Callable, List
|
||
|
||
|
||
class MaskedSchedulerParams:
|
||
def __init__(
|
||
self,
|
||
masked_m: cute.Tensor,
|
||
c: cute.Tensor,
|
||
c_tiler: Tuple[int, int],
|
||
cluster_shape_mnk: cute.Shape,
|
||
*,
|
||
loc=None,
|
||
ip=None,
|
||
):
|
||
if cluster_shape_mnk[2] != 1:
|
||
raise ValueError(f"unsupported cluster_shape_k {cluster_shape_mnk[2]}")
|
||
|
||
gc = cute.zipped_divide(c, tiler=c_tiler)
|
||
problem_shape_ntile_mnl = gc[(0, (None, None, None))].shape
|
||
self.masked_m = masked_m
|
||
self.c = c
|
||
self.c_tiler = c_tiler
|
||
self.problem_shape_ntile_mnl = problem_shape_ntile_mnl
|
||
# cluster_shape_mnk is kept for reconstruction
|
||
self._cluster_shape_mnk = cluster_shape_mnk
|
||
self.cluster_shape_mn = cluster_shape_mnk[:2]
|
||
self._loc = loc
|
||
|
||
self.problem_layout_ncluster_mnl = cute.make_layout(
|
||
cute.ceil_div(
|
||
self.problem_shape_ntile_mnl, cluster_shape_mnk[:2], loc=loc, ip=ip
|
||
),
|
||
loc=loc,
|
||
ip=ip,
|
||
)
|
||
|
||
def __extract_mlir_values__(self):
|
||
values, self._values_pos = [], []
|
||
for obj in [
|
||
self.masked_m,
|
||
self.c,
|
||
self.c_tiler,
|
||
self._cluster_shape_mnk,
|
||
]:
|
||
obj_values = extract_mlir_values(obj)
|
||
values += obj_values
|
||
self._values_pos.append(len(obj_values))
|
||
return values
|
||
|
||
def __new_from_mlir_values__(self, values):
|
||
obj_list = []
|
||
for obj, n_items in zip(
|
||
[self.masked_m, self.c, self.c_tiler, self._cluster_shape_mnk],
|
||
self._values_pos,
|
||
):
|
||
obj_list.append(new_from_mlir_values(obj, values[:n_items]))
|
||
values = values[n_items:]
|
||
return MaskedSchedulerParams(*(tuple(obj_list)), loc=self._loc)
|
||
|
||
@dsl_user_op
|
||
def get_grid_shape(
|
||
self, max_active_clusters: Int32, *, loc=None, ip=None
|
||
) -> Tuple[Integer, Integer, Integer]:
|
||
num_persistent_clusters = max_active_clusters
|
||
|
||
return (*self.cluster_shape_mn, num_persistent_clusters)
|
||
|
||
|
||
class MaskedScheduler:
|
||
def __init__(
|
||
self,
|
||
params: MaskedSchedulerParams,
|
||
num_persistent_clusters: Int32,
|
||
current_work_linear_idx: Int32,
|
||
current_batch_idx: Int32,
|
||
accum_tile_m: Int32,
|
||
cta_id_in_cluster: cute.Coord,
|
||
num_tiles_executed: Int32,
|
||
):
|
||
self.params = params
|
||
self.num_persistent_clusters = num_persistent_clusters
|
||
self._current_work_linear_idx = current_work_linear_idx
|
||
self._current_batch_idx = current_batch_idx
|
||
self._accum_tile_m = accum_tile_m
|
||
self.cta_id_in_cluster = cta_id_in_cluster
|
||
self._num_tiles_executed = num_tiles_executed
|
||
|
||
def __extract_mlir_values__(self) -> list[ir.Value]:
|
||
values = extract_mlir_values(self.num_persistent_clusters)
|
||
values.extend(extract_mlir_values(self._current_work_linear_idx))
|
||
values.extend(extract_mlir_values(self._current_batch_idx))
|
||
values.extend(extract_mlir_values(self._accum_tile_m))
|
||
values.extend(extract_mlir_values(self.cta_id_in_cluster))
|
||
values.extend(extract_mlir_values(self._num_tiles_executed))
|
||
return values
|
||
|
||
def __new_from_mlir_values__(self, values: list[ir.Value]) -> "MaskedScheduler":
|
||
assert len(values) == 8
|
||
new_num_persistent_clusters = new_from_mlir_values(
|
||
self.num_persistent_clusters, [values[0]]
|
||
)
|
||
new_current_work_linear_idx = new_from_mlir_values(
|
||
self._current_work_linear_idx, [values[1]]
|
||
)
|
||
new_current_batch_idx = new_from_mlir_values(
|
||
self._current_batch_idx, [values[2]]
|
||
)
|
||
new_accum_tile_m = new_from_mlir_values(self._accum_tile_m, [values[3]])
|
||
new_cta_id_in_cluster = new_from_mlir_values(
|
||
self.cta_id_in_cluster, values[4:7]
|
||
)
|
||
new_num_tiles_executed = new_from_mlir_values(
|
||
self._num_tiles_executed, [values[7]]
|
||
)
|
||
return MaskedScheduler(
|
||
self.params,
|
||
new_num_persistent_clusters,
|
||
new_current_work_linear_idx,
|
||
new_current_batch_idx,
|
||
new_accum_tile_m,
|
||
new_cta_id_in_cluster,
|
||
new_num_tiles_executed,
|
||
)
|
||
|
||
# called by host
|
||
@dsl_user_op
|
||
@staticmethod
|
||
def create(
|
||
params: MaskedSchedulerParams,
|
||
block_idx: Tuple[Integer, Integer, Integer],
|
||
grid_dim: Tuple[Integer, Integer, Integer],
|
||
*,
|
||
loc=None,
|
||
ip=None,
|
||
):
|
||
params = params
|
||
|
||
# Calculate the number of persistent clusters by dividing the total grid size
|
||
# by the number of CTAs per cluster
|
||
num_persistent_clusters = cute.size(grid_dim, loc=loc, ip=ip) // cute.size(
|
||
params.cluster_shape_mn, loc=loc, ip=ip
|
||
)
|
||
|
||
bidx, bidy, bidz = block_idx
|
||
|
||
# Initialize workload index equals to the cluster index in the grid
|
||
current_work_linear_idx = Int32(bidz)
|
||
current_batch_idx = Int32(0)
|
||
accum_tile_m = Int32(0)
|
||
|
||
# CTA id in the cluster
|
||
cta_id_in_cluster = (
|
||
Int32(bidx % params.cluster_shape_mn[0]),
|
||
Int32(bidy % params.cluster_shape_mn[1]),
|
||
Int32(0),
|
||
)
|
||
# Initialize number of tiles executed to zero
|
||
num_tiles_executed = Int32(0)
|
||
return MaskedScheduler(
|
||
params,
|
||
num_persistent_clusters,
|
||
current_work_linear_idx,
|
||
current_batch_idx,
|
||
accum_tile_m,
|
||
cta_id_in_cluster,
|
||
num_tiles_executed,
|
||
)
|
||
|
||
# called by host
|
||
@staticmethod
|
||
def get_grid_shape(
|
||
params: MaskedSchedulerParams,
|
||
max_active_clusters: Int32,
|
||
*,
|
||
loc=None,
|
||
ip=None,
|
||
) -> Tuple[Integer, Integer, Integer]:
|
||
return params.get_grid_shape(max_active_clusters, loc=loc, ip=ip)
|
||
|
||
# private method
|
||
@cute.jit
|
||
def _get_current_work_for_linear_idx(
|
||
self,
|
||
current_work_linear_idx: Int32,
|
||
) -> WorkTileInfo:
|
||
# is_valid = current_work_linear_idx < cute.size(
|
||
# self.params.problem_layout_ncluster_mnl, loc=loc, ip=ip
|
||
# )
|
||
num_tiles_n = self.params.problem_shape_ntile_mnl[1]
|
||
accum_tile_m = self._accum_tile_m
|
||
batch_idx = self._current_batch_idx
|
||
|
||
while (
|
||
(
|
||
accum_tile_m
|
||
+ cute.ceil_div(self.params.masked_m[batch_idx], self.params.c_tiler[0])
|
||
)
|
||
* num_tiles_n
|
||
<= current_work_linear_idx
|
||
and batch_idx < self.params.masked_m.shape[0]
|
||
):
|
||
accum_tile_m += cute.ceil_div(
|
||
self.params.masked_m[batch_idx], self.params.c_tiler[0]
|
||
)
|
||
batch_idx += Int32(1)
|
||
|
||
self._accum_tile_m = accum_tile_m
|
||
self._current_batch_idx = batch_idx
|
||
|
||
is_valid = self._current_batch_idx < self.params.masked_m.shape[0]
|
||
if is_valid:
|
||
is_valid = (
|
||
self._accum_tile_m
|
||
+ cute.ceil_div(
|
||
self.params.masked_m[self._current_batch_idx],
|
||
self.params.c_tiler[0],
|
||
)
|
||
) * num_tiles_n > current_work_linear_idx
|
||
|
||
# cur_cluster_coord = self.params.problem_layout_ncluster_mnl.get_hier_coord(
|
||
# current_work_linear_idx, loc=loc, ip=ip
|
||
# )
|
||
cur_cluster_coord = (
|
||
current_work_linear_idx // num_tiles_n - self._accum_tile_m,
|
||
current_work_linear_idx % num_tiles_n,
|
||
self._current_batch_idx,
|
||
)
|
||
|
||
# cur_tile_coord is a tuple of i32 values
|
||
cur_tile_coord = tuple(
|
||
Int32(x) * Int32(z) + Int32(y)
|
||
for x, y, z in zip(
|
||
cur_cluster_coord,
|
||
self.cta_id_in_cluster,
|
||
(*self.params.cluster_shape_mn, Int32(1)),
|
||
)
|
||
)
|
||
|
||
return WorkTileInfo(cur_tile_coord, is_valid)
|
||
|
||
@dsl_user_op
|
||
def get_current_work(self, *, loc=None, ip=None) -> WorkTileInfo:
|
||
return self._get_current_work_for_linear_idx(
|
||
self._current_work_linear_idx,
|
||
)
|
||
|
||
@dsl_user_op
|
||
def initial_work_tile_info(self, *, loc=None, ip=None) -> WorkTileInfo:
|
||
return self.get_current_work(loc=loc, ip=ip)
|
||
|
||
@dsl_user_op
|
||
def advance_to_next_work(self, *, advance_count: int = 1, loc=None, ip=None):
|
||
self._current_work_linear_idx += Int32(advance_count) * Int32(
|
||
self.num_persistent_clusters
|
||
)
|
||
self._num_tiles_executed += Int32(1)
|
||
|
||
@property
|
||
def num_tiles_executed(self) -> Int32:
|
||
return self._num_tiles_executed
|
||
|
||
|
||
"""
|
||
This example provides an experimental implementation of the SM100 batched dense blockscaled GEMM kernel, please note that the APIs and implementation details related to this kernel may change in future releases.
|
||
|
||
A high-performance persistent batched dense blockscaled GEMM example for the NVIDIA Blackwell SM100 architecture
|
||
using CUTE DSL.
|
||
- Matrix A is MxKxL, L is batch dimension, A can be row-major("K") or column-major("M") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
|
||
- Matrix B is NxKxL, L is batch dimension, B can be row-major("N") or column-major("K") for MXF8 input type and can only be row-major("K") for MXF4/NVF4 input type
|
||
- Matrix C is MxNxL, L is batch dimension, C can be row-major("N") or column-major("M")
|
||
- Matrix SFA layout is filled internally according to A shape and BlockScaledBasicChunk, which has M×ceil_div(K, sf_vec_size)×L elements respectively
|
||
- Matrix SFB layout is filled internally according to B shape and BlockScaledBasicChunk, which has N×ceil_div(K, sf_vec_size)×L elements respectively
|
||
|
||
This GEMM kernel supports the following features:
|
||
- Utilizes Tensor Memory Access (TMA) for efficient memory operations
|
||
- Utilizes Blackwell's tcgen05.mma for matrix multiply-accumulate (MMA) operations (including 2cta mma instructions)
|
||
- Implements TMA multicast with cluster to reduce L2 memory traffic
|
||
- Support persistent tile scheduling to better overlap memory load/store with mma between tiles
|
||
- Support warp specialization to avoid explicit pipelining between mainloop load and mma
|
||
|
||
This GEMM works as follows:
|
||
1. DMA warp: Load A and B matrices from global memory (GMEM) to shared memory (SMEM) using TMA operations.
|
||
2. MMA warp:
|
||
- Load scale factor A/B from shared memory (SMEM) to tensor memory (TMEM) using tcgen05.cp instruction.
|
||
- Perform matrix multiply-accumulate (MMA) operations using tcgen05.mma instruction.
|
||
3. EPILOGUE warp:
|
||
- Load completed accumulator from tensor memory (TMEM) to registers (RMEM) using tcgen05.ld.
|
||
- Type convert C matrix to output type.
|
||
- Optionally store C matrix from registers (RMEM) to shared memory (SMEM) to global memory (GMEM) with TMA operations,
|
||
or directly store C matrix from registers (RMEM) to global memory (GMEM) without TMA operations.
|
||
|
||
SM100 tcgen05.mma.kind.block_scale instructions operate as follows:
|
||
- Read matrix A from SMEM
|
||
- Read matrix B from SMEM
|
||
- Read scalefactor A from TMEM
|
||
- Read scalefactor B from TMEM
|
||
- Write accumulator to TMEM
|
||
The accumulator in TMEM must then be loaded to registers before writing back to GMEM.
|
||
|
||
Input arguments to this example is shown below:
|
||
|
||
.. code-block:: bash
|
||
|
||
python examples/blackwell/dense_blockscaled_gemm_persistent.py \
|
||
--ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16 \
|
||
--c_dtype Float16 \
|
||
--mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \
|
||
--mnkl 8192,8192,1024,1
|
||
|
||
To collect performance with NCU profiler:
|
||
|
||
.. code-block:: bash
|
||
|
||
ncu python examples/blackwell/dense_blockscaled_gemm_persistent.py \
|
||
--ab_dtype Float4E2M1FN --sf_dtype Float8E8M0FNU --sf_vec_size 16 \
|
||
--c_dtype Float16 \
|
||
--mma_tiler_mn 256,128 --cluster_shape_mn 2,1 \
|
||
--mnkl 8192,8192,1024,1 \
|
||
--warmup_iterations 1 --iterations 10 --skip_ref_check
|
||
|
||
|
||
Constraints:
|
||
* Supported input data types: mxf8, mxf4, nvf4
|
||
see detailed valid dtype combinations in below Sm100BlockScaledPersistentDenseGemmKernel class documentation
|
||
* A/B tensor must have the same data type, mixed data type is not supported (e.g., mxf8 x mxf4)
|
||
* Mma tiler M must be 128 or 256(use_2cta_instrs)
|
||
* Mma tiler N must be 128 or 256
|
||
* Cluster shape M/N must be positive and power of 2, total cluster size <= 16
|
||
* Cluster shape M must be multiple of 2 if Mma tiler M is 256(use_2cta_instrs)
|
||
* The contiguous dimension of A/B/C tensors must be at least 16 bytes aligned,
|
||
i.e, number of elements is a multiple of 16 and 32 for Float8 and Float4, respectively.
|
||
"""
|
||
|
||
|
||
class Sm100BlockScaledPersistentDenseGemmKernel:
|
||
"""This class implements batched matrix multiplication (C = A x SFA x B x SFB) with support for various data types
|
||
and architectural features specific to Blackwell GPUs with persistent tile scheduling and warp specialization.
|
||
|
||
:param sf_vec_size: Scalefactor vector size.
|
||
:type sf_vec_size: int
|
||
:param mma_tiler_mn: Shape of the Matrix Multiply-Accumulate (MMA) tile (M,N)
|
||
:type mma_tiler_mn: Tuple[int, int]
|
||
:param cluster_shape_mn: Cluster dimensions (M,N) for parallel processing
|
||
:type cluster_shape_mn: Tuple[int, int]
|
||
|
||
:note: In current version, A and B tensor must have the same data type
|
||
- i.e., Float8E4M3FN for A and Float8E5M2 for B is not supported
|
||
|
||
:note: Supported combinations of A/B data types, SF data typs and SF vector size:
|
||
- MXF8: A/B: Float8E5M2/Float8E4M3FN + SF: Float8E8M0FNU + sf_vec_size: 32
|
||
- MXF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU + sf_vec_size: 32
|
||
- NVF4: A/B: Float4E2M1FN + SF: Float8E8M0FNU/Float8E4M3FN + sf_vec_size: 16
|
||
|
||
:note: Supported accumulator data types:
|
||
- Float32
|
||
|
||
:note: Supported C data types:
|
||
- Float32
|
||
- Float16/BFloat16
|
||
- Float8E4M3FN/Float8E5M2
|
||
:note: Constraints:
|
||
- MMA tiler M must be 128 or 256 (use_2cta_instrs)
|
||
- MMA tiler N must be 128/256
|
||
- Cluster shape M must be multiple of 2 if Mma tiler M is 256
|
||
- Cluster shape M/N must be positive and power of 2, total cluster size <= 16
|
||
- Also, Cluster shape M/N must be <= 4 for scale factor multicasts due to limited size of scale factors
|
||
|
||
Example:
|
||
>>> gemm = Sm100BlockScaledPersistentDenseGemmKernel(
|
||
... sf_vec_size=16,
|
||
... mma_tiler_mn=(256, 128),
|
||
... cluster_shape_mn=(2, 1)
|
||
... )
|
||
>>> gemm(a_tensor, b_tensor, sfa_tensor, sfb_tensor, c_tensor, max_active_clusters, stream)
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
sf_vec_size: int,
|
||
mma_tiler_mn: Tuple[int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
sm_version: str,
|
||
):
|
||
"""Initializes the configuration for a Blackwell dense GEMM kernel.
|
||
|
||
This configuration includes several key aspects:
|
||
|
||
1. MMA Instruction Settings (tcgen05):
|
||
- acc_dtype: Data types for MMA accumulator, always set to Float32
|
||
- sf_vec_size: Scalefactor A/B vector size.
|
||
- mma_tiler_mn: The (M, N) shape of the MMA instruction tiler.
|
||
|
||
2. Cluster Shape:
|
||
- cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster.
|
||
|
||
:param sf_vec_size: Scalefactor vector size.
|
||
:type sf_vec_size: int
|
||
:param mma_tiler_mn: Tuple (M, N) shape of the MMA instruction.
|
||
:type mma_tiler_mn: Tuple[int, int]
|
||
:param cluster_shape_mn: Tuple (ClusterM, ClusterN) shape of the cluster.
|
||
:type cluster_shape_mn: Tuple[int, int]
|
||
"""
|
||
assert sm_version == "sm_100", (
|
||
"sm_100 is the only supported SM version for cute-dsl backend."
|
||
)
|
||
|
||
self.acc_dtype = cutlass.Float32
|
||
self.sf_vec_size = sf_vec_size
|
||
self.use_2cta_instrs = mma_tiler_mn[0] == 256
|
||
self.cluster_shape_mn = cluster_shape_mn
|
||
# K dimension is deferred in _setup_attributes
|
||
self.mma_tiler = (*mma_tiler_mn, 1)
|
||
|
||
self.cta_group = (
|
||
tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
|
||
)
|
||
|
||
self.occupancy = 1
|
||
# Set specialized warp ids
|
||
self.epilog_warp_id = (
|
||
0,
|
||
1,
|
||
2,
|
||
3,
|
||
)
|
||
self.mma_warp_id = 4
|
||
self.tma_warp_id = 5
|
||
self.threads_per_cta = 32 * len(
|
||
(self.mma_warp_id, self.tma_warp_id, *self.epilog_warp_id)
|
||
)
|
||
# Set barrier id for cta sync, epilogue sync and tmem ptr sync
|
||
self.cta_sync_bar_id = 0
|
||
self.epilog_sync_bar_id = 1
|
||
self.tmem_ptr_sync_bar_id = 2
|
||
self.smem_capacity = utils.get_smem_capacity_in_bytes(sm_version)
|
||
SM100_TMEM_CAPACITY_COLUMNS = 512
|
||
self.num_tmem_alloc_cols = SM100_TMEM_CAPACITY_COLUMNS
|
||
|
||
def _setup_attributes(self):
|
||
"""Set up configurations that are dependent on GEMM inputs
|
||
|
||
This method configures various attributes based on the input tensor properties
|
||
(data types, leading dimensions) and kernel settings:
|
||
- Configuring tiled MMA
|
||
- Computing MMA/cluster/tile shapes
|
||
- Computing cluster layout
|
||
- Computing multicast CTAs for A/B/SFA/SFB
|
||
- Computing epilogue subtile
|
||
- Setting up A/B/SFA/SFB/C stage counts in shared memory
|
||
- Computing A/B/SFA/SFB/C shared memory layout
|
||
- Computing tensor memory allocation columns
|
||
"""
|
||
# Compute mma instruction shapes
|
||
mma_inst_bits_k = 256
|
||
# (MMA_Tile_Shape_M, MMA_Tile_Shape_N, MMA_Inst_Shape_K)
|
||
self.mma_inst_shape_mnk = (
|
||
self.mma_tiler[0],
|
||
self.mma_tiler[1],
|
||
mma_inst_bits_k // self.a_dtype.width,
|
||
)
|
||
# (CTA_Tile_Shape_M, Round_Up(MMA_Tile_Shape_N, 128), MMA_Inst_Shape_K)
|
||
self.mma_inst_shape_mnk_sfb = (
|
||
self.mma_inst_shape_mnk[0] // (2 if self.use_2cta_instrs else 1),
|
||
cute.round_up(self.mma_inst_shape_mnk[1], 128),
|
||
self.mma_inst_shape_mnk[2],
|
||
)
|
||
|
||
tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
|
||
self.a_dtype,
|
||
self.a_major_mode,
|
||
self.b_major_mode,
|
||
self.sf_dtype,
|
||
self.sf_vec_size,
|
||
self.cta_group,
|
||
self.mma_inst_shape_mnk[:2],
|
||
)
|
||
|
||
tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
|
||
self.a_dtype,
|
||
self.a_major_mode,
|
||
self.b_major_mode,
|
||
self.sf_dtype,
|
||
self.sf_vec_size,
|
||
cute.nvgpu.tcgen05.CtaGroup.ONE,
|
||
self.mma_inst_shape_mnk_sfb[:2],
|
||
)
|
||
|
||
# Compute mma/cluster/tile shapes
|
||
mma_inst_tile_k = 4
|
||
self.mma_tiler = (
|
||
self.mma_inst_shape_mnk[0],
|
||
self.mma_inst_shape_mnk[1],
|
||
self.mma_inst_shape_mnk[2] * mma_inst_tile_k,
|
||
)
|
||
self.mma_tiler_sfb = (
|
||
self.mma_inst_shape_mnk_sfb[0],
|
||
self.mma_inst_shape_mnk_sfb[1],
|
||
self.mma_inst_shape_mnk_sfb[2] * mma_inst_tile_k,
|
||
)
|
||
self.cta_tile_shape_mnk = (
|
||
self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
|
||
self.mma_tiler[1],
|
||
self.mma_tiler[2],
|
||
)
|
||
|
||
# Compute cluster layout
|
||
self.cluster_layout_vmnk = cute.tiled_divide(
|
||
cute.make_layout((*self.cluster_shape_mn, 1)),
|
||
(tiled_mma.thr_id.shape,),
|
||
)
|
||
self.cluster_layout_sfb_vmnk = cute.tiled_divide(
|
||
cute.make_layout((*self.cluster_shape_mn, 1)),
|
||
(tiled_mma_sfb.thr_id.shape,),
|
||
)
|
||
|
||
# Compute number of multicast CTAs for A/B
|
||
self.num_mcast_ctas_a = cute.size(self.cluster_layout_vmnk.shape[2])
|
||
self.num_mcast_ctas_b = cute.size(self.cluster_layout_vmnk.shape[1])
|
||
self.num_mcast_ctas_sfb = cute.size(self.cluster_layout_sfb_vmnk.shape[1])
|
||
self.is_a_mcast = self.num_mcast_ctas_a > 1
|
||
self.is_b_mcast = self.num_mcast_ctas_b > 1
|
||
self.is_sfb_mcast = self.num_mcast_ctas_sfb > 1
|
||
|
||
# Compute epilogue subtile
|
||
self.epi_tile = sm100_utils.compute_epilogue_tile_shape(
|
||
self.cta_tile_shape_mnk,
|
||
self.use_2cta_instrs,
|
||
self.c_layout,
|
||
self.c_dtype,
|
||
)
|
||
|
||
# Setup A/B/C stage count in shared memory and ACC stage count in tensor memory
|
||
self.num_acc_stage, self.num_ab_stage, self.num_c_stage = self._compute_stages(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.a_dtype,
|
||
self.a_major_mode,
|
||
self.b_dtype,
|
||
self.b_major_mode,
|
||
self.epi_tile,
|
||
self.c_dtype,
|
||
self.c_layout,
|
||
self.sf_dtype,
|
||
self.sf_vec_size,
|
||
self.smem_capacity,
|
||
self.occupancy,
|
||
)
|
||
|
||
# Compute A/B/SFA/SFB/C shared memory layout
|
||
self.a_smem_layout_staged = sm100_utils.make_smem_layout_a(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.a_dtype,
|
||
self.num_ab_stage,
|
||
)
|
||
self.b_smem_layout_staged = sm100_utils.make_smem_layout_b(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.b_dtype,
|
||
self.num_ab_stage,
|
||
)
|
||
self.sfa_smem_layout_staged = blockscaled_utils.make_smem_layout_sfa(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.sf_vec_size,
|
||
self.num_ab_stage,
|
||
)
|
||
self.sfb_smem_layout_staged = blockscaled_utils.make_smem_layout_sfb(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.sf_vec_size,
|
||
self.num_ab_stage,
|
||
)
|
||
self.c_smem_layout_staged = sm100_utils.make_smem_layout_epi(
|
||
self.c_dtype,
|
||
self.c_layout,
|
||
self.epi_tile,
|
||
self.num_c_stage,
|
||
)
|
||
|
||
@cute.jit
|
||
def __call__(
|
||
self,
|
||
a_tensor: cute.Tensor,
|
||
b_tensor: cute.Tensor,
|
||
sfa_tensor: cute.Tensor,
|
||
sfb_tensor: cute.Tensor,
|
||
c_tensor: cute.Tensor,
|
||
masked_m_tensor: cute.Tensor,
|
||
alpha_tensor: Optional[cute.Tensor],
|
||
max_active_clusters: cutlass.Constexpr,
|
||
stream: cuda.CUstream,
|
||
):
|
||
"""Execute the GEMM operation in steps:
|
||
- Setup static attributes before smem/grid/tma computation
|
||
- Setup TMA load/store atoms and tensors
|
||
- Compute grid size with regard to hardware constraints
|
||
- Define shared storage for kernel
|
||
- Launch the kernel synchronously
|
||
|
||
:param a_tensor: Input tensor A
|
||
:type a_tensor: cute.Tensor
|
||
:param b_tensor: Input tensor B
|
||
:type b_tensor: cute.Tensor
|
||
:param sfa_tensor: Scale factor tensor A
|
||
:type sfa_tensor: cute.Tensor
|
||
:param sfb_tensor: Scale factor tensor B
|
||
:type sfb_tensor: cute.Tensor
|
||
:param c_tensor: Output tensor C
|
||
:type c_tensor: cute.Tensor
|
||
:param masked_m_tensor: Masked layout tensor M
|
||
:type masked_m_tensor: cute.Tensor
|
||
:param max_active_clusters: Maximum number of active clusters
|
||
:type max_active_clusters: cutlass.Constexpr
|
||
:param stream: CUDA stream for asynchronous execution
|
||
:type stream: cuda.CUstream
|
||
:param alpha_tensor: Optional 1D tensor of shape (l,) containing per-batch scaling factors.
|
||
:type alpha_tensor: cute.Tensor
|
||
:raises TypeError: If input data types are incompatible with the MMA instruction.
|
||
"""
|
||
# Setup static attributes before smem/grid/tma computation
|
||
self.a_dtype: Type[cutlass.Numeric] = a_tensor.element_type
|
||
self.b_dtype: Type[cutlass.Numeric] = b_tensor.element_type
|
||
self.sf_dtype: Type[cutlass.Numeric] = sfa_tensor.element_type
|
||
self.c_dtype: Type[cutlass.Numeric] = c_tensor.element_type
|
||
self.a_major_mode = utils.LayoutEnum.from_tensor(a_tensor).mma_major_mode()
|
||
self.b_major_mode = utils.LayoutEnum.from_tensor(b_tensor).mma_major_mode()
|
||
self.c_layout = utils.LayoutEnum.from_tensor(c_tensor)
|
||
|
||
# Check if input data types are compatible with MMA instruction
|
||
if cutlass.const_expr(self.a_dtype != self.b_dtype):
|
||
raise TypeError(f"Type must match: {self.a_dtype} != {self.b_dtype}")
|
||
|
||
# Setup attributes that dependent on gemm inputs
|
||
self._setup_attributes()
|
||
|
||
# Setup sfa/sfb tensor by filling A/B tensor to scale factor atom layout
|
||
# ((Atom_M, Rest_M),(Atom_K, Rest_K),RestL)
|
||
sfa_layout = blockscaled_utils.tile_atom_to_shape_SF(
|
||
a_tensor.shape, self.sf_vec_size
|
||
)
|
||
sfa_tensor = cute.make_tensor(sfa_tensor.iterator, sfa_layout)
|
||
|
||
# ((Atom_N, Rest_N),(Atom_K, Rest_K),RestL)
|
||
sfb_layout = blockscaled_utils.tile_atom_to_shape_SF(
|
||
b_tensor.shape, self.sf_vec_size
|
||
)
|
||
sfb_tensor = cute.make_tensor(sfb_tensor.iterator, sfb_layout)
|
||
|
||
tiled_mma = sm100_utils.make_blockscaled_trivial_tiled_mma(
|
||
self.a_dtype,
|
||
self.a_major_mode,
|
||
self.b_major_mode,
|
||
self.sf_dtype,
|
||
self.sf_vec_size,
|
||
self.cta_group,
|
||
self.mma_inst_shape_mnk[:2],
|
||
)
|
||
|
||
tiled_mma_sfb = sm100_utils.make_blockscaled_trivial_tiled_mma(
|
||
self.a_dtype,
|
||
self.a_major_mode,
|
||
self.b_major_mode,
|
||
self.sf_dtype,
|
||
self.sf_vec_size,
|
||
cute.nvgpu.tcgen05.CtaGroup.ONE,
|
||
self.mma_inst_shape_mnk_sfb[:2],
|
||
)
|
||
atom_thr_size = cute.size(tiled_mma.thr_id.shape)
|
||
|
||
# Setup TMA load for A
|
||
a_op = sm100_utils.cluster_shape_to_tma_atom_A(
|
||
self.cluster_shape_mn, tiled_mma.thr_id
|
||
)
|
||
a_smem_layout = cute.slice_(self.a_smem_layout_staged, (None, None, None, 0))
|
||
tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
|
||
a_op,
|
||
a_tensor,
|
||
a_smem_layout,
|
||
self.mma_tiler,
|
||
tiled_mma,
|
||
self.cluster_layout_vmnk.shape,
|
||
)
|
||
|
||
# Setup TMA load for B
|
||
b_op = sm100_utils.cluster_shape_to_tma_atom_B(
|
||
self.cluster_shape_mn, tiled_mma.thr_id
|
||
)
|
||
b_smem_layout = cute.slice_(self.b_smem_layout_staged, (None, None, None, 0))
|
||
tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
|
||
b_op,
|
||
b_tensor,
|
||
b_smem_layout,
|
||
self.mma_tiler,
|
||
tiled_mma,
|
||
self.cluster_layout_vmnk.shape,
|
||
)
|
||
|
||
# Setup TMA load for SFA
|
||
sfa_op = sm100_utils.cluster_shape_to_tma_atom_A(
|
||
self.cluster_shape_mn, tiled_mma.thr_id
|
||
)
|
||
sfa_smem_layout = cute.slice_(
|
||
self.sfa_smem_layout_staged, (None, None, None, 0)
|
||
)
|
||
tma_atom_sfa, tma_tensor_sfa = cute.nvgpu.make_tiled_tma_atom_A(
|
||
sfa_op,
|
||
sfa_tensor,
|
||
sfa_smem_layout,
|
||
self.mma_tiler,
|
||
tiled_mma,
|
||
self.cluster_layout_vmnk.shape,
|
||
internal_type=cutlass.Int16,
|
||
)
|
||
|
||
# Setup TMA load for SFB
|
||
sfb_op = sm100_utils.cluster_shape_to_tma_atom_SFB(
|
||
self.cluster_shape_mn, tiled_mma.thr_id
|
||
)
|
||
sfb_smem_layout = cute.slice_(
|
||
self.sfb_smem_layout_staged, (None, None, None, 0)
|
||
)
|
||
tma_atom_sfb, tma_tensor_sfb = cute.nvgpu.make_tiled_tma_atom_B(
|
||
sfb_op,
|
||
sfb_tensor,
|
||
sfb_smem_layout,
|
||
self.mma_tiler_sfb,
|
||
tiled_mma_sfb,
|
||
self.cluster_layout_sfb_vmnk.shape,
|
||
internal_type=cutlass.Int16,
|
||
)
|
||
|
||
a_copy_size = cute.size_in_bytes(self.a_dtype, a_smem_layout)
|
||
b_copy_size = cute.size_in_bytes(self.b_dtype, b_smem_layout)
|
||
sfa_copy_size = cute.size_in_bytes(self.sf_dtype, sfa_smem_layout)
|
||
sfb_copy_size = cute.size_in_bytes(self.sf_dtype, sfb_smem_layout)
|
||
self.num_tma_load_bytes = (
|
||
a_copy_size + b_copy_size + sfa_copy_size + sfb_copy_size
|
||
) * atom_thr_size
|
||
|
||
# Setup TMA store for C
|
||
epi_smem_layout = cute.slice_(self.c_smem_layout_staged, (None, None, 0))
|
||
tma_atom_c, tma_tensor_c = cpasync.make_tiled_tma_atom(
|
||
cpasync.CopyBulkTensorTileS2GOp(),
|
||
c_tensor,
|
||
epi_smem_layout,
|
||
self.epi_tile,
|
||
)
|
||
|
||
# Compute grid size
|
||
self.tile_sched_params, grid = self._compute_grid(
|
||
masked_m_tensor, # add masked layout
|
||
c_tensor,
|
||
self.cta_tile_shape_mnk,
|
||
self.cluster_shape_mn,
|
||
max_active_clusters,
|
||
)
|
||
|
||
self.buffer_align_bytes = 1024
|
||
|
||
# Define shared storage for kernel
|
||
@cute.struct
|
||
class SharedStorage:
|
||
ab_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
|
||
ab_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage]
|
||
acc_full_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
|
||
acc_empty_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage]
|
||
tmem_dealloc_mbar_ptr: cutlass.Int64
|
||
tmem_holding_buf: cutlass.Int32
|
||
# (EPI_TILE_M, EPI_TILE_N, STAGE)
|
||
sC: cute.struct.Align[
|
||
cute.struct.MemRange[
|
||
self.c_dtype,
|
||
cute.cosize(self.c_smem_layout_staged.outer),
|
||
],
|
||
self.buffer_align_bytes,
|
||
]
|
||
# (MMA, MMA_M, MMA_K, STAGE)
|
||
sA: cute.struct.Align[
|
||
cute.struct.MemRange[
|
||
self.a_dtype, cute.cosize(self.a_smem_layout_staged.outer)
|
||
],
|
||
self.buffer_align_bytes,
|
||
]
|
||
# (MMA, MMA_N, MMA_K, STAGE)
|
||
sB: cute.struct.Align[
|
||
cute.struct.MemRange[
|
||
self.b_dtype, cute.cosize(self.b_smem_layout_staged.outer)
|
||
],
|
||
self.buffer_align_bytes,
|
||
]
|
||
# (MMA, MMA_M, MMA_K, STAGE)
|
||
sSFA: cute.struct.Align[
|
||
cute.struct.MemRange[
|
||
self.sf_dtype, cute.cosize(self.sfa_smem_layout_staged)
|
||
],
|
||
self.buffer_align_bytes,
|
||
]
|
||
# (MMA, MMA_N, MMA_K, STAGE)
|
||
sSFB: cute.struct.Align[
|
||
cute.struct.MemRange[
|
||
self.sf_dtype, cute.cosize(self.sfb_smem_layout_staged)
|
||
],
|
||
self.buffer_align_bytes,
|
||
]
|
||
|
||
self.shared_storage = SharedStorage
|
||
|
||
# Launch the kernel synchronously
|
||
self.kernel(
|
||
tiled_mma,
|
||
tiled_mma_sfb,
|
||
tma_atom_a,
|
||
tma_tensor_a,
|
||
tma_atom_b,
|
||
tma_tensor_b,
|
||
tma_atom_sfa,
|
||
tma_tensor_sfa,
|
||
tma_atom_sfb,
|
||
tma_tensor_sfb,
|
||
tma_atom_c,
|
||
tma_tensor_c,
|
||
alpha_tensor,
|
||
self.cluster_layout_vmnk,
|
||
self.cluster_layout_sfb_vmnk,
|
||
self.a_smem_layout_staged,
|
||
self.b_smem_layout_staged,
|
||
self.sfa_smem_layout_staged,
|
||
self.sfb_smem_layout_staged,
|
||
self.c_smem_layout_staged,
|
||
self.epi_tile,
|
||
self.tile_sched_params,
|
||
).launch(
|
||
grid=grid,
|
||
block=[self.threads_per_cta, 1, 1],
|
||
cluster=(*self.cluster_shape_mn, 1),
|
||
smem=self.shared_storage.size_in_bytes(), # type: ignore[attr-defined]
|
||
stream=stream,
|
||
)
|
||
return
|
||
|
||
# GPU device kernel
|
||
@cute.kernel
|
||
def kernel(
|
||
self,
|
||
tiled_mma: cute.TiledMma,
|
||
tiled_mma_sfb: cute.TiledMma,
|
||
tma_atom_a: cute.CopyAtom,
|
||
mA_mkl: cute.Tensor,
|
||
tma_atom_b: cute.CopyAtom,
|
||
mB_nkl: cute.Tensor,
|
||
tma_atom_sfa: cute.CopyAtom,
|
||
mSFA_mkl: cute.Tensor,
|
||
tma_atom_sfb: cute.CopyAtom,
|
||
mSFB_nkl: cute.Tensor,
|
||
tma_atom_c: Optional[cute.CopyAtom],
|
||
mC_mnl: cute.Tensor,
|
||
alpha: Optional[cute.Tensor],
|
||
cluster_layout_vmnk: cute.Layout,
|
||
cluster_layout_sfb_vmnk: cute.Layout,
|
||
a_smem_layout_staged: cute.ComposedLayout,
|
||
b_smem_layout_staged: cute.ComposedLayout,
|
||
sfa_smem_layout_staged: cute.Layout,
|
||
sfb_smem_layout_staged: cute.Layout,
|
||
c_smem_layout_staged: Union[cute.Layout, cute.ComposedLayout, None],
|
||
epi_tile: cute.Tile,
|
||
tile_sched_params: MaskedSchedulerParams,
|
||
):
|
||
"""
|
||
GPU device kernel performing the Persistent batched GEMM computation.
|
||
"""
|
||
warp_idx = cute.arch.warp_idx()
|
||
warp_idx = cute.arch.make_warp_uniform(warp_idx)
|
||
|
||
#
|
||
# Prefetch tma desc
|
||
#
|
||
if warp_idx == self.tma_warp_id:
|
||
cpasync.prefetch_descriptor(tma_atom_a)
|
||
cpasync.prefetch_descriptor(tma_atom_b)
|
||
cpasync.prefetch_descriptor(tma_atom_sfa)
|
||
cpasync.prefetch_descriptor(tma_atom_sfb)
|
||
cpasync.prefetch_descriptor(tma_atom_c)
|
||
|
||
use_2cta_instrs = cute.size(tiled_mma.thr_id.shape) == 2
|
||
|
||
#
|
||
# Setup cta/thread coordinates
|
||
#
|
||
# Coords inside cluster
|
||
bidx, bidy, bidz = cute.arch.block_idx()
|
||
mma_tile_coord_v = bidx % cute.size(tiled_mma.thr_id.shape)
|
||
is_leader_cta = mma_tile_coord_v == 0
|
||
cta_rank_in_cluster = cute.arch.make_warp_uniform(
|
||
cute.arch.block_idx_in_cluster()
|
||
)
|
||
block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(
|
||
cta_rank_in_cluster
|
||
)
|
||
block_in_cluster_coord_sfb_vmnk = cluster_layout_sfb_vmnk.get_flat_coord(
|
||
cta_rank_in_cluster
|
||
)
|
||
# Coord inside cta
|
||
tidx, _, _ = cute.arch.thread_idx()
|
||
|
||
#
|
||
# Alloc and init: a+b full/empty, accumulator full/empty, tensor memory dealloc barrier
|
||
#
|
||
smem = utils.SmemAllocator()
|
||
storage = smem.allocate(self.shared_storage)
|
||
|
||
tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr
|
||
tmem_holding_buf = storage.tmem_holding_buf
|
||
|
||
# Initialize mainloop ab_pipeline (barrier) and states
|
||
ab_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
|
||
num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
|
||
ab_pipeline_consumer_group = pipeline.CooperativeGroup(
|
||
pipeline.Agent.Thread, num_tma_producer
|
||
)
|
||
ab_pipeline = pipeline.PipelineTmaUmma.create(
|
||
barrier_storage=storage.ab_full_mbar_ptr.data_ptr(),
|
||
num_stages=self.num_ab_stage,
|
||
producer_group=ab_pipeline_producer_group,
|
||
consumer_group=ab_pipeline_consumer_group,
|
||
tx_count=self.num_tma_load_bytes,
|
||
cta_layout_vmnk=cluster_layout_vmnk,
|
||
)
|
||
|
||
# Initialize acc_pipeline (barrier) and states
|
||
acc_pipeline_producer_group = pipeline.CooperativeGroup(pipeline.Agent.Thread)
|
||
num_acc_consumer_threads = len(self.epilog_warp_id) * (
|
||
2 if use_2cta_instrs else 1
|
||
)
|
||
acc_pipeline_consumer_group = pipeline.CooperativeGroup(
|
||
pipeline.Agent.Thread, num_acc_consumer_threads
|
||
)
|
||
acc_pipeline = pipeline.PipelineUmmaAsync.create(
|
||
barrier_storage=storage.acc_full_mbar_ptr.data_ptr(),
|
||
num_stages=self.num_acc_stage,
|
||
producer_group=acc_pipeline_producer_group,
|
||
consumer_group=acc_pipeline_consumer_group,
|
||
cta_layout_vmnk=cluster_layout_vmnk,
|
||
)
|
||
|
||
# Tensor memory dealloc barrier init
|
||
if use_2cta_instrs:
|
||
if warp_idx == self.tma_warp_id:
|
||
num_tmem_dealloc_threads = 32
|
||
with cute.arch.elect_one():
|
||
cute.arch.mbarrier_init(
|
||
tmem_dealloc_mbar_ptr, num_tmem_dealloc_threads
|
||
)
|
||
cute.arch.mbarrier_init_fence()
|
||
|
||
# Cluster arrive after barrier init
|
||
if cute.size(self.cluster_shape_mn) > 1:
|
||
cute.arch.cluster_arrive_relaxed()
|
||
|
||
#
|
||
# Setup smem tensor A/B/SFA/SFB/C
|
||
#
|
||
# (EPI_TILE_M, EPI_TILE_N, STAGE)
|
||
sC = storage.sC.get_tensor(
|
||
c_smem_layout_staged.outer, swizzle=c_smem_layout_staged.inner
|
||
)
|
||
# (MMA, MMA_M, MMA_K, STAGE)
|
||
sA = storage.sA.get_tensor(
|
||
a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
|
||
)
|
||
# (MMA, MMA_N, MMA_K, STAGE)
|
||
sB = storage.sB.get_tensor(
|
||
b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
|
||
)
|
||
# (MMA, MMA_M, MMA_K, STAGE)
|
||
sSFA = storage.sSFA.get_tensor(sfa_smem_layout_staged)
|
||
# (MMA, MMA_N, MMA_K, STAGE)
|
||
sSFB = storage.sSFB.get_tensor(sfb_smem_layout_staged)
|
||
|
||
#
|
||
# Compute multicast mask for A/B/SFA/SFB buffer full
|
||
#
|
||
a_full_mcast_mask = None
|
||
b_full_mcast_mask = None
|
||
sfa_full_mcast_mask = None
|
||
sfb_full_mcast_mask = None
|
||
if cutlass.const_expr(self.is_a_mcast or self.is_b_mcast or use_2cta_instrs):
|
||
a_full_mcast_mask = cpasync.create_tma_multicast_mask(
|
||
cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
|
||
)
|
||
b_full_mcast_mask = cpasync.create_tma_multicast_mask(
|
||
cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=1
|
||
)
|
||
sfa_full_mcast_mask = cpasync.create_tma_multicast_mask(
|
||
cluster_layout_vmnk, block_in_cluster_coord_vmnk, mcast_mode=2
|
||
)
|
||
sfb_full_mcast_mask = cpasync.create_tma_multicast_mask(
|
||
cluster_layout_sfb_vmnk, block_in_cluster_coord_sfb_vmnk, mcast_mode=1
|
||
)
|
||
|
||
#
|
||
# Local_tile partition global tensors
|
||
#
|
||
# (bM, bK, RestM, RestK, RestL)
|
||
gA_mkl = cute.local_tile(
|
||
mA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
|
||
)
|
||
# (bN, bK, RestN, RestK, RestL)
|
||
gB_nkl = cute.local_tile(
|
||
mB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
|
||
)
|
||
# (bM, bK, RestM, RestK, RestL)
|
||
gSFA_mkl = cute.local_tile(
|
||
mSFA_mkl, cute.slice_(self.mma_tiler, (None, 0, None)), (None, None, None)
|
||
)
|
||
# (bN, bK, RestN, RestK, RestL)
|
||
gSFB_nkl = cute.local_tile(
|
||
mSFB_nkl, cute.slice_(self.mma_tiler, (0, None, None)), (None, None, None)
|
||
)
|
||
# (bM, bN, RestM, RestN, RestL)
|
||
gC_mnl = cute.local_tile(
|
||
mC_mnl, cute.slice_(self.mma_tiler, (None, None, 0)), (None, None, None)
|
||
)
|
||
k_block_cnt = cute.size(gA_mkl, mode=[3])
|
||
|
||
#
|
||
# Partition global tensor for TiledMMA_A/B/C
|
||
#
|
||
thr_mma = tiled_mma.get_slice(mma_tile_coord_v)
|
||
thr_mma_sfb = tiled_mma_sfb.get_slice(mma_tile_coord_v)
|
||
# (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
|
||
tCgA = thr_mma.partition_A(gA_mkl)
|
||
# (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
|
||
tCgB = thr_mma.partition_B(gB_nkl)
|
||
# (MMA, MMA_M, MMA_K, RestM, RestK, RestL)
|
||
tCgSFA = thr_mma.partition_A(gSFA_mkl)
|
||
# (MMA, MMA_N, MMA_K, RestN, RestK, RestL)
|
||
tCgSFB = thr_mma_sfb.partition_B(gSFB_nkl)
|
||
# (MMA, MMA_M, MMA_N, RestM, RestN, RestL)
|
||
tCgC = thr_mma.partition_C(gC_mnl)
|
||
|
||
#
|
||
# Partition global/shared tensor for TMA load A/B
|
||
#
|
||
# TMA load A partition_S/D
|
||
a_cta_layout = cute.make_layout(
|
||
cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape
|
||
)
|
||
# ((atom_v, rest_v), STAGE)
|
||
# ((atom_v, rest_v), RestM, RestK, RestL)
|
||
tAsA, tAgA = cpasync.tma_partition(
|
||
tma_atom_a,
|
||
block_in_cluster_coord_vmnk[2],
|
||
a_cta_layout,
|
||
cute.group_modes(sA, 0, 3),
|
||
cute.group_modes(tCgA, 0, 3),
|
||
)
|
||
# TMA load B partition_S/D
|
||
b_cta_layout = cute.make_layout(
|
||
cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape
|
||
)
|
||
# ((atom_v, rest_v), STAGE)
|
||
# ((atom_v, rest_v), RestN, RestK, RestL)
|
||
tBsB, tBgB = cpasync.tma_partition(
|
||
tma_atom_b,
|
||
block_in_cluster_coord_vmnk[1],
|
||
b_cta_layout,
|
||
cute.group_modes(sB, 0, 3),
|
||
cute.group_modes(tCgB, 0, 3),
|
||
)
|
||
|
||
# TMA load SFA partition_S/D
|
||
sfa_cta_layout = a_cta_layout
|
||
# ((atom_v, rest_v), STAGE)
|
||
# ((atom_v, rest_v), RestM, RestK, RestL)
|
||
tAsSFA, tAgSFA = cute.nvgpu.cpasync.tma_partition(
|
||
tma_atom_sfa,
|
||
block_in_cluster_coord_vmnk[2],
|
||
sfa_cta_layout,
|
||
cute.group_modes(sSFA, 0, 3),
|
||
cute.group_modes(tCgSFA, 0, 3),
|
||
)
|
||
tAsSFA = cute.filter_zeros(tAsSFA)
|
||
tAgSFA = cute.filter_zeros(tAgSFA)
|
||
|
||
# TMA load SFB partition_S/D
|
||
sfb_cta_layout = cute.make_layout(
|
||
cute.slice_(cluster_layout_sfb_vmnk, (0, None, 0, 0)).shape
|
||
)
|
||
# ((atom_v, rest_v), STAGE)
|
||
# ((atom_v, rest_v), RestN, RestK, RestL)
|
||
tBsSFB, tBgSFB = cute.nvgpu.cpasync.tma_partition(
|
||
tma_atom_sfb,
|
||
block_in_cluster_coord_sfb_vmnk[1],
|
||
sfb_cta_layout,
|
||
cute.group_modes(sSFB, 0, 3),
|
||
cute.group_modes(tCgSFB, 0, 3),
|
||
)
|
||
tBsSFB = cute.filter_zeros(tBsSFB)
|
||
tBgSFB = cute.filter_zeros(tBgSFB)
|
||
|
||
#
|
||
# Partition shared/tensor memory tensor for TiledMMA_A/B/C
|
||
#
|
||
# (MMA, MMA_M, MMA_K, STAGE)
|
||
tCrA = tiled_mma.make_fragment_A(sA)
|
||
# (MMA, MMA_N, MMA_K, STAGE)
|
||
tCrB = tiled_mma.make_fragment_B(sB)
|
||
# (MMA, MMA_M, MMA_N)
|
||
acc_shape = tiled_mma.partition_shape_C(self.mma_tiler[:2])
|
||
# (MMA, MMA_M, MMA_N, STAGE)
|
||
tCtAcc_fake = tiled_mma.make_fragment_C(
|
||
cute.append(acc_shape, self.num_acc_stage)
|
||
)
|
||
|
||
#
|
||
# Cluster wait before tensor memory alloc
|
||
#
|
||
if cute.size(self.cluster_shape_mn) > 1:
|
||
cute.arch.cluster_wait()
|
||
else:
|
||
cute.arch.barrier(
|
||
barrier_id=self.cta_sync_bar_id, number_of_threads=self.threads_per_cta
|
||
)
|
||
|
||
#
|
||
# Specialized TMA load warp
|
||
#
|
||
if warp_idx == self.tma_warp_id:
|
||
#
|
||
# Persistent tile scheduling loop
|
||
#
|
||
tile_sched = MaskedScheduler.create(
|
||
tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
|
||
)
|
||
work_tile = tile_sched.initial_work_tile_info()
|
||
|
||
ab_producer_state = pipeline.make_pipeline_state(
|
||
pipeline.PipelineUserType.Producer, self.num_ab_stage
|
||
)
|
||
|
||
while work_tile.is_valid_tile:
|
||
# Get tile coord from tile scheduler
|
||
cur_tile_coord = work_tile.tile_idx
|
||
mma_tile_coord_mnl = (
|
||
cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
|
||
cur_tile_coord[1],
|
||
cur_tile_coord[2],
|
||
)
|
||
|
||
#
|
||
# Slice to per mma tile index
|
||
#
|
||
# ((atom_v, rest_v), RestK)
|
||
tAgA_slice = tAgA[
|
||
(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])
|
||
]
|
||
# ((atom_v, rest_v), RestK)
|
||
tBgB_slice = tBgB[
|
||
(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
|
||
]
|
||
|
||
# ((atom_v, rest_v), RestK)
|
||
tAgSFA_slice = tAgSFA[
|
||
(None, mma_tile_coord_mnl[0], None, mma_tile_coord_mnl[2])
|
||
]
|
||
# ((atom_v, rest_v), RestK)
|
||
tBgSFB_slice = tBgSFB[
|
||
(None, mma_tile_coord_mnl[1], None, mma_tile_coord_mnl[2])
|
||
]
|
||
|
||
# Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt
|
||
ab_producer_state.reset_count()
|
||
peek_ab_empty_status = cutlass.Boolean(1)
|
||
if ab_producer_state.count < k_block_cnt:
|
||
peek_ab_empty_status = ab_pipeline.producer_try_acquire(
|
||
ab_producer_state
|
||
)
|
||
#
|
||
# Tma load loop
|
||
#
|
||
for k_block in cutlass.range(0, k_block_cnt, 1, unroll=1): # noqa: B007
|
||
# Conditionally wait for AB buffer empty
|
||
ab_pipeline.producer_acquire(
|
||
ab_producer_state, peek_ab_empty_status
|
||
)
|
||
|
||
# TMA load A/B/SFA/SFB
|
||
cute.copy(
|
||
tma_atom_a,
|
||
tAgA_slice[(None, ab_producer_state.count)],
|
||
tAsA[(None, ab_producer_state.index)],
|
||
tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
|
||
mcast_mask=a_full_mcast_mask,
|
||
)
|
||
cute.copy(
|
||
tma_atom_b,
|
||
tBgB_slice[(None, ab_producer_state.count)],
|
||
tBsB[(None, ab_producer_state.index)],
|
||
tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
|
||
mcast_mask=b_full_mcast_mask,
|
||
)
|
||
cute.copy(
|
||
tma_atom_sfa,
|
||
tAgSFA_slice[(None, ab_producer_state.count)],
|
||
tAsSFA[(None, ab_producer_state.index)],
|
||
tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
|
||
mcast_mask=sfa_full_mcast_mask,
|
||
)
|
||
cute.copy(
|
||
tma_atom_sfb,
|
||
tBgSFB_slice[(None, ab_producer_state.count)],
|
||
tBsSFB[(None, ab_producer_state.index)],
|
||
tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
|
||
mcast_mask=sfb_full_mcast_mask,
|
||
)
|
||
|
||
# Peek (try_wait) AB buffer empty for k_block = prefetch_k_block_cnt + k_block + 1
|
||
ab_producer_state.advance()
|
||
peek_ab_empty_status = cutlass.Boolean(1)
|
||
if ab_producer_state.count < k_block_cnt:
|
||
peek_ab_empty_status = ab_pipeline.producer_try_acquire(
|
||
ab_producer_state
|
||
)
|
||
|
||
#
|
||
# Advance to next tile
|
||
#
|
||
tile_sched.advance_to_next_work()
|
||
work_tile = tile_sched.get_current_work()
|
||
|
||
#
|
||
# Wait A/B buffer empty
|
||
#
|
||
ab_pipeline.producer_tail(ab_producer_state)
|
||
|
||
#
|
||
# Specialized MMA warp
|
||
#
|
||
if warp_idx == self.mma_warp_id:
|
||
#
|
||
# Bar sync for retrieve tensor memory ptr from shared mem
|
||
#
|
||
tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
|
||
cute.arch.barrier(
|
||
barrier_id=self.tmem_ptr_sync_bar_id,
|
||
number_of_threads=tmem_ptr_read_threads,
|
||
)
|
||
|
||
#
|
||
# Retrieving tensor memory ptr and make accumulator/SFA/SFB tensor
|
||
#
|
||
# Make accumulator tmem tensor
|
||
acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
|
||
self.acc_dtype,
|
||
alignment=16,
|
||
ptr_to_buffer_holding_addr=tmem_holding_buf,
|
||
)
|
||
# (MMA, MMA_M, MMA_N, STAGE)
|
||
tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
|
||
|
||
# Make SFA tmem tensor
|
||
sfa_tmem_ptr = cute.recast_ptr(
|
||
acc_tmem_ptr + tcgen05.find_tmem_tensor_col_offset(tCtAcc_base),
|
||
dtype=self.sf_dtype,
|
||
)
|
||
# (MMA, MMA_M, MMA_K)
|
||
tCtSFA_layout = blockscaled_utils.make_tmem_layout_sfa(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.sf_vec_size,
|
||
cute.slice_(sfa_smem_layout_staged, (None, None, None, 0)),
|
||
)
|
||
tCtSFA = cute.make_tensor(sfa_tmem_ptr, tCtSFA_layout)
|
||
|
||
# Make SFB tmem tensor
|
||
sfb_tmem_ptr = cute.recast_ptr(
|
||
acc_tmem_ptr
|
||
+ tcgen05.find_tmem_tensor_col_offset(tCtAcc_base)
|
||
+ tcgen05.find_tmem_tensor_col_offset(tCtSFA),
|
||
dtype=self.sf_dtype,
|
||
)
|
||
# (MMA, MMA_N, MMA_K)
|
||
tCtSFB_layout = blockscaled_utils.make_tmem_layout_sfb(
|
||
tiled_mma,
|
||
self.mma_tiler,
|
||
self.sf_vec_size,
|
||
cute.slice_(sfb_smem_layout_staged, (None, None, None, 0)),
|
||
)
|
||
tCtSFB = cute.make_tensor(sfb_tmem_ptr, tCtSFB_layout)
|
||
#
|
||
# Partition for S2T copy of SFA/SFB
|
||
#
|
||
tiled_copy_s2t_sfa, tCsSFA_compact_s2t, tCtSFA_compact_s2t = (
|
||
self.mainloop_s2t_copy_and_partition(sSFA, tCtSFA)
|
||
)
|
||
tiled_copy_s2t_sfb, tCsSFB_compact_s2t, tCtSFB_compact_s2t = (
|
||
self.mainloop_s2t_copy_and_partition(sSFB, tCtSFB)
|
||
)
|
||
|
||
#
|
||
# Persistent tile scheduling loop
|
||
#
|
||
tile_sched = MaskedScheduler.create(
|
||
tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
|
||
)
|
||
work_tile = tile_sched.initial_work_tile_info()
|
||
|
||
ab_consumer_state = pipeline.make_pipeline_state(
|
||
pipeline.PipelineUserType.Consumer, self.num_ab_stage
|
||
)
|
||
acc_producer_state = pipeline.make_pipeline_state(
|
||
pipeline.PipelineUserType.Producer, self.num_acc_stage
|
||
)
|
||
|
||
while work_tile.is_valid_tile:
|
||
# Get tile coord from tile scheduler
|
||
cur_tile_coord = work_tile.tile_idx
|
||
mma_tile_coord_mnl = (
|
||
cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
|
||
cur_tile_coord[1],
|
||
cur_tile_coord[2],
|
||
)
|
||
|
||
# Set tensor memory buffer for current tile
|
||
# (MMA, MMA_M, MMA_N)
|
||
tCtAcc = tCtAcc_base[(None, None, None, acc_producer_state.index)]
|
||
|
||
# Peek (try_wait) AB buffer full for k_block = 0
|
||
ab_consumer_state.reset_count()
|
||
peek_ab_full_status = cutlass.Boolean(1)
|
||
if ab_consumer_state.count < k_block_cnt and is_leader_cta:
|
||
peek_ab_full_status = ab_pipeline.consumer_try_wait(
|
||
ab_consumer_state
|
||
)
|
||
|
||
#
|
||
# Wait for accumulator buffer empty
|
||
#
|
||
if is_leader_cta:
|
||
acc_pipeline.producer_acquire(acc_producer_state)
|
||
|
||
#
|
||
# Reset the ACCUMULATE field for each tile
|
||
#
|
||
tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
|
||
|
||
#
|
||
# Mma mainloop
|
||
#
|
||
for k_block in cutlass.range_constexpr(k_block_cnt): # noqa: B007
|
||
if is_leader_cta:
|
||
# Conditionally wait for AB buffer full
|
||
ab_pipeline.consumer_wait(
|
||
ab_consumer_state, peek_ab_full_status
|
||
)
|
||
|
||
# Copy SFA/SFB from smem to tmem
|
||
s2t_stage_coord = (
|
||
None,
|
||
None,
|
||
None,
|
||
None,
|
||
ab_consumer_state.index,
|
||
)
|
||
tCsSFA_compact_s2t_staged = tCsSFA_compact_s2t[s2t_stage_coord]
|
||
tCsSFB_compact_s2t_staged = tCsSFB_compact_s2t[s2t_stage_coord]
|
||
cute.copy(
|
||
tiled_copy_s2t_sfa,
|
||
tCsSFA_compact_s2t_staged,
|
||
tCtSFA_compact_s2t,
|
||
)
|
||
cute.copy(
|
||
tiled_copy_s2t_sfb,
|
||
tCsSFB_compact_s2t_staged,
|
||
tCtSFB_compact_s2t,
|
||
)
|
||
|
||
# tCtAcc += tCrA * tCrSFA * tCrB * tCrSFB
|
||
num_kphases = cute.size(tCrA, mode=[2])
|
||
for kphase_idx in cutlass.range(num_kphases, unroll_full=True):
|
||
kphase_coord = (
|
||
None,
|
||
None,
|
||
kphase_idx,
|
||
ab_consumer_state.index,
|
||
)
|
||
|
||
# Set SFA/SFB tensor to tiled_mma
|
||
sf_kphase_coord = (None, None, kphase_idx)
|
||
tiled_mma.set(
|
||
tcgen05.Field.SFA,
|
||
tCtSFA[sf_kphase_coord].iterator,
|
||
)
|
||
tiled_mma.set(
|
||
tcgen05.Field.SFB,
|
||
tCtSFB[sf_kphase_coord].iterator,
|
||
)
|
||
|
||
cute.gemm(
|
||
tiled_mma,
|
||
tCtAcc,
|
||
tCrA[kphase_coord],
|
||
tCrB[kphase_coord],
|
||
tCtAcc,
|
||
)
|
||
|
||
# Enable accumulate on tCtAcc after first kphase
|
||
tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
|
||
|
||
# Async arrive AB buffer empty
|
||
ab_pipeline.consumer_release(ab_consumer_state)
|
||
|
||
# Peek (try_wait) AB buffer full for k_block = k_block + 1
|
||
ab_consumer_state.advance()
|
||
peek_ab_full_status = cutlass.Boolean(1)
|
||
if ab_consumer_state.count < k_block_cnt:
|
||
if is_leader_cta:
|
||
peek_ab_full_status = ab_pipeline.consumer_try_wait(
|
||
ab_consumer_state
|
||
)
|
||
|
||
#
|
||
# Async arrive accumulator buffer full
|
||
#
|
||
if is_leader_cta:
|
||
acc_pipeline.producer_commit(acc_producer_state)
|
||
acc_producer_state.advance()
|
||
|
||
#
|
||
# Advance to next tile
|
||
#
|
||
tile_sched.advance_to_next_work()
|
||
work_tile = tile_sched.get_current_work()
|
||
|
||
#
|
||
# Wait for accumulator buffer empty
|
||
#
|
||
acc_pipeline.producer_tail(acc_producer_state)
|
||
#
|
||
# Specialized epilogue warps
|
||
#
|
||
if warp_idx < self.mma_warp_id:
|
||
#
|
||
# Alloc tensor memory buffer
|
||
#
|
||
if warp_idx == self.epilog_warp_id[0]:
|
||
cute.arch.alloc_tmem(
|
||
self.num_tmem_alloc_cols,
|
||
tmem_holding_buf,
|
||
is_two_cta=use_2cta_instrs,
|
||
)
|
||
|
||
#
|
||
# Bar sync for retrieve tensor memory ptr from shared memory
|
||
#
|
||
tmem_ptr_read_threads = 32 * len((self.mma_warp_id, *self.epilog_warp_id))
|
||
cute.arch.barrier(
|
||
barrier_id=self.tmem_ptr_sync_bar_id,
|
||
number_of_threads=tmem_ptr_read_threads,
|
||
)
|
||
|
||
#
|
||
# Retrieving tensor memory ptr and make accumulator tensor
|
||
#
|
||
acc_tmem_ptr = cute.arch.retrieve_tmem_ptr(
|
||
self.acc_dtype,
|
||
alignment=16,
|
||
ptr_to_buffer_holding_addr=tmem_holding_buf,
|
||
)
|
||
# (MMA, MMA_M, MMA_N, STAGE)
|
||
tCtAcc_base = cute.make_tensor(acc_tmem_ptr, tCtAcc_fake.layout)
|
||
|
||
#
|
||
# Partition for epilogue
|
||
#
|
||
epi_tidx = tidx
|
||
tiled_copy_t2r, tTR_tAcc_base, tTR_rAcc = (
|
||
self.epilog_tmem_copy_and_partition(
|
||
epi_tidx, tCtAcc_base, tCgC, epi_tile, use_2cta_instrs
|
||
)
|
||
)
|
||
|
||
tTR_rC = cute.make_fragment(tTR_rAcc.shape, self.c_dtype)
|
||
tiled_copy_r2s, tRS_rC, tRS_sC = self.epilog_smem_copy_and_partition(
|
||
tiled_copy_t2r, tTR_rC, epi_tidx, sC
|
||
)
|
||
tma_atom_c, bSG_sC, bSG_gC_partitioned = (
|
||
self.epilog_gmem_copy_and_partition(
|
||
epi_tidx, tma_atom_c, tCgC, epi_tile, sC
|
||
)
|
||
)
|
||
|
||
#
|
||
# Persistent tile scheduling loop
|
||
#
|
||
tile_sched = MaskedScheduler.create(
|
||
tile_sched_params, cute.arch.block_idx(), cute.arch.grid_dim()
|
||
)
|
||
work_tile = tile_sched.initial_work_tile_info()
|
||
|
||
acc_consumer_state = pipeline.make_pipeline_state(
|
||
pipeline.PipelineUserType.Consumer, self.num_acc_stage
|
||
)
|
||
|
||
# Threads/warps participating in tma store pipeline
|
||
c_producer_group = pipeline.CooperativeGroup(
|
||
pipeline.Agent.Thread,
|
||
32 * len(self.epilog_warp_id),
|
||
32 * len(self.epilog_warp_id),
|
||
)
|
||
c_pipeline = pipeline.PipelineTmaStore.create(
|
||
num_stages=self.num_c_stage,
|
||
producer_group=c_producer_group,
|
||
)
|
||
|
||
while work_tile.is_valid_tile:
|
||
# Get tile coord from tile scheduler
|
||
cur_tile_coord = work_tile.tile_idx
|
||
mma_tile_coord_mnl = (
|
||
cur_tile_coord[0] // cute.size(tiled_mma.thr_id.shape),
|
||
cur_tile_coord[1],
|
||
cur_tile_coord[2],
|
||
)
|
||
|
||
#
|
||
# Slice to per mma tile index
|
||
#
|
||
# ((ATOM_V, REST_V), EPI_M, EPI_N)
|
||
bSG_gC = bSG_gC_partitioned[
|
||
(
|
||
None,
|
||
None,
|
||
None,
|
||
*mma_tile_coord_mnl,
|
||
)
|
||
]
|
||
|
||
# Set tensor memory buffer for current tile
|
||
# (T2R, T2R_M, T2R_N, EPI_M, EPI_M)
|
||
tTR_tAcc = tTR_tAcc_base[
|
||
(None, None, None, None, None, acc_consumer_state.index)
|
||
]
|
||
|
||
#
|
||
# Wait for accumulator buffer full
|
||
#
|
||
acc_pipeline.consumer_wait(acc_consumer_state)
|
||
|
||
tTR_tAcc = cute.group_modes(tTR_tAcc, 3, cute.rank(tTR_tAcc))
|
||
bSG_gC = cute.group_modes(bSG_gC, 1, cute.rank(bSG_gC))
|
||
|
||
#
|
||
# Store accumulator to global memory in subtiles
|
||
#
|
||
subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
|
||
num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
|
||
for subtile_idx in cutlass.range(subtile_cnt):
|
||
#
|
||
# Load accumulator from tensor memory buffer to register
|
||
#
|
||
tTR_tAcc_mn = tTR_tAcc[(None, None, None, subtile_idx)]
|
||
cute.copy(tiled_copy_t2r, tTR_tAcc_mn, tTR_rAcc)
|
||
|
||
#
|
||
# Convert to C type
|
||
#
|
||
acc_vec = tiled_copy_r2s.retile(tTR_rAcc).load()
|
||
if cutlass.const_expr(alpha is not None):
|
||
acc_vec = acc_vec * alpha[work_tile.tile_idx[2]]
|
||
|
||
acc_vec = acc_vec.to(self.c_dtype)
|
||
tRS_rC.store(acc_vec)
|
||
|
||
#
|
||
# Store C to shared memory
|
||
#
|
||
c_buffer = (num_prev_subtiles + subtile_idx) % self.num_c_stage
|
||
cute.copy(
|
||
tiled_copy_r2s,
|
||
tRS_rC,
|
||
tRS_sC[(None, None, None, c_buffer)],
|
||
)
|
||
# Fence and barrier to make sure shared memory store is visible to TMA store
|
||
cute.arch.fence_proxy(
|
||
cute.arch.ProxyKind.async_shared,
|
||
space=cute.arch.SharedSpace.shared_cta,
|
||
)
|
||
epilog_threads = 32 * len(self.epilog_warp_id)
|
||
cute.arch.barrier(
|
||
barrier_id=self.epilog_sync_bar_id,
|
||
number_of_threads=epilog_threads,
|
||
)
|
||
|
||
#
|
||
# TMA store C to global memory
|
||
#
|
||
if warp_idx == self.epilog_warp_id[0]:
|
||
cute.copy(
|
||
tma_atom_c,
|
||
bSG_sC[(None, c_buffer)],
|
||
bSG_gC[(None, subtile_idx)],
|
||
)
|
||
# Fence and barrier to make sure shared memory store is visible to TMA store
|
||
c_pipeline.producer_commit()
|
||
c_pipeline.producer_acquire()
|
||
cute.arch.barrier(
|
||
barrier_id=self.epilog_sync_bar_id,
|
||
number_of_threads=epilog_threads,
|
||
)
|
||
|
||
#
|
||
# Async arrive accumulator buffer empty
|
||
#
|
||
with cute.arch.elect_one():
|
||
acc_pipeline.consumer_release(acc_consumer_state)
|
||
acc_consumer_state.advance()
|
||
|
||
#
|
||
# Advance to next tile
|
||
#
|
||
tile_sched.advance_to_next_work()
|
||
work_tile = tile_sched.get_current_work()
|
||
|
||
#
|
||
# Dealloc the tensor memory buffer
|
||
#
|
||
if warp_idx == self.epilog_warp_id[0]:
|
||
cute.arch.relinquish_tmem_alloc_permit(is_two_cta=use_2cta_instrs)
|
||
epilog_threads = 32 * len(self.epilog_warp_id)
|
||
cute.arch.barrier(
|
||
barrier_id=self.epilog_sync_bar_id, number_of_threads=epilog_threads
|
||
)
|
||
if warp_idx == self.epilog_warp_id[0]:
|
||
if use_2cta_instrs:
|
||
cute.arch.mbarrier_arrive(
|
||
tmem_dealloc_mbar_ptr, cta_rank_in_cluster ^ 1
|
||
)
|
||
cute.arch.mbarrier_wait(tmem_dealloc_mbar_ptr, 0)
|
||
cute.arch.dealloc_tmem(
|
||
acc_tmem_ptr, self.num_tmem_alloc_cols, is_two_cta=use_2cta_instrs
|
||
)
|
||
#
|
||
# Wait for C store complete
|
||
#
|
||
c_pipeline.producer_tail()
|
||
|
||
def mainloop_s2t_copy_and_partition(
|
||
self,
|
||
sSF: cute.Tensor,
|
||
tSF: cute.Tensor,
|
||
) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
|
||
"""
|
||
Make tiledCopy for smem to tmem load for scale factor tensor, then use it to partition smem memory (source) and tensor memory (destination).
|
||
|
||
:param sSF: The scale factor tensor in smem
|
||
:type sSF: cute.Tensor
|
||
:param tSF: The scale factor tensor in tmem
|
||
:type tSF: cute.Tensor
|
||
|
||
:return: A tuple containing (tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t) where:
|
||
- tiled_copy_s2t: The tiled copy operation for smem to tmem load for scale factor tensor(s2t)
|
||
- tCsSF_compact_s2t: The partitioned scale factor tensor in smem
|
||
- tSF_compact_s2t: The partitioned scale factor tensor in tmem
|
||
:rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
|
||
"""
|
||
# (MMA, MMA_MN, MMA_K, STAGE)
|
||
tCsSF_compact = cute.filter_zeros(sSF)
|
||
# (MMA, MMA_MN, MMA_K)
|
||
tCtSF_compact = cute.filter_zeros(tSF)
|
||
|
||
# Make S2T CopyAtom and tiledCopy
|
||
copy_atom_s2t = cute.make_copy_atom(
|
||
tcgen05.Cp4x32x128bOp(self.cta_group),
|
||
self.sf_dtype,
|
||
)
|
||
tiled_copy_s2t = tcgen05.make_s2t_copy(copy_atom_s2t, tCtSF_compact)
|
||
thr_copy_s2t = tiled_copy_s2t.get_slice(0)
|
||
|
||
# ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
|
||
tCsSF_compact_s2t_ = thr_copy_s2t.partition_S(tCsSF_compact)
|
||
# ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K, STAGE)
|
||
tCsSF_compact_s2t = tcgen05.get_s2t_smem_desc_tensor(
|
||
tiled_copy_s2t, tCsSF_compact_s2t_
|
||
)
|
||
# ((ATOM_V, REST_V), Rest_Tiler, MMA_MN, MMA_K)
|
||
tCtSF_compact_s2t = thr_copy_s2t.partition_D(tCtSF_compact)
|
||
|
||
return tiled_copy_s2t, tCsSF_compact_s2t, tCtSF_compact_s2t
|
||
|
||
def epilog_tmem_copy_and_partition(
|
||
self,
|
||
tidx: cutlass.Int32,
|
||
tAcc: cute.Tensor,
|
||
gC_mnl: cute.Tensor,
|
||
epi_tile: cute.Tile,
|
||
use_2cta_instrs: Union[cutlass.Boolean, bool],
|
||
) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
|
||
"""
|
||
Make tiledCopy for tensor memory load, then use it to partition tensor memory (source) and register array (destination).
|
||
|
||
:param tidx: The thread index in epilogue warp groups
|
||
:type tidx: cutlass.Int32
|
||
:param tAcc: The accumulator tensor to be copied and partitioned
|
||
:type tAcc: cute.Tensor
|
||
:param gC_mnl: The global tensor C
|
||
:type gC_mnl: cute.Tensor
|
||
:param epi_tile: The epilogue tiler
|
||
:type epi_tile: cute.Tile
|
||
:param use_2cta_instrs: Whether use_2cta_instrs is enabled
|
||
:type use_2cta_instrs: bool
|
||
|
||
:return: A tuple containing (tiled_copy_t2r, tTR_tAcc, tTR_rAcc) where:
|
||
- tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
|
||
- tTR_tAcc: The partitioned accumulator tensor
|
||
- tTR_rAcc: The accumulated tensor in register used to hold t2r results
|
||
:rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
|
||
"""
|
||
# Make tiledCopy for tensor memory load
|
||
copy_atom_t2r = sm100_utils.get_tmem_load_op(
|
||
self.cta_tile_shape_mnk,
|
||
self.c_layout,
|
||
self.c_dtype,
|
||
self.acc_dtype,
|
||
epi_tile,
|
||
use_2cta_instrs,
|
||
)
|
||
# (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, STAGE)
|
||
tAcc_epi = cute.flat_divide(
|
||
tAcc[((None, None), 0, 0, None)],
|
||
epi_tile,
|
||
)
|
||
# (EPI_TILE_M, EPI_TILE_N)
|
||
tiled_copy_t2r = tcgen05.make_tmem_copy(
|
||
copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
|
||
)
|
||
|
||
thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
|
||
# (T2R, T2R_M, T2R_N, EPI_M, EPI_M, STAGE)
|
||
tTR_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
|
||
|
||
# (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
|
||
gC_mnl_epi = cute.flat_divide(
|
||
gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
|
||
)
|
||
# (T2R, T2R_M, T2R_N, EPI_M, EPI_N, RestM, RestN, RestL)
|
||
tTR_gC = thr_copy_t2r.partition_D(gC_mnl_epi)
|
||
# (T2R, T2R_M, T2R_N)
|
||
tTR_rAcc = cute.make_fragment(
|
||
tTR_gC[(None, None, None, 0, 0, 0, 0, 0)].shape, self.acc_dtype
|
||
)
|
||
return tiled_copy_t2r, tTR_tAcc, tTR_rAcc
|
||
|
||
def epilog_smem_copy_and_partition(
|
||
self,
|
||
tiled_copy_t2r: cute.TiledCopy,
|
||
tTR_rC: cute.Tensor,
|
||
tidx: cutlass.Int32,
|
||
sC: cute.Tensor,
|
||
) -> Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]:
|
||
"""
|
||
Make tiledCopy for shared memory store, then use it to partition register array (source) and shared memory (destination).
|
||
|
||
:param tiled_copy_t2r: The tiled copy operation for tmem to register copy(t2r)
|
||
:type tiled_copy_t2r: cute.TiledCopy
|
||
:param tTR_rC: The partitioned accumulator tensor
|
||
:type tTR_rC: cute.Tensor
|
||
:param tidx: The thread index in epilogue warp groups
|
||
:type tidx: cutlass.Int32
|
||
:param sC: The shared memory tensor to be copied and partitioned
|
||
:type sC: cute.Tensor
|
||
:type sepi: cute.Tensor
|
||
|
||
:return: A tuple containing (tiled_copy_r2s, tRS_rC, tRS_sC) where:
|
||
- tiled_copy_r2s: The tiled copy operation for register to smem copy(r2s)
|
||
- tRS_rC: The partitioned tensor C (register source)
|
||
- tRS_sC: The partitioned tensor C (smem destination)
|
||
:rtype: Tuple[cute.TiledCopy, cute.Tensor, cute.Tensor]
|
||
"""
|
||
copy_atom_r2s = sm100_utils.get_smem_store_op(
|
||
self.c_layout, self.c_dtype, self.acc_dtype, tiled_copy_t2r
|
||
)
|
||
tiled_copy_r2s = cute.make_tiled_copy_D(copy_atom_r2s, tiled_copy_t2r)
|
||
# (R2S, R2S_M, R2S_N, PIPE_D)
|
||
thr_copy_r2s = tiled_copy_r2s.get_slice(tidx)
|
||
tRS_sC = thr_copy_r2s.partition_D(sC)
|
||
# (R2S, R2S_M, R2S_N)
|
||
tRS_rC = tiled_copy_r2s.retile(tTR_rC)
|
||
return tiled_copy_r2s, tRS_rC, tRS_sC
|
||
|
||
def epilog_gmem_copy_and_partition(
|
||
self,
|
||
tidx: cutlass.Int32,
|
||
atom: Union[cute.CopyAtom, cute.TiledCopy],
|
||
gC_mnl: cute.Tensor,
|
||
epi_tile: cute.Tile,
|
||
sC: cute.Tensor,
|
||
) -> Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]:
|
||
"""Make tiledCopy for global memory store, then use it to:
|
||
partition shared memory (source) and global memory (destination) for TMA store version.
|
||
|
||
:param tidx: The thread index in epilogue warp groups
|
||
:type tidx: cutlass.Int32
|
||
:param atom: The copy_atom_c to be used for TMA store version, or tiled_copy_t2r for none TMA store version
|
||
:type atom: cute.CopyAtom or cute.TiledCopy
|
||
:param gC_mnl: The global tensor C
|
||
:type gC_mnl: cute.Tensor
|
||
:param epi_tile: The epilogue tiler
|
||
:type epi_tile: cute.Tile
|
||
:param sC: The shared memory tensor to be copied and partitioned
|
||
:type sC: cute.Tensor
|
||
|
||
:return: A tuple containing (tma_atom_c, bSG_sC, bSG_gC) where:
|
||
- tma_atom_c: The TMA copy atom
|
||
- bSG_sC: The partitioned shared memory tensor C
|
||
- bSG_gC: The partitioned global tensor C
|
||
:rtype: Tuple[cute.CopyAtom, cute.Tensor, cute.Tensor]
|
||
"""
|
||
# (EPI_TILE_M, EPI_TILE_N, EPI_M, EPI_N, RestM, RestN, RestL)
|
||
gC_epi = cute.flat_divide(
|
||
gC_mnl[((None, None), 0, 0, None, None, None)], epi_tile
|
||
)
|
||
|
||
tma_atom_c = atom
|
||
sC_for_tma_partition = cute.group_modes(sC, 0, 2)
|
||
gC_for_tma_partition = cute.group_modes(gC_epi, 0, 2)
|
||
# ((ATOM_V, REST_V), EPI_M, EPI_N)
|
||
# ((ATOM_V, REST_V), EPI_M, EPI_N, RestM, RestN, RestL)
|
||
bSG_sC, bSG_gC = cpasync.tma_partition(
|
||
tma_atom_c,
|
||
0,
|
||
cute.make_layout(1),
|
||
sC_for_tma_partition,
|
||
gC_for_tma_partition,
|
||
)
|
||
return tma_atom_c, bSG_sC, bSG_gC
|
||
|
||
@staticmethod
|
||
def _compute_stages(
|
||
tiled_mma: cute.TiledMma,
|
||
mma_tiler_mnk: Tuple[int, int, int],
|
||
a_dtype: Type[cutlass.Numeric],
|
||
a_major_mode: tcgen05.OperandMajorMode,
|
||
b_dtype: Type[cutlass.Numeric],
|
||
b_major_mode: tcgen05.OperandMajorMode,
|
||
epi_tile: cute.Tile,
|
||
c_dtype: Type[cutlass.Numeric],
|
||
c_layout: utils.LayoutEnum,
|
||
sf_dtype: Type[cutlass.Numeric],
|
||
sf_vec_size: int,
|
||
smem_capacity: int,
|
||
occupancy: int,
|
||
) -> Tuple[int, int, int]:
|
||
"""Computes the number of stages for A/B/C operands based on heuristics.
|
||
|
||
:param tiled_mma: The tiled MMA object defining the core computation.
|
||
:type tiled_mma: cute.TiledMma
|
||
:param mma_tiler_mnk: The shape (M, N, K) of the MMA tiler.
|
||
:type mma_tiler_mnk: tuple[int, int, int]
|
||
:param a_dtype: Data type of operand A.
|
||
:type a_dtype: type[cutlass.Numeric]
|
||
:param a_major_mode: Major mode of operand A.
|
||
:type a_major_mode: tcgen05.OperandMajorMode
|
||
:param b_dtype: Data type of operand B.
|
||
:type b_dtype: type[cutlass.Numeric]
|
||
:param b_major_mode: Major mode of operand B.
|
||
:type b_major_mode: tcgen05.OperandMajorMode
|
||
:param epi_tile: The epilogue tile shape.
|
||
:type epi_tile: cute.Tile
|
||
:param c_dtype: Data type of operand C (output).
|
||
:type c_dtype: type[cutlass.Numeric]
|
||
:param c_layout: Layout enum of operand C.
|
||
:type c_layout: utils.LayoutEnum
|
||
:param sf_dtype: Data type of Scale factor.
|
||
:type sf_dtype: type[cutlass.Numeric]
|
||
:param sf_vec_size: Scale factor vector size.
|
||
:type sf_vec_size: int
|
||
:param smem_capacity: Total available shared memory capacity in bytes.
|
||
:type smem_capacity: int
|
||
:param occupancy: Target number of CTAs per SM (occupancy).
|
||
:type occupancy: int
|
||
|
||
:return: A tuple containing the computed number of stages for:
|
||
(ACC stages, A/B operand stages, C stages)
|
||
:rtype: tuple[int, int, int]
|
||
"""
|
||
# ACC stages
|
||
num_acc_stage = 1 if mma_tiler_mnk[1] == 256 else 2
|
||
|
||
# Default C stages
|
||
num_c_stage = 2
|
||
|
||
# Calculate smem layout and size for one stage of A, B, SFA, SFB and C
|
||
a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
|
||
tiled_mma,
|
||
mma_tiler_mnk,
|
||
a_dtype,
|
||
1, # a tmp 1 stage is provided
|
||
)
|
||
b_smem_layout_staged_one = sm100_utils.make_smem_layout_b(
|
||
tiled_mma,
|
||
mma_tiler_mnk,
|
||
b_dtype,
|
||
1, # a tmp 1 stage is provided
|
||
)
|
||
sfa_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfa(
|
||
tiled_mma,
|
||
mma_tiler_mnk,
|
||
sf_vec_size,
|
||
1, # a tmp 1 stage is provided
|
||
)
|
||
sfb_smem_layout_staged_one = blockscaled_utils.make_smem_layout_sfb(
|
||
tiled_mma,
|
||
mma_tiler_mnk,
|
||
sf_vec_size,
|
||
1, # a tmp 1 stage is provided
|
||
)
|
||
|
||
c_smem_layout_staged_one = sm100_utils.make_smem_layout_epi(
|
||
c_dtype,
|
||
c_layout,
|
||
epi_tile,
|
||
1,
|
||
)
|
||
|
||
ab_bytes_per_stage = (
|
||
cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
|
||
+ cute.size_in_bytes(b_dtype, b_smem_layout_staged_one)
|
||
+ cute.size_in_bytes(sf_dtype, sfa_smem_layout_staged_one)
|
||
+ cute.size_in_bytes(sf_dtype, sfb_smem_layout_staged_one)
|
||
)
|
||
mbar_helpers_bytes = 1024
|
||
c_bytes_per_stage = cute.size_in_bytes(c_dtype, c_smem_layout_staged_one)
|
||
c_bytes = c_bytes_per_stage * num_c_stage
|
||
|
||
# Calculate A/B/SFA/SFB stages:
|
||
# Start with total smem per CTA (capacity / occupancy)
|
||
# Subtract reserved bytes and initial C stages bytes
|
||
# Divide remaining by bytes needed per A/B/SFA/SFB stage
|
||
num_ab_stage = (
|
||
smem_capacity // occupancy - (mbar_helpers_bytes + c_bytes)
|
||
) // ab_bytes_per_stage
|
||
|
||
# Refine epilogue stages:
|
||
# Calculate remaining smem after allocating for A/B/SFA/SFB stages and reserved bytes
|
||
# Add remaining unused smem to epilogue
|
||
num_c_stage += (
|
||
smem_capacity
|
||
- occupancy * ab_bytes_per_stage * num_ab_stage
|
||
- occupancy * (mbar_helpers_bytes + c_bytes)
|
||
) // (occupancy * c_bytes_per_stage)
|
||
|
||
return num_acc_stage, num_ab_stage, num_c_stage
|
||
|
||
@staticmethod
|
||
def _compute_grid(
|
||
masked_m_tensor: cute.Tensor,
|
||
c: cute.Tensor,
|
||
cta_tile_shape_mnk: Tuple[int, int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
max_active_clusters: cutlass.Constexpr,
|
||
) -> Tuple[MaskedSchedulerParams, Tuple[int, int, int]]:
|
||
"""Use persistent tile scheduler to compute the grid size for the output tensor C.
|
||
|
||
:param c: The output tensor C
|
||
:type c: cute.Tensor
|
||
:param cta_tile_shape_mnk: The shape (M, N, K) of the CTA tile.
|
||
:type cta_tile_shape_mnk: tuple[int, int, int]
|
||
:param cluster_shape_mn: Shape of each cluster in M, N dimensions.
|
||
:type cluster_shape_mn: tuple[int, int]
|
||
:param max_active_clusters: Maximum number of active clusters.
|
||
:type max_active_clusters: cutlass.Constexpr
|
||
|
||
:return: A tuple containing:
|
||
- tile_sched_params: Parameters for the persistent tile scheduler.
|
||
- grid: Grid shape for kernel launch.
|
||
:rtype: Tuple[MaskedSchedulerParams, tuple[int, int, int]]
|
||
"""
|
||
c_tiler = cute.slice_(cta_tile_shape_mnk, (None, None, 0))
|
||
cluster_shape_mnl = (*cluster_shape_mn, 1)
|
||
|
||
tile_sched_params = MaskedSchedulerParams(
|
||
masked_m_tensor, c, c_tiler, cluster_shape_mnl
|
||
)
|
||
grid = MaskedScheduler.get_grid_shape(tile_sched_params, max_active_clusters)
|
||
|
||
return tile_sched_params, grid
|
||
|
||
@staticmethod
|
||
def is_valid_dtypes_and_scale_factor_vec_size(
|
||
ab_dtype: Type[cutlass.Numeric],
|
||
sf_dtype: Type[cutlass.Numeric],
|
||
sf_vec_size: int,
|
||
c_dtype: Type[cutlass.Numeric],
|
||
) -> bool:
|
||
"""
|
||
Check if the dtypes and sf_vec_size are valid combinations
|
||
|
||
:param ab_dtype: The data type of the A and B operands
|
||
:type ab_dtype: Type[cutlass.Numeric]
|
||
:param sf_dtype: The data type of the scale factor
|
||
:type sf_dtype: Type[cutlass.Numeric]
|
||
:param sf_vec_size: The vector size of the scale factor
|
||
:type sf_vec_size: int
|
||
:param c_dtype: The data type of the output tensor
|
||
:type c_dtype: Type[cutlass.Numeric]
|
||
|
||
:return: True if the dtypes and sf_vec_size are valid, False otherwise
|
||
:rtype: bool
|
||
"""
|
||
is_valid = True
|
||
|
||
# Check valid ab_dtype
|
||
if ab_dtype not in {
|
||
cutlass.Float4E2M1FN,
|
||
cutlass.Float8E5M2,
|
||
cutlass.Float8E4M3FN,
|
||
}:
|
||
is_valid = False
|
||
|
||
# Check valid sf_vec_size
|
||
if sf_vec_size not in {16, 32}:
|
||
is_valid = False
|
||
|
||
# Check valid sf_dtype
|
||
if sf_dtype not in {cutlass.Float8E8M0FNU, cutlass.Float8E4M3FN}:
|
||
is_valid = False
|
||
|
||
# Check valid sf_dtype and sf_vec_size combinations
|
||
if sf_dtype == cutlass.Float8E4M3FN and sf_vec_size == 32:
|
||
is_valid = False
|
||
if ab_dtype in {cutlass.Float8E5M2, cutlass.Float8E4M3FN} and sf_vec_size == 16:
|
||
is_valid = False
|
||
|
||
# Check valid c_dtype
|
||
if c_dtype not in {
|
||
cutlass.Float32,
|
||
cutlass.Float16,
|
||
cutlass.BFloat16,
|
||
cutlass.Float8E5M2,
|
||
cutlass.Float8E4M3FN,
|
||
}:
|
||
is_valid = False
|
||
|
||
return is_valid
|
||
|
||
@staticmethod
|
||
def is_valid_layouts(
|
||
ab_dtype: Type[cutlass.Numeric],
|
||
c_dtype: Type[cutlass.Numeric],
|
||
a_major: str,
|
||
b_major: str,
|
||
c_major: str,
|
||
) -> bool:
|
||
"""
|
||
Check if the dtypes and sf_vec_size are valid combinations
|
||
|
||
:param ab_dtype: The data type of the A and B operands
|
||
:type ab_dtype: Type[cutlass.Numeric]
|
||
:param c_dtype: The data type of the output tensor
|
||
:type c_dtype: Type[cutlass.Numeric]
|
||
:param a_major: The major dimension of the A tensor
|
||
:type a_major: str
|
||
:param b_major: The major dimension of the B tensor
|
||
:type b_major: str
|
||
:param c_major: The major dimension of the C tensor
|
||
:type c_major: str
|
||
|
||
:return: True if the layouts are valid, False otherwise
|
||
:rtype: bool
|
||
"""
|
||
is_valid = True
|
||
|
||
if ab_dtype is cutlass.Float4E2M1FN and not (a_major == "k" and b_major == "k"):
|
||
is_valid = False
|
||
return is_valid
|
||
|
||
@staticmethod
|
||
def is_valid_mma_tiler_and_cluster_shape(
|
||
mma_tiler_mn: Tuple[int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
) -> bool:
|
||
"""
|
||
Check if the mma tiler and cluster shape are valid
|
||
|
||
:param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
|
||
:type mma_tiler_mn: Tuple[int, int]
|
||
:param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
|
||
:type cluster_shape_mn: Tuple[int, int]
|
||
|
||
:return: True if the mma tiler and cluster shape are valid, False otherwise
|
||
:rtype: bool
|
||
"""
|
||
is_valid = True
|
||
# Skip invalid mma tile shape
|
||
if mma_tiler_mn[0] not in [128, 256]:
|
||
is_valid = False
|
||
if mma_tiler_mn[1] not in [128, 256]:
|
||
is_valid = False
|
||
# Skip illegal cluster shape
|
||
if cluster_shape_mn[0] % (2 if mma_tiler_mn[0] == 256 else 1) != 0:
|
||
is_valid = False
|
||
# Skip invalid cluster shape
|
||
is_power_of_2 = lambda x: x > 0 and (x & (x - 1)) == 0
|
||
if (
|
||
cluster_shape_mn[0] * cluster_shape_mn[1] > 16
|
||
or cluster_shape_mn[0] <= 0
|
||
or cluster_shape_mn[1] <= 0
|
||
# Special cluster shape check for scale factor multicasts.
|
||
# Due to limited size of scale factors, we can't multicast among more than 4 CTAs.
|
||
or cluster_shape_mn[0] > 4
|
||
or cluster_shape_mn[1] > 4
|
||
or not is_power_of_2(cluster_shape_mn[0])
|
||
or not is_power_of_2(cluster_shape_mn[1])
|
||
):
|
||
is_valid = False
|
||
return is_valid
|
||
|
||
@staticmethod
|
||
def is_valid_tensor_alignment(
|
||
m: int,
|
||
n: int,
|
||
k: int,
|
||
l: int,
|
||
ab_dtype: Type[cutlass.Numeric],
|
||
c_dtype: Type[cutlass.Numeric],
|
||
a_major: str,
|
||
b_major: str,
|
||
c_major: str,
|
||
) -> bool:
|
||
"""
|
||
Check if the tensor alignment is valid
|
||
|
||
:param m: The number of rows in the A tensor
|
||
:type m: int
|
||
:param n: The number of columns in the B tensor
|
||
:type n: int
|
||
:param k: The number of columns in the A tensor
|
||
:type k: int
|
||
:param l: The number of columns in the C tensor
|
||
:type l: int
|
||
:param ab_dtype: The data type of the A and B operands
|
||
:type ab_dtype: Type[cutlass.Numeric]
|
||
:param c_dtype: The data type of the output tensor
|
||
:type c_dtype: Type[cutlass.Numeric]
|
||
:param a_major: The major axis of the A tensor
|
||
:type a_major: str
|
||
:param b_major: The major axis of the B tensor
|
||
:type b_major: str
|
||
:param c_major: The major axis of the C tensor
|
||
:type c_major: str
|
||
|
||
:return: True if the problem shape is valid, False otherwise
|
||
:rtype: bool
|
||
"""
|
||
is_valid = True
|
||
|
||
def check_contigous_16B_alignment(dtype, is_mode0_major, tensor_shape):
|
||
major_mode_idx = 0 if is_mode0_major else 1
|
||
num_major_elements = tensor_shape[major_mode_idx]
|
||
num_contiguous_elements = 16 * 8 // dtype.width
|
||
return num_major_elements % num_contiguous_elements == 0
|
||
|
||
if (
|
||
not check_contigous_16B_alignment(ab_dtype, a_major == "m", (m, k, l))
|
||
or not check_contigous_16B_alignment(ab_dtype, b_major == "n", (n, k, l))
|
||
or not check_contigous_16B_alignment(c_dtype, c_major == "m", (m, n, l))
|
||
):
|
||
is_valid = False
|
||
return is_valid
|
||
|
||
@staticmethod
|
||
def can_implement(
|
||
ab_dtype: Type[cutlass.Numeric],
|
||
sf_dtype: Type[cutlass.Numeric],
|
||
sf_vec_size: int,
|
||
c_dtype: Type[cutlass.Numeric],
|
||
mma_tiler_mn: Tuple[int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
m: int,
|
||
n: int,
|
||
k: int,
|
||
l: int,
|
||
a_major: str,
|
||
b_major: str,
|
||
c_major: str,
|
||
) -> bool:
|
||
"""
|
||
Check if the gemm can be implemented
|
||
|
||
:param ab_dtype: The data type of the A and B operands
|
||
:type ab_dtype: Type[cutlass.Numeric]
|
||
:param sf_dtype: The data type of the scale factor tensor
|
||
:type sf_dtype: Type[cutlass.Numeric]
|
||
:param sf_vec_size: The vector size
|
||
:type sf_vec_size: int
|
||
:param c_dtype: The data type of the output tensor
|
||
:type c_dtype: Type[cutlass.Numeric]
|
||
:param mma_tiler_mn: The (M, N) shape of the MMA instruction tiler
|
||
:type mma_tiler_mn: Tuple[int, int]
|
||
:param cluster_shape_mn: The (ClusterM, ClusterN) shape of the CTA cluster
|
||
:type cluster_shape_mn: Tuple[int, int]
|
||
:param m: The number of rows in the A tensor
|
||
:type m: int
|
||
:param n: The number of columns in the B tensor
|
||
:type n: int
|
||
:param k: The number of columns in the A tensor
|
||
:type k: int
|
||
:param l: The number of columns in the C tensor
|
||
:type l: int
|
||
:param a_major: The major axis of the A tensor
|
||
:type a_major: str
|
||
:param b_major: The major axis of the B tensor
|
||
:type b_major: str
|
||
:param c_major: The major axis of the C tensor
|
||
:type c_major: str
|
||
|
||
:return: True if the gemm can be implemented, False otherwise
|
||
:rtype: bool
|
||
"""
|
||
can_implement = True
|
||
# Skip unsupported types
|
||
if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_dtypes_and_scale_factor_vec_size(
|
||
ab_dtype, sf_dtype, sf_vec_size, c_dtype
|
||
):
|
||
can_implement = False
|
||
# Skip unsupported layouts
|
||
if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_layouts(
|
||
ab_dtype, c_dtype, a_major, b_major, c_major
|
||
):
|
||
can_implement = False
|
||
# Skip invalid mma tile shape and cluster shape
|
||
if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_mma_tiler_and_cluster_shape(
|
||
mma_tiler_mn, cluster_shape_mn
|
||
):
|
||
can_implement = False
|
||
# Skip illegal problem shape for load/store alignment
|
||
if not Sm100BlockScaledPersistentDenseGemmKernel.is_valid_tensor_alignment(
|
||
m, n, k, l, ab_dtype, c_dtype, a_major, b_major, c_major
|
||
):
|
||
can_implement = False
|
||
return can_implement
|
||
|
||
|
||
@cute.jit
|
||
def cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
|
||
sf_ref_tensor: cute.Tensor,
|
||
sf_mma_tensor: cute.Tensor,
|
||
):
|
||
"""Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
|
||
# sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
|
||
# group to ((32, 4, rest_m), (4, rest_k), l)
|
||
sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
|
||
sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
|
||
for i in cutlass.range(cute.size(sf_ref_tensor)):
|
||
mkl_coord = sf_ref_tensor.layout.get_hier_coord(i)
|
||
sf_mma_tensor[mkl_coord] = sf_ref_tensor[mkl_coord]
|
||
|
||
|
||
@cute.jit
|
||
def cvt_sf_MKL_to_M32x4xrm_K4xrk_L_mma_spec(
|
||
sf_mma_tensor: cute.Tensor,
|
||
):
|
||
"""Convert scale factor tensor from MKL layout to mma specification M(32x4xrest_m)xK(4xrest_k)xL layout"""
|
||
# sf_mma_tensor has flatten shape (32, 4, rest_m, 4, rest_k, l)
|
||
# group to ((32, 4, rest_m), (4, rest_k), l)
|
||
sf_mma_tensor = cute.group_modes(sf_mma_tensor, 0, 3)
|
||
sf_mma_tensor = cute.group_modes(sf_mma_tensor, 1, 3)
|
||
|
||
|
||
# Create scale factor tensor SFA/SFB
|
||
def create_scale_factor_tensor(l, mn, k, sf_vec_size, dtype, device):
|
||
def ceil_div(a, b):
|
||
return (a + b - 1) // b
|
||
|
||
sf_k = ceil_div(k, sf_vec_size)
|
||
ref_shape = (l, mn, sf_k)
|
||
|
||
atom_m = (32, 4)
|
||
atom_k = 4
|
||
mma_shape = (
|
||
l,
|
||
ceil_div(mn, atom_m[0] * atom_m[1]),
|
||
ceil_div(sf_k, atom_k),
|
||
atom_m[0],
|
||
atom_m[1],
|
||
atom_k,
|
||
)
|
||
|
||
ref_permute_order = (1, 2, 0)
|
||
mma_permute_order = (3, 4, 1, 5, 2, 0)
|
||
|
||
# Create f32 ref torch tensor
|
||
ref_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
|
||
ref_shape,
|
||
torch.float32,
|
||
permute_order=ref_permute_order,
|
||
init_type=cutlass_torch.TensorInitType.RANDOM,
|
||
init_config=cutlass_torch.RandomInitConfig(
|
||
min_val=1,
|
||
max_val=3,
|
||
),
|
||
)
|
||
|
||
# Create f32 cute torch tensor
|
||
cute_f32_torch_tensor_cpu = cutlass_torch.create_and_permute_torch_tensor(
|
||
mma_shape,
|
||
torch.float32,
|
||
permute_order=mma_permute_order,
|
||
init_type=cutlass_torch.TensorInitType.RANDOM,
|
||
init_config=cutlass_torch.RandomInitConfig(
|
||
min_val=0,
|
||
max_val=1,
|
||
),
|
||
)
|
||
|
||
# convert ref f32 tensor to cute f32 tensor
|
||
cvt_sf_MKL_to_M32x4xrm_K4xrk_L(
|
||
from_dlpack(ref_f32_torch_tensor_cpu),
|
||
from_dlpack(cute_f32_torch_tensor_cpu),
|
||
)
|
||
cute_f32_torch_tensor = cute_f32_torch_tensor_cpu.to(device, non_blocking=True)
|
||
|
||
# reshape makes memory contiguous
|
||
ref_f32_torch_tensor_cpu = (
|
||
ref_f32_torch_tensor_cpu.permute(2, 0, 1)
|
||
.unsqueeze(-1)
|
||
.expand(l, mn, sf_k, sf_vec_size)
|
||
.reshape(l, mn, sf_k * sf_vec_size)
|
||
.permute(*ref_permute_order)
|
||
)
|
||
# prune to mkl for reference check.
|
||
ref_f32_torch_tensor_cpu = ref_f32_torch_tensor_cpu[:, :k, :]
|
||
ref_f32_torch_tensor = ref_f32_torch_tensor_cpu.to(device, non_blocking=True)
|
||
|
||
# Create dtype cute torch tensor (cpu)
|
||
cute_tensor, cute_torch_tensor = cutlass_torch.cute_tensor_like(
|
||
cute_f32_torch_tensor_cpu,
|
||
dtype,
|
||
is_dynamic_layout=True,
|
||
assumed_align=16,
|
||
)
|
||
|
||
# Convert f32 cute tensor to dtype cute tensor
|
||
cute_tensor = cutlass_torch.convert_cute_tensor(
|
||
cute_f32_torch_tensor,
|
||
cute_tensor,
|
||
dtype,
|
||
is_dynamic_layout=True,
|
||
)
|
||
return ref_f32_torch_tensor, cute_tensor, cute_torch_tensor
|
||
|
||
|
||
class MaskedBatchedMatmulCuteDSL:
|
||
def __init__(
|
||
self,
|
||
m: int,
|
||
n: int,
|
||
k: int,
|
||
l: int,
|
||
a_major: str,
|
||
b_major: str,
|
||
c_major: str,
|
||
ab_dtype: torch.dtype,
|
||
sf_dtype: torch.dtype,
|
||
c_dtype: torch.dtype,
|
||
alpha_dtype: torch.dtype,
|
||
sf_vec_size: int,
|
||
mma_tiler_mn: Tuple[int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
sm_count: int,
|
||
sm_version: str,
|
||
):
|
||
self._m = m
|
||
self._n = n
|
||
self._k = k
|
||
self._l = l
|
||
self._a_major = a_major
|
||
self._b_major = b_major
|
||
self._c_major = c_major
|
||
self._ab_dtype = ab_dtype
|
||
self._sf_dtype = sf_dtype
|
||
self._c_dtype = c_dtype
|
||
self._alpha_dtype = alpha_dtype
|
||
self._sf_vec_size = sf_vec_size
|
||
self._mma_tiler_mn = mma_tiler_mn
|
||
self._cluster_shape_mn = cluster_shape_mn
|
||
|
||
if not Sm100BlockScaledPersistentDenseGemmKernel.can_implement(
|
||
ab_dtype,
|
||
sf_dtype,
|
||
sf_vec_size,
|
||
c_dtype,
|
||
mma_tiler_mn,
|
||
cluster_shape_mn,
|
||
m,
|
||
n,
|
||
k,
|
||
l,
|
||
a_major,
|
||
b_major,
|
||
c_major,
|
||
):
|
||
raise TypeError(
|
||
f"MaskedBatchedMatmulCuteDSL: Unsupported with {ab_dtype}, {sf_dtype}, {sf_vec_size}, {c_dtype}, {mma_tiler_mn}, {cluster_shape_mn}, {m}, {n}, {k}, {l}, {a_major}, {b_major}, {c_major}"
|
||
)
|
||
|
||
# Compute max active clusters on current device
|
||
hardware_info = cutlass.utils.HardwareInfo()
|
||
self._max_active_clusters = min(
|
||
hardware_info.get_max_active_clusters(
|
||
self._cluster_shape_mn[0] * self._cluster_shape_mn[1]
|
||
),
|
||
sm_count,
|
||
)
|
||
self._sm_version = sm_version
|
||
|
||
@cute.jit
|
||
def __call__(
|
||
self,
|
||
a_ptr: cute.Pointer,
|
||
b_ptr: cute.Pointer,
|
||
sfa_ptr: cute.Pointer,
|
||
sfb_ptr: cute.Pointer,
|
||
c_ptr: cute.Pointer,
|
||
masked_m_ptr: cute.Pointer,
|
||
alpha_ptr: cute.Pointer,
|
||
current_stream: cuda.CUstream,
|
||
):
|
||
a_tensor = cute.make_tensor(
|
||
a_ptr,
|
||
layout=cute.make_ordered_layout(
|
||
(self._m, self._k, self._l),
|
||
order=(0, 1, 2) if self._a_major == "m" else (1, 0, 2),
|
||
),
|
||
)
|
||
b_tensor = cute.make_tensor(
|
||
b_ptr,
|
||
layout=cute.make_ordered_layout(
|
||
(self._n, self._k, self._l),
|
||
order=(0, 1, 2) if self._b_major == "n" else (1, 0, 2),
|
||
),
|
||
)
|
||
c_tensor = cute.make_tensor(
|
||
c_ptr,
|
||
layout=cute.make_ordered_layout(
|
||
(self._m, self._n, self._l),
|
||
order=(0, 1, 2) if self._c_major == "m" else (1, 0, 2),
|
||
),
|
||
)
|
||
|
||
# calculate sf_tensor shape and order
|
||
def ceil_div(a, b):
|
||
return (a + b - 1) // b
|
||
|
||
sf_k = ceil_div(self._k, self._sf_vec_size)
|
||
|
||
atom_m = (32, 4)
|
||
atom_k = 4
|
||
mma_shape_a = (
|
||
self._l,
|
||
ceil_div(self._m, atom_m[0] * atom_m[1]),
|
||
ceil_div(sf_k, atom_k),
|
||
atom_m[0],
|
||
atom_m[1],
|
||
atom_k,
|
||
)
|
||
mma_shape_b = (
|
||
self._l,
|
||
ceil_div(self._n, atom_m[0] * atom_m[1]),
|
||
ceil_div(sf_k, atom_k),
|
||
atom_m[0],
|
||
atom_m[1],
|
||
atom_k,
|
||
)
|
||
mma_permute_order = (3, 4, 1, 5, 2, 0)
|
||
|
||
sfa_tensor = cute.make_tensor(
|
||
sfa_ptr,
|
||
layout=cute.make_ordered_layout(
|
||
mma_shape_a,
|
||
order=mma_permute_order,
|
||
),
|
||
)
|
||
sfb_tensor = cute.make_tensor(
|
||
sfb_ptr,
|
||
layout=cute.make_ordered_layout(
|
||
mma_shape_b,
|
||
order=mma_permute_order,
|
||
),
|
||
)
|
||
cvt_sf_MKL_to_M32x4xrm_K4xrk_L_mma_spec(sfa_tensor)
|
||
cvt_sf_MKL_to_M32x4xrm_K4xrk_L_mma_spec(sfb_tensor)
|
||
|
||
masked_m_tensor = cute.make_tensor(
|
||
masked_m_ptr,
|
||
layout=cute.make_ordered_layout((self._l,), order=(0,)),
|
||
)
|
||
|
||
# Use const_expr for compile-time conditional
|
||
alpha_tensor = (
|
||
cute.make_tensor(
|
||
alpha_ptr,
|
||
layout=cute.make_ordered_layout((self._l,), order=(0,)),
|
||
)
|
||
if cutlass.const_expr(alpha_ptr is not None)
|
||
else None
|
||
)
|
||
|
||
Sm100BlockScaledPersistentDenseGemmKernel(
|
||
sf_vec_size=self._sf_vec_size,
|
||
mma_tiler_mn=self._mma_tiler_mn,
|
||
cluster_shape_mn=self._cluster_shape_mn,
|
||
sm_version=self._sm_version,
|
||
)(
|
||
a_tensor,
|
||
b_tensor,
|
||
sfa_tensor,
|
||
sfb_tensor,
|
||
c_tensor,
|
||
masked_m_tensor,
|
||
alpha_tensor,
|
||
self._max_active_clusters,
|
||
current_stream,
|
||
)
|
||
|
||
|
||
@functools.cache
|
||
def get_cute_dsl_compiled_masked_gemm_kernel(
|
||
m: int,
|
||
n: int,
|
||
k: int,
|
||
l: int,
|
||
a_major: str,
|
||
b_major: str,
|
||
c_major: str,
|
||
ab_dtype: Type[cutlass.Numeric],
|
||
sf_dtype: Type[cutlass.Numeric],
|
||
c_dtype: Type[cutlass.Numeric],
|
||
alpha_dtype: Optional[Type[cutlass.Numeric]],
|
||
sf_vec_size: int,
|
||
mma_tiler_mn: Tuple[int, int],
|
||
cluster_shape_mn: Tuple[int, int],
|
||
sm_count: int,
|
||
sm_version: str,
|
||
) -> Callable:
|
||
def get_cute_pointers(
|
||
input_tensors: Optional[List[torch.tensor]],
|
||
) -> List[cute.Pointer]:
|
||
if input_tensors is None:
|
||
(
|
||
a_data_ptr,
|
||
b_data_ptr,
|
||
sfa_data_ptr,
|
||
sfb_data_ptr,
|
||
c_data_ptr,
|
||
masked_m_data_ptr,
|
||
alpha_data_ptr,
|
||
) = [16 for _ in range(7)]
|
||
else:
|
||
(
|
||
a_tensor_gpu,
|
||
b_tensor_gpu,
|
||
sfa_tensor_gpu,
|
||
sfb_tensor_gpu,
|
||
c_tensor_gpu,
|
||
masked_m_tensor_gpu,
|
||
alpha_tensor_gpu,
|
||
) = input_tensors
|
||
(
|
||
a_data_ptr,
|
||
b_data_ptr,
|
||
sfa_data_ptr,
|
||
sfb_data_ptr,
|
||
c_data_ptr,
|
||
masked_m_data_ptr,
|
||
alpha_data_ptr,
|
||
) = (
|
||
a_tensor_gpu.data_ptr(),
|
||
b_tensor_gpu.data_ptr(),
|
||
sfa_tensor_gpu.data_ptr(),
|
||
sfb_tensor_gpu.data_ptr(),
|
||
c_tensor_gpu.data_ptr(),
|
||
masked_m_tensor_gpu.data_ptr(),
|
||
alpha_tensor_gpu.data_ptr() if alpha_tensor_gpu is not None else None,
|
||
)
|
||
|
||
a_ptr = make_ptr(
|
||
ab_dtype,
|
||
a_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
b_ptr = make_ptr(
|
||
ab_dtype,
|
||
b_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
sfa_ptr = make_ptr(
|
||
sf_dtype,
|
||
sfa_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
sfb_ptr = make_ptr(
|
||
sf_dtype,
|
||
sfb_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
c_ptr = make_ptr(
|
||
c_dtype,
|
||
c_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
masked_m_ptr = make_ptr(
|
||
cutlass.Int32,
|
||
masked_m_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
alpha_ptr = (
|
||
make_ptr(
|
||
alpha_dtype,
|
||
alpha_data_ptr,
|
||
cute.AddressSpace.gmem,
|
||
assumed_align=16,
|
||
)
|
||
if alpha_data_ptr is not None and alpha_dtype is not None
|
||
else None
|
||
)
|
||
|
||
return [a_ptr, b_ptr, sfa_ptr, sfb_ptr, c_ptr, masked_m_ptr, alpha_ptr]
|
||
|
||
kernel = cute.compile(
|
||
MaskedBatchedMatmulCuteDSL(
|
||
m=m,
|
||
n=n,
|
||
k=k,
|
||
l=l,
|
||
a_major=a_major,
|
||
b_major=b_major,
|
||
c_major=c_major,
|
||
ab_dtype=ab_dtype,
|
||
sf_dtype=sf_dtype,
|
||
c_dtype=c_dtype,
|
||
alpha_dtype=alpha_dtype,
|
||
sf_vec_size=sf_vec_size,
|
||
mma_tiler_mn=mma_tiler_mn,
|
||
cluster_shape_mn=cluster_shape_mn,
|
||
sm_count=sm_count,
|
||
sm_version=sm_version,
|
||
),
|
||
*get_cute_pointers(None),
|
||
cutlass_torch.current_stream(),
|
||
)
|
||
|
||
def tensor_api(
|
||
a_tensor_gpu: torch.Tensor,
|
||
b_tensor_gpu: torch.Tensor,
|
||
sfa_tensor_gpu: torch.Tensor,
|
||
sfb_tensor_gpu: torch.Tensor,
|
||
masked_m_tensor_gpu: torch.Tensor,
|
||
c_tensor_gpu: Optional[torch.Tensor] = None,
|
||
alpha_tensor_gpu: Optional[torch.Tensor] = None,
|
||
):
|
||
if c_tensor_gpu is None:
|
||
# fp4 gemm output is not supported
|
||
c_tensor_gpu = torch.empty(
|
||
(l, m, n),
|
||
dtype=cutlass_to_torch_dtype(c_dtype),
|
||
device="cuda",
|
||
)
|
||
|
||
# fp4 or fp8 torch tensor to cute tensor
|
||
current_stream = cutlass_torch.current_stream()
|
||
|
||
nonlocal kernel
|
||
kernel(
|
||
*get_cute_pointers(
|
||
[
|
||
a_tensor_gpu,
|
||
b_tensor_gpu,
|
||
sfa_tensor_gpu,
|
||
sfb_tensor_gpu,
|
||
c_tensor_gpu,
|
||
masked_m_tensor_gpu,
|
||
alpha_tensor_gpu,
|
||
]
|
||
),
|
||
current_stream,
|
||
)
|
||
|
||
return c_tensor_gpu
|
||
|
||
return tensor_api
|
||
|
||
|
||
def grouped_gemm_nt_masked(
|
||
lhs: Tuple[torch.Tensor, torch.Tensor],
|
||
rhs: Tuple[torch.Tensor, torch.Tensor],
|
||
out: torch.Tensor,
|
||
masked_m: torch.Tensor,
|
||
*,
|
||
ab_dtype: str,
|
||
sf_dtype: str,
|
||
c_dtype: str,
|
||
sf_vec_size: int,
|
||
sm_count: Optional[int] = None,
|
||
**kwargs,
|
||
):
|
||
"""
|
||
Executes a masked, batched matrix multiplication (GEMM) with scale factors and optional alpha scaling at output.
|
||
|
||
Args:
|
||
lhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the left-hand side input tensor (A) and its scale factor tensor (SFA).
|
||
- A should be in (m, k, l) order, but physically (l, m, k). For fp4 tensor with 8-bit storage, we expect the shape to be (m, k/2, l).
|
||
- SFA should be in (m32, m4, rm, k4, rk, l) order, but physically (l, rm, rk, m32, m4, k4)
|
||
rhs (Tuple[torch.Tensor, torch.Tensor]): Tuple containing the right-hand side input tensor (B) and its scale factor tensor (SFB).
|
||
- B should be in (n, k, l) order, but physically (l, n, k). For fp4 tensor with 8-bit storage, we expect the shape to be (n, k/2, l).
|
||
- SFB should be in (n32, n4, rn, k4, rk, l) order, but physically (l, rn, rk, n32, n4, k4)
|
||
out (torch.Tensor): Output tensor to store the result, with shape (l, m, n).
|
||
masked_m (torch.Tensor): 1D tensor of shape (l,) specifying the valid row count for each batch (used for masking).
|
||
ab_dtype (str): Data type for A and B matrices. Supported: "float4_e2m1fn", "float8_e4m3fn", "float8_e5m2".
|
||
sf_dtype (str): Data type for scale factors. Supported: "float8_e8m0fnu", "float8_e4m3fn".
|
||
c_dtype (str): Data type for output matrix C. Supported: "float16", "bfloat16", "float32", "float8_e4m3fn", "float8_e5m2".
|
||
sf_vec_size (int): Vector size for scale factors. Typically 16 or 32.
|
||
sm_count (int, optional): Number of SMs to use. Default: max available SMs under the CTA configuration.
|
||
mma_tiler_mn (Tuple[int, int], optional): Shape of the MMA tiler (M, N). Default: (128, 128).
|
||
cluster_shape_mn (Tuple[int, int], optional): Shape of the CTA cluster (ClusterM, ClusterN). Default: (1, 1).
|
||
alpha_dtype (str, optional): Data type for alpha scaling factors.
|
||
alpha (torch.Tensor, optional): Optional 1D tensor of shape (l,) containing per-batch scaling factors. Perform per-batch scaling out = alpha * out.
|
||
|
||
Notes:
|
||
- Legends of the input tensors:
|
||
* `l` is the batch size, `m/n` is the number of rows, and `k` is the number of columns.
|
||
* `m/n32`, `m/n4`, `k4` are constant values 32, 4, 4 respectively.
|
||
* `m32 * m4 * rm` should be same as `M`, which is `m` padded up to the nearest multiple of 128.
|
||
* `n32 * n4 * rn` should be same as `N`, which is `n` padded up to the nearest multiple of 128.
|
||
* `k4 * rk` should be same as `K`, which is `k / sf_vec_size` padded up to the nearest multiple of 4.
|
||
- The function applies masking per batch using masked_m.
|
||
- If alpha is provided, each batch output is multiplied by its corresponding alpha value. out = alpha * (A @ B).
|
||
- The result is written to c_tensor.
|
||
"""
|
||
|
||
a_torch, sfa_torch = lhs
|
||
b_torch, sfb_torch = rhs
|
||
c_torch = out
|
||
|
||
m, k, l = a_torch.shape
|
||
n, _, _ = b_torch.shape
|
||
|
||
if ab_dtype == "float4_e2m1fn":
|
||
# todo(yingyi): update mnk based on a_major and b_major, and support more major.
|
||
# Note: only support deepgemm-like shape for now
|
||
k = k * 2
|
||
|
||
mma_tiler_mn = kwargs.pop("mma_tiler_mn", (128, 128))
|
||
cluster_shape_mn = kwargs.pop("cluster_shape_mn", (1, 1))
|
||
if sm_count is None:
|
||
sm_count = get_num_sm(a_torch.device)
|
||
|
||
alpha = kwargs.pop("alpha", None)
|
||
alpha_dtype = kwargs.pop("alpha_dtype", None)
|
||
|
||
assert len(kwargs) == 0, f"Unsupported kwargs: {kwargs}"
|
||
|
||
major, minor = get_compute_capability(a_torch.device)
|
||
if major == 11 and minor == 0:
|
||
raise ValueError("SM110 is not supported for cute-dsl backend.")
|
||
|
||
return get_cute_dsl_compiled_masked_gemm_kernel(
|
||
m=m,
|
||
n=n,
|
||
k=k,
|
||
l=l,
|
||
a_major="k",
|
||
b_major="k",
|
||
c_major="n",
|
||
ab_dtype=get_cutlass_dtype(ab_dtype),
|
||
sf_dtype=get_cutlass_dtype(sf_dtype),
|
||
c_dtype=get_cutlass_dtype(c_dtype),
|
||
alpha_dtype=None if alpha is None else get_cutlass_dtype(alpha_dtype),
|
||
sf_vec_size=sf_vec_size,
|
||
mma_tiler_mn=mma_tiler_mn,
|
||
cluster_shape_mn=cluster_shape_mn,
|
||
sm_count=sm_count,
|
||
sm_version=f"sm_{major}{minor}",
|
||
)(
|
||
a_tensor_gpu=a_torch,
|
||
b_tensor_gpu=b_torch,
|
||
sfa_tensor_gpu=sfa_torch,
|
||
sfb_tensor_gpu=sfb_torch,
|
||
c_tensor_gpu=c_torch,
|
||
masked_m_tensor_gpu=masked_m,
|
||
alpha_tensor_gpu=alpha,
|
||
)
|