Add optional zero_start_index_M argument to triton fp8 rowwise quantization (pytorch#3628)

jwfromm · facebook-github-bot · commit 54e83db4d43d · 2025-01-29T09:51:27.000-08:00
Summary: Pull Request resolved: pytorch#3628 X-link: facebookresearch/FBGEMM#705 In MOE models, many rows may be sparsely populated. There's no reason to run quantization on these empty values. This diff adds a new optional argument to fp8 rowwise quantization that allows skipping over the sparse region of rows. Reviewed By: jasonjk-park, jianyuh, jiawenliu64 Differential Revision: D68797978 fbshipit-source-id: 0142427bb9324592fa29d2e162f1edd8d9fd1c9c
diff --git a/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py b/fbgemm_gpu/experimental/gemm/test/fp8_gemm_test.py
@@ -37,6 +37,7 @@ def _test_quantize_fp8_row(
             use_triton: bool,
             device: torch.device,
             output_device: Optional[torch.device] = None,
+            use_jagged: bool = False,
             use_scale_ub: bool = False,
             transpose_inputs: bool = False,
         ) -> None:
@@ -49,16 +50,30 @@ def _test_quantize_fp8_row(
                 for dim1, dim2 in itertools.combinations(dims, 2):
                     dims_list = list(dims)
                     dims_list[dim1], dims_list[dim2] = dims_list[dim2], dims_list[dim1]
-                    inputs.append(a.permute(dims_list))
+                    inputs.append(a.clone().permute(dims_list))
             scale_ub = (
                 torch.tensor([1200], dtype=torch.float, device=device)
                 if use_scale_ub
                 else None
             )
             for input_a in inputs:
+                # Apply sparsification if specified.
+                zero_start_index_M = None
+                if use_jagged:
+                    m_vals = torch.randint(
+                        0, input_a.shape[-1] + 1, (input_a.shape[:-1])
+                    )
+                    mask = torch.arange(input_a.shape[-1]).expand(
+                        input_a.shape[:-1] + (input_a.shape[-1],)
+                    ) >= m_vals.unsqueeze(-1)
+                    # Set corresponding values to 0.
+                    input_a[mask] = 0.0
+                    # Generate nonzero tensor in same layout as input.
+                    zero_start_index_M = torch.count_nonzero(input_a, dim=-1)
                 a_fp8, a_scale = quantize_fp8_row(
                     input_a,
                     scale_ub=scale_ub,
+                    zero_start_index_M=zero_start_index_M,
                     use_triton=use_triton,
                     output_device=output_device,
                 )
@@ -73,7 +88,10 @@ def _test_quantize_fp8_row(
 
                 self.assertTrue(
                     torch.allclose(
-                        input_a.to(device=output_device), a_torch, atol=2e-1, rtol=1e-1
+                        input_a.to(device=output_device),
+                        a_torch,
+                        atol=2e-1,
+                        rtol=1e-1,
                     )
                 )
 
@@ -97,6 +115,18 @@ def _test_quantize_fp8_row(
         )
         _test_quantize_fp8_row((4, 2, 3), True, torch.device("cpu"))
         _test_quantize_fp8_row((6, 4, 2, 3), True, torch.device("cpu"))
+        # Test with zero_start_index_M
+        _test_quantize_fp8_row((20, 30), True, torch.device("cuda"), use_jagged=True)
+        _test_quantize_fp8_row(
+            (6, 4, 2, 3), True, torch.device("cuda"), use_jagged=True
+        )
+        _test_quantize_fp8_row(
+            (4, 2, 3),
+            True,
+            torch.device("cuda"),
+            transpose_inputs=True,
+            use_jagged=True,
+        )
 
     def test_scale_fp8_row(self) -> None:
         def _test_scale_fp8_row(
diff --git a/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py b/fbgemm_gpu/experimental/gemm/triton_gemm/fp8_gemm.py
@@ -2301,6 +2301,7 @@ def _kernel_quantize_fp8_row(
     A_scale,
     A_fp8,
     scale_ub,
+    zero_start_index_M,
     B,
     M,
     N,
@@ -2313,10 +2314,14 @@ def _kernel_quantize_fp8_row(
     stride_om,
     stride_on,
     stride_ok,
+    stride_zb,
+    stride_zm,
+    stride_zn,
     TL_FP8_DTYPE: tl.constexpr,
     MAX_FP8: tl.constexpr,
     EPS: tl.constexpr,
     CLAMP_MAX: tl.constexpr,
+    JAGGED: tl.constexpr,
     BLOCK_SIZE: tl.constexpr,
     USE_INT64: tl.constexpr,
 ) -> None:
@@ -2347,10 +2352,14 @@ def _kernel_quantize_fp8_row(
         stride_om (int): Stride of m dimension of output.
         stride_on (int): Stride of n dimension of output.
         stride_ok (int): Stride of k dimension of output.
+        stride_zb (int): Stride of b dimension of jagged index.
+        stride_zm (int): Stride of m dimension of jagged index.
+        stride_zn (int): Stride of n dimension of jagged index.
         TL_FP8_DTYPE (tl.dtype): Target fp8 datatype.
         MAX_FP8 (float): Maxmimum expressible value for FP8.
         EPS (float): Epsilon value for numerical stability.
         CLAMP_MAX (bool): Whethar to apply scale_ub.
+        JAGGED (bool): Whether to use jagged indexing.
         BLOCK_SIZE (int): Block size for reduction.
         USE_INT64 (bool): Whether to use int64 indexing for large inputs.
     """
@@ -2371,11 +2380,25 @@ def _kernel_quantize_fp8_row(
         + (pid % (M * N)) % N * stride_on
     )
 
+    if JAGGED:
+        z_offset_base = (
+            pid // (M * N) * stride_zb
+            + (pid % (M * N)) // N * stride_zm
+            + (pid % (M * N)) % N * stride_zn
+        )
+        row_size = tl.load(zero_start_index_M + z_offset_base)
+    else:
+        row_size = K
+
+    blocks = tl.cdiv(row_size, BLOCK_SIZE)
+
     # Calculate max.
     cur_max = 0.0
-    for _k in range(0, tl.cdiv(K, BLOCK_SIZE)):
+    for _k in range(0, blocks):
         a = tl.load(
-            A + a_offset_base + n_offset * stride_ak, mask=n_offset < K, other=0.0
+            A + a_offset_base + n_offset * stride_ak,
+            mask=n_offset < row_size,
+            other=0.0,
         )
         tile_max = tl.max(tl.abs(a))
         cur_max = tl.maximum(tile_max, cur_max)
@@ -2394,7 +2417,9 @@ def _kernel_quantize_fp8_row(
 
     for _k in range(0, tl.cdiv(K, BLOCK_SIZE)):
         a = tl.load(
-            A + a_offset_base + n_offset * stride_ak, mask=n_offset < K, other=0.0
+            A + a_offset_base + n_offset * stride_ak,
+            mask=n_offset < row_size,
+            other=0.0,
         )
         a_fp8 = a * a_scale
         # Clamp A to fp8 range to make sure there's no overflow.
@@ -2403,20 +2428,25 @@ def _kernel_quantize_fp8_row(
         a_fp8 = tl.clamp(a_fp8, -MAX_FP8, MAX_FP8)
         a_fp8.to(TL_FP8_DTYPE)
         tl.store(
-            A_fp8 + a_fp8_offset_base + n_offset * stride_ok, a_fp8, mask=n_offset < K
+            A_fp8 + a_fp8_offset_base + n_offset * stride_ok,
+            a_fp8,
+            mask=n_offset < K,
         )
         n_offset += BLOCK_SIZE
 
 
 def triton_quantize_fp8_row(
-    a: Tensor, scale_ub: Optional[Tensor] = None
+    a: Tensor,
+    scale_ub: Optional[Tensor] = None,
+    zero_start_index_M: Optional[Tensor] = None,
 ) -> Tuple[Tensor, Tensor]:
     """
     Call the triton quantize fp8 row kernel to quantize a tensor to fp8 with row-wise scalings.
 
     Args:
         a (Tensor): higher precision input tensor of 4 dimension.
         scale_ub (Tensor): Maximum allowed value for scale.
+        zero_start_index_M (Tensor): Indicates number of nonzero elements in each row.
 
     Returns:
         torch.Tensor: fp8 scaled tensor.
@@ -2436,6 +2466,7 @@ def triton_quantize_fp8_row(
         a_scale,
         a_fp8,
         scale_ub,
+        zero_start_index_M,
         a.shape[0],
         a.shape[1],
         a.shape[2],
@@ -2448,20 +2479,25 @@ def triton_quantize_fp8_row(
         a_fp8.stride(1),
         a_fp8.stride(2),
         a_fp8.stride(3),
+        zero_start_index_M.stride(0) if zero_start_index_M is not None else None,
+        zero_start_index_M.stride(1) if zero_start_index_M is not None else None,
+        zero_start_index_M.stride(2) if zero_start_index_M is not None else None,
         TL_FP8_DTYPE=tl_dtype,
         MAX_FP8=max_fp8,
         EPS=eps,
         CLAMP_MAX=scale_ub is not None,
+        JAGGED=zero_start_index_M is not None,
         USE_INT64=use_int64,
     )
 
-    return a_fp8, a_scale.view(a.shape[:-1])
+    return a_fp8, a_scale
 
 
 @torch.library.custom_op("triton::quantize_fp8_row", mutates_args=())
 def quantize_fp8_row(
     a: Tensor,
     scale_ub: Optional[Tensor] = None,
+    zero_start_index_M: Optional[Tensor] = None,
     use_triton: bool = True,
     output_device: Optional[torch.device] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -2471,6 +2507,7 @@ def quantize_fp8_row(
     Args:
         a (Tensor): Input high precision tensor. Required to have no more than 4 dimension
         scale_ub (Tensor): Maximum allowed value for scale.
+        zero_start_index_M (Tensor): Indicates number of nonzero elements in each row.
         use_triton (bool): Whether to use triton kernel or pytorch.
         output_device (torch.device): Device to optionally move the scaled tensors to.
 
@@ -2489,8 +2526,11 @@ def quantize_fp8_row(
         a_shape = a.shape
         while a.dim() < 4:
             a = a.unsqueeze(0)
-        a_fp8, a_scale = triton_quantize_fp8_row(a, scale_ub)
-        return a_fp8.view(a_shape), a_scale
+        if zero_start_index_M is not None:
+            while zero_start_index_M.dim() < 3:
+                zero_start_index_M = zero_start_index_M.unsqueeze(0)
+        a_fp8, a_scale = triton_quantize_fp8_row(a, scale_ub, zero_start_index_M)
+        return a_fp8.view(a_shape), a_scale.view(a_shape[:-1])
     # else use pytorch implementation.
     if not output_device:
         output_device = a.device
@@ -2513,7 +2553,7 @@ def quantize_fp8_row(
     a_fp8 = a_fp8.to(device=output_device, dtype=pt_dtype)
     a_scale = a_scale.to(output_device)  # pyre-ignore
     del a
-    return a_fp8, (1 / a_scale).view(a_shape.shape[:-1])  # pyre-ignore
+    return a_fp8, (1 / a_scale).view(a_shape[:-1])  # pyre-ignore
 
 
 @quantize_fp8_row.register_fake