update

Files changed (16) hide show

.gitignore +6 -1
CMakeLists.txt +3 -3
build.toml +5 -2
sage_attention/__init__.py +0 -11
sage_attention/fused/fused.cu +1 -1
sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu +1 -1
sage_attention/utils.cuh +2 -2
tests/test_core.py +73 -0
tests/test_quant.py +146 -0
tests/test_sage_attention.py +55 -2
torch-ext/sage_attention/__init__.py +4 -0
torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so +2 -2
torch-ext/sage_attention/core.py +983 -0
torch-ext/sage_attention/quant.py +74 -45
torch-ext/sage_attention/quant_per_thread.py +204 -0
torch-ext/torch_binding.cpp +21 -6

.gitignore CHANGED Viewed

	@@ -1 +1,6 @@
1	- .venv/

+.venv/
+tests/__pycache__/
+torch-ext/__pycache__/
+torch-ext/sage_attention/__pycache__/
+torch-ext/sage_attention/fused/__pycache__/
+torch-ext/sage_attention/qattn/__pycache__/

CMakeLists.txt CHANGED Viewed

@@ -151,7 +151,7 @@ set_source_files_properties(
   PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
 if(GPU_LANG STREQUAL "CUDA")
-      cuda_archs_loose_intersection(_qattn_sm90_ARCHS "9.0a" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _qattn_sm90: ${_qattn_sm90_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_qattn_sm90_SRC}" CUDA_ARCHS "${_qattn_sm90_ARCHS}")
@@ -239,7 +239,7 @@ set_source_files_properties(
   PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
 if(GPU_LANG STREQUAL "CUDA")
-      cuda_archs_loose_intersection(_fused_ARCHS "9.0" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _fused: ${_fused_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_fused_SRC}" CUDA_ARCHS "${_fused_ARCHS}")
@@ -281,7 +281,7 @@ set(_qattn_SRC
 if(GPU_LANG STREQUAL "CUDA")
-      cuda_archs_loose_intersection(_qattn_ARCHS "9.0" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _qattn: ${_qattn_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_qattn_SRC}" CUDA_ARCHS "${_qattn_ARCHS}")

   PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
 if(GPU_LANG STREQUAL "CUDA")
+      cuda_archs_loose_intersection(_qattn_sm90_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _qattn_sm90: ${_qattn_sm90_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_qattn_sm90_SRC}" CUDA_ARCHS "${_qattn_sm90_ARCHS}")
   PROPERTIES INCLUDE_DIRECTORIES "${CMAKE_SOURCE_DIR}/.")
 if(GPU_LANG STREQUAL "CUDA")
+      cuda_archs_loose_intersection(_fused_ARCHS "8.0;8.9;9.0;9.0a" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _fused: ${_fused_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_fused_SRC}" CUDA_ARCHS "${_fused_ARCHS}")
 if(GPU_LANG STREQUAL "CUDA")
+      cuda_archs_loose_intersection(_qattn_ARCHS "8.0;8.9;9.0;9.0a" "${CUDA_ARCHS}")
     message(STATUS "Capabilities for kernel _qattn: ${_qattn_ARCHS}")
   set_gencode_flags_for_srcs(SRCS "${_qattn_SRC}" CUDA_ARCHS "${_qattn_ARCHS}")

build.toml CHANGED Viewed

@@ -7,6 +7,9 @@ src = [
   "torch-ext/torch_binding.cpp",
   "torch-ext/torch_binding.h",
 ]
 [kernel._qattn]
 depends = ["torch"]
@@ -23,7 +26,7 @@ src = [
     "sage_attention/permuted_smem.cuh",
     "sage_attention/reduction_utils.cuh",
     "sage_attention/wgmma.cuh",
-    "sage_attention/utils.cuh"
 ]
 cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
 cuda-flags = [
@@ -98,7 +101,7 @@ cuda-flags = [
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
-    "9.0a",
 ]
 include = ["."]
 src = [

   "torch-ext/torch_binding.cpp",
   "torch-ext/torch_binding.h",
 ]
+cuda-capabilities = [
+    "8.0", "9.0"
+]
 [kernel._qattn]
 depends = ["torch"]
     "sage_attention/permuted_smem.cuh",
     "sage_attention/reduction_utils.cuh",
     "sage_attention/wgmma.cuh",
+    "sage_attention/utils.cuh",
 ]
 cxx-flags = ["-g", "-O3", "-fopenmp", "-lgomp", "-std=c++17", "-DENABLE_BF16"]
 cuda-flags = [
 depends = ["torch"]
 backend = "cuda"
 cuda-capabilities = [
+    "9.0",
 ]
 include = ["."]
 src = [

sage_attention/__init__.py DELETED Viewed

@@ -1,11 +0,0 @@
-"""Top-level package marker for sage_attention.
-This file allows tools that scan the repository (e.g., get_local_kernel)
-to locate the Python package at the repo root during build-time checks.
-It intentionally avoids importing heavy runtime dependencies.
-"""
-__all__: list[str] = []

sage_attention/fused/fused.cu CHANGED Viewed

@@ -369,7 +369,7 @@ __global__ void MeanScaleKernel(T *__restrict__ input, int8_t *__restrict__ outp
   float block_max_val = vllm::blockReduceMax(max_val);
   float block_min_val = vllm::blockReduceMin(min_val);
-  float block_sum_val;
   if constexpr (sub_mean)
   {

   float block_max_val = vllm::blockReduceMax(max_val);
   float block_min_val = vllm::blockReduceMin(min_val);
+  float block_sum_val = 0.0f;
   if constexpr (sub_mean)
   {

sage_attention/qattn/qk_int_sv_f8_cuda_sm90.cu CHANGED Viewed

@@ -45,7 +45,7 @@ CUtensorMap create_tensor_map_4D(T* gmem_ptr, int d1, int d2, int d3, int d4, in
     uint32_t smem_box_shape[5] = {uint32_t(BlockMinorSize), uint32_t(BlockMajorSize), 1, 1, 1};
     uint32_t smem_box_stride[5] = {1, 1, 1, 1, 1};
-    auto fn = reinterpret_cast<decltype(&cuTensorMapEncodeTiled)>(getCUDALibrary().sym(__func__));
     if (!fn)
       throw std::runtime_error("Can't get cuTensorMapEncodeTiled");
     CUresult result = fn(

     uint32_t smem_box_shape[5] = {uint32_t(BlockMinorSize), uint32_t(BlockMajorSize), 1, 1, 1};
     uint32_t smem_box_stride[5] = {1, 1, 1, 1, 1};
+    auto fn = reinterpret_cast<decltype(&cuTensorMapEncodeTiled)>(getCUDALibrary().sym("cuTensorMapEncodeTiled"));
     if (!fn)
       throw std::runtime_error("Can't get cuTensorMapEncodeTiled");
     CUresult result = fn(

sage_attention/utils.cuh CHANGED Viewed

@@ -34,5 +34,5 @@
 #define CHECK_CONTIGUOUS(x) \
   TORCH_CHECK(x.is_contiguous(), "Tensor " #x " must be contiguous")
 #define CHECK_LASTDIM_CONTIGUOUS(x) \
-  TORCH_CHECK(x.stride(-1) == 1,    \
-              "Tensor " #x " must be contiguous at the last dimension")

 #define CHECK_CONTIGUOUS(x) \
   TORCH_CHECK(x.is_contiguous(), "Tensor " #x " must be contiguous")
 #define CHECK_LASTDIM_CONTIGUOUS(x) \
+  TORCH_CHECK(x.stride(-1) == 1, \
+              "Tensor " #x " must be contiguous at the last dimension")

tests/test_core.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import math
+import pytest
+import torch
+import sage_attention as sa
+cuda_available = torch.cuda.is_available()
+def current_sm():
+    if not cuda_available:
+        return None
+    major, minor = torch.cuda.get_device_capability(0)
+    return f"sm{major}{minor}"
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+@pytest.mark.parametrize("tensor_layout", ["HND", "NHD"])
+@pytest.mark.parametrize("head_dim", [64, 128])
+@pytest.mark.parametrize("return_lse", [False, True])
+def test_sageattn_runs_and_shapes(tensor_layout, head_dim, return_lse):
+    device = "cuda"
+    dtype = torch.float16
+    # Small, nontrivial shapes; pad path will be exercised for head_dim=64
+    if tensor_layout == "HND":
+        q = torch.randn(2, 6, 129, head_dim, dtype=dtype, device=device)
+        k = torch.randn(2, 3, 257, head_dim, dtype=dtype, device=device)
+        v = torch.randn(2, 3, 257, head_dim, dtype=dtype, device=device)
+        expected_o_shape = (2, 6, 129, head_dim)
+        expected_lse_shape = (2, 6, 129)
+    else:
+        q = torch.randn(2, 129, 6, head_dim, dtype=dtype, device=device)
+        k = torch.randn(2, 257, 3, head_dim, dtype=dtype, device=device)
+        v = torch.randn(2, 257, 3, head_dim, dtype=dtype, device=device)
+        expected_o_shape = (2, 129, 6, head_dim)
+        expected_lse_shape = (2, 6, 129)
+    sm = current_sm()
+    # Some backends may not be compiled on this GPU; skip gracefully if unsupported
+    try:
+        out = sa.sageattn(
+            q, k, v, tensor_layout=tensor_layout, is_causal=False, return_lse=return_lse
+        )
+    except ValueError as e:
+        if "Unsupported CUDA architecture" in str(e):
+            pytest.skip(f"Unsupported arch for this build: {sm}")
+        raise
+    if return_lse:
+        o, lse = out
+        assert lse.shape == expected_lse_shape and torch.isfinite(lse).all()
+    else:
+        o = out
+    assert o.shape == expected_o_shape
+    assert o.dtype == dtype
+    assert o.device.type == "cuda"
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+def test_sageattn_raises_on_unsupported_head_dim():
+    device = "cuda"
+    dtype = torch.float16
+    # head_dim > 128 should raise
+    q = torch.randn(1, 2, 8, 192, dtype=dtype, device=device)
+    k = torch.randn(1, 1, 8, 192, dtype=dtype, device=device)
+    v = torch.randn(1, 1, 8, 192, dtype=dtype, device=device)
+    with pytest.raises(ValueError):
+        sa.sageattn(q, k, v)

tests/test_quant.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import math
+import pytest
+import torch
+import sage_attention as sa
+cuda_available = torch.cuda.is_available()
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+@pytest.mark.parametrize("tensor_layout", ["HND", "NHD"])
+def test_per_block_int8_shapes_and_types(tensor_layout):
+    device = "cuda"
+    dtype = torch.float16
+    if tensor_layout == "HND":
+        q = torch.randn(2, 4, 129, 128, dtype=dtype, device=device)
+        k = torch.randn(2, 4, 257, 128, dtype=dtype, device=device)
+        expected_q_scale_shape = (2, 4, math.ceil(129 / 128))
+        expected_k_scale_shape = (2, 4, math.ceil(257 / 64))
+    else:
+        q = torch.randn(2, 129, 4, 128, dtype=dtype, device=device)
+        k = torch.randn(2, 257, 4, 128, dtype=dtype, device=device)
+        expected_q_scale_shape = (2, 4, math.ceil(129 / 128))
+        expected_k_scale_shape = (2, 4, math.ceil(257 / 64))
+    km = (
+        torch.randn(2, 4, 128, dtype=dtype, device=device)
+        if tensor_layout == "HND"
+        else torch.randn(2, 4, 128, dtype=dtype, device=device)
+    )
+    q_int8, q_scale, k_int8, k_scale = sa.per_block_int8(
+        q, k, km, tensor_layout=tensor_layout
+    )
+    assert q_int8.shape == q.shape and q_int8.dtype == torch.int8
+    assert k_int8.shape == k.shape and k_int8.dtype == torch.int8
+    assert q_scale.shape == expected_q_scale_shape and q_scale.dtype == torch.float32
+    assert k_scale.shape == expected_k_scale_shape and k_scale.dtype == torch.float32
+    assert q_int8.device == q.device == k.device == q_scale.device == k_scale.device
+    assert torch.isfinite(q_scale).all()
+    assert torch.isfinite(k_scale).all()
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+@pytest.mark.parametrize("tensor_layout", ["HND", "NHD"])
+@pytest.mark.parametrize("head_dim", [64, 128])
+def test_per_warp_int8_shapes_and_types(tensor_layout, head_dim):
+    device = "cuda"
+    dtype = torch.float16
+    if tensor_layout == "HND":
+        q = torch.randn(1, 2, 130, head_dim, dtype=dtype, device=device)
+        k = torch.randn(1, 2, 70, head_dim, dtype=dtype, device=device)
+        expected_q_scale_shape = (
+            1,
+            2,
+            math.ceil(130 / 128) * (128 // (16 if head_dim == 128 else 32)),
+        )
+        expected_k_scale_shape = (1, 2, math.ceil(70 / 64))
+    else:
+        q = torch.randn(1, 130, 2, head_dim, dtype=dtype, device=device)
+        k = torch.randn(1, 70, 2, head_dim, dtype=dtype, device=device)
+        expected_q_scale_shape = (
+            1,
+            2,
+            math.ceil(130 / 128) * (128 // (16 if head_dim == 128 else 32)),
+        )
+        expected_k_scale_shape = (1, 2, math.ceil(70 / 64))
+    q_int8, q_scale, k_int8, k_scale = sa.per_warp_int8(
+        q,
+        k,
+        tensor_layout=tensor_layout,
+        BLKQ=128,
+        WARPQ=(16 if head_dim == 128 else 32),
+        BLKK=64,
+    )
+    assert q_int8.shape == q.shape and q_int8.dtype == torch.int8
+    assert k_int8.shape == k.shape and k_int8.dtype == torch.int8
+    assert q_scale.shape == expected_q_scale_shape and q_scale.dtype == torch.float32
+    assert k_scale.shape == expected_k_scale_shape and k_scale.dtype == torch.float32
+    assert torch.isfinite(q_scale).all()
+    assert torch.isfinite(k_scale).all()
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+@pytest.mark.parametrize("tensor_layout", ["HND", "NHD"])
+def test_sub_mean_properties(tensor_layout):
+    device = "cuda"
+    dtype = torch.float16
+    if tensor_layout == "HND":
+        v = torch.randn(2, 3, 65, 128, dtype=dtype, device=device)
+        seq_dim = 2
+        nh_dim = 1
+    else:
+        v = torch.randn(2, 65, 3, 128, dtype=dtype, device=device)
+        seq_dim = 1
+        nh_dim = 2
+    v_smoothed, vm = sa.sub_mean(v, tensor_layout=tensor_layout)
+    assert v_smoothed.shape == v.shape and v_smoothed.dtype == torch.float16
+    assert vm.shape == (v.size(0), v.size(nh_dim), v.size(-1)) and vm.dtype == v.dtype
+    # The mean along the sequence dimension of smoothed v should be ~0 (in fp16)
+    mean_after = v_smoothed.mean(dim=seq_dim)
+    assert torch.isfinite(mean_after).all()
+    assert (mean_after.abs() < 1e-1).all()
+@pytest.mark.skipif(not cuda_available, reason="CUDA is required")
+@pytest.mark.parametrize("tensor_layout", ["HND", "NHD"])
+@pytest.mark.parametrize("smooth_v", [True, False])
+def test_per_channel_fp8_shapes_and_outputs(tensor_layout, smooth_v):
+    device = "cuda"
+    dtype = torch.float16
+    if tensor_layout == "HND":
+        v = torch.randn(2, 3, 77, 128, dtype=dtype, device=device)
+        kv_len = v.size(2)
+    else:
+        v = torch.randn(2, 77, 3, 128, dtype=dtype, device=device)
+        kv_len = v.size(1)
+    v_fp8, v_scale, vm = sa.per_channel_fp8(
+        v, tensor_layout=tensor_layout, smooth_v=smooth_v
+    )
+    assert v_fp8.dtype == torch.float8_e4m3fn
+    assert v_scale.shape == (2, 3, 128)
+    if smooth_v:
+        assert vm is not None and vm.shape == (2, 3, 128) and vm.dtype == torch.float32
+    else:
+        assert vm is None
+    # Padded seq len should be multiple of 64
+    padded_len = ((kv_len + 63) // 64) * 64
+    if tensor_layout == "HND":
+        assert v_fp8.shape == (2, 3, 128, padded_len)
+    else:
+        assert v_fp8.shape == (2, 128, 3, padded_len)
+    assert torch.isfinite(v_scale).all()

tests/test_sage_attention.py CHANGED Viewed

@@ -3,10 +3,63 @@ import torch
 print(dir(sage_attention))
 def test_per_block_int8():
     q = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda")
     k = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda")
-    q_int8, q_scale, k_int8, k_scale = sage_attention.per_block_int8(q, k)
     print(q_int8.shape, q_scale.shape, k_int8.shape, k_scale.shape)
-test_per_block_int8()

 print(dir(sage_attention))
+# Skip tests gracefully if CUDA is unavailable (e.g., no driver/GPU present)
+if not torch.cuda.is_available():
+    print("CUDA is not available; skipping SageAttention tests.")
+    raise SystemExit(0)
 def test_per_block_int8():
     q = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda")
     k = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda")
+    km = torch.randn(1, 1024, 128, dtype=torch.float16, device="cuda")
+    q_int8, q_scale, k_int8, k_scale = sage_attention.per_block_int8(q, k, km)
     print(q_int8.shape, q_scale.shape, k_int8.shape, k_scale.shape)
+# test_per_block_int8()
+def test_per_channel_fp8():
+    v = torch.randn(1, 1024, 1024, 128, dtype=torch.float16, device="cuda")
+    v_fp8, v_scale, vm = sage_attention.per_channel_fp8(
+        v, tensor_layout="HND", smooth_v=True
+    )
+    print(v_fp8.shape, v_scale.shape, vm.shape)
+def test_sageattn():
+    # The error is about the expected shape of query_scale, which is derived from the kernel's block sizes.
+    # Let's use a shape for q, k, v that matches the kernel's expectations for scale shape.
+    # For HND: (batch, nheads, seqlen, head_dim)
+    # Let's use seqlen = 128, which is a multiple of CTA_Q=128, WARP_Q=16.
+    # This will make div_ceil(qo_len, CTA_Q) = 1, (CTA_Q / WARP_Q) = 8, so scale shape = (1, 1024, 8)
+    q = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda")
+    k = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda")
+    v = torch.randn(1, 1024, 128, 128, dtype=torch.float16, device="cuda")
+    # Compare SageAttention to standard attention
+    # o_sage = sage_attention.sageattn(
+    #     q, k, v, tensor_layout="HND", is_causal=False, return_lse=False
+    # )
+    o_sage = sage_attention.sageattn_qk_int8_pv_fp8_cuda(
+      q, k, v,
+      tensor_layout="HND",
+      is_causal=False,
+      qk_quant_gran="per_warp",   # switch from per_warp
+      pv_accum_dtype="fp32+fp32",   # required for sm90
+      return_lse=False,
+    )
+    # Force sync so any async kernel failure surfaces here (not on the next cuBLAS call)
+    # Standard attention for comparison
+    # q, k, v: (batch, nheads, seqlen, head_dim)
+    attn_scores = torch.matmul(q, k.transpose(-2, -1)) / (q.shape[-1] ** 0.5)
+    attn_probs = torch.softmax(attn_scores, dim=-1)
+    o_ref = torch.matmul(attn_probs, v)
+    print("SageAttention output shape:", o_sage.shape)
+    print("Standard attention output shape:", o_ref.shape)
+    print("Max abs diff:", (o_sage.float() - o_ref).abs().max().item())
+test_sageattn()

torch-ext/sage_attention/__init__.py CHANGED Viewed

@@ -1,8 +1,12 @@
 from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
 __all__ = [
     "per_block_int8",
     "per_warp_int8",
     "sub_mean",
     "per_channel_fp8",
 ]

 from .quant import per_block_int8, per_warp_int8, sub_mean, per_channel_fp8
+from .core import sageattn, sageattn_qk_int8_pv_fp8_cuda
 __all__ = [
     "per_block_int8",
     "per_warp_int8",
     "sub_mean",
     "per_channel_fp8",
+    "sageattn",
+    "sageattn_qk_int8_pv_fp8_cuda",
 ]

torch-ext/sage_attention/_sage_attention_57cb7ec_dirty.abi3.so CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2ed533e57758ba109f361945c736223026260e2f2155c1c2e1fc6e45c7fac3c8
-size 25534800

 version https://git-lfs.github.com/spec/v1
+oid sha256:334785260951d33ffef9a14a96c44ec9d38cbbcde8311b7bd84f9a98d3f43f91
+size 24479576

torch-ext/sage_attention/core.py ADDED Viewed

	@@ -0,0 +1,983 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import torch.nn.functional as F
+from ._ops import ops
+from .quant import per_warp_int8 as per_warp_int8_cuda
+from .quant import sub_mean
+from .quant import per_channel_fp8
+from .quant_per_thread import per_thread_int8 as per_thread_int8_triton
+from typing import Any, List, Literal, Optional, Tuple, Union
+import warnings
+import subprocess
+import re
+def get_cuda_version():
+    try:
+        output = subprocess.check_output(["nvcc", "--version"]).decode()
+        match = re.search(r"release (\d+)\.(\d+)", output)
+        if match:
+            major, minor = int(match.group(1)), int(match.group(2))
+            return major, minor
+    except Exception as e:
+        print("Failed to get CUDA version:", e)
+    return None, None
+def get_cuda_arch_versions():
+    cuda_archs = []
+    for i in range(torch.cuda.device_count()):
+        major, minor = torch.cuda.get_device_capability(i)
+        cuda_archs.append(f"sm{major}{minor}")
+    return cuda_archs
+def sageattn(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    sm_scale: Optional[float] = None,
+    return_lse: bool = False,
+    **kwargs: Any,
+):
+    """
+    Automatically selects the appropriate implementation of the SageAttention kernel based on the GPU compute capability.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    """
+    arch = get_cuda_arch_versions()[q.device.index]
+    if arch == "sm80":
+        return sageattn_qk_int8_pv_fp16_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32",
+        )
+    elif arch == "sm89":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )
+    elif arch == "sm90":
+        return sageattn_qk_int8_pv_fp8_cuda_sm90(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp32",
+        )
+    elif arch == "sm120":
+        return sageattn_qk_int8_pv_fp8_cuda(
+            q,
+            k,
+            v,
+            tensor_layout=tensor_layout,
+            is_causal=is_causal,
+            qk_quant_gran="per_warp",
+            sm_scale=sm_scale,
+            return_lse=return_lse,
+            pv_accum_dtype="fp32+fp16",
+        )  # sm120 has accurate fp32 accumulator for fp8 mma and triton kernel is currently not usable on sm120.
+    else:
+        raise ValueError(f"Unsupported CUDA architecture: {arch}")
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp16_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP16 PV with FP16/FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp16", "fp16+fp32" or "fp32".
+        - "fp16": PV accumulation is done in fully in FP16. This is the fastest option but may lead to numerical instability. `smooth_v` option will increase the accuracy in cases when the value tensor has a large bias (like in CogVideoX-2b).
+        - "fp32": PV accumulation is done in FP32. This is the most accurate option but may be slower than "fp16" due to CUDA core overhead.
+        - "fp16+fp32": PV accumulation is done in FP16, but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32" or "fp16+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=128,
+            WARPQ=(16 if (q.size(-1) == 128 and pv_accum_dtype == "fp16+fp32") else 32),
+            BLKK=64,
+            WARPK=64,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype in ["fp32", "fp16+fp32"] and smooth_v:
+        warnings.warn(f"pv_accum_dtype is {pv_accum_dtype}, smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f32_attn(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp16":
+        if smooth_v:
+            smoothed_v, vm = sub_mean(v, tensor_layout=tensor_layout)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                smoothed_v,
+                o,
+                q_scale,
+                k_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+        else:
+            v = v.to(torch.float16)
+            lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn(
+                q_int8,
+                k_int8,
+                v,
+                o,
+                q_scale,
+                k_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+    elif pv_accum_dtype == "fp16+fp32":
+        v = v.to(torch.float16)
+        lse = _qattn_sm80.qk_int8_sv_f16_accum_f16_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v,
+            o,
+            q_scale,
+            k_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    else:
+        raise ValueError(f"Unsupported pv_accum_dtype: {pv_accum_dtype}")
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp16",
+    smooth_k: bool = True,
+    smooth_v: bool = False,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    smooth_v : bool
+        Whether to smooth the value tensor by subtracting the mean along the sequence dimension.
+        smooth_v will be ignored if pv_accum_dtype is "fp32+fp32".
+        Default: False.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    # cuda_major_version, cuda_minor_version = get_cuda_version()
+    # if(cuda_major_version, cuda_minor_version) < (12, 8) and pv_accum_dtype == 'fp32+fp16':
+    #     warnings.warn("cuda version < 12.8, change pv_accum_dtype to 'fp32+fp32'")
+    #     pv_accum_dtype = 'fp32+fp32'
+    # FIXME(DefTruth): make sage attention work compatible with distributed
+    # env, for example, xDiT which launch by torchrun. Without this workaround,
+    # sage attention will run into illegal memory access error after first
+    # inference step in distributed env for multi gpus inference. This small
+    # workaround also make sage attention work compatible with torch.compile
+    # through non-fullgraph compile mode.
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    if pv_accum_dtype == "fp32+fp32" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp32', smooth_v will be ignored.")
+        smooth_v = False
+    if pv_accum_dtype == "fp32+fp16" and smooth_v:
+        warnings.warn("pv_accum_dtype is 'fp32+fp16', smooth_v will be ignored.")
+        smooth_v = False
+    quant_v_scale_max = 448.0
+    if pv_accum_dtype == "fp32+fp16":
+        quant_v_scale_max = 2.25
+    v_fp8, v_scale, vm = per_channel_fp8(
+        v, tensor_layout=tensor_layout, scale_max=quant_v_scale_max, smooth_v=smooth_v
+    )
+    print("before kernel call")
+    if pv_accum_dtype == "fp32":
+        if smooth_v:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_fuse_v_mean_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                vm,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+        else:
+            lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+                q_int8,
+                k_int8,
+                v_fp8,
+                o,
+                q_scale,
+                k_scale,
+                v_scale,
+                _tensor_layout,
+                _is_caual,
+                _qk_quant_gran,
+                sm_scale,
+                _return_lse,
+            )
+            torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp32":
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    elif pv_accum_dtype == "fp32+fp16":
+        lse = ops.qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+        torch.cuda.synchronize()
+    o = o[..., :head_dim_og]
+    print("after kernel call")
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o
+@torch.compiler.disable
+def sageattn_qk_int8_pv_fp8_cuda_sm90(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    tensor_layout: str = "HND",
+    is_causal: bool = False,
+    qk_quant_gran: str = "per_thread",
+    sm_scale: Optional[float] = None,
+    pv_accum_dtype: str = "fp32+fp32",
+    smooth_k: bool = True,
+    return_lse: bool = False,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    SageAttention with INT8 quantization for Q and K, FP8 PV with FP32 accumulation, implemented using CUDA.
+    Parameters
+    ----------
+    q : torch.Tensor
+        The query tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+    k : torch.Tensor
+        The key tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    v : torch.Tensor
+        The value tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_kv_heads, kv_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, kv_len, num_kv_heads, head_dim]``.
+    tensor_layout : str
+        The tensor layout, either "HND" or "NHD".
+        Default: "HND".
+    is_causal : bool
+        Whether to apply causal mask to the attention matrix. Only applicable when qo_len == kv_len.
+        Default: False.
+    qk_quant_gran : str
+        The granularity of quantization for Q and K, either "per_warp" or "per_thread".
+        Default: "per_thread".
+    sm_scale : Optional[float]
+        The scale used in softmax, if not provided, will be set to ``1.0 / sqrt(head_dim)``.
+    pv_accum_dtype : str
+        The dtype of the accumulation of the product of the value tensor and the attention weights, either "fp32" or "fp32+fp32".
+        - "fp32": PV accumulation is done in fully in FP32. However, due to the hardware issue, there are only 22 valid bits in the FP32 accumulator.
+        - "fp32+fp32": PV accumulation is done in FP32 (actually FP22), but added to a FP32 buffer every few iterations. This offers a balance between speed and accuracy.
+        Default: "fp32+fp32".
+    smooth_k : bool
+        Whether to smooth the key tensor by subtracting the mean along the sequence dimension.
+        Default: True.
+    return_lse : bool
+        Whether to return the log sum of the exponentiated attention weights. Used for cases like Ring Attention.
+        Default: False.
+    Returns
+    -------
+    torch.Tensor
+        The output tensor. Shape:
+        - If `tensor_layout` is "HND": ``[batch_size, num_qo_heads, qo_len, head_dim]``.
+        - If `tensor_layout` is "NHD": ``[batch_size, qo_len, num_qo_heads, head_dim]``.
+            torch.Tensor
+        The logsumexp of each row of the matrix QK^T * scaling (e.g., log of the softmax normalization factor).
+        Shape: ``[batch_size, num_qo_heads, qo_len]``.
+        Only returned if `return_lse` is True.
+    Note
+    ----
+    - ``num_qo_heads`` must be divisible by ``num_kv_heads``.
+    - The tensors `q`, `k`, and `v` must have the dtype ``torch.float16`` or ``torch.bfloat16``
+    - All tensors must be on the same cuda device.
+    - `smooth_k` will introduce slight overhead but will improve the accuracy under most circumstances.
+    """
+    dtype = q.dtype
+    assert q.is_cuda, "Input tensors must be on cuda."
+    assert dtype in [torch.float16, torch.bfloat16], (
+        "Input tensors must be in dtype of torch.float16 or torch.bfloat16"
+    )
+    assert qk_quant_gran in ["per_warp", "per_thread"], (
+        "qk_quant_gran must be either 'per_warp' or 'per_thread'."
+    )
+    assert q.device == k.device == v.device, "All tensors must be on the same device."
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same dtype."
+    torch.cuda.set_device(v.device)
+    _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    _is_caual = 1 if is_causal else 0
+    _qk_quant_gran = 3 if qk_quant_gran == "per_thread" else 2
+    _return_lse = 1 if return_lse else 0
+    head_dim_og = q.size(-1)
+    if head_dim_og < 64:
+        q = torch.nn.functional.pad(q, (0, 64 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 64 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 64 - head_dim_og))
+    elif head_dim_og > 64 and head_dim_og < 128:
+        q = torch.nn.functional.pad(q, (0, 128 - head_dim_og))
+        k = torch.nn.functional.pad(k, (0, 128 - head_dim_og))
+        v = torch.nn.functional.pad(v, (0, 128 - head_dim_og))
+    elif head_dim_og > 128:
+        raise ValueError(f"Unsupported head_dim: {head_dim_og}")
+    # assert last dim is contiguous
+    assert q.stride(-1) == 1 and k.stride(-1) == 1 and v.stride(-1) == 1, (
+        "Last dim of qkv must be contiguous."
+    )
+    if sm_scale is None:
+        sm_scale = head_dim_og**-0.5
+    seq_dim = 1 if _tensor_layout == 0 else 2
+    nh_dim = 2 if _tensor_layout == 0 else 1
+    if smooth_k:
+        km = k.mean(dim=seq_dim, keepdim=True)
+        nqheads = q.size(2)
+        nkheads = k.size(2)
+        q_per_kv_heads = nqheads // nkheads
+        if q_per_kv_heads > 1:
+            # nheads_k => nheads_q
+            km_broadcast = torch.repeat_interleave(km, q_per_kv_heads, dim=nh_dim)
+        else:
+            km_broadcast = km
+        if return_lse:
+            if tensor_layout == "NHD":
+                lse_correction = (
+                    torch.matmul(
+                        q.transpose(1, 2), km_broadcast.transpose(1, 2).transpose(2, 3)
+                    )
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+            else:
+                lse_correction = (
+                    torch.matmul(q, km_broadcast.transpose(2, 3))
+                    .squeeze(-1)
+                    .to(torch.float32)
+                )
+    else:
+        km = None
+    if qk_quant_gran == "per_warp":
+        q_int8, q_scale, k_int8, k_scale = per_warp_int8_cuda(
+            q, k, km, tensor_layout=tensor_layout, BLKQ=64, WARPQ=16, BLKK=128
+        )
+    elif qk_quant_gran == "per_thread":
+        q_int8, q_scale, k_int8, k_scale = per_thread_int8_triton(
+            q,
+            k,
+            km,
+            tensor_layout=tensor_layout,
+            BLKQ=64,
+            WARPQ=16,
+            BLKK=128,
+            WARPK=128,
+        )
+    o = torch.empty(q.size(), dtype=dtype, device=q.device)
+    # pad v to multiple of 128
+    # TODO: modify per_channel_fp8 kernel to handle this
+    kv_len = k.size(seq_dim)
+    v_pad_len = 128 - (kv_len % 128) if kv_len % 128 != 0 else 0
+    if v_pad_len > 0:
+        if tensor_layout == "HND":
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v.size(1),
+                        v_pad_len,
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=2,
+            )
+        else:
+            v = torch.cat(
+                [
+                    v,
+                    torch.zeros(
+                        v.size(0),
+                        v_pad_len,
+                        v.size(2),
+                        v.size(3),
+                        dtype=v.dtype,
+                        device=v.device,
+                    ),
+                ],
+                dim=1,
+            )
+    v_fp8, v_scale, _ = per_channel_fp8(v, tensor_layout=tensor_layout, smooth_v=False)
+    if pv_accum_dtype == "fp32":
+        raise NotImplementedError("Please use pv_accum_dtype='fp32+fp32' for sm90.")
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    elif pv_accum_dtype == "fp32+fp32":
+        print(
+            "qint8",
+            q_int8.shape,
+            "qscale",
+            q_scale.shape,
+            "kint8",
+            k_int8.shape,
+            "kscale",
+            k_scale.shape,
+            "vfp8",
+            v_fp8.shape,
+            "vscale",
+            v_scale.shape,
+        )
+        lse = ops.qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(
+            q_int8,
+            k_int8,
+            v_fp8,
+            o,
+            q_scale,
+            k_scale,
+            v_scale,
+            _tensor_layout,
+            _is_caual,
+            _qk_quant_gran,
+            sm_scale,
+            _return_lse,
+        )
+    o = o[..., :head_dim_og]
+    if return_lse:
+        return (
+            o,
+            lse / 1.44269504 + lse_correction * sm_scale
+            if smooth_k
+            else lse / 1.44269504,
+        )
+    else:
+        return o

torch-ext/sage_attention/quant.py CHANGED Viewed

@@ -19,14 +19,15 @@ from typing import Optional
 from ._ops import ops
 def per_block_int8(
-    q: torch.Tensor,
-    k: torch.Tensor,
     km: Optional[torch.Tensor] = None,
-    BLKQ: int = 128,
-    BLKK: int = 64,
-    sm_scale: Optional[float] = None,
-    tensor_layout: str ="HND"
 ):
     """
     Quantize the query tensor `q` and the key tensor `k` with per block quantization.
@@ -46,9 +47,9 @@ def per_block_int8(
     km : Optional[torch.Tensor]
         The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
         Should be of the same dtype as `k` if provided. Default is None.
     sm_scale : Optional[float]
-        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
         It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
     tensor_layout : str
@@ -63,7 +64,7 @@ def per_block_int8(
         - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
         - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
         - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
     Note
     ----
     - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
@@ -82,34 +83,42 @@ def per_block_int8(
     else:
         raise ValueError(f"Unknown tensor layout: {tensor_layout}")
     _tensor_layout = 0 if tensor_layout == "NHD" else 1
-    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32)
-    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32)
     if sm_scale is None:
         sm_scale = head_dim**-0.5
     sm_scale *= 1.44269504
     ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
     if km is not None:
         km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
-        ops.quant_per_block_int8_fuse_sub_mean_cuda(k, km, k_int8, k_scale, BLKK, _tensor_layout)
     else:
-        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, BLKK, _tensor_layout)
     return q_int8, q_scale, k_int8, k_scale
 def per_warp_int8(
-    q: torch.Tensor,
     k: torch.Tensor,
     km: Optional[torch.Tensor] = None,
-    BLKQ: int =128,
-    WARPQ: int =32,
-    BLKK: int =64,
-    tensor_layout: str ="HND"
 ):
     """
     Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
@@ -131,7 +140,7 @@ def per_warp_int8(
     km : Optional[torch.Tensor]
         The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
         Should be of the same dtype as `k` if provided. Default is None.
     tensor_layout : str
         The tensor layout, either "HND" or "NHD".
         Default: "HND".
@@ -144,7 +153,7 @@ def per_warp_int8(
         - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
         - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
         - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
     Note
     ----
     - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
@@ -163,26 +172,33 @@ def per_warp_int8(
     else:
         raise ValueError(f"Unknown tensor layout: {tensor_layout}")
     _tensor_layout = 0 if tensor_layout == "NHD" else 1
-    q_scale = torch.empty((b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)), device=q.device, dtype=torch.float32)
-    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32)
     ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
     if km is not None:
         km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
-        ops.quant_per_block_int8_fuse_sub_mean_cuda(k, km, k_int8, k_scale, BLKK, _tensor_layout)
     else:
-        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, BLKK, _tensor_layout)
     return q_int8, q_scale, k_int8, k_scale
-def sub_mean(
-    v: torch.Tensor,
-    tensor_layout: str ="HND"
-):
     """
     Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
@@ -215,17 +231,18 @@ def sub_mean(
     vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
     v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
     # subtract mean and store the result as fp16
     ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
     return v_smoothed, vm
 def per_channel_fp8(
     v: torch.Tensor,
-    tensor_layout: str ="HND",
     scale_max: float = 448.0,
-    smooth_v: bool = True
 ):
     """
     Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
@@ -271,27 +288,39 @@ def per_channel_fp8(
     if tensor_layout == "HND":
         b, h_kv, kv_len, head_dim = v.shape
         padded_len = (kv_len + 63) // 64 * 64
-        v_transposed_permutted = torch.empty((b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device)
     elif tensor_layout == "NHD":
         b, kv_len, h_kv, head_dim = v.shape
         padded_len = (kv_len + 63) // 64 * 64
-        v_transposed_permutted = torch.empty((b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device)
     ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
-    v_fp8 = torch.empty(v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device)
     v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
     vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
     if smooth_v:
-        ops.mean_scale_fuse_quant_cuda(v_transposed_permutted, v_fp8, vm, v_scale, kv_len, scale_max, _tensor_layout)
         return v_fp8, v_scale, vm
     else:
-        ops.scale_fuse_quant_cuda(v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout)
         return v_fp8, v_scale, None

 from ._ops import ops
 def per_block_int8(
+    q: torch.Tensor,
+    k: torch.Tensor,
     km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    BLKK: int = 64,
+    sm_scale: Optional[float] = None,
+    tensor_layout: str = "HND",
 ):
     """
     Quantize the query tensor `q` and the key tensor `k` with per block quantization.
     km : Optional[torch.Tensor]
         The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
         Should be of the same dtype as `k` if provided. Default is None.
     sm_scale : Optional[float]
+        The scale factor for the softmax operation. Default is ``head_dim**-0.5``.
         It will be multiplied by ``1.44269504`` to work together with the triton attention kernel.
     tensor_layout : str
         - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ]`` with `float32` dtype.
         - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
         - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
     Note
     ----
     - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
     else:
         raise ValueError(f"Unknown tensor layout: {tensor_layout}")
     _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, (qo_len + BLKQ - 1) // BLKQ), device=q.device, dtype=torch.float32
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
     if sm_scale is None:
         sm_scale = head_dim**-0.5
     sm_scale *= 1.44269504
     ops.quant_per_block_int8_cuda(q, q_int8, q_scale, sm_scale, BLKQ, _tensor_layout)
     if km is not None:
         km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
     else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
     return q_int8, q_scale, k_int8, k_scale
 def per_warp_int8(
+    q: torch.Tensor,
     k: torch.Tensor,
     km: Optional[torch.Tensor] = None,
+    BLKQ: int = 128,
+    WARPQ: int = 32,
+    BLKK: int = 64,
+    tensor_layout: str = "HND",
 ):
     """
     Quantize the query tensor `q` with per warp quantization and the key tensor `k` with per block quantization.
     km : Optional[torch.Tensor]
         The mean tensor of `k` along the sequence length dimension. Shape: ``[batch_size, num_kv_heads, head_dim]``.
         Should be of the same dtype as `k` if provided. Default is None.
     tensor_layout : str
         The tensor layout, either "HND" or "NHD".
         Default: "HND".
         - The scale tensor of the query tensor. Shape: ``[batch_size, num_qo_heads, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ)]`` with `float32` dtype.
         - The quantized key tensor. Shape: Same as `k` but with `int8` dtype.
         - The scale tensor of the key tensor. Shape: ``[batch_size, num_kv_heads, (kv_len + BLKK - 1) // BLKK]`` with `float32` dtype.
     Note
     ----
     - The tensors `q` and `k` must have the dtype ``torch.float16`` or ``torch.bfloat16``
     else:
         raise ValueError(f"Unknown tensor layout: {tensor_layout}")
     _tensor_layout = 0 if tensor_layout == "NHD" else 1
+    q_scale = torch.empty(
+        (b, h_qo, ((qo_len + BLKQ - 1) // BLKQ) * (BLKQ // WARPQ)),
+        device=q.device,
+        dtype=torch.float32,
+    )
+    k_scale = torch.empty(
+        (b, h_kv, (kv_len + BLKK - 1) // BLKK), device=q.device, dtype=torch.float32
+    )
     ops.quant_per_warp_int8_cuda(q, q_int8, q_scale, BLKQ, WARPQ, _tensor_layout)
     if km is not None:
         km = km.squeeze(1) if _tensor_layout == 0 else km.squeeze(2)
+        ops.quant_per_block_int8_fuse_sub_mean_cuda(
+            k, km, k_int8, k_scale, BLKK, _tensor_layout
+        )
     else:
+        # The bound CUDA op expects an sm_scale argument; use 1.0 for K to avoid scaling
+        ops.quant_per_block_int8_cuda(k, k_int8, k_scale, 1.0, BLKK, _tensor_layout)
     return q_int8, q_scale, k_int8, k_scale
+def sub_mean(v: torch.Tensor, tensor_layout: str = "HND"):
     """
     Calculate the mean of the tensor `v` along the sequence length dimension and subtract it from `v`. Result is stored as fp16.
     vm = v.mean(dim=1 if _tensor_layout == 0 else 2)
     v_smoothed = torch.empty(v.shape, dtype=torch.float16, device=v.device)
     # subtract mean and store the result as fp16
     ops.sub_mean_cuda(v, vm, v_smoothed, _tensor_layout)
     return v_smoothed, vm
 def per_channel_fp8(
     v: torch.Tensor,
+    tensor_layout: str = "HND",
     scale_max: float = 448.0,
+    smooth_v: bool = True,
 ):
     """
     Transpose, pad and permute the tensor `v` and quantize it to fp8 with per channel quantization.
     if tensor_layout == "HND":
         b, h_kv, kv_len, head_dim = v.shape
         padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, h_kv, head_dim, padded_len), dtype=v.dtype, device=v.device
+        )
     elif tensor_layout == "NHD":
         b, kv_len, h_kv, head_dim = v.shape
         padded_len = (kv_len + 63) // 64 * 64
+        v_transposed_permutted = torch.empty(
+            (b, head_dim, h_kv, padded_len), dtype=v.dtype, device=v.device
+        )
     ops.transpose_pad_permute_cuda(v, v_transposed_permutted, _tensor_layout)
+    v_fp8 = torch.empty(
+        v_transposed_permutted.shape, dtype=torch.float8_e4m3fn, device=v.device
+    )
     v_scale = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
     vm = torch.empty((b, h_kv, head_dim), dtype=torch.float32, device=v.device)
     if smooth_v:
+        ops.mean_scale_fuse_quant_cuda(
+            v_transposed_permutted,
+            v_fp8,
+            vm,
+            v_scale,
+            kv_len,
+            scale_max,
+            _tensor_layout,
+        )
         return v_fp8, v_scale, vm
     else:
+        ops.scale_fuse_quant_cuda(
+            v_transposed_permutted, v_fp8, v_scale, kv_len, scale_max, _tensor_layout
+        )
         return v_fp8, v_scale, None

torch-ext/sage_attention/quant_per_thread.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Copyright (c) 2024 by SageAttention team.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def quant_query_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int8_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    # offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    # offs_k = tl.arange(0, C)
+    # input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    # output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    # scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    # x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    # x = x.to(tl.float32)
+    # scale = tl.max(tl.abs(x)) / 127. + 0.0000001
+    # x_int8 = x / scale
+    # x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    # x_int8 = x_int8.to(tl.int8)
+    # tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    # tl.store(scale_ptrs, scale)
+    offs_n0 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2
+    offs_n1 = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld * 2 + 1
+    offs_k = tl.arange(0, C)
+    input_ptrs0 = Input + off_b * stride_iz + off_h * stride_ih + offs_n0[:, None] * stride_in + offs_k[None, :]
+    input_ptrs1 = Input + off_b * stride_iz + off_h * stride_ih + offs_n1[:, None] * stride_in + offs_k[None, :]
+    output_ptrs0 = Output + off_b * stride_oz + off_h * stride_oh + offs_n0[:, None] * stride_on + offs_k[None, :]
+    output_ptrs1 = Output + off_b * stride_oz + off_h * stride_oh + offs_n1[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x0 = tl.load(input_ptrs0, mask=offs_n0[:, None] < L)
+    x1 = tl.load(input_ptrs1, mask=offs_n1[:, None] < L)
+    x0 = x0.to(tl.float32)
+    x1 = x1.to(tl.float32)
+    scale = max(tl.max(tl.abs(x0)), tl.max(tl.abs(x1))) / 127. + 0.0000001
+    x0_int8 = x0 / scale
+    x1_int8 = x1 / scale
+    x0_int8 += 0.5 * tl.where(x0_int8 >= 0, 1, -1)
+    x1_int8 += 0.5 * tl.where(x1_int8 >= 0, 1, -1)
+    x0_int8 = x0_int8.to(tl.int8)
+    x1_int8 = x1_int8.to(tl.int8)
+    tl.store(output_ptrs0, x0_int8, mask=offs_n0[:, None] < L)
+    tl.store(output_ptrs1, x1_int8, mask=offs_n1[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_query_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 8
+    off_tld = tl.program_id(0) % 8
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.arange(0, BLK // 8) * 8 + off_tld
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 8 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+@triton.jit
+def quant_key_per_thread_int4_kernel(Input, Output, Scale, L,
+                                        stride_iz, stride_ih, stride_in,
+                                        stride_oz, stride_oh, stride_on,
+                                        stride_sz, stride_sh,
+                                        C: tl.constexpr, BLK: tl.constexpr):
+    off_blk = tl.program_id(0) // 4
+    off_tld = tl.program_id(0) % 4
+    off_h = tl.program_id(1)
+    off_b = tl.program_id(2)
+    offs_n = off_blk * BLK + tl.cat(tl.arange(0, BLK // 8) * 8, tl.arange(0, BLK // 8) * 8 + 1, True) + off_tld * 2
+    offs_k = tl.arange(0, C)
+    input_ptrs = Input + off_b * stride_iz + off_h * stride_ih + offs_n[:, None] * stride_in + offs_k[None, :]
+    output_ptrs = Output + off_b * stride_oz + off_h * stride_oh + offs_n[:, None] * stride_on + offs_k[None, :]
+    scale_ptrs = Scale + off_b * stride_sz + off_h * stride_sh + off_blk * 4 + off_tld
+    x = tl.load(input_ptrs, mask=offs_n[:, None] < L)
+    x = x.to(tl.float32)
+    scale = tl.max(tl.abs(x)) / 7. + 0.0000001
+    x_int8 = x / scale
+    x_int8 += 0.5 * tl.where(x_int8 >= 0, 1, -1)
+    x_int8 = x_int8.to(tl.int8)
+    tl.store(output_ptrs, x_int8, mask=offs_n[:, None] < L)
+    tl.store(scale_ptrs, scale)
+def per_thread_int8(q, k, km=None, BLKQ=128, WARPQ=32, BLKK=64, WARPK=64, sm_scale=None, tensor_layout="HND"):
+    q_int8 = torch.empty(q.shape, dtype=torch.int8, device=q.device)
+    k_int8 = torch.empty(k.shape, dtype=torch.int8, device=k.device)
+    if km is not None:
+        k = k - km
+    if tensor_layout == "HND":
+        b, h_qo, qo_len, head_dim = q.shape
+        _, h_kv, kv_len, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(1), q.stride(2)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(1), q_int8.stride(2)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(1), k.stride(2)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(1), k_int8.stride(2)
+    elif tensor_layout == "NHD":
+        b, qo_len, h_qo, head_dim = q.shape
+        _, kv_len, h_kv, _ = k.shape
+        stride_bz_q, stride_h_q, stride_seq_q = q.stride(0), q.stride(2), q.stride(1)
+        stride_bz_qo, stride_h_qo, stride_seq_qo = q_int8.stride(0), q_int8.stride(2), q_int8.stride(1)
+        stride_bz_k, stride_h_k, stride_seq_k = k.stride(0), k.stride(2), k.stride(1)
+        stride_bz_ko, stride_h_ko, stride_seq_ko = k_int8.stride(0), k_int8.stride(2), k_int8.stride(1)
+    else:
+        raise ValueError(f"Unknown tensor layout: {tensor_layout}")
+    q_scale = torch.empty((b, h_qo, (qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8), device=q.device, dtype=torch.float32)
+    k_scale = torch.empty((b, h_kv, (kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4), device=q.device, dtype=torch.float32)
+    if sm_scale is None:
+        sm_scale = head_dim**-0.5
+    grid = ((qo_len + BLKQ - 1) // BLKQ * (BLKQ // WARPQ) * 8, h_qo, b)
+    quant_query_per_thread_int8_kernel[grid](
+        q, q_int8, q_scale, qo_len,
+        stride_bz_q, stride_h_q, stride_seq_q,
+        stride_bz_qo, stride_h_qo, stride_seq_qo,
+        q_scale.stride(0), q_scale.stride(1),
+        C=head_dim, BLK=WARPQ
+    )
+    grid = ((kv_len + BLKK - 1) // BLKK * (BLKK // WARPK) * 4, h_kv, b)
+    quant_key_per_thread_int8_kernel[grid](
+        k, k_int8, k_scale, kv_len,
+        stride_bz_k, stride_h_k, stride_seq_k,
+        stride_bz_ko, stride_h_ko, stride_seq_ko,
+        k_scale.stride(0), k_scale.stride(1),
+        C=head_dim, BLK=WARPK
+    )
+    return q_int8, q_scale, k_int8, k_scale

torch-ext/torch_binding.cpp CHANGED Viewed

@@ -8,6 +8,9 @@ void sm_check_89(torch::Tensor x, std::string op_name) {
   int device_index = x.get_device();
   const auto& prop = at::cuda::getDeviceProperties(device_index);
   if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) {
       TORCH_CHECK(false, op_name + " requires compute capability 8.9+");
   }
@@ -17,11 +20,24 @@ void sm_check_90(torch::Tensor x, std::string op_name) {
   int device_index = x.get_device();
   const auto& prop = at::cuda::getDeviceProperties(device_index);
   if (prop->major < 9) {
       TORCH_CHECK(false, op_name + " requires compute capability 9.0+");
   }
 }
 // ##############################################################################
 // SM89
 // ##############################################################################
@@ -146,6 +162,7 @@ static at::Tensor qk_int8_sv_f16_accum_f32_attn_wrap(
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
   return qk_int8_sv_f16_accum_f32_attn(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
@@ -157,6 +174,7 @@ static at::Tensor qk_int8_sv_f16_accum_f16_attn_wrap(
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
   return qk_int8_sv_f16_accum_f16_attn(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
@@ -168,6 +186,7 @@ static at::Tensor qk_int8_sv_f16_accum_f16_attn_inst_buf_wrap(
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
   return qk_int8_sv_f16_accum_f16_attn_inst_buf(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
@@ -179,6 +198,7 @@ static at::Tensor qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap(
     at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
   return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
       q, k, v, o, q_scale, k_scale, v_mean,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
@@ -238,15 +258,13 @@ static void mean_scale_fuse_quant_cuda_wrap(
 }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-#if defined(HAS_SM90)
     ops.def("qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90", torch::kCUDA, &qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap);
     ops.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90", torch::kCUDA, &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap);
-#endif
-#if defined(HAS_SM89)
     ops.def("qk_int8_sv_f8_accum_f32_attn_inst_buf(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_attn_inst_buf", torch::kCUDA, &qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap);
@@ -267,9 +285,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
     ops.def("qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf", torch::kCUDA, &qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap);
-#endif
-#if defined(HAS_SM80)
     ops.def("qk_int8_sv_f16_accum_f32_attn(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f16_accum_f32_attn", torch::kCUDA, &qk_int8_sv_f16_accum_f32_attn_wrap);
@@ -281,7 +297,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
     ops.def("qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_mean, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f16_accum_f16_fuse_v_mean_attn", torch::kCUDA, &qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap);
-#endif
     //Fused (available across supported archs)
     ops.def("quant_per_block_int8_cuda(Tensor input, Tensor! output, Tensor scale, float sm_scale, int block_size, int tensor_layout) -> ()");

   int device_index = x.get_device();
   const auto& prop = at::cuda::getDeviceProperties(device_index);
+  std::cerr << "sm_check_89: prop->major: " << prop->major << std::endl;
+  std::cerr << "sm_check_89: prop->minor: " << prop->minor << std::endl;
   if (prop->major < 8 || (prop->major == 8 && prop->minor < 9)) {
       TORCH_CHECK(false, op_name + " requires compute capability 8.9+");
   }
   int device_index = x.get_device();
   const auto& prop = at::cuda::getDeviceProperties(device_index);
+  std::cerr << "sm_check_90: prop->major: " << prop->major << std::endl;
+  std::cerr << "sm_check_90: prop->minor: " << prop->minor << std::endl;
   if (prop->major < 9) {
       TORCH_CHECK(false, op_name + " requires compute capability 9.0+");
   }
 }
+void sm_check_80(torch::Tensor x, std::string op_name) {
+  int device_index = x.get_device();
+  const auto& prop = at::cuda::getDeviceProperties(device_index);
+  std::cerr << "sm_check_80: prop->major: " << prop->major << std::endl;
+  std::cerr << "sm_check_80: prop->minor: " << prop->minor << std::endl;
+  if (prop->major < 8) {
+      TORCH_CHECK(false, op_name + " requires compute capability 8.0+");
+  }
+}
 // ##############################################################################
 // SM89
 // ##############################################################################
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
+  sm_check_80(q, "qk_int8_sv_f16_accum_f32_attn");
   return qk_int8_sv_f16_accum_f32_attn(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
+  sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn");
   return qk_int8_sv_f16_accum_f16_attn(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
     at::Tensor q_scale, at::Tensor k_scale,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
+  sm_check_80(q, "qk_int8_sv_f16_accum_f16_attn_inst_buf");
   return qk_int8_sv_f16_accum_f16_attn_inst_buf(
       q, k, v, o, q_scale, k_scale,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
     at::Tensor q_scale, at::Tensor k_scale, at::Tensor v_mean,
     int64_t tensor_layout, int64_t is_causal, int64_t qk_quant_gran,
     double sm_scale, int64_t return_lse) {
+  sm_check_80(q, "qk_int8_sv_f16_accum_f16_fuse_v_mean_attn");
   return qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(
       q, k, v, o, q_scale, k_scale, v_mean,
       static_cast<int>(tensor_layout), static_cast<int>(is_causal), static_cast<int>(qk_quant_gran),
 }
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+    // SM90
     ops.def("qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90", torch::kCUDA, &qk_int8_sv_f8_accum_f32_attn_inst_buf_sm90_wrap);
     ops.def("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90", torch::kCUDA, &qk_int8_sv_f8_accum_f32_fuse_v_scale_attn_inst_buf_sm90_wrap);
     ops.def("qk_int8_sv_f8_accum_f32_attn_inst_buf(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f32_attn_inst_buf", torch::kCUDA, &qk_int8_sv_f8_accum_f32_attn_inst_buf_wrap);
     ops.def("qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf", torch::kCUDA, &qk_int8_sv_f8_accum_f16_fuse_v_scale_attn_inst_buf_wrap);
     ops.def("qk_int8_sv_f16_accum_f32_attn(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f16_accum_f32_attn", torch::kCUDA, &qk_int8_sv_f16_accum_f32_attn_wrap);
     ops.def("qk_int8_sv_f16_accum_f16_fuse_v_mean_attn(Tensor q, Tensor k, Tensor v, Tensor! o, Tensor q_scale, Tensor k_scale, Tensor v_mean, int tensor_layout, int is_causal, int qk_quant_gran, float sm_scale, int return_lse) -> Tensor");
     ops.impl("qk_int8_sv_f16_accum_f16_fuse_v_mean_attn", torch::kCUDA, &qk_int8_sv_f16_accum_f16_fuse_v_mean_attn_wrap);
     //Fused (available across supported archs)
     ops.def("quant_per_block_int8_cuda(Tensor input, Tensor! output, Tensor scale, float sm_scale, int block_size, int tensor_layout) -> ()");