Clarit-AI
diff --git a/‎python/sglang/jit_kernel/awq_dequantize.py‎
Lines changed: 38 additions & 0 deletions b/‎python/sglang/jit_kernel/awq_dequantize.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎python/sglang/jit_kernel/awq_marlin_repack.py‎
Lines changed: 56 additions & 0 deletions b/‎python/sglang/jit_kernel/awq_marlin_repack.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py‎
Lines changed: 125 additions & 0 deletions b/‎python/sglang/jit_kernel/benchmark/bench_awq_dequantize.py‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎python/sglang/jit_kernel/benchmark/bench_awq_marlin_moe_repack.py‎
Lines changed: 133 additions & 0 deletions b/‎python/sglang/jit_kernel/benchmark/bench_awq_marlin_moe_repack.py‎
Lines changed: 133 additions & 0 deletions
@@ -0,0 +1,38 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.jit_kernel.utils import cache_once, load_jit, make_cpp_args
+
+if TYPE_CHECKING:
+    from tvm_ffi.module import Module
+
+
+@cache_once
+def _jit_awq_dequantize_module(dtype: torch.dtype) -> Module:
+    args = make_cpp_args(dtype)
+    return load_jit(
+        "awq_dequantize",
+        *args,
+        cuda_files=["gemm/awq_dequantize.cuh"],
+        cuda_wrappers=[("awq_dequantize", f"awq_dequantize<{args}>")],
+    )
+
+
+def awq_dequantize(
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    qzeros: torch.Tensor,
+) -> torch.Tensor:
+    qweight_rows = qweight.shape[0]
+    qweight_cols = qweight.shape[1]
+    output = torch.empty(
+        (qweight_rows, qweight_cols * 8),
+        dtype=scales.dtype,
+        device=scales.device,
+    )
+    module = _jit_awq_dequantize_module(scales.dtype)
+    module.awq_dequantize(output, qweight, scales, qzeros)
+    return output
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from sglang.jit_kernel.utils import cache_once, load_jit
+
+if TYPE_CHECKING:
+    from tvm_ffi.module import Module
+
+
+@cache_once
+def _jit_awq_marlin_repack_module() -> Module:
+    return load_jit(
+        "awq_marlin_repack",
+        cuda_files=["gemm/marlin/awq_marlin_repack.cuh"],
+        cuda_wrappers=[("awq_marlin_repack", "awq_marlin_repack")],
+    )
+
+
+def awq_marlin_repack(
+    b_q_weight: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    tile_size = 16
+    pack_factor = 32 // num_bits
+    out = torch.empty(
+        (size_k // tile_size, size_n * tile_size // pack_factor),
+        dtype=b_q_weight.dtype,
+        device=b_q_weight.device,
+    )
+    module = _jit_awq_marlin_repack_module()
+    module.awq_marlin_repack(out, b_q_weight, size_k, size_n, num_bits)
+    return out
+
+
+def awq_marlin_moe_repack(
+    b_q_weight: torch.Tensor,
+    perm: torch.Tensor,
+    size_k: int,
+    size_n: int,
+    num_bits: int,
+) -> torch.Tensor:
+    num_experts = b_q_weight.shape[0]
+    assert size_k % 16 == 0
+    output = torch.empty(
+        (num_experts, size_k // 16, size_n * (num_bits // 2)),
+        device=b_q_weight.device,
+        dtype=b_q_weight.dtype,
+    )
+    for e in range(num_experts):
+        output[e] = awq_marlin_repack(b_q_weight[e], size_k, size_n, num_bits)
+    return output
@@ -0,0 +1,125 @@
+import itertools
+import os
+
+import torch
+import triton
+import triton.testing
+
+from sglang.jit_kernel.awq_dequantize import awq_dequantize as jit_awq_dequantize
+
+try:
+    from sgl_kernel import awq_dequantize as aot_awq_dequantize
+
+    AOT_AVAILABLE = True
+except ImportError:
+    AOT_AVAILABLE = False
+
+IS_CI = (
+    os.getenv("CI", "false").lower() == "true"
+    or os.getenv("GITHUB_ACTIONS", "false").lower() == "true"
+)
+
+# CI environment uses simplified parameters
+if IS_CI:
+    qweight_row_range = [128]
+    qweight_cols_range = [16]
+else:
+    qweight_row_range = [128, 256, 512, 1024, 3584]
+    qweight_cols_range = [16, 32, 64, 128, 448]
+
+configs = list(itertools.product(qweight_row_range, qweight_cols_range))
+
+
+def check_correctness():
+    if not AOT_AVAILABLE:
+        print("sgl_kernel AOT not available, skipping correctness check")
+        return
+
+    qweight_row, qweight_col = 128, 16
+    device = torch.device("cuda")
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+    group_size = qweight_row
+    scales_row = qweight_row // group_size
+    scales_col = qweight_col * 8
+    scales = torch.rand(scales_row, scales_col, dtype=torch.float16, device=device)
+    qzeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (scales_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    jit_out = jit_awq_dequantize(qweight, scales, qzeros)
+    aot_out = aot_awq_dequantize(qweight, scales, qzeros)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(jit_out, aot_out, rtol=0, atol=0)
+    print("Correctness check passed (JIT vs AOT)")
+
+
+if AOT_AVAILABLE:
+    line_vals = ["jit", "aot"]
+    line_names = ["JIT Kernel", "AOT Kernel"]
+    styles = [("blue", "-"), ("green", "-")]
+else:
+    line_vals = ["jit"]
+    line_names = ["JIT Kernel"]
+    styles = [("blue", "-")]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["qweight_row", "qweight_col"],
+        x_vals=configs,
+        line_arg="provider",
+        line_vals=line_vals,
+        line_names=line_names,
+        styles=styles,
+        ylabel="us",
+        plot_name="awq-dequantize-jit-vs-aot",
+        args={},
+    )
+)
+def benchmark(qweight_row, qweight_col, provider):
+    device = torch.device("cuda")
+    qweight = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (qweight_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+    group_size = qweight_row
+    scales_row = qweight_row // group_size
+    scales_col = qweight_col * 8
+    scales = torch.rand(scales_row, scales_col, dtype=torch.float16, device=device)
+    qzeros = torch.randint(
+        0,
+        torch.iinfo(torch.int32).max,
+        (scales_row, qweight_col),
+        dtype=torch.int32,
+        device=device,
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "jit":
+        fn = lambda: jit_awq_dequantize(qweight, scales, qzeros)
+    elif provider == "aot":
+        fn = lambda: aot_awq_dequantize(qweight, scales, qzeros)
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    check_correctness()
+    benchmark.run(print_data=True)
@@ -0,0 +1,133 @@
+import os
+
+import numpy as np
+import torch
+import triton
+import triton.testing
+from sgl_kernel.scalar_type import scalar_types
+
+from sglang.jit_kernel.awq_marlin_repack import (
+    awq_marlin_moe_repack as jit_awq_marlin_moe_repack,
+)
+from sglang.srt.layers.quantization.utils import pack_cols, quantize_weights
+
+try:
+    from sgl_kernel import awq_marlin_moe_repack as aot_awq_marlin_moe_repack
+
+    AOT_AVAILABLE = True
+except ImportError:
+    AOT_AVAILABLE = False
+
+IS_CI = (
+    os.getenv("CI", "false").lower() == "true"
+    or os.getenv("GITHUB_ACTIONS", "false").lower() == "true"
+)
+
+# Fixed parameters
+NUM_BITS = 4
+GROUP_SIZE = 128
+SIZE_N = 4096
+
+
+def awq_pack(q_w, num_bits, size_k, size_n):
+    if num_bits == 4:
+        interleave = np.array([0, 2, 4, 6, 1, 3, 5, 7])
+    elif num_bits == 8:
+        interleave = np.array([0, 2, 1, 3])
+    else:
+        raise Exception("num_bits must be 4 or 8, got {}".format(num_bits))
+
+    q_w = q_w.reshape((-1, len(interleave)))[:, interleave].ravel()
+    q_w = q_w.reshape((-1, size_n)).contiguous()
+    return pack_cols(q_w, num_bits, size_k, size_n)
+
+
+def make_moe_weights(num_experts, size_k, size_n, num_bits, group_size):
+    pack_factor = 32 // num_bits
+    b_q_weight = torch.empty(
+        (num_experts, size_k, size_n // pack_factor),
+        dtype=torch.int32,
+        device="cuda",
+    )
+    for e in range(num_experts):
+        b_weight = torch.randn((size_k, size_n), dtype=torch.float16, device="cuda")
+        w_ref, q_w, s, zp = quantize_weights(
+            b_weight, scalar_types.uint4, min(group_size, size_k), zero_points=True
+        )
+        b_q_weight[e] = awq_pack(q_w, num_bits, size_k, size_n)
+    perm = torch.empty((num_experts, 0), dtype=torch.int32, device="cuda")
+    return b_q_weight, perm
+
+
+def check_correctness():
+    if not AOT_AVAILABLE:
+        print("sgl_kernel AOT not available, skipping correctness check")
+        return
+
+    num_experts = 4
+    size_k = 1024
+    b_q_weight, perm = make_moe_weights(
+        num_experts, size_k, SIZE_N, NUM_BITS, GROUP_SIZE
+    )
+
+    out_jit = jit_awq_marlin_moe_repack(b_q_weight, perm, size_k, SIZE_N, NUM_BITS)
+    out_aot = aot_awq_marlin_moe_repack(b_q_weight, perm, size_k, SIZE_N, NUM_BITS)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(out_jit, out_aot, rtol=0, atol=0)
+    print("Correctness check passed (JIT vs AOT)")
+
+
+if IS_CI:
+    expert_range = [2, 4]
+else:
+    expert_range = [2, 4, 8, 16]
+
+if AOT_AVAILABLE:
+    line_vals = ["jit", "aot"]
+    line_names = ["JIT Kernel", "AOT Kernel"]
+    styles = [("blue", "-"), ("green", "-")]
+else:
+    line_vals = ["jit"]
+    line_names = ["JIT Kernel"]
+    styles = [("blue", "-")]
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=["num_experts"],
+        x_vals=expert_range,
+        line_arg="provider",
+        line_vals=line_vals,
+        line_names=line_names,
+        styles=styles,
+        ylabel="us",
+        plot_name="awq-marlin-moe-repack-performance",
+        args={"size_k": 4096, "size_n": SIZE_N, "num_bits": NUM_BITS},
+    )
+)
+def benchmark(num_experts, size_k, size_n, num_bits, provider):
+    group_size = min(GROUP_SIZE, size_k)
+    b_q_weight, perm = make_moe_weights(
+        num_experts, size_k, size_n, num_bits, group_size
+    )
+
+    quantiles = [0.5, 0.2, 0.8]
+
+    if provider == "jit":
+        fn = lambda: jit_awq_marlin_moe_repack(
+            b_q_weight, perm, size_k, size_n, num_bits
+        )
+    elif provider == "aot":
+        fn = lambda: aot_awq_marlin_moe_repack(
+            b_q_weight, perm, size_k, size_n, num_bits
+        )
+    else:
+        raise ValueError(f"Unknown provider: {provider}")
+
+    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
+    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
+
+
+if __name__ == "__main__":
+    check_correctness()
+    benchmark.run(print_data=True)