vllm.model_executor.layers.quantization.utils.marlin_utils ¶

marlin_moe_intermediate_size ¶

marlin_moe_intermediate_size(
    w1_packed: Tensor, w2_packed: Tensor
)

Given Marlin packed weight matrices w1_packed, and w2_packed, return the MoE intermediate size N

Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py

def marlin_moe_intermediate_size(w1_packed: torch.Tensor, w2_packed: torch.Tensor):
    """
    Given Marlin packed weight matrices w1_packed, and w2_packed,
    return the MoE intermediate size N
    """
    marlin_tile_size = 16
    return w2_packed.size(1) * marlin_tile_size

moe_packed_to_marlin_zero_points ¶

moe_packed_to_marlin_zero_points(
    q_zp_packed: Tensor,
    size_k: int,
    size_n: int,
    num_bits: int,
    is_a_8bit: bool = False,
)

Convert compressed-tensors packed zero points to Marlin format.

Unlike AWQ, compressed-tensors uses standard bit packing without interleaving, so we just unpack and apply Marlin permutation directly.

Source code in vllm/model_executor/layers/quantization/utils/marlin_utils.py

def moe_packed_to_marlin_zero_points(
    q_zp_packed: torch.Tensor,
    size_k: int,
    size_n: int,
    num_bits: int,
    is_a_8bit: bool = False,
):
    """Convert compressed-tensors packed zero points to Marlin format.

    Unlike AWQ, compressed-tensors uses standard bit packing without
    interleaving, so we just unpack and apply Marlin permutation directly.
    """
    num_experts = q_zp_packed.shape[0]
    output = torch.empty(
        (num_experts, q_zp_packed.shape[1], q_zp_packed.shape[2]),
        device=q_zp_packed.device,
        dtype=q_zp_packed.dtype,
    )
    for e in range(num_experts):
        q_zp = unpack_cols(q_zp_packed[e], num_bits, size_k, size_n)
        output[e] = marlin_zero_points(q_zp, size_k, size_n, num_bits, is_a_8bit)
    return output