vllm.v1.kv_offload.cpu.swap_blocks_triton ¶

Triton kernel + tuned constants for the swap_blocks_batch fast path.

swap_blocks_batch ¶

swap_blocks_batch(
    src_addrs: Tensor,
    dst_addrs: Tensor,
    sizes: Tensor,
    is_src_access_order_any: bool = False,
    *,
    bytes_per_chunk: int,
) -> None

Triton implementation of swap_blocks_batch for small CPU->GPU batches.

Source code in vllm/v1/kv_offload/cpu/swap_blocks_triton.py

def swap_blocks_batch(
    src_addrs: torch.Tensor,
    dst_addrs: torch.Tensor,
    sizes: torch.Tensor,
    is_src_access_order_any: bool = False,
    *,
    bytes_per_chunk: int,
) -> None:
    """Triton implementation of ``swap_blocks_batch`` for small CPU->GPU batches."""
    n = src_addrs.numel()
    # Too few descriptors to amortize Triton's launch cost.
    if n < MIN_N:
        ops.swap_blocks_batch(
            src_addrs,
            dst_addrs,
            sizes,
            is_src_access_order_any=is_src_access_order_any,
        )
        return
    _swap_blocks_kernel[(min(NUM_SMS, n),)](
        src_addrs.to("cuda", non_blocking=True),
        dst_addrs.to("cuda", non_blocking=True),
        sizes.to("cuda", non_blocking=True),
        n,
        BYTES_PER_CHUNK=bytes_per_chunk,
    )