swap_blocks_batch(
src_addrs: Tensor,
dst_addrs: Tensor,
sizes: Tensor,
is_src_access_order_any: bool = False,
*,
bytes_per_chunk: int,
) -> None
Triton implementation of swap_blocks_batch for small CPU->GPU batches.
Source code in vllm/v1/kv_offload/cpu/swap_blocks_triton.py
| def swap_blocks_batch(
src_addrs: torch.Tensor,
dst_addrs: torch.Tensor,
sizes: torch.Tensor,
is_src_access_order_any: bool = False,
*,
bytes_per_chunk: int,
) -> None:
"""Triton implementation of ``swap_blocks_batch`` for small CPU->GPU batches."""
n = src_addrs.numel()
# Too few descriptors to amortize Triton's launch cost.
if n < MIN_N:
ops.swap_blocks_batch(
src_addrs,
dst_addrs,
sizes,
is_src_access_order_any=is_src_access_order_any,
)
return
_swap_blocks_kernel[(min(NUM_SMS, n),)](
src_addrs.to("cuda", non_blocking=True),
dst_addrs.to("cuda", non_blocking=True),
sizes.to("cuda", non_blocking=True),
n,
BYTES_PER_CHUNK=bytes_per_chunk,
)
|