vllm.model_executor.models.minicpmo ¶

Inference-only MiniCPM-O model compatible with HuggingFace weights.

MiniCPMO ¶

Bases: MiniCPMOBaseModel, MiniCPMV2_6

MiniCPM-O model with audio support. Different versions use different LLM backbones: - Version 2.6: Uses Qwen2 - Version 4.5: Uses Qwen3

Source code in vllm/model_executor/models/minicpmo.py

@MULTIMODAL_REGISTRY.register_processor(
    MiniCPMOMultiModalProcessor,
    info=MiniCPMOProcessingInfo,
    dummy_inputs=MiniCPMODummyInputsBuilder,
)
class MiniCPMO(MiniCPMOBaseModel, MiniCPMV2_6):
    """
    MiniCPM-O model with audio support.
    Different versions use different LLM backbones:
    - Version 2.6: Uses Qwen2
    - Version 4.5: Uses Qwen3
    """

    def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
        config = vllm_config.model_config.hf_config

        # Determine version from config
        if hasattr(config, "version"):
            try:
                version_str = str(config.version)
                version_parts = version_str.split(".")
                version = tuple(int(x) for x in version_parts[:2])
            except (ValueError, TypeError) as e:
                raise ValueError(
                    f"Invalid model version format in config: {config.version}. "
                    "Expected a dot-separated version string like '4.5'."
                ) from e
        else:
            # Default to 2.6 for backward compatibility
            version = (2, 6)

        # Dispatch class based on version
        instance_cls = _MINICPMO_SUPPORT_VERSION.get(version)
        if instance_cls is None:
            supported_versions = ", ".join(
                [f"{v[0]}.{v[1]}" for v in sorted(_MINICPMO_SUPPORT_VERSION.keys())]
            )
            raise ValueError(
                f"Currently, MiniCPMO only supports versions "
                f"{supported_versions}. Got version: {version}"
            )

        return instance_cls(vllm_config=vllm_config, prefix=prefix)

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        # This __init__ won't be called due to __new__ returning a different class
        pass

MiniCPMO2_6 ¶

Bases: MiniCPMOBaseModel, MiniCPMV2_6

MiniCPM-O 2.6 model with Qwen2 backbone.

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMO2_6(MiniCPMOBaseModel, MiniCPMV2_6):
    """MiniCPM-O 2.6 model with Qwen2 backbone."""

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        with self._mark_tower_model(vllm_config, "audio"):
            self.apm = self.init_audio_module(
                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm")
            )

MiniCPMO4_5 ¶

Bases: MiniCPMOBaseModel, MiniCPMV4_5

MiniCPM-O 4.5 model with Qwen3 backbone.

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMO4_5(MiniCPMOBaseModel, MiniCPMV4_5):
    """MiniCPM-O 4.5 model with Qwen3 backbone."""

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        with self._mark_tower_model(vllm_config, "audio"):
            self.apm = self.init_audio_module(
                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm")
            )

MiniCPMOAudioEmbeddingInputs ¶

Bases: TensorSchema

Dimensions

bn: Batch size * number of audios
s: Number of slices
h: Hidden size (must match language model backbone)

Length of each slice may vary, so pass it as a list.

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMOAudioEmbeddingInputs(TensorSchema):
    """
    Dimensions:
        - bn: Batch size * number of audios
        - s: Number of slices
        - h: Hidden size (must match language model backbone)

    Length of each slice may vary, so pass it as a list.
    """

    type: Literal["audio_embeds"] = "audio_embeds"

    audio_embeds: Annotated[
        torch.Tensor | list[torch.Tensor],
        TensorShape("bn", "s", "h", dynamic_dims={"s"}),
    ]

MiniCPMOAudioFeatureInputs ¶

Bases: TensorSchema

Dimensions

bns: Batch size * number of audios * number of slices
bn: Batch size * number of audios
c: Number of channels
l: Length
s: Number of slices

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMOAudioFeatureInputs(TensorSchema):
    """
    Dimensions:
        - bns: Batch size * number of audios * number of slices
        - bn: Batch size * number of audios
        - c: Number of channels
        - l: Length
        - s: Number of slices
    """

    type: Literal["audio_features"] = "audio_features"

    audio_features: Annotated[
        torch.Tensor | list[torch.Tensor],
        TensorShape("bns", "c", "l", dynamic_dims={"l"}),
    ]
    """
    Slice here means chunk. Audio that is too long will be split into slices,
    which is the same as image. Padding is used therefore `audio_features` is 
    `torch.Tensor`.
    """

    audio_feature_lens: Annotated[
        torch.Tensor | list[torch.Tensor],
        TensorShape("bn", "s"),
    ]
    """
    This should be feature length of each audio slice, 
    which equals to `audio_features.shape[-1]`
    """

audio_feature_lens `instance-attribute` ¶

audio_feature_lens: Annotated[
    Tensor | list[Tensor], TensorShape(bn, s)
]

This should be feature length of each audio slice, which equals to audio_features.shape[-1]

audio_features `instance-attribute` ¶

audio_features: Annotated[
    Tensor | list[Tensor],
    TensorShape(bns, c, l, dynamic_dims={l}),
]

Slice here means chunk. Audio that is too long will be split into slices, which is the same as image. Padding is used therefore audio_features is torch.Tensor.

MiniCPMOBaseModel ¶

Base mixin class for MiniCPM-O models with audio support.

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMOBaseModel:
    """Base mixin class for MiniCPM-O models with audio support."""

    packed_modules_mapping = {
        "qkv_proj": [
            "q_proj",
            "k_proj",
            "v_proj",
        ],
        "gate_up_proj": [
            "gate_proj",
            "up_proj",
        ],
    }

    @classmethod
    def get_placeholder_str(cls, modality: str, i: int) -> str | None:
        if modality.startswith("image"):
            return "(<image>./</image>)"
        if modality.startswith("video"):
            return "(<video>./</video>)"
        if modality.startswith("audio"):
            return "(<audio>./</audio>)"

        raise ValueError("Only image, video or audio modality is supported")

    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
        super().__init__(vllm_config=vllm_config, prefix=prefix)

        with self._mark_tower_model(vllm_config, "audio"):
            self.apm = self.init_audio_module(
                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "apm")
            )

    def init_audio_module(self, *, vllm_config: VllmConfig, prefix: str = ""):
        # Do not use parameters temporarily
        audio_config = self.config.audio_config
        model = MiniCPMWhisperEncoder(audio_config)
        audio_output_dim = int(audio_config.encoder_ffn_dim // 4)
        self.audio_avg_pooler = nn.AvgPool1d(
            self.config.audio_pool_step, stride=self.config.audio_pool_step
        )
        self.audio_projection_layer = MultiModalProjector(
            in_dim=audio_output_dim, out_dim=self.embed_dim
        )
        self.audio_encoder_layer = -1
        return model

    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        loader = AutoWeightsLoader(self, skip_prefixes=["tts"])
        return loader.load_weights(weights)

    def subsequent_chunk_mask(
        self,
        size: int,
        chunk_size: int,
        num_left_chunks: int = -1,
        device: torch.device = CPU_DEVICE,
        num_lookhead: int = 0,
    ) -> torch.Tensor:
        ret = torch.zeros(size, size, device=device, dtype=torch.bool)
        # Vectorized computation of row indices and chunk boundaries
        row_indices = torch.arange(size, device=device)
        chunk_indices = row_indices // chunk_size
        if num_left_chunks < 0:
            # If num_left_chunks < 0, start is always 0 for all rows
            start_indices = torch.zeros_like(row_indices)
        else:
            # Compute start indices vectorially
            start_chunk_indices = torch.clamp(chunk_indices - num_left_chunks, min=0)
            start_indices = start_chunk_indices * chunk_size
        # Compute ending indices vectorially
        end_chunk_indices = chunk_indices + 1
        end_indices = torch.clamp(
            end_chunk_indices * chunk_size + num_lookhead, max=size
        )
        # Create column indices for broadcasting
        col_indices = torch.arange(size, device=device).unsqueeze(0)
        start_indices = start_indices.unsqueeze(1)
        end_indices = end_indices.unsqueeze(1)
        # Vectorized mask creation
        ret = (col_indices >= start_indices) & (col_indices < end_indices)
        return ret

    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
        input_lengths_after_cnn = (input_lengths - 1) // 2 + 1
        input_lengths_after_pooling = (
            input_lengths_after_cnn - self.config.audio_pool_step
        ) // self.config.audio_pool_step + 1
        input_lengths_after_pooling = input_lengths_after_pooling.to(dtype=torch.int32)

        return input_lengths_after_cnn, input_lengths_after_pooling

    def get_audio_hidden_states(
        self, data: MiniCPMOAudioFeatureInputs
    ) -> list[torch.Tensor]:
        chunk_length = self.config.audio_chunk_length

        # (bs, 80, frames) or [], multi audios need filled in advance
        wavforms_raw = data["audio_features"]
        if isinstance(wavforms_raw, list):
            B = len(wavforms_raw)
            C = wavforms_raw[0].shape[-2]
            L = max(item.shape[-1] for item in wavforms_raw)
            device = wavforms_raw[0].device
            dtype = wavforms_raw[0].dtype

            wavforms = torch.zeros((B, C, L), dtype=dtype, device=device)
            for i, wavforms_item in enumerate(wavforms_raw):
                L_item = wavforms_item.shape[-1]
                wavforms[i, ..., :L_item] = wavforms_item
        else:
            wavforms = wavforms_raw

        # list, [[x1, x2], [y1], [z1]]
        audio_feature_lens_raw = data["audio_feature_lens"]
        if isinstance(audio_feature_lens_raw, torch.Tensor):
            audio_feature_lens_raw = audio_feature_lens_raw.unbind(0)

        audio_feature_lens = torch.hstack(audio_feature_lens_raw)
        batch_size, _, max_mel_seq_len = wavforms.shape
        max_seq_len = (max_mel_seq_len - 1) // 2 + 1

        # Create a sequence tensor of shape (batch_size, max_seq_len)
        seq_range = (
            torch.arange(
                0,
                max_seq_len,
                dtype=audio_feature_lens.dtype,
                device=audio_feature_lens.device,
            )
            .unsqueeze(0)
            .expand(batch_size, max_seq_len)
        )
        lengths_expand = audio_feature_lens.unsqueeze(1).expand(batch_size, max_seq_len)
        # Create mask
        padding_mask = seq_range >= lengths_expand  # 1 for padded values

        audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
            batch_size, 1, max_seq_len, max_seq_len
        )
        audio_attention_mask = audio_attention_mask_.to(
            dtype=self.apm.conv1.weight.dtype, device=self.apm.conv1.weight.device
        )

        if chunk_length > 0:
            chunk_num_frame = int(chunk_length * 50)
            chunk_mask = self.subsequent_chunk_mask(
                size=max_seq_len,
                chunk_size=chunk_num_frame,
                num_left_chunks=-1,
                device=audio_attention_mask_.device,
            )
            audio_attention_mask_ = torch.logical_or(
                audio_attention_mask_, torch.logical_not(chunk_mask)
            )

        audio_attention_mask[audio_attention_mask_] = float("-inf")
        audio_states = self.apm(
            wavforms, attention_mask=audio_attention_mask
        ).hidden_states[self.audio_encoder_layer]
        audio_embeds = self.audio_projection_layer(audio_states)

        audio_embeds = audio_embeds.transpose(1, 2)
        audio_embeds = self.audio_avg_pooler(audio_embeds)
        audio_embeds = audio_embeds.transpose(1, 2)

        _, feature_lens_after_pooling = self._get_feat_extract_output_lengths(
            audio_feature_lens
        )

        num_audio_tokens = feature_lens_after_pooling

        final_audio_embeds = list[torch.Tensor]()
        idx = 0
        for i in range(len(audio_feature_lens_raw)):
            target_audio_embeds_lst = list[torch.Tensor]()
            for _ in range(len(audio_feature_lens_raw[i])):
                target_audio_embeds_lst.append(
                    audio_embeds[idx, : num_audio_tokens[idx], :]
                )
                idx += 1

            final_audio_embeds.append(torch.cat(target_audio_embeds_lst))

        return final_audio_embeds

    def _parse_and_validate_audio_input(
        self, **kwargs: object
    ) -> MiniCPMOAudioInputs | None:
        audio_features = kwargs.pop("audio_features", None)
        audio_embeds = kwargs.pop("audio_embeds", None)

        if audio_features is None and audio_embeds is None:
            return None

        if audio_embeds is not None:
            return MiniCPMOAudioEmbeddingInputs(
                type="audio_embeds",
                audio_embeds=audio_embeds,
            )

        audio_feature_lens = kwargs.pop("audio_feature_lens")

        return MiniCPMOAudioFeatureInputs(
            type="audio_features",
            audio_features=audio_features,
            audio_feature_lens=audio_feature_lens,
        )

    def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
        modalities = super()._parse_and_validate_multimodal_inputs(**kwargs)

        # Preserve the order of modalities if there are multiple of them
        # from the order of kwargs.
        for input_key in kwargs:
            if (
                input_key in ("audio_features", "audio_embeds")
                and "audios" not in modalities
            ):
                modalities["audios"] = self._parse_and_validate_audio_input(**kwargs)

        return modalities

    def _process_audio_input(
        self,
        audio_input: MiniCPMOAudioInputs,
    ) -> torch.Tensor | list[torch.Tensor]:
        if audio_input["type"] == "audio_embeds":
            return audio_input["audio_embeds"]

        return self.get_audio_hidden_states(audio_input)

    def _process_multimodal_inputs(self, modalities: dict):
        multimodal_embeddings = super()._process_multimodal_inputs(modalities)

        for modality in modalities:
            if modality == "audios":
                audio_input = modalities["audios"]
                audio_embeddings = self._process_audio_input(audio_input)
                multimodal_embeddings += tuple(audio_embeddings)

        return multimodal_embeddings

MiniCPMOProcessingInfo ¶

Bases: MiniCPMVProcessingInfo

Source code in vllm/model_executor/models/minicpmo.py

class MiniCPMOProcessingInfo(MiniCPMVProcessingInfo):
    audio_pattern = "(<audio>./</audio>)"

    def get_hf_processor(self, **kwargs: object) -> "MiniCPMOProcessor":
        """Get vendored MiniCPMOProcessor for multimodal (image+audio) inputs.

        Creates a vendored processor that reuses the HF image processor,
        feature extractor, and tokenizer; applies the correct audio pooling
        configuration; and converts numpy arrays in the image processor to
        lists for serialization compatibility. The returned processor is
        compatible with Transformers v5.
        """
        import numpy as np

        hf_processor = self.ctx.get_hf_processor(**kwargs)

        from vllm.transformers_utils.processors.minicpmo import MiniCPMOProcessor

        # Create vendored processor with correct configuration
        vendored_processor = MiniCPMOProcessor(
            image_processor=hf_processor.image_processor,
            feature_extractor=hf_processor.feature_extractor,
            tokenizer=hf_processor.tokenizer,
            pool_step=self.get_default_audio_pool_step(),
        )

        # Convert numpy arrays in image processor to lists for serialization
        image_processor = vendored_processor.image_processor
        for attr in ("mean", "std"):
            val = getattr(image_processor, attr, None)
            if val is not None and isinstance(val, np.ndarray):
                setattr(image_processor, attr, val.tolist())

        return vendored_processor

    def get_data_parser(self):
        return MiniCPMOMultiModalDataParser(
            target_sr=self.get_default_audio_sampling_rate(),
            expected_hidden_size=self._get_expected_hidden_size(),
        )

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {**super().get_supported_mm_limits(), "audio": None}

    def get_audio_placeholder(
        self,
        audio_lens: int,
        chunk_input: bool = True,
        chunk_length: int = 1,
    ) -> str:
        hf_processor = self.get_hf_processor()

        return hf_processor.get_audio_placeholder(
            audio_lens,
            chunk_input=chunk_input,
            chunk_length=chunk_length,
        )

    def get_default_audio_pool_step(self) -> int:
        hf_config = self.get_hf_config()
        # MiniCPM-o 4.5 uses pool_step=5, older versions use 2
        return getattr(hf_config, "audio_pool_step", 2)

    def get_default_audio_sampling_rate(self) -> int:
        return 16000

    def get_chunk_length(self) -> int:
        return self.get_hf_config().audio_chunk_length

    def get_max_audio_tokens_per_chunk(self) -> int:
        pool_step = self.get_default_audio_pool_step()
        fbank_feat_in_chunk = 100
        cnn_feat_in_chunk = (fbank_feat_in_chunk - 1) // 2 + 1
        return (cnn_feat_in_chunk - pool_step) // pool_step + 1

    def get_max_audio_chunks_with_most_features(self) -> int:
        return 30

    def get_max_audio_tokens(self) -> int:
        num_chunks = self.get_max_audio_chunks_with_most_features()
        return self.get_max_audio_tokens_per_chunk() * num_chunks

    def get_audio_len_by_num_chunks(self, num_chunks: int) -> int:
        sampling_rate = self.get_default_audio_sampling_rate()
        num_tokens_per_chunk = self.get_max_audio_tokens_per_chunk()
        return int(num_chunks * sampling_rate / num_tokens_per_chunk) + 1

    def get_num_frames_with_most_features(
        self,
        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> int:
        max_images = mm_counts.get("image", 0)
        max_videos = mm_counts.get("video", 0)
        max_audios = mm_counts.get("audio", 0)

        max_image_tokens = self.get_max_image_tokens() * max_images
        max_audio_tokens = self.get_max_audio_tokens() * max_audios
        max_total_frames = self.get_max_video_frames(
            seq_len - max_image_tokens - max_audio_tokens
        )
        max_frames_per_video = min(
            max_total_frames // max(max_videos, 1), _MAX_FRAMES_PER_VIDEO
        )

        return max(max_frames_per_video, 1)

get_hf_processor ¶

get_hf_processor(**kwargs: object) -> MiniCPMOProcessor

Get vendored MiniCPMOProcessor for multimodal (image+audio) inputs.

Creates a vendored processor that reuses the HF image processor, feature extractor, and tokenizer; applies the correct audio pooling configuration; and converts numpy arrays in the image processor to lists for serialization compatibility. The returned processor is compatible with Transformers v5.

Source code in vllm/model_executor/models/minicpmo.py

def get_hf_processor(self, **kwargs: object) -> "MiniCPMOProcessor":
    """Get vendored MiniCPMOProcessor for multimodal (image+audio) inputs.

    Creates a vendored processor that reuses the HF image processor,
    feature extractor, and tokenizer; applies the correct audio pooling
    configuration; and converts numpy arrays in the image processor to
    lists for serialization compatibility. The returned processor is
    compatible with Transformers v5.
    """
    import numpy as np

    hf_processor = self.ctx.get_hf_processor(**kwargs)

    from vllm.transformers_utils.processors.minicpmo import MiniCPMOProcessor

    # Create vendored processor with correct configuration
    vendored_processor = MiniCPMOProcessor(
        image_processor=hf_processor.image_processor,
        feature_extractor=hf_processor.feature_extractor,
        tokenizer=hf_processor.tokenizer,
        pool_step=self.get_default_audio_pool_step(),
    )

    # Convert numpy arrays in image processor to lists for serialization
    image_processor = vendored_processor.image_processor
    for attr in ("mean", "std"):
        val = getattr(image_processor, attr, None)
        if val is not None and isinstance(val, np.ndarray):
            setattr(image_processor, attr, val.tolist())

    return vendored_processor

vllm.model_executor.models.minicpmo ¶

MiniCPMO ¶

MiniCPMO2_6 ¶

MiniCPMO4_5 ¶

MiniCPMOAudioEmbeddingInputs ¶

MiniCPMOAudioFeatureInputs ¶

audio_feature_lens instance-attribute ¶

audio_features instance-attribute ¶

MiniCPMOBaseModel ¶

MiniCPMOProcessingInfo ¶

get_hf_processor ¶

audio_feature_lens `instance-attribute` ¶

audio_features `instance-attribute` ¶