Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,7 @@ steps:
- pytest -v -s models/multimodal/processing

- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
timeout_in_minutes: 100
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
Expand All @@ -976,7 +976,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing

- label: Multi-Modal Models Test (Extended) 2
- label: Multi-Modal Models Test (Extended) 2 #60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
Expand Down Expand Up @@ -1450,8 +1451,8 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
- label: Multi-Modal Accuracy Eval (Small Models) # 160min
timeout_in_minutes: 240
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
"""Pytest configuration for vLLM multimodal tests."""

import warnings

Expand All @@ -9,16 +9,13 @@
from vllm.platforms import current_platform


def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return

skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return

# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
Expand Down
25 changes: 23 additions & 2 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,19 @@
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
}
if current_platform.is_rocm()
else {}
),
},
marks=[large_gpu_mark(min_gb=32)],
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
),
#### Extended model tests
"aria": VLMTestInfo(
Expand Down Expand Up @@ -645,7 +656,17 @@
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)],
marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
),
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],
Expand Down
24 changes: 0 additions & 24 deletions tests/models/multimodal/pooling/conftest.py

This file was deleted.

14 changes: 11 additions & 3 deletions tests/models/multimodal/pooling/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pytest
from transformers import CLIPModel

from vllm.platforms import current_platform

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

Expand Down Expand Up @@ -70,7 +72,9 @@ def _run_test(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text(
hf_runner,
vllm_runner,
Expand All @@ -93,7 +97,9 @@ def test_models_text(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_image(
hf_runner,
vllm_runner,
Expand All @@ -118,7 +124,9 @@ def test_models_image(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
Expand Down
14 changes: 11 additions & 3 deletions tests/models/multimodal/pooling/test_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pytest
from transformers import SiglipModel

from vllm.platforms import current_platform

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

Expand Down Expand Up @@ -85,7 +87,9 @@ def _run_test(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text(
hf_runner,
vllm_runner,
Expand All @@ -112,7 +116,9 @@ def test_models_text(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_image(
hf_runner,
vllm_runner,
Expand All @@ -137,7 +143,9 @@ def test_models_image(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
Expand Down
23 changes: 23 additions & 0 deletions vllm/model_executor/models/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only Qwen3VL model compatible with HuggingFace weights."""

import os
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from functools import lru_cache, partial
from itertools import islice
Expand Down Expand Up @@ -370,6 +371,28 @@ def __init__(
]
)

if attn_backend_override is None:
from vllm.platforms import current_platform

if current_platform.is_rocm():
# [12/04/2025] Qwen3-VL is only supported with ROCM_AITER_FA
# attention backend for ROCm for now.
# TODO: Add support for other backends [FLASH_ATTN, TORCH_SDPA]
# and test accuracy on ROCm with `Multi-Modal Models Test (Standard)`
# group.
from vllm._aiter_ops import IS_AITER_FOUND
from vllm.platforms.rocm import on_gfx9

if on_gfx9() and IS_AITER_FOUND:
attn_backend_override = AttentionBackendEnum.ROCM_AITER_FA
# Set also env variable for platform rocm to use ROCM_AITER_FA
# for `selected_backend` in attention backend getter.
os.environ["VLLM_ATTENTION_BACKEND"] = "ROCM_AITER_FA"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Using os.environ to control behavior between different parts of the application introduces a global state, which is a potential source of bugs. In a concurrent environment, such as a server handling multiple model loading requests simultaneously, this can lead to race conditions where one request unintentionally affects another. A safer approach would be to pass this configuration through the VllmConfig or a similar mechanism that avoids global state. For instance, you could add an attn_backend_override to ModelConfig which can be set by model-specific code and then read by the attention backend selection logic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no concurrency that involves this part of code. This behavior is ROCm specific. This is critical for this model for now. Nonetheless, we are going to add the support necessary for the rest of the ROCm specific attention backends and make this mini-patch obsolete in the future.

logger.info(
"Overriding with ROCM_AITER_FA attention "
"backend for Qwen3-VL model."
)

self.attn_backend = get_vit_attn_backend(
head_size=head_dim,
dtype=torch.get_default_dtype(),
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/siglip2navit.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,10 @@ def apply_rotary_pos_emb(
) -> tuple[torch.Tensor, torch.Tensor]:
cos = cos.chunk(2, dim=-1)[0].contiguous()
sin = sin.chunk(2, dim=-1)[0].contiguous()
if is_flash_attn_backend and not current_platform.is_xpu():
if is_flash_attn_backend and not (
current_platform.is_xpu() or current_platform.is_rocm()
):
# TODO: [ROCm] Use AITER flash attention's rotary embedding
from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb

apply_rotary_emb_func = apply_rotary_emb
Expand Down
32 changes: 31 additions & 1 deletion vllm/model_executor/models/transformers/multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import torch

from vllm.config.utils import getattr_iter
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MultiModalKwargsItems
Expand All @@ -36,6 +37,7 @@
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors

if TYPE_CHECKING:
Expand All @@ -52,6 +54,8 @@
"inputs_embeds": 0,
}

logger = init_logger(__name__)


class MultiModalProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self):
Expand Down Expand Up @@ -345,8 +349,29 @@ def embed_multimodal(self, **kwargs):

num_image_patches = kwargs.pop("num_image_patches")
kwargs.pop("token_type_ids", None) # used only in `forward`

if pixel_values is not None:
vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
# ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
# with flash_sdp and mem_efficient_sdp
if current_platform.is_rocm():
# TODO: [ROCm] Fix accuracy issues with flash backend
logger.debug(
"ROCm platform detected. Forcing math SDP backend "
"for vision encoder. Currently ROCm platform has "
"accuracy issues with `flash_sdp` and"
"`mem_efficient_sdp` backends. See issue: "
"https://github.com/vllm-project/vllm/issues/30167"
)
with torch.nn.attention.sdpa_kernel(
backends=[torch.nn.attention.SDPBackend.MATH]
):
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)
else:
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)

if isinstance(vision_embeddings, torch.Tensor):
if vision_embeddings.ndim == 2:
Expand All @@ -364,6 +389,11 @@ def embed_multimodal(self, **kwargs):
]

return vision_embeddings
else:
logger.debug(
"No pixel values or image embeddings provided for multimodal embedding."
)
return None

def get_mrope_input_positions(
self,
Expand Down
28 changes: 23 additions & 5 deletions vllm/platforms/rocm.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import torch

import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.logger import init_logger
from vllm.utils.torch_utils import cuda_device_count_stateless
Expand Down Expand Up @@ -200,7 +201,11 @@ def get_vit_attn_backend(
# TODO: Add support for other VL models in their model class.
return AttentionBackendEnum.ROCM_AITER_FA

if on_gfx9() and find_spec("flash_attn") is not None:
if (
on_gfx9()
and find_spec("flash_attn") is not None
and (dtype == torch.float16 or dtype == torch.bfloat16)
):
return AttentionBackendEnum.FLASH_ATTN

return AttentionBackendEnum.TORCH_SDPA
Expand Down Expand Up @@ -241,26 +246,34 @@ def get_attn_backend_cls(
)
if selected_backend == AttentionBackendEnum.TRITON_MLA:
if block_size != 1:
logger.info_once("Using Triton MLA backend.")
logger.info_once("Using Triton MLA backend on V1 engine.")
return AttentionBackendEnum.TRITON_MLA.get_path()
raise ValueError(
f" The selected backend, {selected_backend.name},"
f"does not support block size {block_size}."
)
if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
logger.info("Using AITER MLA backend.")
logger.info("Using AITER MLA backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA:
logger.info("Using AITER TRITON MLA backend.")
logger.info("Using AITER TRITON MLA backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path()

raise ValueError(
f" The selected backend, {selected_backend.name},"
f"is not MLA type while requested for MLA backend."
)

attn_backend_override = os.environ.get("VLLM_ATTENTION_BACKEND")
if selected_backend is None and attn_backend_override is not None:
logger.info(
"Detected VLLM_ATTENTION_BACKEND=%s (set by model architecture).",
attn_backend_override,
)
selected_backend = AttentionBackendEnum[attn_backend_override]

if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
logger.info("Using FlexAttention backend.")
logger.info("Using FlexAttention backend on V1 engine.")
return AttentionBackendEnum.FLEX_ATTENTION.get_path()

if selected_backend == AttentionBackendEnum.TRITON_ATTN:
Expand Down Expand Up @@ -313,6 +326,11 @@ def get_attn_backend_cls(
logger.info("Using Aiter Flash Attention backend on V1 engine.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path()

# Priority 5: If model is Encoder-only self-attention type
if attn_type is not None and attn_type in (AttentionType.ENCODER_ONLY):
logger.info("Using FlexAttention backend on V1 engine.")
return AttentionBackendEnum.FLEX_ATTENTION.get_path()
Comment on lines 329 to 332

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Encoder-only fallback condition raises at runtime

The new FlexAttention fallback uses attn_type in (AttentionType.ENCODER_ONLY) without a trailing comma, so the right-hand side is a single AttentionType value rather than an iterable. As soon as attn_type is provided (e.g., for encoder-only models such as CLIP/SigLIP on ROCm), evaluating this condition raises TypeError: argument of type 'AttentionType' is not iterable, preventing backend selection and crashing initialization instead of falling back to FlexAttention.

Useful? React with 👍 / 👎.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment addressed in 9c9f225


# Default: Triton Unified Attention
logger.info("Using Triton Attention backend on V1 engine.")
return AttentionBackendEnum.TRITON_ATTN.get_path()
Expand Down
Loading
Loading