Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .buildkite/test-amd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -949,7 +949,7 @@ steps:
- pytest -v -s models/multimodal/processing

- label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80
timeout_in_minutes: 100
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
Expand All @@ -976,7 +976,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing

- label: Multi-Modal Models Test (Extended) 2
- label: Multi-Modal Models Test (Extended) 2 #60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental]
agent_pool: mi325_1
# grade: Blocking
Expand Down Expand Up @@ -1450,8 +1451,8 @@ steps:
commands:
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1

- label: Multi-Modal Accuracy Eval (Small Models) # 10min
timeout_in_minutes: 70
- label: Multi-Modal Accuracy Eval (Small Models) # 160min
timeout_in_minutes: 240
mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1
# grade: Blocking
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests."""
"""Pytest configuration for vLLM multimodal tests."""

import warnings

Expand All @@ -9,16 +9,13 @@
from vllm.platforms import current_platform


def pytest_configure(config):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF
Transformers accuracy issues.
"""
def pytest_collection_modifyitems(config, items):
"""Configure ROCm-specific settings based on collected tests."""
if not current_platform.is_rocm():
return

skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return

# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
Expand Down
25 changes: 23 additions & 2 deletions tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,19 @@
image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={
"model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
}
if current_platform.is_rocm()
else {}
),
},
marks=[large_gpu_mark(min_gb=32)],
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
),
#### Extended model tests
"aria": VLMTestInfo(
Expand Down Expand Up @@ -645,7 +656,17 @@
hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)],
marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
),
"molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"],
Expand Down
24 changes: 0 additions & 24 deletions tests/models/multimodal/pooling/conftest.py

This file was deleted.

14 changes: 11 additions & 3 deletions tests/models/multimodal/pooling/test_clip.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import pytest
from transformers import CLIPModel

from vllm.platforms import current_platform

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

Expand Down Expand Up @@ -70,7 +72,9 @@ def _run_test(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text(
hf_runner,
vllm_runner,
Expand All @@ -93,7 +97,9 @@ def test_models_text(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_image(
hf_runner,
vllm_runner,
Expand All @@ -118,7 +124,9 @@ def test_models_image(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
Expand Down
14 changes: 11 additions & 3 deletions tests/models/multimodal/pooling/test_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pytest
from transformers import SiglipModel

from vllm.platforms import current_platform

from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ...utils import check_embeddings_close

Expand Down Expand Up @@ -85,7 +87,9 @@ def _run_test(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text(
hf_runner,
vllm_runner,
Expand All @@ -112,7 +116,9 @@ def test_models_text(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_image(
hf_runner,
vllm_runner,
Expand All @@ -137,7 +143,9 @@ def test_models_image(


@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
"dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
)
def test_models_text_image_no_crash(
vllm_runner,
image_assets,
Expand Down
23 changes: 23 additions & 0 deletions vllm/model_executor/models/qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
# limitations under the License.
"""Inference-only Qwen3VL model compatible with HuggingFace weights."""

import os
from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
from functools import lru_cache, partial
from itertools import islice
Expand Down Expand Up @@ -370,6 +371,28 @@ def __init__(
]
)

if attn_backend_override is None:
from vllm.platforms import current_platform

if current_platform.is_rocm():
# [12/04/2025] Qwen3-VL is only supported with ROCM_AITER_FA
# attention backend for ROCm for now.
# TODO: Add support for other backends [FLASH_ATTN, TORCH_SDPA]
# and test accuracy on ROCm with `Multi-Modal Models Test (Standard)`
# group.
from vllm._aiter_ops import IS_AITER_FOUND
from vllm.platforms.rocm import on_gfx9

if on_gfx9() and IS_AITER_FOUND:
attn_backend_override = AttentionBackendEnum.ROCM_AITER_FA
# Set also env variable for platform rocm to use ROCM_AITER_FA
# for `selected_backend` in attention backend getter.
os.environ["VLLM_ATTENTION_BACKEND"] = "ROCM_AITER_FA"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

Using os.environ to control behavior between different parts of the application introduces a global state, which is a potential source of bugs. In a concurrent environment, such as a server handling multiple model loading requests simultaneously, this can lead to race conditions where one request unintentionally affects another. A safer approach would be to pass this configuration through the VllmConfig or a similar mechanism that avoids global state. For instance, you could add an attn_backend_override to ModelConfig which can be set by model-specific code and then read by the attention backend selection logic.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no concurrency that involves this part of code. This behavior is ROCm specific. This is critical for this model for now. Nonetheless, we are going to add the support necessary for the rest of the ROCm specific attention backends and make this mini-patch obsolete in the future.

logger.info(
"Overriding with ROCM_AITER_FA attention "
"backend for Qwen3-VL model."
)

self.attn_backend = get_vit_attn_backend(
head_size=head_dim,
dtype=torch.get_default_dtype(),
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/siglip2navit.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,10 @@ def apply_rotary_pos_emb(
) -> tuple[torch.Tensor, torch.Tensor]:
cos = cos.chunk(2, dim=-1)[0].contiguous()
sin = sin.chunk(2, dim=-1)[0].contiguous()
if is_flash_attn_backend and not current_platform.is_xpu():
if is_flash_attn_backend and not (
current_platform.is_xpu() or current_platform.is_rocm()
):
# TODO: [ROCm] Use AITER flash attention's rotary embedding
from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb

apply_rotary_emb_func = apply_rotary_emb
Expand Down
32 changes: 31 additions & 1 deletion vllm/model_executor/models/transformers/multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import torch

from vllm.config.utils import getattr_iter
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MultiModalKwargsItems
Expand All @@ -36,6 +37,7 @@
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors

if TYPE_CHECKING:
Expand All @@ -52,6 +54,8 @@
"inputs_embeds": 0,
}

logger = init_logger(__name__)


class MultiModalProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self):
Expand Down Expand Up @@ -345,8 +349,29 @@ def embed_multimodal(self, **kwargs):

num_image_patches = kwargs.pop("num_image_patches")
kwargs.pop("token_type_ids", None) # used only in `forward`

if pixel_values is not None:
vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
# ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
# with flash_sdp and mem_efficient_sdp
if current_platform.is_rocm():
# TODO: [ROCm] Fix accuracy issues with flash backend
logger.debug(
"ROCm platform detected. Forcing math SDP backend "
"for vision encoder. Currently ROCm platform has "
"accuracy issues with `flash_sdp` and"
"`mem_efficient_sdp` backends. See issue: "
"https://github.com/vllm-project/vllm/issues/30167"
)
with torch.nn.attention.sdpa_kernel(
backends=[torch.nn.attention.SDPBackend.MATH]
):
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)
else:
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)

if isinstance(vision_embeddings, torch.Tensor):
if vision_embeddings.ndim == 2:
Expand All @@ -364,6 +389,11 @@ def embed_multimodal(self, **kwargs):
]

return vision_embeddings
else:
logger.debug(
"No pixel values or image embeddings provided for multimodal embedding."
)
return None

def get_mrope_input_positions(
self,
Expand Down
Loading