vllm-project · AndreasKaratzas · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 6, 2025
diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
@@ -949,7 +949,7 @@ steps:
     - pytest -v -s models/multimodal/processing
 
 - label: Multi-Modal Models Test (Standard) # 60min
-  timeout_in_minutes: 80
+  timeout_in_minutes: 100
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -976,7 +976,8 @@ steps:
     - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
     - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
 
-- label: Multi-Modal Models Test (Extended) 2
+- label: Multi-Modal Models Test (Extended) 2 #60min
+  timeout_in_minutes: 120
   mirror_hardwares: [amdexperimental]
   agent_pool: mi325_1
   # grade: Blocking
@@ -1450,8 +1451,8 @@ steps:
   commands:
   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 
-- label: Multi-Modal Accuracy Eval (Small Models) # 10min
-  timeout_in_minutes: 70
+- label: Multi-Modal Accuracy Eval (Small Models) # 160min
+  timeout_in_minutes: 240
   mirror_hardwares: [amdexperimental, amdproduction]
   agent_pool: mi325_1
   # grade: Blocking

@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Pytest configuration for vLLM tests."""
+"""Pytest configuration for vLLM multimodal tests."""
 
 import warnings
 
@@ -9,16 +9,13 @@
 from vllm.platforms import current_platform
 
 
-def pytest_configure(config):
-    """Disable Flash/MemEfficient SDP on ROCm to avoid HF
-    Transformers accuracy issues.
-    """
+def pytest_collection_modifyitems(config, items):
+    """Configure ROCm-specific settings based on collected tests."""
     if not current_platform.is_rocm():
         return
 
     skip_patterns = ["test_granite_speech.py"]
     if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
-        # Skip disabling SDP for Granite Speech tests on ROCm
         return
 
     # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers

@@ -253,8 +253,19 @@
         image_size_factors=[(0.25, 0.2, 0.15)],
         vllm_runner_kwargs={
             "model_impl": "transformers",
+            # TODO: [ROCm] Revert this once issue #30167 is resolved
+            **(
+                {
+                    "mm_processor_kwargs": {
+                        "min_pixels": 256 * 28 * 28,
+                        "max_pixels": 1280 * 28 * 28,
+                    },
+                }
+                if current_platform.is_rocm()
+                else {}
+            ),
         },
-        marks=[large_gpu_mark(min_gb=32)],
+        marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
     ),
     #### Extended model tests
     "aria": VLMTestInfo(
@@ -645,7 +656,17 @@
         hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
         patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
         auto_cls=AutoModelForImageTextToText,
-        marks=[large_gpu_mark(min_gb=80)],
+        marks=[
+            large_gpu_mark(min_gb=80),
+            # TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
+            pytest.mark.skipif(
+                current_platform.is_rocm(),
+                reason=(
+                    "ROCm: Model too large for single GPU; "
+                    "multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
+                ),
+            ),
+        ],
     ),
     "molmo": VLMTestInfo(
         models=["allenai/Molmo-7B-D-0924"],

@@ -4,6 +4,8 @@
 import pytest
 from transformers import CLIPModel
 
+from vllm.platforms import current_platform
+
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ...utils import check_embeddings_close
 
@@ -70,7 +72,9 @@ def _run_test(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_text(
     hf_runner,
     vllm_runner,
@@ -93,7 +97,9 @@ def test_models_text(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_image(
     hf_runner,
     vllm_runner,
@@ -118,7 +124,9 @@ def test_models_image(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_text_image_no_crash(
     vllm_runner,
     image_assets,

@@ -6,6 +6,8 @@
 import pytest
 from transformers import SiglipModel
 
+from vllm.platforms import current_platform
+
 from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
 from ...utils import check_embeddings_close
 
@@ -85,7 +87,9 @@ def _run_test(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_text(
     hf_runner,
     vllm_runner,
@@ -112,7 +116,9 @@ def test_models_text(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_image(
     hf_runner,
     vllm_runner,
@@ -137,7 +143,9 @@ def test_models_image(
 
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize(
+    "dtype", ["float"] if not current_platform.is_rocm() else ["float16"]
+)
 def test_models_text_image_no_crash(
     vllm_runner,
     image_assets,

@@ -24,6 +24,7 @@
 # limitations under the License.
 """Inference-only Qwen3VL model compatible with HuggingFace weights."""
 
+import os
 from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence
 from functools import lru_cache, partial
 from itertools import islice
@@ -370,6 +371,28 @@ def __init__(
             ]
         )
 
+        if attn_backend_override is None:
+            from vllm.platforms import current_platform
+
+            if current_platform.is_rocm():
+                # [12/04/2025] Qwen3-VL is only supported with ROCM_AITER_FA
+                # attention backend for ROCm for now.
+                # TODO: Add support for other backends [FLASH_ATTN, TORCH_SDPA]
+                # and test accuracy on ROCm with `Multi-Modal Models Test (Standard)`
+                # group.
+                from vllm._aiter_ops import IS_AITER_FOUND
+                from vllm.platforms.rocm import on_gfx9
+
+                if on_gfx9() and IS_AITER_FOUND:
+                    attn_backend_override = AttentionBackendEnum.ROCM_AITER_FA
+                    # Set also env variable for platform rocm to use ROCM_AITER_FA
+                    # for `selected_backend` in attention backend getter.
+                    os.environ["VLLM_ATTENTION_BACKEND"] = "ROCM_AITER_FA"
+                    logger.info(
+                        "Overriding with ROCM_AITER_FA attention "
+                        "backend for Qwen3-VL model."
+                    )
+
         self.attn_backend = get_vit_attn_backend(
             head_size=head_dim,
             dtype=torch.get_default_dtype(),

diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
@@ -190,7 +190,10 @@ def apply_rotary_pos_emb(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     cos = cos.chunk(2, dim=-1)[0].contiguous()
     sin = sin.chunk(2, dim=-1)[0].contiguous()
-    if is_flash_attn_backend and not current_platform.is_xpu():
+    if is_flash_attn_backend and not (
+        current_platform.is_xpu() or current_platform.is_rocm()
+    ):
+        # TODO: [ROCm] Use AITER flash attention's rotary embedding
         from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
 
         apply_rotary_emb_func = apply_rotary_emb

@@ -22,6 +22,7 @@
 import torch
 
 from vllm.config.utils import getattr_iter
+from vllm.logger import init_logger
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
 from vllm.model_executor.models.utils import WeightsMapper
 from vllm.multimodal import MultiModalKwargsItems
@@ -36,6 +37,7 @@
 from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
 from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 if TYPE_CHECKING:
@@ -52,6 +54,8 @@
     "inputs_embeds": 0,
 }
 
+logger = init_logger(__name__)
+
 
 class MultiModalProcessingInfo(BaseProcessingInfo):
     def get_supported_mm_limits(self):
@@ -345,8 +349,29 @@ def embed_multimodal(self, **kwargs):
 
         num_image_patches = kwargs.pop("num_image_patches")
         kwargs.pop("token_type_ids", None)  # used only in `forward`
+
         if pixel_values is not None:
-            vision_embeddings = self.model.get_image_features(pixel_values, **kwargs)
+            # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
+            # with flash_sdp and mem_efficient_sdp
+            if current_platform.is_rocm():
+                # TODO: [ROCm] Fix accuracy issues with flash backend
+                logger.debug(
+                    "ROCm platform detected. Forcing math SDP backend "
+                    "for vision encoder. Currently ROCm platform has "
+                    "accuracy issues with `flash_sdp` and"
+                    "`mem_efficient_sdp` backends. See issue: "
+                    "https://github.com/vllm-project/vllm/issues/30167"
+                )
+                with torch.nn.attention.sdpa_kernel(
+                    backends=[torch.nn.attention.SDPBackend.MATH]
+                ):
+                    vision_embeddings = self.model.get_image_features(
+                        pixel_values, **kwargs
+                    )
+            else:
+                vision_embeddings = self.model.get_image_features(
+                    pixel_values, **kwargs
+                )
 
             if isinstance(vision_embeddings, torch.Tensor):
                 if vision_embeddings.ndim == 2:
@@ -364,6 +389,11 @@ def embed_multimodal(self, **kwargs):
                 ]
 
             return vision_embeddings
+        else:
+            logger.debug(
+                "No pixel values or image embeddings provided for multimodal embedding."
+            )
+            return None
 
     def get_mrope_input_positions(
         self,

@@ -8,6 +8,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.logger import init_logger
 from vllm.utils.torch_utils import cuda_device_count_stateless
@@ -200,7 +201,11 @@ def get_vit_attn_backend(
             # TODO: Add support for other VL models in their model class.
             return AttentionBackendEnum.ROCM_AITER_FA
 
-        if on_gfx9() and find_spec("flash_attn") is not None:
+        if (
+            on_gfx9()
+            and find_spec("flash_attn") is not None
+            and (dtype == torch.float16 or dtype == torch.bfloat16)
+        ):
             return AttentionBackendEnum.FLASH_ATTN
 
         return AttentionBackendEnum.TORCH_SDPA
@@ -241,26 +246,34 @@ def get_attn_backend_cls(
                 )
             if selected_backend == AttentionBackendEnum.TRITON_MLA:
                 if block_size != 1:
-                    logger.info_once("Using Triton MLA backend.")
+                    logger.info_once("Using Triton MLA backend on V1 engine.")
                     return AttentionBackendEnum.TRITON_MLA.get_path()
                 raise ValueError(
                     f" The selected backend, {selected_backend.name},"
                     f"does not support block size {block_size}."
                 )
             if selected_backend == AttentionBackendEnum.ROCM_AITER_MLA:
-                logger.info("Using AITER MLA backend.")
+                logger.info("Using AITER MLA backend on V1 engine.")
                 return AttentionBackendEnum.ROCM_AITER_MLA.get_path()
             if selected_backend == AttentionBackendEnum.ROCM_AITER_TRITON_MLA:
-                logger.info("Using AITER TRITON MLA backend.")
+                logger.info("Using AITER TRITON MLA backend on V1 engine.")
                 return AttentionBackendEnum.ROCM_AITER_TRITON_MLA.get_path()
 
             raise ValueError(
                 f" The selected backend, {selected_backend.name},"
                 f"is not MLA type while requested for MLA backend."
             )
 
+        attn_backend_override = os.environ.get("VLLM_ATTENTION_BACKEND")
+        if selected_backend is None and attn_backend_override is not None:
+            logger.info(
+                "Detected VLLM_ATTENTION_BACKEND=%s (set by model architecture).",
+                attn_backend_override,
+            )
+            selected_backend = AttentionBackendEnum[attn_backend_override]
+
         if selected_backend == AttentionBackendEnum.FLEX_ATTENTION:
-            logger.info("Using FlexAttention backend.")
+            logger.info("Using FlexAttention backend on V1 engine.")
             return AttentionBackendEnum.FLEX_ATTENTION.get_path()
 
         if selected_backend == AttentionBackendEnum.TRITON_ATTN:
@@ -313,6 +326,11 @@ def get_attn_backend_cls(
                 logger.info("Using Aiter Flash Attention backend on V1 engine.")
                 return AttentionBackendEnum.ROCM_AITER_FA.get_path()
 
+            # Priority 5: If model is Encoder-only self-attention type
+            if attn_type is not None and attn_type in (AttentionType.ENCODER_ONLY):
+                logger.info("Using FlexAttention backend  on V1 engine.")
+                return AttentionBackendEnum.FLEX_ATTENTION.get_path()
+
             # Default: Triton Unified Attention
             logger.info("Using Triton Attention backend on V1 engine.")
             return AttentionBackendEnum.TRITON_ATTN.get_path()