torch.OutOfMemoryError: CUDA out of memory. Tried ...
作成日: 2025年4月10日
使用モデル O3 Mini High by Chat01
作成日: 2025年4月10日
使用モデル O3 Mini High by Chat01
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 842.00 MiB. GPU 0 has a total capacity of 44.52 GiB of which 576.06 MiB is free. Process 1364514 has 6.41 GiB memory in use. Process 1741703 has 37.54 GiB memory in use. Of the allocated memory 36.99 GiB is allocated by PyTorch, and 56.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) 解释
这个错误信息说明 PyTorch 在 CUDA 上尝试分配 842.00 MiB 内存时,发现 GPU 可用内存不足,从而导致了内存不足(Out Of Memory, OOM)的错误。具体解释如下:
GPU 内存状态
PyTorch 内部的内存管理
错误原因总结
建议的解决方法
PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
这个设置可以帮助 PyTorch 更好地管理内存碎片,允许内存段的动态扩展,可能有助于解决这类问题。参考文档
总结来说,该错误主要是由于 GPU 的空闲内存不足以满足一次性大块内存分配请求,再加上可能存在的内存碎片问题。可以考虑减少其他进程对显存的占用、调整模型和 batch size,或者利用环境变量优化内存分配策略来缓解这一问题。
import itertools
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union)
import torch
import torch.nn as nn
from PIL import Image
from transformers import CLIPVisionConfig, LlavaConfig, SiglipVisionConfig
from vllm.attention import AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import SamplerOutput
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.sequence import IntermediateTensors
from vllm.utils import is_list_of
from .clip import (CLIPVisionModel, dummy_image_for_clip,
dummy_seq_data_for_clip, get_max_clip_image_tokens,
input_processor_for_clip)
from .interfaces import SupportsMultiModal
SiglipVisionModel = None
from .utils import (filter_weights, flatten_bn, init_vllm_registered_model,
merge_multimodal_embeddings)
class LlavaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
"""Shape: (batch_size * num_images, num_channels, height, width)
"""
class LlavaImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
"""Shape: (batch_size * num_images, image_feature_size, hidden_size)
text`hidden_size` must match the hidden size of language model backbone. """
LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageEmbeddingInputs]
class LlavaMultiModalProjector(nn.Module):
textdef __init__(self, vision_hidden_size: int, text_hidden_size: int, projector_hidden_act: str): super().__init__() self.linear_1 = nn.Linear(vision_hidden_size, text_hidden_size, bias=True) self.act = get_act_fn(projector_hidden_act) self.linear_2 = nn.Linear(text_hidden_size, text_hidden_size, bias=True) def forward(self, image_features: torch.Tensor) -> torch.Tensor: hidden_states = self.linear_1(image_features) hidden_states = self.act(hidden_states) hidden_states = self.linear_2(hidden_states) return hidden_states
def get_max_llava_image_tokens(ctx: InputContext):
hf_config = ctx.get_hf_config(LlavaConfig)
vision_config = hf_config.vision_config
textif isinstance(vision_config, CLIPVisionConfig): num_image_tokens = get_max_clip_image_tokens(vision_config) elif isinstance(vision_config, SiglipVisionConfig): num_image_tokens = get_max_siglip_image_tokens(vision_config) else: msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg) strategy = hf_config.vision_feature_select_strategy if strategy == "default": return num_image_tokens - 1 elif strategy == "full": return num_image_tokens else: raise ValueError(f"Unexpected select feature strategy: {strategy}")
def dummy_data_for_llava(ctx: InputContext, seq_len: int,
mm_counts: Mapping[str, int]):
hf_config = ctx.get_hf_config(LlavaConfig)
vision_config = hf_config.vision_config
num_images = mm_counts["image"]
textimage_feature_size = get_max_llava_image_tokens(ctx) if isinstance(vision_config, CLIPVisionConfig): seq_data = dummy_seq_data_for_clip( vision_config, seq_len, num_images, image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) mm_data = dummy_image_for_clip(vision_config, num_images) return seq_data, mm_data elif isinstance(vision_config, SiglipVisionConfig): seq_data = dummy_seq_data_for_siglip( vision_config, seq_len, num_images, image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) mm_data = dummy_image_for_siglip(vision_config, num_images) return seq_data, mm_data msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg)
def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
multi_modal_data = llm_inputs.get("multi_modal_data")
if multi_modal_data is None or "image" not in multi_modal_data:
return llm_inputs
textmodel_config = ctx.model_config hf_config = ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config image_data = multi_modal_data["image"] if isinstance(image_data, Image.Image): image_feature_size = get_max_llava_image_tokens(ctx) elif is_list_of(image_data, Image.Image): image_feature_size = [get_max_llava_image_tokens(ctx) ] * len(image_data) elif isinstance(image_data, torch.Tensor): num_images, image_feature_size, hidden_size = image_data.shape elif is_list_of(image_data, torch.Tensor): image_feature_size = [item.shape[1] for item in image_data] else: raise TypeError(f"Invalid image type: {type(image_data)}") if isinstance(vision_config, CLIPVisionConfig): return input_processor_for_clip( model_config, vision_config, llm_inputs, image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) elif isinstance(vision_config, SiglipVisionConfig): return input_processor_for_siglip( model_config, vision_config, llm_inputs, image_token_id=hf_config.image_token_index, image_feature_size_override=image_feature_size, ) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg)
def _init_vision_tower(hf_config: LlavaConfig):
vision_config = hf_config.vision_config
text# Initialize the vision tower only up to the required feature layer vision_feature_layer = hf_config.vision_feature_layer if vision_feature_layer < 0: num_hidden_layers = hf_config.vision_config.num_hidden_layers \ + vision_feature_layer + 1 else: num_hidden_layers = vision_feature_layer + 1 if isinstance(vision_config, CLIPVisionConfig): return CLIPVisionModel( vision_config, num_hidden_layers_override=num_hidden_layers, ) elif isinstance(vision_config, SiglipVisionConfig): return SiglipVisionModel( vision_config, num_hidden_layers_override=num_hidden_layers, ) msg = f"Unsupported vision config: {type(vision_config)}" raise NotImplementedError(msg)
@MULTIMODAL_REGISTRY.register_image_input_mapper()
@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_llava_image_tokens)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal):
textdef __init__(self, config: LlavaConfig, multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None) -> None: super().__init__() self.config = config self.multimodal_config = multimodal_config # TODO: Optionally initializes this for supporting embeddings. self.vision_tower = _init_vision_tower(config) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor: h = w = self.config.vision_config.image_size expected_dims = (3, h, w) actual_dims = tuple(data.shape[1:]) if actual_dims != expected_dims: expected_expr = ("batch_size", *map(str, expected_dims)) raise ValueError( f"The expected shape of pixel values is {expected_expr}. " f"You supplied {tuple(data.shape)}.") return data def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_embeds = kwargs.pop("image_embeds", None) if pixel_values is None and image_embeds is None: return None if pixel_values is not None: if not isinstance(pixel_values, (torch.Tensor, list)): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") return LlavaImagePixelInputs( type="pixel_values", data=self._validate_pixel_values( flatten_bn(pixel_values, concat=True)), ) if image_embeds is not None: if not isinstance(image_embeds, (torch.Tensor, list)): raise ValueError("Incorrect type of image embeddings. " f"Got type: {type(image_embeds)}") return LlavaImageEmbeddingInputs( type="image_embeds", data=flatten_bn(image_embeds, concat=True), ) raise AssertionError("This line should be unreachable.") def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa if strategy == "default": return image_features[:, 1:] elif strategy == "full": return image_features raise ValueError(f"Unexpected select feature strategy: {strategy}") def _image_pixels_to_features( self, vision_tower: Union[CLIPVisionModel, SiglipVisionModel], pixel_values: torch.Tensor, ) -> torch.Tensor: # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower image_features = vision_tower(pixel_values) return self._select_image_features( image_features, strategy=self.config.vision_feature_select_strategy, ) def _process_image_pixels(self, inputs: LlavaImagePixelInputs) -> torch.Tensor: assert self.vision_tower is not None pixel_values = inputs["data"] return self._image_pixels_to_features(self.vision_tower, pixel_values) def _process_image_input(self, image_input: LlavaImageInputs) -> torch.Tensor: if image_input["type"] == "image_embeds": return image_input["data"] assert self.vision_tower is not None image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) def forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, **kwargs: object, ) -> SamplerOutput: """Run forward pass for LLaVA-1.5. One key thing to understand is the `input_ids` already accounts for the positions of the to-be-inserted image embeddings. Concretely, consider a text prompt: `"USER: <image>\\nWhat's the content of the image?\\nASSISTANT:"`. Tokenizer outputs: `[1, 3148, 1001, 29901, 29871, 32000, 29871, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`. To reserve space in KV cache, we have to insert placeholder tokens before they are inputted to the model, so the input processor prepends additional image tokens (denoted as `32000`), resulting in: `[1, 3148, 1001, 29901, 29871, 32000, ..., 32000, 29871, 13, 5618, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901]`. We insert 575 tokens so that including the original image token in the input, there are a total of 576 (24 * 24) image tokens, which corresponds to the number of image tokens inputted to the language model, i.e. the number of image tokens outputted by the visual encoder. This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. pixel_values: The pixels in each input image. See also: :class:`LlavaImageInputs` """ image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: vision_embeddings = self._process_image_input(image_input) inputs_embeds = self.language_model.model.get_input_embeddings( input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, self.config.image_token_index) input_ids = None else: inputs_embeds = None hidden_states = self.language_model.model(input_ids, positions, kv_caches, attn_metadata, None, inputs_embeds=inputs_embeds) return hidden_states def compute_logits( self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: return self.language_model.compute_logits(hidden_states, sampling_metadata) def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: return self.language_model.sample(logits, sampling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # prepare weight iterators for components vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3) # load vision encoder vit_weights = filter_weights(vit_weights, "vision_tower") self.vision_tower.load_weights(vit_weights) # load mlp projector mlp_weights = filter_weights(mlp_weights, "multi_modal_projector") mlp_params_dict = dict(self.multi_modal_projector.named_parameters()) for name, loaded_weight in mlp_weights: param = mlp_params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) # load llm backbone llm_weights = filter_weights(llm_weights, "language_model") self.language_model.load_weights(llm_weights)
怎么让这个使用指定的gpu
要确保你的模型使用指定的 GPU,通常你需要将模型和数据移动到指定的 GPU 设备上。在 PyTorch 中,你可以通过 torch.device
来指定设备,然后使用 .to(device)
方法将模型和张量转移到该设备。
以下是如何将你的模型和输入数据移动到指定 GPU 的步骤:
首先,需要通过 torch.device
来指定目标 GPU。假设你想使用 GPU 0,可以像这样定义设备:
pythondevice = torch.device('cuda:0') # 选择 GPU 0
然后,你需要将模型移动到指定的 GPU 上。你可以在模型初始化时将其移动到 GPU,或者在模型的任何地方使用 .to(device)
来进行迁移。
例如,在你的 LlavaForConditionalGeneration
模型中,你可以在初始化时添加:
pythonclass LlavaForConditionalGeneration(nn.Module, SupportsMultiModal): def __init__(self, config: LlavaConfig, multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, device: torch.device = torch.device('cuda:0')) -> None: super().__init__() self.device = device # 保存设备信息 self.config = config self.multimodal_config = multimodal_config # 初始化 vision_tower、multi_modal_projector、language_model self.vision_tower = _init_vision_tower(config).to(self.device) # 将 vision_tower 移到设备 self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, text_hidden_size=config.text_config.hidden_size, projector_hidden_act=config.projector_hidden_act ).to(self.device) # 将 multi_modal_projector 移到设备 self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config ).to(self.device) # 将 language_model 移到设备
当你将模型移到指定的 GPU 后,输入数据(如 input_ids
, pixel_values
等)也必须移到相同的设备上。你可以在前向传递时将数据移到 GPU:
pythondef forward( self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, **kwargs: object, ) -> SamplerOutput: input_ids = input_ids.to(self.device) # 将 input_ids 移到指定设备 positions = positions.to(self.device) # 将 positions 移到指定设备 # 处理图片数据 image_input = self._parse_and_validate_image_input(**kwargs) if image_input is not None: vision_embeddings = self._process_image_input(image_input) vision_embeddings = vision_embeddings.to(self.device) # 确保 vision_embeddings 在同一个设备上 inputs_embeds = self.language_model.model.get_input_embeddings(input_ids).to(self.device) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, self.config.image_token_index ) else: inputs_embeds = None hidden_states = self.language_model.model(input_ids, positions, kv_caches, attn_metadata, None, inputs_embeds=inputs_embeds) return hidden_states
.to(device)
方法移到指定 GPU。通常,最常见的做法是将所有内容(模型和数据)都统一移到目标设备(如 cuda:0
)。如果你使用多个 GPU,你也可以通过 torch.nn.DataParallel
或 torch.nn.parallel.DistributedDataParallel
来并行化训练和推理过程。
如果你只是想在特定的 GPU 上运行,确保在所有相关的代码中都使用 .to(device)
来明确将模型和数据移动到正确的设备。