说明:将qwen2.5-vl拆解开,看清楚整个的推理流程。后续再尝试将模型导出到onnx

1. 代码实现

from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from transformers import AutoConfig, Qwen2VLForConditionalGeneration
from transformers.cache_utils import DynamicCache

model_id = "/path/Qwen2.5-VL-3B-Instruct"
devices = "cpu"  # Change to CPU for ONNX export
dtype = torch.float32

model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id,
    torch_dtype=dtype,
    attn_implementation="eager",
    device_map=devices,
)


processor = AutoProcessor.from_pretrained(model_id)
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "asserts/imgs/person.png",
            },
            {"type": "text", "text": "请详细描述图像"},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(devices)

input_ids = inputs.input_ids
pixel_values = inputs.pixel_values
image_grid_thw = inputs.image_grid_thw

inputs_embeds = model.model.embed_tokens(inputs.input_ids)
image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)

## 将图像编码嵌入到token_embeds中
n_image_tokens = (input_ids == model.config.image_token_id).sum().item()
n_image_features = image_embeds.shape[0]
assert n_image_tokens == n_image_features, f"Expected {n_image_tokens} image tokens, but got {n_image_features} image features."
mask = input_ids == model.config.image_token_id
mask_unsqueezed = mask.unsqueeze(-1)
mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
image_mask = mask_expanded.to(inputs_embeds.device)
image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)

pass
## 准备模型的输入,所有的图像token共用一个position_value
attention_mask = inputs.attention_mask
past_key_values = DynamicCache()
position_ids, rope_deltas = model.get_rope_index(
                    input_ids,
                    image_grid_thw,
                    None,
                    None,
                    attention_mask,
                )
model.rope_deltas = rope_deltas

## 语言模型推理
outputs = model.model(
    input_ids=None,
    position_ids=position_ids,
    attention_mask=attention_mask,
    past_key_values=past_key_values,
    inputs_embeds=inputs_embeds,
    use_cache=True,
    output_attentions=False,
    output_hidden_states=False,
    return_dict=True,
    cache_position=None,
)

hidden_states = outputs[0]
logits = model.lm_head(hidden_states)

tokens = [torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)]
new_tokens = 0
while new_tokens < 24:
    input_ids = tokens[-1]
    inputs_embeds = model.model.embed_tokens(input_ids)

    batch_size, seq_length, _ = inputs_embeds.shape
    position_ids = torch.arange(seq_length, device=inputs_embeds.device)
    delta = past_key_values.get_seq_length()  + model.rope_deltas
    position_ids = position_ids.add(delta)
    position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)

    attention_mask = torch.ones((batch_size, past_key_values.get_seq_length() + seq_length), device=inputs_embeds.device)
    
    outputs = model.model(
        input_ids=None,
        position_ids=position_ids,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=True,
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        cache_position=None,
    )

    hidden_states = outputs[0]
    logits = model.lm_head(hidden_states)

    next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
    new_tokens += 1

    tokens.append(next_token)

tokens = torch.cat(tokens, dim=-1)
output_text = processor.batch_decode(tokens, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(output_text)

例图:
请添加图片描述
模型推理结果:

‘这张图片展示了一组五位女性,她们站在一个篮球场的背景下。她们都穿着运动装,看起来像是在进行某种体育活动或比赛的宣传。从左到右,她们依次穿着以下服装:\n\n1. 第一位女性穿着一件带有“PHILADELPHIA”字样的白色运动衫,搭配蓝色短裤和白色运动鞋。\n2. 第二位女性穿着一件紫色上衣和黑色裤子,搭配黑色运动鞋。\n3. 第三位女性穿着一件白色上衣和蓝色裙子,搭配黑色运动鞋。\n4. 第四位女性穿着一件黑色上衣和红色裙子’

Logo

火山引擎开发者社区是火山引擎打造的AI技术生态平台,聚焦Agent与大模型开发,提供豆包系列模型(图像/视频/视觉)、智能分析与会话工具,并配套评测集、动手实验室及行业案例库。社区通过技术沙龙、挑战赛等活动促进开发者成长,新用户可领50万Tokens权益,助力构建智能应用。

更多推荐