Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor VL modules for phi3-vision #2779

Merged
merged 33 commits into from
Nov 20, 2024
Merged
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
c40a8ae
qwen2-vl
lvhan028 Nov 17, 2024
e24b303
internvl
lvhan028 Nov 18, 2024
dcc454b
qwen2
lvhan028 Nov 18, 2024
8407d57
get image_tokens_per_patch for internvl2
lvhan028 Nov 18, 2024
ba1ae5a
merge refactor-vl
lvhan028 Nov 18, 2024
676c23f
deepseek-vl
lvhan028 Nov 18, 2024
e7319c0
cogvlm
lvhan028 Nov 18, 2024
cc9a4eb
glm4v
lvhan028 Nov 18, 2024
b416a26
update internvl
lvhan028 Nov 18, 2024
086eed8
internvl_llava
lvhan028 Nov 18, 2024
da86bbe
llava
lvhan028 Nov 19, 2024
98dde7b
glm4v
lvhan028 Nov 19, 2024
5a06515
upate internvl
lvhan028 Nov 19, 2024
4daf4e3
cogvlm
lvhan028 Nov 19, 2024
a45ddf4
deepseek
lvhan028 Nov 19, 2024
2b8b053
llava_hf
lvhan028 Nov 19, 2024
9cff378
rollback llava, internvl-llava
lvhan028 Nov 19, 2024
09ebaf6
Merge branch 'refactor-vl' into refactor-vl-for-tm
lvhan028 Nov 19, 2024
1132018
refactor qwen
lvhan028 Nov 19, 2024
32a5433
update internvl
lvhan028 Nov 19, 2024
61ad4a6
update llava_hf
lvhan028 Nov 19, 2024
e034874
update qwen2-vl
lvhan028 Nov 19, 2024
e6c8a1a
llava_next
lvhan028 Nov 20, 2024
a9493eb
update llava_next
lvhan028 Nov 20, 2024
8212da5
update llava
lvhan028 Nov 20, 2024
1a87001
update llava
lvhan028 Nov 20, 2024
5f47aa6
update llava
lvhan028 Nov 20, 2024
d958a1e
Merge branch 'refactor-vl' into refactor-vl-for-tm
lvhan028 Nov 20, 2024
32cd694
qwen2
lvhan028 Nov 20, 2024
b9c8581
Merge branch 'refactor-vl' into refactor-vl-for-tm
lvhan028 Nov 20, 2024
c7e8c53
fix internvl
lvhan028 Nov 20, 2024
e8eae01
phi3-vision
lvhan028 Nov 20, 2024
e3a08ca
Merge branch 'refactor-vl' into refactor-vl-for-tm
lvhan028 Nov 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 35 additions & 19 deletions lmdeploy/vl/model/phi3_vision.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# Copyright (c) OpenMMLab. All rights reserved.

import warnings
from typing import List
from typing import Dict, List

import torch
from PIL.Image import Image
from transformers import AutoProcessor

from lmdeploy.vl.model.base import VISION_MODELS, VisonModel
from lmdeploy.vl.model.llava_hf import VISION_MODELS, LlavaHfVisionModel
from lmdeploy.vl.model.utils import disable_logging


Expand Down Expand Up @@ -119,11 +118,20 @@ def _process_image_embedding(self, pixel_values: torch.Tensor,


@VISION_MODELS.register_module()
class Phi3VisionModel(VisonModel):
class Phi3VisionModel(LlavaHfVisionModel):
"""Llava hf vision model."""

_arch = 'Phi3VForCausalLM'

def build_preprocessor(self):
processor = AutoProcessor.from_pretrained(self.model_path,
trust_remote_code=True)
if hasattr(processor, 'tokenizer'):
del processor.tokenizer
processor.prtokenizer = None
self.processor = processor.image_processor
self.processor = processor

def build_model(self):
from accelerate import init_empty_weights, load_checkpoint_and_dispatch
from accelerate.utils import get_balanced_memory, infer_auto_device_map
Expand Down Expand Up @@ -173,23 +181,31 @@ def build_model(self):

model.eval()
self.model = model
# processor
processor = AutoProcessor.from_pretrained(self.model_path,
trust_remote_code=True)
if hasattr(processor, 'tokenizer'):
del processor.tokenizer
processor.prtokenizer = None
self.processor = processor.image_processor
self.processor = processor

def preprocess(self, messages: List[Dict]) -> List[Dict]:
"""refers to `super.preprocess() for spec."""
outputs = []
for item in messages[-1]['content']:
if item['type'] == 'image':
image = item['image'].convert('RGB')
result = self.processor.image_processor(image,
return_tensors='pt')
h = result['image_sizes'][0][0].item() // 336
w = result['image_sizes'][0][1].item() // 336
image_tokens = int((h * w + 1) * 144 + 1 + (h + 1) * 12)
result.update(
dict(image_size=image.size,
image_tokens=image_tokens,
image_token_id=0))
outputs.append(result)
return outputs

@torch.no_grad()
def forward(self, images: List[Image]) -> List[torch.Tensor]:
"""forward."""
process_outputs = self.processor.image_processor(
images, return_tensors='pt').to(device=self.model.device,
dtype=self.model.dtype)
pixel_values = process_outputs['pixel_values']
image_sizes = process_outputs['image_sizes']
def forward(self, inputs: List[Dict]) -> List[torch.Tensor]:
pixel_values = [x['pixel_values'] for x in inputs]
pixel_values = torch.stack(pixel_values, dim=0)
image_sizes = [x['image_sizes'] for x in inputs]
image_sizes = torch.stack(image_sizes, dim=0)
image_features = _process_image_embedding(
self.model.model.vision_embed_tokens,
pixel_values=pixel_values,
Expand Down
Loading