-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminiCPM.py
59 lines (48 loc) · 1.91 KB
/
miniCPM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from pdf2image import convert_from_path
import pathlib
from PyPDF2 import PdfReader
class Extract():
def __init__(self,pdf_path, device = 'cuda',method='text'):
if method=='ocr':
self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)#, torch_dtype=torch.bfloat16)
self.model = self.model.to(device=device)#, dtype=torch.bfloat16)
self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
self.model.eval()
else:
pass
self.path = pdf_path
def getText(self):
reader = PdfReader(self.path)
number_of_pages = len(reader.pages)
text = ''
for i in range(number_of_pages):
page = reader.pages[i]
text = text + page.extract_text() + 'new page \n '
return text
def getTextFromImg(self):
if pathlib.Path(self.path).suffix == '.pdf':
images = convert_from_path(self.path, dpi=800)
else:
images = [Image.open(self.path).convert('RGB')]
documentTXT = ''
for page in images:
self.f = page
question= """
Write down everything in the image as text.
"""
#question= """Write down the tables in the image with units for each cell.
#Write down everything in the image with seperators."""
msgs = [{'role': 'user', 'content': question}]
res= self.model.chat(
image=page,
msgs=msgs,
context=None,
tokenizer=self.tokenizer,
sampling=False,
temperature=0.7
)
documentTXT= documentTXT + '\n' + res + 'new page \n'
return documentTXT