diff --git a/.gitignore b/.gitignore index ea5a563..9fba684 100644 --- a/.gitignore +++ b/.gitignore @@ -134,6 +134,8 @@ dmypy.json test/*.ofd test/*.pdf test/*.json +test/data +test/test 增值税电子专票5 *.ofd *.pdf diff --git a/README.md b/README.md index 8440e89..297253b 100644 --- a/README.md +++ b/README.md @@ -11,8 +11,7 @@ ### 更新 -v0.3.1 解决了一些wps适配 问题。 新增加了pfd转ofd电子版的支持,ofd 转pdf 中签章的解析 - +v0.3.3 解决了一些bug ,去除了对opencv的依赖 环境体积减少50M左右,后续可能会尝试把一些依赖改为选装按需安装。 ### 常见问题 @@ -27,11 +26,9 @@ https://github.com/renoyuan/easyofd/wiki/FAQ 1 环境,后续可能会尝试减少一些第三包的依赖压缩环境体积 -- 主要是opencv 和numpy -2 gui 工具 ,之前提供过一个开箱即用的gui工具 v0.1.0.1版本后就没有更新过了,有时间会更新下。 - -3 功能上 对于pfd2ofd 和 ofd 生成 可能会有一些优化 +2 功能上 对于pfd2ofd 和 ofd 生成 可能会有一些优化 -4 需求收集,若有其他相关easyofd 的需求和建议可以git 上给我提,有意思的需求我会考虑尝试。 +3 需求收集,若有其他相关easyofd 的需求和建议可以git 上给我提,有意思的需求我会考虑尝试。 @@ -52,7 +49,6 @@ https://github.com/renoyuan/easyofd/wiki/FAQ - 关于 jb2格式图片解析 使用了第三方库 jbig2dec 去读取jb2格式图片 参考下面链接安装使用jbig2dec https://github.com/rillian/jbig2dec diff --git a/easyofd/__init__.py b/easyofd/__init__.py index b3b779f..e365c01 100644 --- a/easyofd/__init__.py +++ b/easyofd/__init__.py @@ -1,3 +1,3 @@ from .ofd import OFD -__version__ = "0.3.2" +__version__ = "0.3.4" __all__ = ["OFD"] \ No newline at end of file diff --git a/easyofd/draw/draw_ofd.py b/easyofd/draw/draw_ofd.py index 33f11a1..cff8d21 100644 --- a/easyofd/draw/draw_ofd.py +++ b/easyofd/draw/draw_ofd.py @@ -10,7 +10,6 @@ from datetime import datetime import xmltodict -import cv2 from PIL import Image from loguru import logger @@ -190,6 +189,7 @@ def build_content_res(self, pil_img_list=None, pdf_info_list=None, id_obj: CurId return content_res_list def pil_2_bytes(self, image): + """""" # 创建一个 BytesIO 对象 img_bytesio = BytesIO() @@ -203,24 +203,24 @@ def pil_2_bytes(self, image): img_bytesio.close() return img_bytes - def __call__(self, pdf_bytes, cv2_img_list=None, optional_text=False): + def __call__(self, pdf_bytes=None, pil_img_list=None, optional_text=False): """ + input pdf | imgs if pdf >optional_text or not 0 解析pdf文件 1 构建必要的ofd template 2 转化为 ofd """ pdf_obj = DPFParser() - if optional_text: # 生成可编辑ofd: - pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes) # 解析pdf - logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}") - - page_pil_img_list = None - - else: # 插入图片ofd - if cv2_img_list: # 读取 图片 - page_pil_img_list = [(self.pil_2_bytes(Image.fromarray(cv2.cvtColor(_img,cv2.COLOR_BGR2RGB))), - _img.shape[1], _img.shape[0]) for _img in cv2_img_list] - else: # 读取 pdf 转图片 + page_pil_img_list = None + + # 插入图片ofd + if pil_img_list: # 读取 图片 + page_pil_img_list = [(self.pil_2_bytes(_img),_img.size[0]/self.OP,_img.size[1]/self.OP) for _img in pil_img_list] + else: # 读取 pdf 转图片 + if optional_text: # 生成可编辑ofd: + pdf_info_list, pfd_res_uuid_map = pdf_obj.extract_text_with_details(pdf_bytes) # 解析pdf + logger.debug(f"pdf_info_list: {pdf_info_list} \n pfd_res_uuid_map {pfd_res_uuid_map}") + else: img_list = pdf_obj.to_img(pdf_bytes) page_pil_img_list = [(self.pil_2_bytes(Image.frombytes("RGB", [_img.width, _img.height], _img.samples)), _img.width/self.OP, _img.height/self.OP) for _img in img_list] @@ -266,6 +266,7 @@ def __call__(self, pdf_bytes, cv2_img_list=None, optional_text=False): if __name__ == "__main__": pdf_p = r"D:\renodoc\技术栈\GBT_33190-2016_电子文件存储与交换格式版式文档.pdf" + pdf_p = r"F:\code\easyofd\test" with open(pdf_p,"rb") as f: content = f.read() diff --git a/easyofd/draw/draw_pdf.py b/easyofd/draw/draw_pdf.py index b430ccb..1c7c9ec 100644 --- a/easyofd/draw/draw_pdf.py +++ b/easyofd/draw/draw_pdf.py @@ -277,66 +277,70 @@ def draw_signature(self, canvas, signatures_page_list, page_size): def draw_line(self,canvas,line_list,page_size): """绘制线条""" # print("绘制",line_list) + + def match_mode(Abbr: list): + """ + 解析AbbreviatedData + 匹配各种线条模式 + S 定义起始 坐标 x, y + M 移动到指定坐标 x, y + L 从当前点移动到指定点 x, y + Q x1 y1 x2 y2 二次贝塞尔曲线 + B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 + A 到 x,y 的圆弧 并移动到 x,y rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转 + C 当前点和SubPath自动闭合 + """ + relu_list = [] + mode = "" + modes = ["S", "M", "L", "Q", "B", "A", "C"] + mode_dict = {} + for idx, i in enumerate(Abbr): + if i in modes: + mode = i + if mode_dict: + relu_list.append(mode_dict) + mode_dict = {"mode": i, "points": []} + + else: + mode_dict["points"].append(i) + + if idx + 1 == len(Abbr): + relu_list.append(mode_dict) + return relu_list + + + def assemble(relu_list: list): + start_point = {} + acticon = [] + for i in relu_list: + if i.get("mode") == "M": + start_point = i + elif i.get("mode") in ['B', "Q", 'L']: + acticon.append({"start_point": start_point, + "end_point": i + }) + return acticon + + def convert_coord(p_list, direction, page_size, pos): + """坐标转换ofd2pdf""" + new_p_l = [] + for p in p_list: + if direction == "x": + + new_p = (float(pos[0]) + float(p)) * self.OP + else: + new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP + new_p_l.append(new_p) + return new_p_l + for line in line_list: Abbr = line.get("AbbreviatedData").split(" ") # AbbreviatedData color = line.get("FillColor",[0,0,0]) - def match_mode(Abbr:list): - """ - 解析AbbreviatedData - 匹配各种线条模式 - S 定义起始 坐标 x, y - M 移动到指定坐标 x, y - L 从当前点移动到指定点 x, y - Q x1 y1 x2 y2 二次贝塞尔曲线 - B x1 y1 x2 y2 x3 y3 三次贝塞尔曲线 - A 到 x,y 的圆弧 并移动到 x,y rx 长轴 ry 短轴 angle 旋转角度 large为1表示 大于180 的弧 为0时表示小于180的弧 swcpp 为1 表示顺时针旋转 0 表示逆时针旋转 - C 当前点和SubPath自动闭合 - """ - relu_list = [] - mode = "" - modes = ["S","M","L","Q","B","A","C"] - mode_dict = {} - for idx, i in enumerate(Abbr): - if i in modes: - mode = i - if mode_dict: - relu_list.append(mode_dict) - mode_dict = {"mode": i, "points": []} - - else: - mode_dict["points"].append(i) - - if idx + 1 == len(Abbr): - relu_list.append(mode_dict) - return relu_list - + relu_list = match_mode(Abbr) # TODO 组合 relu_list 1 M L 直线 2 M B*n 三次贝塞尔线 3 M Q*n 二次贝塞尔线 - def assemble(relu_list: list): - start_point = {} - acticon = [] - for i in relu_list: - if i.get("mode") == "M": - start_point = i - elif i.get("mode") in ['B', "Q", 'L']: - acticon.append({"start_point": start_point, - "end_point":i - }) - return acticon - - def convert_coord(p_list,direction, page_size, pos): - """坐标转换ofd2pdf""" - new_p_l = [] - for p in p_list: - if direction == "x": - - new_p = (float(pos[0]) + float(p)) * self.OP - else: - new_p = (float(page_size[3]) - float(pos[1]) - float(p)) * self.OP - new_p_l.append(new_p) - return new_p_l - + # print(relu_list) acticons = assemble(relu_list) @@ -405,20 +409,20 @@ def draw_pdf(self): c.setPageSize((page_size[2]*self.OP, page_size[3]*self.OP)) # 写入图片 - self.draw_img(c, img_list, images, page_size) - - + if img_list: + self.draw_img(c, img_list, images, page_size) # 写入文本 - self.draw_chars(c, text_list, fonts, page_size) + if text_list: + self.draw_chars(c, text_list, fonts, page_size) # 绘制线条 - self.draw_line(c, line_list, page_size) + if line_list: + self.draw_line(c, line_list, page_size) # 绘制签章 - self.draw_signature(c, signatures_page_id.get(page_id), page_size) - - + if signatures_page_id: + self.draw_signature(c, signatures_page_id.get(page_id), page_size) # print("去写入") diff --git a/easyofd/draw/font_tools.py b/easyofd/draw/font_tools.py index e5b0da8..3ef9a84 100644 --- a/easyofd/draw/font_tools.py +++ b/easyofd/draw/font_tools.py @@ -13,14 +13,14 @@ import os import shutil import logging -from io import BytesIO,StringIO +from io import BytesIO, StringIO import string from uuid import uuid1 import random import traceback import logging -import numpy as np + import tempfile import xmltodict from fontTools.ttLib import TTFont as ttLib_TTFont diff --git a/easyofd/draw/pdf_parse.py b/easyofd/draw/pdf_parse.py index 4d8bda1..590b16a 100644 --- a/easyofd/draw/pdf_parse.py +++ b/easyofd/draw/pdf_parse.py @@ -153,7 +153,7 @@ def extract_text_with_details(self, pdf_bytes): # print("details_list",details_list) return details_list, res_uuid_map def to_img(self, buffer_pdf): - """转图片""" + """pdf2img""" pix_list = [] pdfDoc = fitz.open(stream=buffer_pdf) for pg in range(pdfDoc.page_count): @@ -166,6 +166,8 @@ def to_img(self, buffer_pdf): # zoom_x,zoom_y = (1,1) mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) pix = page.get_pixmap(matrix=mat, alpha=False) + + pix_list.append(pix) return pix_list diff --git a/easyofd/ofd.py b/easyofd/ofd.py index 17b32dc..232c78f 100644 --- a/easyofd/ofd.py +++ b/easyofd/ofd.py @@ -14,8 +14,8 @@ from typing import Any import fitz -import cv2 -import numpy as np + +from PIL import Image from loguru import logger from easyofd.parser_ofd import OFDParser @@ -79,23 +79,24 @@ def pdf2img(self, pdfbytes): zoom_x, zoom_y = 1.6, 1.6 mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) pix = page.get_pixmap(matrix=mat, alpha=False) - image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples) + pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + # image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples) # print(image.shape) # print(image[2]) - image_list.append(image) - logger.info(f"to_jpg") + image_list.append(pil_image) + logger.info(f"pdf2img") return image_list def jpg2ofd(self,imglist:list): """ - imglist: cv2 image list + imglist: pil image list """ - ofd_byte = OFDWrite()(cv2_img_list=imglist) + ofd_byte = OFDWrite()(pil_img_list=imglist) return ofd_byte def jpg2pfd(self,imglist:list): """ - imglist: cv2 image list + imglist: PIL image list 1 构建data 2 DrawPDF(self.data)() """ @@ -105,25 +106,14 @@ def jpg2pfd(self,imglist:list): def to_jpg(self,format="jpg"): """ - return numpy list + return pil list """ - assert self.data,f"data is None" + assert self.data, f"data is None" image_list = [] pdfbytes = self.to_pdf() - - doc = fitz.open(stream=pdfbytes, filetype="pdf") - - for page in doc: - rotate = int(0) - zoom_x, zoom_y = 1.6, 1.6 - mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate) - pix = page.get_pixmap(matrix=mat, alpha=False) - image = np.ndarray((pix.height, pix.width, 3), dtype=np.uint8, buffer=pix.samples) - # print(image.shape) - # print(image[2]) - image_list.append(image) - logger.info(f"to_jpg") + image_list = self.pdf2img(pdfbytes) return image_list + def del_data(self,): diff --git a/easyofd/parser_ofd/img_deal.py b/easyofd/parser_ofd/img_deal.py index 95bc60d..2181000 100644 --- a/easyofd/parser_ofd/img_deal.py +++ b/easyofd/parser_ofd/img_deal.py @@ -1,7 +1,35 @@ #!/usr/bin/env python -#-*- coding: utf-8 -*- -#PROJECT_NAME: easyofd img_deal -#CREATE_TIME: 2024/7/18 11:20 -#E_MAIL: renoyuan@foxmail.com -#AUTHOR: renoyuan -#note: \ No newline at end of file +# -*- coding: utf-8 -*- +# PROJECT_NAME: easyofd img_deal +# CREATE_TIME: 2024/7/18 11:20 +# E_MAIL: renoyuan@foxmail.com +# AUTHOR: renoyuan +# note: img 操作 +from io import BytesIO +class DealImg(object): + def __init__(self): + pass + def resize(self): + """resize img""" + pass + def pil2bytes(self, image): + """pil2bytes""" + # 创建一个 BytesIO 对象 + img_bytesio = BytesIO() + # 将图像保存到 BytesIO 对象 + image.save(img_bytesio, format='PNG') # 你可以根据需要选择其他图像格式 + # 获取 BytesIO 对象中的字节 + img_bytes = img_bytesio.getvalue() + # 关闭 BytesIO 对象 + img_bytesio.close() + return img_bytes + def pil2bytes_io(self, image): + """pil2bytes_io""" + # 创建一个 BytesIO 对象 + img_bytesio = BytesIO() + # 将图像保存到 BytesIO 对象 + image.save(img_bytesio, format='PNG') # 你可以根据需要选择其他图像格式 + return img_bytesio + + + diff --git a/easyofd/parser_ofd/ofd_parser.py b/easyofd/parser_ofd/ofd_parser.py index ec9493b..5746fb7 100644 --- a/easyofd/parser_ofd/ofd_parser.py +++ b/easyofd/parser_ofd/ofd_parser.py @@ -14,10 +14,11 @@ import traceback import base64 import re -import cv2 -from typing import Any + +from typing import Any,List from PIL import Image from loguru import logger +from .img_deal import DealImg from .file_deal import FileRead from .file_parser import (OFDFileParser, DocumentFileParser, ContentFileParser,DocumentResFileParser,PublicResFileParser, SignaturesFileParser,SignatureFileParser) @@ -32,11 +33,12 @@ class OFDParser(object): 2 调用font 注册 字体 """ def __init__(self, ofdb64): + self.img_deal = DealImg() self.ofdb64 = ofdb64 self.file_tree = None self.jbig2dec_path = r"C:/msys64/mingw64/bin/jbig2dec.exe" - def img2data(self,imglist): + def img2data(self,imglist:List[Image]): """ imglist to ofd data @@ -49,10 +51,9 @@ def img2data(self,imglist): page_info_d = {} - for idx, img_numpy in enumerate(imglist): - h, w, _ = img_numpy.shape - _, img_encode = cv2.imencode('.jpg', img_numpy) - img_bytes = img_encode.tobytes() + for idx, img_pil in enumerate(imglist): + w,h = img_pil.size + img_bytes = self.img_deal.pil2bytes(img_pil) imgb64 = str(base64.b64encode(img_bytes),encoding="utf-8") img_info[str(idx)] = { "format": "jpg", diff --git a/requerment.txt b/requerment.txt index 36f2d02..7aabdd4 100644 --- a/requerment.txt +++ b/requerment.txt @@ -1,8 +1,6 @@ reportlab==3.6.11 xmltodict==0.13.0 -numpy==1.26.1 loguru==0.7.2 fontTools==4.43.1 PyMuPDF==1.23.4 -opencv-python>=4.6.0.66 pyasn1>=0.6.0 diff --git a/setup.py b/setup.py index 9c2cc30..5760f98 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,6 @@ "loguru>=0.7.2", "fontTools>=4.43.1", "PyMuPDF>=1.23.4", - "opencv-python>=4.6.0.66", "pyasn1>=0.6.0" ], python_requires='>=3.8', diff --git a/test/demo.py b/test/demo.py index d58d4dd..49b1bd8 100644 --- a/test/demo.py +++ b/test/demo.py @@ -15,8 +15,7 @@ print(pkg_dir) sys.path.insert(0,project_dir) sys.path.insert(0,pkg_dir) -import numpy as np -import cv2 + from easyofd.ofd import OFD @@ -30,12 +29,12 @@ def test_img2(dir_path): imgs_p = os.listdir(dir_path) imgs = [] for img_p in imgs_p: - imgs.append(cv2.imread(os.path.join(dir_path,img_p))) - ofdbytes = ofd = OFD().jpg2ofd(imgs) - pdfbytes = ofd = OFD().jpg2pfd(imgs) - with open(r"img2test.pdf","wb") as f: + imgs.append(Image.open(os.path.join(dir_path, img_p))) # 传入改为pil + ofdbytes = OFD().jpg2ofd(imgs) + pdfbytes = OFD().jpg2pfd(imgs) + with open(r"img2test.pdf", "wb") as f: f.write(pdfbytes) - with open(r"img2test.ofd","wb") as f: + with open(r"img2test.ofd", "wb") as f: f.write(ofdbytes) def test_ofd2(file_path): @@ -59,8 +58,8 @@ def test_ofd2(file_path): f.write(pdf_bytes) for idx, img in enumerate(img_np): - im = Image.fromarray(img) - im.save(f"{file_prefix}_{idx}.jpg") + # im = Image.fromarray(img) + img.save(f"{file_prefix}_{idx}.jpg") def test_pdf2(file_path): """ @@ -77,13 +76,13 @@ def test_pdf2(file_path): with open(f"{file_prefix}.ofd", "wb") as f: f.write(ofd_bytes) for idx, img in enumerate(img_np): - im = Image.fromarray(img) - im.save(f"{file_prefix}_{idx}.jpg") + img.save(f"{file_prefix}_{idx}.jpg") if __name__ == "__main__": - file_path = r"1.ofd" - # file_path = r"E:\download\MyPython\ceshi.pdf" + file_path = r"data/1.ofd" + # file_path = r"F:\code\easyofd\test\img" + file_path = r"E:\download\MyPython\ceshi.pdf" if sys.argv[1] =="ofd2": test_ofd2(file_path) elif sys.argv[1] =="pdf2":