Skip to content

Commit

Permalink
Merge branch 'PaddlePaddle:release/2.6' into api
Browse files Browse the repository at this point in the history
  • Loading branch information
m986883511 authored Sep 30, 2022
2 parents bca7f73 + 7f6c9a7 commit 8afc30c
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 182 deletions.
84 changes: 37 additions & 47 deletions PPOCRLabel/PPOCRLabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2449,13 +2449,6 @@ def exportJSON(self):
export PPLabel and CSV to JSON (PubTabNet)
'''
import pandas as pd
from libs.dataPartitionDialog import DataPartitionDialog

# data partition user input
partitionDialog = DataPartitionDialog(parent=self)
partitionDialog.exec()
if partitionDialog.getStatus() == False:
return

# automatically save annotations
self.saveFilestate()
Expand All @@ -2478,28 +2471,19 @@ def exportJSON(self):
labeldict[file] = eval(label)
else:
labeldict[file] = []

# read table recognition output
TableRec_excel_dir = os.path.join(
self.lastOpenDir, 'tableRec_excel_output')

train_split, val_split, test_split = partitionDialog.getDataPartition()
# check validate
if train_split + val_split + test_split > 100:
msg = "The sum of training, validation and testing data should be less than 100%"
QMessageBox.information(self, "Information", msg)
return
print(train_split, val_split, test_split)
train_split, val_split, test_split = float(train_split) / 100., float(val_split) / 100., float(test_split) / 100.
train_id = int(len(labeldict) * train_split)
val_id = int(len(labeldict) * (train_split + val_split))
print('Data partition: train:', train_id,
'validation:', val_id - train_id,
'test:', len(labeldict) - val_id)

TableRec_excel_dir = os.path.join(self.lastOpenDir, 'tableRec_excel_output')
json_results = []
imgid = 0
# save txt
fid = open(
"{}/gt.txt".format(self.lastOpenDir), "w", encoding='utf-8')
for image_path in labeldict.keys():
# load csv annotations
filename, _ = os.path.splitext(os.path.basename(image_path))
csv_path = os.path.join(TableRec_excel_dir, filename + '.xlsx')
csv_path = os.path.join(
TableRec_excel_dir, filename + '.xlsx')
if not os.path.exists(csv_path):
continue

Expand All @@ -2518,28 +2502,31 @@ def exportJSON(self):
cells = []
for anno in labeldict[image_path]:
tokens = list(anno['transcription'])
obb = anno['points']
hbb = OBB2HBB(np.array(obb)).tolist()
cells.append({'tokens': tokens, 'bbox': hbb})

# data split
if imgid < train_id:
split = 'train'
elif imgid < val_id:
split = 'val'
else:
split = 'test'

# save dict
html = {'structure': {'tokens': token_list}, 'cells': cells}
json_results.append({'filename': os.path.basename(image_path), 'split': split, 'imgid': imgid, 'html': html})
imgid += 1

# save json
with open("{}/annotation.json".format(self.lastOpenDir), "w", encoding='utf-8') as fid:
fid.write(json.dumps(json_results, ensure_ascii=False))

msg = 'JSON sucessfully saved in {}/annotation.json'.format(self.lastOpenDir)
cells.append({
'tokens': tokens,
'bbox': anno['points']
})

# 构造标注信息
html = {
'structure': {
'tokens': token_list
},
'cells': cells
}
d = {
'filename': os.path.basename(image_path),
'html': html
}
# 重构HTML
d['gt'] = rebuild_html_from_ppstructure_label(d)
fid.write('{}\n'.format(
json.dumps(
d, ensure_ascii=False)))

# convert to PP-Structure label format
fid.close()
msg = 'JSON sucessfully saved in {}/gt.txt'.format(self.lastOpenDir)
QMessageBox.information(self, "Information", msg)

def autolcm(self):
Expand Down Expand Up @@ -2728,6 +2715,9 @@ def change_box_key(self):

self._update_shape_color(shape)
self.keyDialog.addLabelHistory(key_text)

# save changed shape
self.setDirty()

def undoShapeEdit(self):
self.canvas.restoreShape()
Expand Down
4 changes: 2 additions & 2 deletions PPOCRLabel/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,9 @@ python PPOCRLabel.py --kie True # [KIE mode] for [detection + recognition + keyw
#### 1.2.3 Build and Install the Whl Package Locally
Compile and install a new whl package, where 1.0.2 is the version number, you can specify the new version in 'setup.py'.
```bash
cd PaddleOCR/PPOCRLabel
cd ./PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl
pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl
```


Expand Down
4 changes: 2 additions & 2 deletions PPOCRLabel/README_ch.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ python PPOCRLabel.py --lang ch
编译与安装新的whl包,其中1.0.2为版本号,可在 `setup.py` 中指定新版本。

```bash
cd PaddleOCR/PPOCRLabel
cd ./PPOCRLabel
python3 setup.py bdist_wheel
pip3 install dist/PPOCRLabel-1.0.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
pip3 install dist/PPOCRLabel-2.1.2-py2.py3-none-any.whl -i https://mirror.baidu.com/pypi/simple
```


Expand Down
6 changes: 3 additions & 3 deletions PPOCRLabel/libs/canvas.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,8 +611,8 @@ def paintEvent(self, event):

if self.drawing() and not self.prevPoint.isNull() and not self.outOfPixmap(self.prevPoint):
p.setPen(QColor(0, 0, 0))
p.drawLine(self.prevPoint.x(), 0, self.prevPoint.x(), self.pixmap.height())
p.drawLine(0, self.prevPoint.y(), self.pixmap.width(), self.prevPoint.y())
p.drawLine(int(self.prevPoint.x()), 0, int(self.prevPoint.x()), self.pixmap.height())
p.drawLine(0, int(self.prevPoint.y()), self.pixmap.width(), int(self.prevPoint.y()))

self.setAutoFillBackground(True)
if self.verified:
Expand Down Expand Up @@ -909,4 +909,4 @@ def isShapeRestorable(self):
def updateShapeIndex(self):
for i in range(len(self.shapes)):
self.shapes[i].idx = i
self.update()
self.update()
113 changes: 0 additions & 113 deletions PPOCRLabel/libs/dataPartitionDialog.py

This file was deleted.

32 changes: 20 additions & 12 deletions PPOCRLabel/libs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,18 +176,6 @@ def boxPad(box, imgShape, pad : int) -> np.array:
return box


def OBB2HBB(obb) -> np.array:
"""
Convert Oriented Bounding Box to Horizontal Bounding Box.
"""
hbb = np.zeros(4, dtype=np.int32)
hbb[0] = min(obb[:, 0])
hbb[1] = min(obb[:, 1])
hbb[2] = max(obb[:, 0])
hbb[3] = max(obb[:, 1])
return hbb


def expand_list(merged, html_list):
'''
Fill blanks according to merged cells
Expand Down Expand Up @@ -232,6 +220,26 @@ def convert_token(html_list):
return token_list


def rebuild_html_from_ppstructure_label(label_info):
from html import escape
html_code = label_info['html']['structure']['tokens'].copy()
to_insert = [
i for i, tag in enumerate(html_code) if tag in ('<td>', '>')
]
for i, cell in zip(to_insert[::-1], label_info['html']['cells'][::-1]):
if cell['tokens']:
cell = [
escape(token) if len(token) == 1 else token
for token in cell['tokens']
]
cell = ''.join(cell)
html_code.insert(i + 1, cell)
html_code = ''.join(html_code)
html_code = '<html><body><table>{}</table></body></html>'.format(
html_code)
return html_code


def stepsInfo(lang='en'):
if lang == 'ch':
msg = "1. 安装与运行:使用上述命令安装与运行程序。\n" \
Expand Down
2 changes: 1 addition & 1 deletion PPOCRLabel/resources/strings/strings-en.properties
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,4 @@ keyDialogTip=Enter object label
keyChange=Change Box Key
TableRecognition=Table Recognition
cellreRecognition=Cell Re-Recognition
exportJSON=Export Excel Label(PubTabNet)
exportJSON=Export Table Label
2 changes: 1 addition & 1 deletion PPOCRLabel/resources/strings/strings-zh-CN.properties
Original file line number Diff line number Diff line change
Expand Up @@ -113,4 +113,4 @@ keyDialogTip=请输入类型名称
keyChange=更改Box关键字类别
TableRecognition=表格识别
cellreRecognition=单元格重识别
exportJSON=导出表格JSON标注
exportJSON=导出表格标注
2 changes: 1 addition & 1 deletion PPOCRLabel/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def readme():
package_dir={'PPOCRLabel': ''},
include_package_data=True,
entry_points={"console_scripts": ["PPOCRLabel= PPOCRLabel.PPOCRLabel:main"]},
version='2.1.1',
version='2.1.2',
install_requires=requirements,
license='Apache License 2.0',
description='PPOCRLabel is a semi-automatic graphic annotation tool suitable for OCR field, with built-in PPOCR model to automatically detect and re-recognize data. It is written in python3 and pyqt5, supporting rectangular box annotation and four-point annotation modes. Annotations can be directly used for the training of PPOCR detection and recognition models',
Expand Down

0 comments on commit 8afc30c

Please sign in to comment.