-
Notifications
You must be signed in to change notification settings - Fork 24
/
utils.py
46 lines (43 loc) · 1.74 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
from bs4 import BeautifulSoup as bs
def format_html(img):
''' Formats HTML code from tokenized annotation of img
'''
# html_string = '''<html>
# <head>
# <meta charset="UTF-8">
# <style>
# table, th, td {
# border: 1px solid black;
# font-size: 10px;
# }
# </style>
# </head>
# <body>
# <table frame="hsides" rules="groups" width="100%%">
# %s
# </table>
# </body>
# </html>''' % ''.join(img['html']['structure']['tokens'])
html_string = '''<html><body><table>%s</table></body></html>''' % ''.join(img['html']['structure']['tokens'])
cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
# print(len(cell_nodes), len(img['html']['cells']))
assert len(cell_nodes) == len(img['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
cells = [''.join(c['tokens']) for c in img['html']['cells']]
offset = 0
for n, cell in zip(cell_nodes, cells):
html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
offset += len(cell)
# # prettify the html
# soup = bs(html_string, features="html.parser")
# html_string = soup.prettify()
return html_string
if __name__ == '__main__':
import json
import sys
f = sys.argv[1]
with open(f, 'r') as fp:
annotations = json.load(fp)
for img in annotations['images']:
html_string = format_html(img)
print(html_string)