utils.py

import re
from bs4 import BeautifulSoup as bs

def format_html(img):
    ''' Formats HTML code from tokenized annotation of img
    '''
    # html_string = '''<html>
    #                  <head>
    #                  <meta charset="UTF-8">
    #                  <style>
    #                  table, th, td {
    #                    border: 1px solid black;
    #                    font-size: 10px;
    #                  }
    #                  </style>
    #                  </head>
    #                  <body>
    #                  <table frame="hsides" rules="groups" width="100%%">
    #                      %s
    #                  </table>
    #                  </body>
    #                  </html>''' % ''.join(img['html']['structure']['tokens'])
    html_string = '''<html><body><table>%s</table></body></html>''' % ''.join(img['html']['structure']['tokens'])
    cell_nodes = list(re.finditer(r'(<td[^<>]*>)(</td>)', html_string))
    # print(len(cell_nodes), len(img['html']['cells']))
    assert len(cell_nodes) == len(img['html']['cells']), 'Number of cells defined in tags does not match the length of cells'
    cells = [''.join(c['tokens']) for c in img['html']['cells']]
    offset = 0
    for n, cell in zip(cell_nodes, cells):
        html_string = html_string[:n.end(1) + offset] + cell + html_string[n.start(2) + offset:]
        offset += len(cell)
    # # prettify the html
    # soup = bs(html_string, features="html.parser")
    # html_string = soup.prettify()
    return html_string


if __name__ == '__main__':
    import json
    import sys
    f = sys.argv[1]
    with open(f, 'r') as fp:
        annotations = json.load(fp)
        for img in annotations['images']:
            html_string = format_html(img)
            print(html_string)