-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathpdf2html.py
executable file
·26 lines (23 loc) · 951 Bytes
/
pdf2html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
#!/usr/bin/env python3
"""
PDF to HTML conversion - first step of the process.
Batch processes a folder full of PDFs using pdf2htmlEX
producing a HTML folder.
This HTML uses just CSS positioning for layout. We need
further work to add sematic tags: transcript.py
"""
import glob, os, time, multiprocessing
import config
def pdf2html(pdf_path):
no = pdf_path.split('/')[-1].replace('.pdf','')
# --embed cfijo = don't embed Css, Fonts, Images, Js, Outlines (> man pdf2htmlEX)
os.system('pdf2htmlEX --embed-external-font 0\
--external-hint-tool ttfautohint\
--process-nontext 0\
--embed cfijo\
--dest-dir %s/%s\
%s %s.html' % (config.HTML_DIR, no, pdf_path, no))
time.sleep(.2)
if __name__ == '__main__':
p = multiprocessing.Pool(4)
print(p.map(pdf2html, glob.glob(config.PDF_DIR + '/*_*.pdf')))