-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsimple_pdf.py
52 lines (41 loc) · 1.58 KB
/
simple_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/7/6 21:02
# @Author : chen
# @Site :
# @File : simplePDF.py
# @Software: PyCharm
import os
from cStringIO import StringIO
import sys
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
def convert_pdf_2_text(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
with open(path, 'rb') as fp:
for page in PDFPage.get_pages(fp, set()):
interpreter.process_page(page)
text = retstr.getvalue()
device.close()
retstr.close()
return text
def convert_pdf_to_txt(src_filename, dest_filename):
# src_dir = u"G:/data/business/AStock/scripts/chen/input/"
# dest_dir = u"G:/data/business/AStock/scripts/chen/output/"
# filename = '2407409.PDF'
# filename = u'S深发展A:2006年年度报告.PDF'
with file(dest_filename +".txt", "wb") as f:
f.write(convert_pdf_2_text(src_filename))
if __name__ == '__main__':
src_dir = u"G:/data/business/AStock/financial_report"
dest_dir = u"G:/data/business/AStock/third_parse"
# filename = '2407409.PDF'
filename = u'S深发展A:2006年年度报告.PDF'
src_file =src_dir + filename
with file(dest_dir + filename+".txt", "wb") as f:
f.write(convert_pdf_2_text(src_file))