-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathGet-Info-from-CLT-Admission-Ticket.py
104 lines (94 loc) · 4.41 KB
/
Get-Info-from-CLT-Admission-Ticket.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding:utf-8 -*-
# by 'hollowman6' from Lanzhou University(兰州大学)
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
import os.path
# 正则表达式搜索 Regular expression search
import re
def parse(path):
'''解析PDF文本,并保存到TXT文件中'''
'''parses PDF text and saves it to TXT file'''
fp = open(path, 'rb')
# 用文件对象创建一个PDF文档分析器 Create a PDF document analyzer with file objects
parser = PDFParser(fp)
# 创建一个PDF文档 Create a PDF document
doc = PDFDocument()
# 连接分析器,与文档对象 Connect the analyzer to the document object
parser.set_document(doc)
doc.set_parser(parser)
# 提供初始化密码,如果没有密码,就创建一个空的字符串 Provides an initialization password, and if there is no password, creates an empty string
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略 Check whether the document provides TXT conversion, ignore it if it does not
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDF资源管理器来共享资源 Create PDF Resource Manager to share resources
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象 Create a PDF device object
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF解释其对象 Create a PDF to explain its object
interpreter = PDFPageInterpreter(rsrcmgr, device)
results = ""
# 循环遍历列表,每次处理一个page内容 创建PDF以解释其对象
# doc.get_pages() 获取page列表 Get page list
for page in doc.get_pages():
interpreter.process_page(page)
# 接受该页面的LTPage对象 The LTPage object that accepts the page
layout = device.get_result()
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 Here layout is an LTPage object that stores the various objects parsed by this page.
# 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 Generally include LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal and so on.
# 想要获取文本就获得对象的text属性 Get the text attribute of the object if you want to get the text
for x in layout:
if(isinstance(x, LTTextBoxHorizontal)):
results += x.get_text()
info = ''.join(re.findall(r'准考证号:(.+)\n', str(results)))
info += " "
info += ''.join(re.findall(r'姓名:(.+)\n', str(results)))
return info
def get_files(path='Downloads', rule=".pdf"):
all = []
# os.walk获取所有的目录 Get all the directories
for fpathe, dirs, fs in os.walk(path):
for f in fs:
filename = os.path.join(fpathe, f)
# 判断是否是"<rule>"结尾 Judge whether it is the end of "<rule>"?
if filename.endswith(rule):
all.append(filename)
return all
if __name__ == '__main__':
pdflist = get_files()
input("使用前请确保所有pdf文件已经解压到Downloads目录下,且文件名格式类似于“2019年上半年英语六级笔试准考证(XXX).pdf”,准备好后按回车键继续...")
fw = open("info.txt", 'a', encoding="UTF-8")
print("正在提取中,请稍后...")
# 考试代码 Exam Code
for name in pdflist:
t='2'
wd='CET4_'
if name[15]=='上':
t='1'
if '英语六级' in name:
wd='CET6_'
elif '俄语四级' in name:
ed='CRT4_'
elif '俄语六级' in name:
wd='CRT6_'
elif '德语四级' in name:
ed='PHS4_'
elif '德语六级' in name:
wd='PHS6_'
elif '法语四级' in name:
ed='TFU4_'
elif '法语六级' in name:
wd='TFU6_'
elif '日语四级' in name:
ed='CJT4_'
elif '日语六级' in name:
ed='CJT6_'
print(name)
fw.write(wd+name[12:14]+t+'_DANGCI '+parse(name)+"\n")
fw.close()
input("全部完成!按回车键退出...")