-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetFromPDF.py
80 lines (61 loc) · 1.77 KB
/
getFromPDF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pdfplumber
import json
import re
syllabus = "theatre.pdf"
def extract(syllabus):
with pdfplumber.open(syllabus) as pdf:
file_text = ""
# this methods will just iterated through every line
for x in range(len(pdf.pages)):
file_text += pdf.pages[x].extract_text() + "\n"
file_text = file_text.lower()
# print(file_text)
text_list = file_text.split("\n")
# print(text_list)
# list of lists whose items are date and assignment
calendar_list = []
i = 0
while i < len(text_list):
extract = False
pattern = ""
# pattern = "([0-1][0-9][/][0-9][0-9]|[0-1][0-9][/][0-9]|[0-9][/][0-9][0-9]|[0-9][/][0-9])"
if re.search("[0-1][0-9][/][0-9][0-9]", text_list[i]):
pattern = "[0-1][0-9][/][0-9][0-9]"
extract = True
elif re.search("[0-1][0-9][/][0-9]", text_list[i]):
pattern = "[0-1][0-9][/][0-9]"
extract = True
elif re.search("[0-9][/][0-9][0-9]", text_list[i]):
pattern = "[0-9][/][0-9][0-9]"
extract = True
elif re.search("[0-9][/][0-9]", text_list[i]):
pattern = "[0-9][/][0-9]"
extract = True
if extract:
temp = text_list[i].split()
temp_list = []
word = ""
dateTrue = True
j = 0
while j < len(temp):
if dateTrue:
# also regex for date
# convert date to type date
if re.match(pattern, temp[j]):
temp_list.append(temp[j])
dateTrue = False
else:
word += temp[j] + " "
j += 1
temp_list.append(word)
# for x in temp:
# if x not in ignore
if temp_list != [""]:
calendar_list.append(temp_list)
i += 1
# print(calendar_list)
with open('calendar.json', 'w') as json_file:
# json.dump(text_list, json_file, indent = 1)
json.dump(calendar_list, json_file, indent = 1)
print("done")
# extract(syllabus)