-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
132 lines (102 loc) · 3.75 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from __future__ import annotations
import glob
import os
import re
import pickle
import xml.etree.ElementTree as ET
from tqdm import tqdm
import pandas as pd
from src.data import oclc
LOAD_XMLS = False
LOAD_PICKLE = True
smark_regex = re.compile("[0-9]{1,5}[\s\.]{1,2}[\w]{1,3}[\s\.]{1,2}[\w0-9]{1,5}")
author_regex = re.compile("[A-Z]+[\s]+\([A-Z][a-z]+\)")
isbn_regex = re.compile("ISBN\s[0-9\-\s]+")
def extractLines(root: ET.Element):
lines = []
textRegions = [x for x in root[1] if len(x) > 2] # Empty Text Regions Removed
for textRegion in textRegions:
textLines = textRegion[1:-1] # Skip coordinate data in first child
for textLine in textLines:
lines.append(textLine[-1][0].text) # Text equivalent for line
return lines
def extractLinesForVol(vol: list[ET.Element]):
allLines = []
for root in tqdm(vol):
rootLines = extractLines(root)
allLines.append(rootLines)
return allLines
def find_author(lines, dummy):
author, title = None, None
for i, l in enumerate(lines):
if author_regex.search(l): # look for an author format match
author = l
break
if author:
if i >= 2: # author is after the second line (where we expect the title)
title = " ".join(lines[1:i])
elif i == 1: # author is the second line
title = lines[2]
else:
title = lines[1] # default to the title being the second line
return title, author
def isbn_search(x):
res = isbn_regex.search(x)
if res:
return res.group()
else:
return None
p5_root = (
r"G:\DigiSchol\Digital Research and Curator Team\Projects & Proposals\00_Current Projects"
r"\LibCrowds Convert-a-Card (Adi)\OCR\20230504 TKB Export P5 175 GT pp\1016992\P5_for_Transkribus"
)
if LOAD_XMLS:
page_xml_loc = os.path.join(p5_root, "page")
attempts = 0
while attempts < 3:
xmls = glob.glob(os.path.join(page_xml_loc, "*.xml"))
if len(xmls) > 0:
break
else:
attempts += 1
continue
else:
raise IOError(f"Failed to connect to {page_xml_loc}")
xmlroots = []
print(f"\nGetting xml roots from {page_xml_loc}")
for file in tqdm(xmls):
fileName = os.fsdecode(file)
attempts = 0
while attempts < 3:
try:
tree = ET.parse(fileName)
break
except FileNotFoundError:
attempts += 1
continue
else:
raise FileNotFoundError(f"Failed to connect to: {fileName}")
root = tree.getroot()
xmlroots.append(root)
cards = extractLinesForVol(xmlroots)
cards_df_v0 = pd.DataFrame(
data={
"xml": [os.path.basename(x) for x in xmls],
"lines": cards,
"dummy": [None for x in cards]
}
)
cards_df_v0["shelfmark"] = cards_df_v0["lines"].transform(lambda x: smark_regex.search(x[0]).group()).str.replace(" ", "")
t_a = cards_df_v0.loc[:,('lines', 'dummy')].transform(lambda x: find_author(x[0], x[1]), axis=1).rename(columns={"lines":"title", "dummy":"author"})
cards_df = cards_df_v0.drop(columns="dummy").join(t_a)
cards_df["ISBN"] = cards_df["lines"].transform(lambda x:isbn_search("".join(x))).str.replace("ISBN ", "").str.strip()
res = pickle.load(open("notebooks\\res.p", "rb"))
cards_df['worldcat_result'] = res
with open("notebooks\\cards_df.p", "wb") as f:
pickle.dump(cards_df, f)
if LOAD_PICKLE:
cards_df = pickle.load(open("notebooks\\cards_df.p", "rb"))
# cards_df["xml"] = cards_df["xml"].str.decode("utf-8")
# res_dict, res = oclc.OCLC_orig_query("FENG JIAN ZHU YI DE SHENG CHAN FANG SHI", "ZHANG (Yu)")
# # print(result[0])
# print("hello")