-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodule_names.py
96 lines (75 loc) · 3.72 KB
/
module_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import re # use regex to split words on capital letters
from datetime import datetime
# 64% of people don't have prefix in their name!, 35% have one of the prefixes in following list, Less than 1% have other prefixes
PREFIXES = ['van', 'de', 'van der', 'van den', 'van de', 'den', 'vanden', 'vander', 'vande']
FREQ_NAMES = ['te', 'kinderen', 'dochter']
# Labels we assign to words
LABELS = {1: 'NAME ', 2: 'PREFIX ', 3: 'FAKE ', 4: 'FREQUENT', -1: 'NOT_NAME', None: 'UNKNOWN', 5: 'DIGIT '}
import import_db
def extract_names(word_list, unique_id):
""" (list) --> (dist)
for each word the specifications are reported:
1 : words started with capital letter
2 : last name prefix
3 : First word of the whole paragraph which continues with a capital letter word
4 : Very frequent words
5 : Digit
-1: has capital letter but is not a name
"""
# get a list of words from pre-processed words.
word_list_labeled = {} # for word specific
# search for names and digits
for index, word in enumerate(word_list):
word_list_labeled[index] = None
# if word starts by a capital letter and has at least three letters
if re.match('[A-Z][a-z]+', word): # and index > 0:
word_list_labeled[index] = 1 # i.e., Name
# A capital letter followed by a dot
if re.match('[A-Z]\.', word):
word_list_labeled[index] = 1
if re.match('\d+', word):
word_list_labeled[index] = 5 # i.e., Digit
# search for last name prefixes
for index, word in enumerate(word_list):
# one component prefixes, happen if before and after them we have names
if word in PREFIXES \
and word_list_labeled.get(index - 1) == 1 \
and word_list_labeled.get(index + 1) == 1:
word_list_labeled[index] = 2
# two component prefixes, happen if before and after them we have names
if index < len(word_list) - 1 \
and word + " " + word_list[index + 1] in PREFIXES \
and word_list_labeled.get(index - 1) == 1 \
and word_list_labeled.get(index + 2) == 1:
word_list_labeled[index] = 2
word_list_labeled[index + 1] = 2
# let's double check every word which starts a sentence
if word_list[0] in import_db.list_of_first_names:
word_list_labeled[0] = 1
else:
word_list_labeled[0] = 3
# store the error in a log file
with open('warning_log.csv', 'a') as f:
f.write("WARNING;" + str(unique_id) + ';' +
str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +
';' + word_list[0] + ";;; Invalid Given Name \n")
for index, word in enumerate(word_list):
if index < len(word_list) - 1:
if word == '.':
if word_list[index + 1] in import_db.list_of_first_names:
word_list_labeled[index + 1] = 1
else:
word_list_labeled[index + 1] = 3
# store the error in a log file
with open('warning_log.csv', 'a') as f:
f.write("WARNING;" + str(unique_id) + ';' +
str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +
';' + word_list[index + 1] + ";;; Invalid Given Name \n")
for index, word in enumerate(word_list):
if word in FREQ_NAMES:
word_list_labeled[index] = 4
# this is to get rid of "Sint Oedenrode, Sint Janssstraat, Sint Janssstraat, etc."
for index, word in enumerate(word_list):
if word in ['Sint', 'St']:
word_list_labeled[index] = -1
return word_list_labeled