-
Notifications
You must be signed in to change notification settings - Fork 0
/
langdet_check_language.py
142 lines (109 loc) · 5.55 KB
/
langdet_check_language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
This program detects the language of documents
"""
from sd_core.configuration_handler import ConfigurationHandler
from sd_core.conditional_print import ConditionalPrint
from sd_core.text_file_loader import TextFileLoader
from my_lib.SpatialLanguageIdentificator import SpatialLanguageIdentificator, ComparatorSLI
import re
import os
CODED_CONFIGURATION_PATH = './configurations/language_detector.conf'
config_handler = ConfigurationHandler(first_init=True, fill_unkown_args=True,\
coded_configuration_paths=[CODED_CONFIGURATION_PATH])
config = config_handler.get_config()
cpr = ConditionalPrint(config.PRINT_MAIN, config.PRINT_EXCEPTION_LEVEL, config.PRINT_WARNING_LEVEL,
leading_tag="main")
# Take bibles as learn in test data ( here's a good source for other languages)
# en http://www.bibleprotector.com/TEXT-PCE-127.txt king james version
# en https://ebible.org/find/details.php?id=eng-kjv2006 king james version used (cause format similar)
# de https://info2.sermon-online.com/german/MartinLuther-1912/Martin_Luther_Uebersetzung_1912.txt luther version
# de https://ebible.org/find/details.php?id=deu1912 luther version used
# fr https://ebible.org/find/details.php?id=fraLSG loius segond bible equivalent to king james
# th https://ebible.org/find/details.php?id=thaKJV thai king james version
# load dataset spatial language identificators saved through 'langdet_create_dataset'
# make sure the tags in this list correspond to the folder names for languages in your io_data
selected_datasets = ["de", "en", "fr", "sp", "th"]
identifiers_loaded = []
for lang_code in selected_datasets:
current_sli = SpatialLanguageIdentificator.load_spatial_language_identificator(config.IO_BASE_PATH, lang_code)
current_sli.normalize_spatial_language_identificator() # create normalized sli
identifiers_loaded.append(current_sli)
# create SLI comparator for later distance calculations
sli_comparator = ComparatorSLI()
# text_file_loader
text_file_loader = TextFileLoader()
# get list of input files
all_input_files_in_path = os.listdir(config.INPUT_FILE_FOLDER_PATH)
all_input_complete_paths = []
for file in all_input_files_in_path:
path_complete = os.path.join(config.INPUT_FILE_FOLDER_PATH, file)
all_input_complete_paths.append(path_complete)
specific_input_file_path = config.INPUT_FILE_PATH
combined_files_list = [specific_input_file_path]
combined_files_list.extend(all_input_complete_paths)
def preprocess_input_text(text): # very similar to the method in create_dataset
text_list = text.split('\n')
if len(text_list) >= 2:
text_list_new = text_list[2:] # remove first two lines, cause they contain chapter header
else:
text_list_new = text_list
# join list again to string
text_joined = "".join(text_list_new)
# text_joined = text_joined.lower() # capitalization should be a good for identification, commented out
return text_joined
def tokenize_text(preprocessed_text):
# tokenized_text = preprocessed_text.split(" ")
tokenized_text = re.split('\s|-', preprocessed_text) # split at spaces and dashes
return tokenized_text
def process_a_file(file_path):
# determine input file format
format = text_file_loader.check_file_format(file_path)
input_text = ""
# get text from file
if format.is_pdf:
input_text, pdf_metadata = text_file_loader.load_pdf_text(file_path)
elif format.is_txt:
input_text = text_file_loader.load_txt_text(file_path)
# preprocessing steps for the given input
p_input_text = preprocess_input_text(input_text)
tokenized_text = tokenize_text(p_input_text)
# create an sli for the input
input_sli = SpatialLanguageIdentificator(file_path, "input")
input_sli.update_spatial_language_identificator(tokenized_text)
input_sli.sort_spatial_language_identificator()
input_sli.normalize_spatial_language_identificator()
results = dict()
for identifier in identifiers_loaded:
distance_matrix, distance = sli_comparator.compare_slis(input_sli, identifier, print_output=False)
results[identifier.language_code] = distance
# normalize results
maximum_distance = max(results.values())
results_normalized = dict()
for key in results:
result = results[key]
result_normalized = result / maximum_distance
result_normalized = 100 - (result_normalized * 100)
results_normalized[key] = result_normalized
# cpr.print(key, "\t", result, result_normalized)
# sort after normalized results and print the result
sorted_results_normalized = sorted(results_normalized.items(), key=lambda kv: kv[1])
sorted_results_normalized.reverse()
# print results
print("Results______________________________________________________")
print("{0:<10} {1:<25}".format("Input:", input_sli.base_path))
print("{0:<10} {1:<25}".format("Det. lang:", sorted_results_normalized[0][0]))
print("_____________________________________________________________")
# print headline
print("{0:<10} {1:<25} {2:<25}".format("language", "distance", "likeliness"))
# print results comparison table
for key, holder in sorted_results_normalized:
result = results[key]
result_normalized = results_normalized[key]
# cpr.print(key, "\t", result, result_normalized)
print("{0:<10} {1:<25} {2:<25}".format(key, result, result_normalized))
print("")
print("")
# iterate through the files list and calculate results for each file
for file_path in combined_files_list:
process_a_file(file_path)
cpr.print("done")