From 1eebdeaa21e82c639a61d76afd152de78436d00a Mon Sep 17 00:00:00 2001 From: Christian Newman Date: Wed, 15 May 2024 22:48:40 -0400 Subject: [PATCH] Some custom modifications to handle a different data set --- CMakeLists.txt | 4 ++-- HTTPRequest | 2 +- ensemble_tagger_implementation/ensemble_functions.py | 9 ++++++--- ensemble_tagger_implementation/preprocess_identifiers.py | 4 +++- .../tagger_config/model_config.yml | 4 ++-- srcSAXEventDispatch | 2 +- tagger_multiple_requests.py | 7 ++++--- 7 files changed, 19 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e5c845a..b0f2584 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ project(SEPosTagging) # find needed libraries find_package(LibXml2 REQUIRED) -find_package(PythonLibs REQUIRED) +find_package(Python3 REQUIRED) set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_FLAGS "-O3 -Wno-reorder -Wunused-variable -Wunused-parameter") @@ -47,4 +47,4 @@ include_directories(/usr/local/include src) add_subdirectory(srcSAXEventDispatch) -add_subdirectory(src) \ No newline at end of file +add_subdirectory(src) diff --git a/HTTPRequest b/HTTPRequest index a9f085c..44d97a5 160000 --- a/HTTPRequest +++ b/HTTPRequest @@ -1 +1 @@ -Subproject commit a9f085c0279c28c051e890182afad2f3f850ebe1 +Subproject commit 44d97a57f5cab434e5205e69cf719753689d8b81 diff --git a/ensemble_tagger_implementation/ensemble_functions.py b/ensemble_tagger_implementation/ensemble_functions.py index 404b6df..525afbf 100644 --- a/ensemble_tagger_implementation/ensemble_functions.py +++ b/ensemble_tagger_implementation/ensemble_functions.py @@ -28,7 +28,9 @@ def Process_identifier_with_swum(identifier_data, context_of_identifier): swum_string = "{identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0]) swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '2', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: - split_identifier_name = split_identifier_name+'('+identifier_data.split('(')[1] + split_identifier_name = split_identifier_name+'()' + print("NAME\n") + print(split_identifier_name) swum_string = " {identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0]) swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '1', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -49,8 +51,9 @@ def Process_identifier_with_posse(identifier_data, context_of_identifier): posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'C', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'M', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - + posse_out, posse_err = posse_process.communicate() + print(posse_err.decode('utf-8').strip()) posse_out_parsed = Parse_posse(posse_out.decode('utf-8').strip(), split_identifier_name_raw) return posse_out_parsed @@ -120,4 +123,4 @@ def Annotate_word(swum_tag, posse_tag, stanford_tag, normalized_length, code_con y_pred = clf.predict(df_features) return (y_pred[0]) -#read_from_cmd_line() \ No newline at end of file +#read_from_cmd_line() diff --git a/ensemble_tagger_implementation/preprocess_identifiers.py b/ensemble_tagger_implementation/preprocess_identifiers.py index aa6aa32..e76f4e3 100644 --- a/ensemble_tagger_implementation/preprocess_identifiers.py +++ b/ensemble_tagger_implementation/preprocess_identifiers.py @@ -26,6 +26,8 @@ def Parse_swum(swum_output, split_identifier_name): identifier = code_context[1].split('-')[1].split() raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier)) else: + print("CONTEXT\n") + print(code_context) identifier = code_context[1].split('@')[1].split('|') raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier)) @@ -88,4 +90,4 @@ def Parse_stanford(stanford_output, split_identifier_name): return("{identifier_names},{grammar_pattern}" .format(identifier_names=' '.join(split_identifier_name), - grammar_pattern=' '.join(grammar_pattern))) \ No newline at end of file + grammar_pattern=' '.join(grammar_pattern))) diff --git a/ensemble_tagger_implementation/tagger_config/model_config.yml b/ensemble_tagger_implementation/tagger_config/model_config.yml index bd4d289..aeb54bd 100644 --- a/ensemble_tagger_implementation/tagger_config/model_config.yml +++ b/ensemble_tagger_implementation/tagger_config/model_config.yml @@ -1,6 +1,6 @@ models: DTCP: - 'models/model_DecisionTreeClassifier_training_set_conj.pkl' + 'models/model_DecisionTreeClassifier.pkl' RFCP: 'models/model_RandomForestClassifier_training_set_conj.pkl' DTCA: @@ -14,4 +14,4 @@ models: DTNA: 'models/model_DecisionTreeClassifier_training_set_norm_other.pkl' RFNA: - 'models/model_RandomForestClassifier_training_set_norm_other.pkl' \ No newline at end of file + 'models/model_RandomForestClassifier_training_set_norm_other.pkl' diff --git a/srcSAXEventDispatch b/srcSAXEventDispatch index 2aeac9a..7605fa5 160000 --- a/srcSAXEventDispatch +++ b/srcSAXEventDispatch @@ -1 +1 @@ -Subproject commit 2aeac9a5101f08840956dd66cb827ed4f23efed0 +Subproject commit 7605fa505ab00e2c1eaba4e39b99a9ff98e0379f diff --git a/tagger_multiple_requests.py b/tagger_multiple_requests.py index 3f08524..f865a24 100644 --- a/tagger_multiple_requests.py +++ b/tagger_multiple_requests.py @@ -7,18 +7,19 @@ import csv import requests +import re with open('names_for_tagger.csv', 'rt') as dataset: - dataset_rows = csv.reader(dataset) + dataset_rows = csv.reader(dataset, dialect='excel') for line in dataset_rows: identifier_type = line[0] identifier_name = line[1] identifier_context = line[2] - + identifier_type = re.sub(r'[^a-zA-Z0-9]', '', identifier_type) try: r = requests.get(f'http://127.0.0.1:5000/{identifier_type}/{identifier_name}/{identifier_context}') print (r.text) except Exception as error: print(error) - continue \ No newline at end of file + continue