From 1eebdeaa21e82c639a61d76afd152de78436d00a Mon Sep 17 00:00:00 2001
From: Christian Newman <cdvnse@rit.edu>
Date: Wed, 15 May 2024 22:48:40 -0400
Subject: [PATCH] Some custom modifications to handle a different data set

---
 CMakeLists.txt                                           | 4 ++--
 HTTPRequest                                              | 2 +-
 ensemble_tagger_implementation/ensemble_functions.py     | 9 ++++++---
 ensemble_tagger_implementation/preprocess_identifiers.py | 4 +++-
 .../tagger_config/model_config.yml                       | 4 ++--
 srcSAXEventDispatch                                      | 2 +-
 tagger_multiple_requests.py                              | 7 ++++---
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5c845a..b0f2584 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,7 +24,7 @@ project(SEPosTagging)
 
 # find needed libraries
 find_package(LibXml2 REQUIRED)
-find_package(PythonLibs REQUIRED)
+find_package(Python3 REQUIRED)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_FLAGS "-O3 -Wno-reorder -Wunused-variable -Wunused-parameter")
@@ -47,4 +47,4 @@ include_directories(/usr/local/include
                     src)
 
 add_subdirectory(srcSAXEventDispatch)
-add_subdirectory(src)
\ No newline at end of file
+add_subdirectory(src)
diff --git a/HTTPRequest b/HTTPRequest
index a9f085c..44d97a5 160000
--- a/HTTPRequest
+++ b/HTTPRequest
@@ -1 +1 @@
-Subproject commit a9f085c0279c28c051e890182afad2f3f850ebe1
+Subproject commit 44d97a57f5cab434e5205e69cf719753689d8b81
diff --git a/ensemble_tagger_implementation/ensemble_functions.py b/ensemble_tagger_implementation/ensemble_functions.py
index 404b6df..525afbf 100644
--- a/ensemble_tagger_implementation/ensemble_functions.py
+++ b/ensemble_tagger_implementation/ensemble_functions.py
@@ -28,7 +28,9 @@ def Process_identifier_with_swum(identifier_data, context_of_identifier):
         swum_string = "{identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
         swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '2', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     else:
-        split_identifier_name = split_identifier_name+'('+identifier_data.split('(')[1]
+        split_identifier_name = split_identifier_name+'()'
+        print("NAME\n")
+        print(split_identifier_name)
         swum_string = " {identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
         swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '1', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
@@ -49,8 +51,9 @@ def Process_identifier_with_posse(identifier_data, context_of_identifier):
         posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'C', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     else:
         posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'M', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    
+   
     posse_out, posse_err = posse_process.communicate()
+    print(posse_err.decode('utf-8').strip())
     posse_out_parsed = Parse_posse(posse_out.decode('utf-8').strip(), split_identifier_name_raw)
     return posse_out_parsed
 
@@ -120,4 +123,4 @@ def Annotate_word(swum_tag, posse_tag, stanford_tag, normalized_length, code_con
     y_pred = clf.predict(df_features)
     return (y_pred[0])
 
-#read_from_cmd_line()
\ No newline at end of file
+#read_from_cmd_line()
diff --git a/ensemble_tagger_implementation/preprocess_identifiers.py b/ensemble_tagger_implementation/preprocess_identifiers.py
index aa6aa32..e76f4e3 100644
--- a/ensemble_tagger_implementation/preprocess_identifiers.py
+++ b/ensemble_tagger_implementation/preprocess_identifiers.py
@@ -26,6 +26,8 @@ def Parse_swum(swum_output, split_identifier_name):
         identifier = code_context[1].split('-')[1].split()
         raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))
     else:
+        print("CONTEXT\n")
+        print(code_context)
         identifier = code_context[1].split('@')[1].split('|')
         raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))
     
@@ -88,4 +90,4 @@ def Parse_stanford(stanford_output, split_identifier_name):
     
     return("{identifier_names},{grammar_pattern}"
           .format(identifier_names=' '.join(split_identifier_name), 
-            grammar_pattern=' '.join(grammar_pattern)))
\ No newline at end of file
+            grammar_pattern=' '.join(grammar_pattern)))
diff --git a/ensemble_tagger_implementation/tagger_config/model_config.yml b/ensemble_tagger_implementation/tagger_config/model_config.yml
index bd4d289..aeb54bd 100644
--- a/ensemble_tagger_implementation/tagger_config/model_config.yml
+++ b/ensemble_tagger_implementation/tagger_config/model_config.yml
@@ -1,6 +1,6 @@
 models:
   DTCP:
-    'models/model_DecisionTreeClassifier_training_set_conj.pkl'
+    'models/model_DecisionTreeClassifier.pkl'
   RFCP:
     'models/model_RandomForestClassifier_training_set_conj.pkl'
   DTCA:
@@ -14,4 +14,4 @@ models:
   DTNA:
     'models/model_DecisionTreeClassifier_training_set_norm_other.pkl'
   RFNA:
-    'models/model_RandomForestClassifier_training_set_norm_other.pkl'
\ No newline at end of file
+    'models/model_RandomForestClassifier_training_set_norm_other.pkl'
diff --git a/srcSAXEventDispatch b/srcSAXEventDispatch
index 2aeac9a..7605fa5 160000
--- a/srcSAXEventDispatch
+++ b/srcSAXEventDispatch
@@ -1 +1 @@
-Subproject commit 2aeac9a5101f08840956dd66cb827ed4f23efed0
+Subproject commit 7605fa505ab00e2c1eaba4e39b99a9ff98e0379f
diff --git a/tagger_multiple_requests.py b/tagger_multiple_requests.py
index 3f08524..f865a24 100644
--- a/tagger_multiple_requests.py
+++ b/tagger_multiple_requests.py
@@ -7,18 +7,19 @@
 
 import csv
 import requests
+import re
 
 with open('names_for_tagger.csv', 'rt') as dataset:
-    dataset_rows = csv.reader(dataset)
+    dataset_rows = csv.reader(dataset, dialect='excel')
 
     for line in dataset_rows:
         identifier_type = line[0]
         identifier_name = line[1]
         identifier_context = line[2]
-        
+        identifier_type = re.sub(r'[^a-zA-Z0-9]', '', identifier_type) 
         try:
             r = requests.get(f'http://127.0.0.1:5000/{identifier_type}/{identifier_name}/{identifier_context}')
             print (r.text)
         except Exception as error:
             print(error)
-            continue
\ No newline at end of file
+            continue