Some custom modifications to handle a different data set

SCANL · May 16, 2024 · 1eebdea · 1eebdea
1 parent 391a437
commit 1eebdea
Show file tree

Hide file tree

Showing 7 changed files with 19 additions and 13 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -24,7 +24,7 @@ project(SEPosTagging)
 
 # find needed libraries
 find_package(LibXml2 REQUIRED)
-find_package(PythonLibs REQUIRED)
+find_package(Python3 REQUIRED)
 
 set(CMAKE_CXX_STANDARD 14)
 set(CMAKE_CXX_FLAGS "-O3 -Wno-reorder -Wunused-variable -Wunused-parameter")
@@ -47,4 +47,4 @@ include_directories(/usr/local/include
                     src)
 
 add_subdirectory(srcSAXEventDispatch)
-add_subdirectory(src)
+add_subdirectory(src)
diff --git a/HTTPRequest b/HTTPRequest
diff --git a/ensemble_tagger_implementation/ensemble_functions.py b/ensemble_tagger_implementation/ensemble_functions.py
@@ -28,7 +28,9 @@ def Process_identifier_with_swum(identifier_data, context_of_identifier):
         swum_string = "{identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
         swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '2', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     else:
-        split_identifier_name = split_identifier_name+'('+identifier_data.split('(')[1]
+        split_identifier_name = split_identifier_name+'()'
+        print("NAME\n")
+        print(split_identifier_name)
         swum_string = " {identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
         swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '1', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
 
@@ -49,8 +51,9 @@ def Process_identifier_with_posse(identifier_data, context_of_identifier):
         posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'C', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
     else:
         posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'M', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    
+
     posse_out, posse_err = posse_process.communicate()
+    print(posse_err.decode('utf-8').strip())
     posse_out_parsed = Parse_posse(posse_out.decode('utf-8').strip(), split_identifier_name_raw)
     return posse_out_parsed
 
@@ -120,4 +123,4 @@ def Annotate_word(swum_tag, posse_tag, stanford_tag, normalized_length, code_con
     y_pred = clf.predict(df_features)
     return (y_pred[0])
 
-#read_from_cmd_line()
+#read_from_cmd_line()
diff --git a/ensemble_tagger_implementation/preprocess_identifiers.py b/ensemble_tagger_implementation/preprocess_identifiers.py
@@ -26,6 +26,8 @@ def Parse_swum(swum_output, split_identifier_name):
         identifier = code_context[1].split('-')[1].split()
         raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))
     else:
+        print("CONTEXT\n")
+        print(code_context)
         identifier = code_context[1].split('@')[1].split('|')
         raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))
 
@@ -88,4 +90,4 @@ def Parse_stanford(stanford_output, split_identifier_name):
 
     return("{identifier_names},{grammar_pattern}"
           .format(identifier_names=' '.join(split_identifier_name), 
-            grammar_pattern=' '.join(grammar_pattern)))
+            grammar_pattern=' '.join(grammar_pattern)))
diff --git a/ensemble_tagger_implementation/tagger_config/model_config.yml b/ensemble_tagger_implementation/tagger_config/model_config.yml
@@ -1,6 +1,6 @@
 models:
   DTCP:
-    'models/model_DecisionTreeClassifier_training_set_conj.pkl'
+    'models/model_DecisionTreeClassifier.pkl'
   RFCP:
     'models/model_RandomForestClassifier_training_set_conj.pkl'
   DTCA:
@@ -14,4 +14,4 @@ models:
   DTNA:
     'models/model_DecisionTreeClassifier_training_set_norm_other.pkl'
   RFNA:
-    'models/model_RandomForestClassifier_training_set_norm_other.pkl'
+    'models/model_RandomForestClassifier_training_set_norm_other.pkl'
diff --git a/srcSAXEventDispatch b/srcSAXEventDispatch
diff --git a/tagger_multiple_requests.py b/tagger_multiple_requests.py
@@ -7,18 +7,19 @@
 
 import csv
 import requests
+import re
 
 with open('names_for_tagger.csv', 'rt') as dataset:
-    dataset_rows = csv.reader(dataset)
+    dataset_rows = csv.reader(dataset, dialect='excel')
 
     for line in dataset_rows:
         identifier_type = line[0]
         identifier_name = line[1]
         identifier_context = line[2]
-
+        identifier_type = re.sub(r'[^a-zA-Z0-9]', '', identifier_type) 
         try:
             r = requests.get(f'http://127.0.0.1:5000/{identifier_type}/{identifier_name}/{identifier_context}')
             print (r.text)
         except Exception as error:
             print(error)
-            continue
+            continue
+36 −0		.circleci/config.yml
+30 −13		.gitignore
+3 −0		.gitmodules
+19 −0		CMakeLists.txt
+36 −24		README.md
+61 −10		azure-pipelines.yml
+3 −7		example/Makefile
+28 −0		example/example.sln
+9 −5		example/example.vcxproj
+16 −42		example/example.xcodeproj/project.pbxproj
+7 −0		example/example.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+104 −0		example/main.cpp
+1 −0		external/Catch2
+1,038 −389		include/HTTPRequest.hpp
+14 −0		sonar-project.properties
+0 −85		test/main.cpp
+21 −0		tests/CMakeLists.txt
+28 −0		tests/Makefile
+95 −0		tests/encoding.cpp
+2 −0		tests/main.cpp
+484 −0		tests/parsing.cpp
+1 −1		tests/tests.sln
+167 −0		tests/tests.vcxproj
+320 −0		tests/tests.xcodeproj/project.pbxproj
+1 −1		tests/tests.xcodeproj/project.xcworkspace/contents.xcworkspacedata