Skip to content

Commit

Permalink
Some custom modifications to handle a different data set
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian Newman committed May 16, 2024
1 parent 391a437 commit 1eebdea
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 13 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ project(SEPosTagging)

# find needed libraries
find_package(LibXml2 REQUIRED)
find_package(PythonLibs REQUIRED)
find_package(Python3 REQUIRED)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_FLAGS "-O3 -Wno-reorder -Wunused-variable -Wunused-parameter")
Expand All @@ -47,4 +47,4 @@ include_directories(/usr/local/include
src)

add_subdirectory(srcSAXEventDispatch)
add_subdirectory(src)
add_subdirectory(src)
9 changes: 6 additions & 3 deletions ensemble_tagger_implementation/ensemble_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,9 @@ def Process_identifier_with_swum(identifier_data, context_of_identifier):
swum_string = "{identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '2', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
split_identifier_name = split_identifier_name+'('+identifier_data.split('(')[1]
split_identifier_name = split_identifier_name+'()'
print("NAME\n")
print(split_identifier_name)
swum_string = " {identifier_type} {identifier_name}".format(identifier_name = split_identifier_name, identifier_type = identifier_type_and_name[0])
swum_process = subprocess.Popen(['java', '-jar', '../SWUM/SWUM_POS/swum.jar', swum_string, '1', 'true'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

Expand All @@ -49,8 +51,9 @@ def Process_identifier_with_posse(identifier_data, context_of_identifier):
posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'C', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
else:
posse_process = subprocess.Popen(['../POSSE/Scripts/mainParser.pl', 'M', posse_string], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

posse_out, posse_err = posse_process.communicate()
print(posse_err.decode('utf-8').strip())
posse_out_parsed = Parse_posse(posse_out.decode('utf-8').strip(), split_identifier_name_raw)
return posse_out_parsed

Expand Down Expand Up @@ -120,4 +123,4 @@ def Annotate_word(swum_tag, posse_tag, stanford_tag, normalized_length, code_con
y_pred = clf.predict(df_features)
return (y_pred[0])

#read_from_cmd_line()
#read_from_cmd_line()
4 changes: 3 additions & 1 deletion ensemble_tagger_implementation/preprocess_identifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ def Parse_swum(swum_output, split_identifier_name):
identifier = code_context[1].split('-')[1].split()
raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))
else:
print("CONTEXT\n")
print(code_context)
identifier = code_context[1].split('@')[1].split('|')
raw_grammar_pattern = re.findall('([A-Z]+)', ' '.join(identifier))

Expand Down Expand Up @@ -88,4 +90,4 @@ def Parse_stanford(stanford_output, split_identifier_name):

return("{identifier_names},{grammar_pattern}"
.format(identifier_names=' '.join(split_identifier_name),
grammar_pattern=' '.join(grammar_pattern)))
grammar_pattern=' '.join(grammar_pattern)))
4 changes: 2 additions & 2 deletions ensemble_tagger_implementation/tagger_config/model_config.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
models:
DTCP:
'models/model_DecisionTreeClassifier_training_set_conj.pkl'
'models/model_DecisionTreeClassifier.pkl'
RFCP:
'models/model_RandomForestClassifier_training_set_conj.pkl'
DTCA:
Expand All @@ -14,4 +14,4 @@ models:
DTNA:
'models/model_DecisionTreeClassifier_training_set_norm_other.pkl'
RFNA:
'models/model_RandomForestClassifier_training_set_norm_other.pkl'
'models/model_RandomForestClassifier_training_set_norm_other.pkl'
2 changes: 1 addition & 1 deletion srcSAXEventDispatch
7 changes: 4 additions & 3 deletions tagger_multiple_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,19 @@

import csv
import requests
import re

with open('names_for_tagger.csv', 'rt') as dataset:
dataset_rows = csv.reader(dataset)
dataset_rows = csv.reader(dataset, dialect='excel')

for line in dataset_rows:
identifier_type = line[0]
identifier_name = line[1]
identifier_context = line[2]

identifier_type = re.sub(r'[^a-zA-Z0-9]', '', identifier_type)
try:
r = requests.get(f'http://127.0.0.1:5000/{identifier_type}/{identifier_name}/{identifier_context}')
print (r.text)
except Exception as error:
print(error)
continue
continue

0 comments on commit 1eebdea

Please sign in to comment.