-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_KG_evaluation_csv.py
129 lines (112 loc) · 7.68 KB
/
create_KG_evaluation_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import re
import csv
from rdflib import Graph, Namespace
from helper_functions import find_rdf_triples, extract_kg_info
def KG_indvidual_csv_creation(config):
# Folder containing the text files
folder_path = config.get('Paths', 'Ans_to_cq_v1_summary')
# Path to the file containing the competency questions
cq_file = config.get('Paths', 'CQs_path')
# Regular expression pattern to extract the publication number from filenames
pattern = re.compile(r'Publication(\d+)_CQ(\d+)')
# Mapping of competency questions to ontology concepts
cq_ontology_mapping = {
"What data formats are used in the deep learning pipeline?": ["DataFormat"],
"What are the sources of input data for the deep learning pipeline?": ["Source", "InputData"],
"How was raw data collected in terms of methods and tools?": ["Method", "Tool"],
"Is the source code openly accessible, and if so, what is the repository link?": ["RepositoryLink"],
"What preprocessing steps are involved before training the deep learning model?": ["PreprocessingStep"],
"Are there transformations or augmentations applied to the input data?": ["TransformationAugmentation"],
"Does the paper discuss data bias or ethical implications?": ["Bias", "EthicalImplication"],
"What is the architecture of the deep learning model in the pipeline?": ["Architecture", "Model"],
"How was the model selected for a specific task?": ["ModelSelectionProcess", "Model"],
"What were the considerations in the model selection process?": ["Consideration"],
"How many models are used in the pipeline?": ["NumberOfModels"],
"Are the models considered state-of-the-art?": ["StateOfTheArt", "Model"],
"How is the model initialized?": ["Initialization", "Model"],
"Are there specific weight configurations used during initialization?": ["WeightConfiguration"],
"Are there optimization algorithms or learning rate schedules used during training?": ["OptimizationAlgorithm", "LearningRateSchedule"],
"What is the convergence criteria or stopping condition for the training process?": ["ConvergenceCriteria", "TrainingProcedure"],
"Which software frameworks or libraries are used to build the model?": ["SoftwareFrameworkLibrary"],
"What hardware infrastructures are used for model training?": ["HardwareInfrastructure"],
"What hyperparameters are used in the model?": ["Hyperparameter"],
"Why were those specific hyperparameters selected?": ["Hyperparameter"],
"Are the provided hyperparameters fine-tuned?": ["FineTuning"],
"What metrics are used to evaluate the model?": ["Metric"],
"Did the authors use different metrics for different problems?": ["Metric"],
"Is there sufficient information to reproduce the deep learning pipeline?": ["Reproducibility"],
"What measures are taken to explain model predictions?": ["Explanation", "PredictionClassification"],
"What is the versioning strategy for trained models?": ["VersioningStrategy"],
"How are different versions of datasets managed?": ["DatasetVersion"],
"How are updates to datasets documented?": ["UpdateFrequency"],
"What annotations or labels are associated with the data?": ["PredictionClassification"],
"How are these annotations or labels used in the model?": [],
"What predictions or classifications are generated by the deep learning model?": ["PredictionClassification"],
"How is uncertainty or confidence in model predictions captured?": ["UncertaintyConfidence"],
"Are there post-processing steps applied to the model's output?": ["PostProcessing"],
"Is the trained model deployed, and if not, what is the reason?": ["Deployment"],
"What hardware and software are used for model deployment?": ["SoftwareFrameworkLibrary", "HardwareInfrastructure"],
"How often are model weights updated by retraining with new data?": [],
"What ethical considerations are taken into account during development and deployment?": ["EthicalImplication"],
"How is bias in the data addressed?": ["BiasAddressing"],
"Is there transparency in the decision-making process regarding bias?": ["Transparency"],
"Are privacy and security measures implemented in handling sensitive data?": ["PrivacySecurityMeasure", "SensitiveData"]
}
# Read the competency questions from the file
with open(cq_file, 'r') as f:
cq_lines = f.readlines()
cq_lines = [line.strip() for line in cq_lines]
# Dictionary to store content for each publication
publication_dict = {}
# Iterate through files in the folder
for filename in os.listdir(folder_path):
# Check if the file is a text file
if filename.endswith('.txt'):
file_path = os.path.join(folder_path, filename)
# Extract the publication number and CQ number from the filename using regular expression
match = pattern.search(filename)
if match:
publication_no = int(match.group(1))
cq_no = int(match.group(2))
# Read the content of the file
with open(file_path, 'r') as file:
content = file.read().strip()
# Add content to the dictionary
publication_dict.setdefault(publication_no, {}).setdefault(cq_no, []).append(content)
ontology_ns = Namespace("https://w3id.org/dlprovenance#")
# Define RDF namespaces
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
folder_path = config.get('Paths', 'KG_individual_Eval_path')
os.makedirs(folder_path, exist_ok=True)
# Iterate through the publication numbers and write content to respective CSV files
for publication_no, cq_data in publication_dict.items():
print(publication_no)
#if publication_no != 3:
# continue
version = config.get('Paths', 'prompt_cq_ans_version')
kg_files_directory = config.get('Paths', 'kg_files_path')
kg_eval_filename_format = config.get('Paths', 'kg_eval_filename_format')
# Path to the KG file
kg_file = os.path.join(kg_files_directory, f'{version}', f'Publication_{publication_no}.ttl')
# Parse the KG file
g = Graph()
g.parse(kg_file, format="ttl")
# Create a separate CSV file for each publication number
csv_file = os.path.join(folder_path, f'{version}', kg_evaluation_filename_format.format(publication_no=publication_no))
# Write content to CSV file
with open(csv_file, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# Write header row
writer.writerow(['Publication No.', 'CQ No.', 'CQ', 'CQ LLM Answer', 'Ontology Concept', 'KG Individual', 'KG Individual Label'])
# Write content to CSV file
for cq_no in range(1, 41): # Assuming there are 40 CQs
for content in cq_data.get(cq_no, [''] * 3): # Get content for each CQ, fill with empty strings if not present
# Find the ontology concepts associated with the current CQ
ontology_concepts = cq_ontology_mapping.get(cq_lines[cq_no - 1], [])
for ontology_concept in ontology_concepts:
for subj, pred, obj in g.triples((None, RDF.type, ontology_ns[ontology_concept])):
for s, p, o in g.triples((subj, RDFS.label, None)):
rdfs_label = str(o)
writer.writerow([publication_no, cq_no, cq_lines[cq_no - 1], content, ontology_concept, s, rdfs_label])