-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgraph_embedding_evaluator.py
203 lines (177 loc) · 8.97 KB
/
graph_embedding_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import os
import subprocess
import evalmetrics
import networkx as nx
import time
from sklearn.manifold import TSNE
from subprocess import CalledProcessError, PIPE
from evalne.utils import preprocess as pp
import visualization
class GraphEval():
def __init__(self, datasets, methods, metrics,
output_path, repetitions=1, emb_dimension=2,
visualize=True, load_embeddings=None):
self.datasets = datasets
self.methods = methods
self.metrics = metrics
self.repetitions = repetitions
self.output_path = output_path
self.emb_dimension = emb_dimension
self.visualize = visualize
# Initialize result dataframe
cols = ['dataset', 'method', 'rep', 'parameters',
'embedding', 'embedding_file', 'runtime']
self.results = pd.DataFrame(data=None, columns=cols)
metric_names = list(self.metrics.keys())
if "glam" in metric_names:
metric_names.remove("glam")
metric_names += ["crosslessness", "edge_length_cv", "min_angle", "shape_gabriel"]
self.store_columns = ["method", "dataset", "rep", "runtime"] + metric_names + ["parameters"]
if load_embeddings is not None:
self.emb_path = load_embeddings
else:
self.emb_path = output_path
def update_results(self, rows, store_intermediate=False):
self.results = self.results.append(rows,
ignore_index=True)
if store_intermediate:
self.results\
.to_csv(os.path.join(self.output_path, "results_intermediate.txt"),
index=False,
float_format='%.4f',
columns=self.store_columns,
header=f"Embeddings path: {self.emb_path}")
def save_results(self):
output = os.path.join(self.output_path, "results.txt")
self.results\
.to_csv(output,
index=False,
float_format='%.4f',
columns=self.store_columns,
header=f"Embeddings path: {self.emb_path}")
print(f"Saved results to {output}.")
intermediate_results = os.path.join(self.output_path, "results_intermediate.txt")
if os.path.isfile(intermediate_results):
os.remove(intermediate_results)
def evaluate_all(self):
for data_name, data_dict in self.datasets.items():
print(f"\n\nDataset {data_name}")
data_dir = os.path.join(self.output_path, data_name)
if not os.path.isdir(data_dir):
os.mkdir(data_dir)
self.load_graph(data_dict)
self.process_dataset(data_name, data_dict)
self.save_results()
self.cleanup_edgefiles()
def compute_metrics(self, data_dict, embedding):
results = {}
for metric_name, metric_dict in self.metrics.items():
metric_method = getattr(evalmetrics, metric_dict["method"])
res = metric_method(data_dict["G"], embedding,
**metric_dict["args"],
data_dict=data_dict)
if metric_name == "glam":
results.update(res)
else:
results[metric_name] = res
return results
def cleanup_edgefiles(self):
for data_dict in self.datasets.values():
if "edgefile" in data_dict:
tmp_edgefile = data_dict["edgefile"]
if os.path.exists(tmp_edgefile):
os.remove(tmp_edgefile)
def compute_visualization(self, G, embedding, emb_prefix, format="pdf"):
print(f"Visualizing results...")
assert embedding.shape[1] <= 2, "Embeddings can only be visulized in two dimensions."
visualization.draw_edges(G, embedding, filename=emb_prefix + "_edge_vis",
format=format)
visualization.draw_nodes(G, embedding, filename=emb_prefix + "_vis",
format=format)
def preprocess_graph(self, data_dict, edgelist_filepath=None):
# Load and preprocess the network
G = pp.load_graph(data_dict["file"],
delimiter=data_dict["delimiter"],
directed=data_dict["directed"])
G, _ = pp.prep_graph(G)
data_dict["G"] = G
print("Number of nodes {}".format(len(G.nodes)))
print("Number of edges {}".format(len(G.edges)))
if edgelist_filepath is not None:
data_dict["file"] = edgelist_filepath
np.savetxt(os.path.join(edgelist_filepath, data_dict['name'] + ".txt"),
G.edges(), delimiter=",", fmt="%d", header=str(G.number_of_nodes()) + ' ' + str(len(G.edges())))
def load_graph(self, data_dict):
"""
Load graph from edgelist without preprocessing.
"""
E = np.loadtxt(data_dict["file"],
delimiter=data_dict["delimiter"],
comments=data_dict["comments"], dtype=int)
G = nx.Graph()
G.add_edges_from(E)
data_dict["G"] = G
print("Number of nodes {}".format(len(G.nodes)))
print("Number of edges {}".format(len(G.edges)))
def process_dataset(self, data_name, data_dict):
for method_name, method_dict in self.methods.items():
emb_dir = os.path.join(self.emb_path, data_name, method_name)
if not os.path.isdir(emb_dir):
os.mkdir(emb_dir)
parameters = method_dict["parameters"]
for r in range(1, self.repetitions+1):
print(
f"\nMethod {method_name}, rep {r} of {self.repetitions}")
emb_prefix = (data_name
+ "_" + method_name
+ "_" + str(r))
emb_filename = emb_prefix + "_embedding.txt"
emb_filepath = os.path.join(emb_dir, emb_filename)
if os.path.exists(emb_filepath):
print("Reading embedding from {}.".format(emb_filepath))
embedding = np.loadtxt(emb_filepath, delimiter=",")
exec_time = 0
else:
command = (method_dict["command"]
+ " --inputgraph " + data_dict["file"]
+ " --output " + emb_filepath
+ " --dimension " + str(method_dict["emb_dimension"]))
command = command + " " + parameters if parameters != "" else command
start = time.time()
try:
subprocess.run(command, shell=True, stderr=PIPE, check=True, encoding='utf8')
except CalledProcessError as err:
print(f"Method call {command} failed with following output:\n{err.stderr}")
exit(1)
exec_time = time.time() - start
embedding = np.loadtxt(emb_filepath, delimiter=",")
if method_dict.get("use_tsne", False):
pretsne_filename = emb_filepath[:-4] + "_dim" + str(method_dict["emb_dimension"]) + ".txt"
os.rename(emb_filepath, pretsne_filename)
embedding = np.loadtxt(pretsne_filename, delimiter=",")
print(f"Using t-SNE to reduce dimensionality from {method_dict['emb_dimension']} to {self.emb_dimension}")
tsne_start = time.time()
embedding = TSNE(n_components=self.emb_dimension).fit_transform(embedding)
exec_time += time.time() - tsne_start
np.savetxt(emb_filepath, embedding, delimiter=",")
# Check that embeddings all have final dimension
assert embedding.shape[1] == self.emb_dimension, f"Embedding is expected to have {self.emb_dimension} dimensions."
res = {'rep': r,
'dataset': data_name,
'method': method_name,
'embedding': embedding,
'embedding_file': os.path.join(data_name, method_name, emb_filename),
'runtime': exec_time,
'parameters': parameters,
'dimension': method_dict["emb_dimension"]}
metric_results = self.compute_metrics(data_dict, embedding)
res.update(metric_results)
self.update_results(res, store_intermediate=True)
if self.visualize and r == 1:
self.compute_visualization(data_dict["G"],
embedding,
os.path.join(self.output_path, data_name, emb_prefix))