-
Notifications
You must be signed in to change notification settings - Fork 0
/
subgraph_nema_dataset.py
868 lines (667 loc) · 34.3 KB
/
subgraph_nema_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
# -*- coding: utf-8 -*-
"""
Created on Thu Jul 20 14:59:13 2023
@author: Pandadada
"""
# coding: utf-8
# # Tutorial
# In[1]:
# import numpy as np
# import os
# import sys
# sys.path.append(os.path.realpath(
# 'E:/Master Thesis/comparisons/fornax2/fornax'))
import sys
# sys.modules[__name__].__dict__.clear()
import os
# os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import networkx as nx
import fornax
import random
import string
import pickle
import time
from lib1.data_loader import load_local_data,histog,build_noisy_circular_graph
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
from IPython.core.display import SVG
# Add project root dir
# ROOT_DIR = os.path.abspath("../../")
# sys.path.append(ROOT_DIR)
from sqlalchemy import text
# To install the use the dependencies for this notebook:
#
# ```bash
# conda env create -f environment.yml
# source activate fornax_tutorial
# ```
#
# To run this notebook from the project root:
#
# ```bash
# cd docs/tutorial
# jupyter-notebook
# ```
# In this tutorial we will:
#
# * Load a graph of superheros and their teams from csv files
#
# * Search for nodes in the graph using a string similarity function
#
# * Use fornax to search for nodes using string similarity and fuzzy graph matching
#
# The data in this tutorial we be generated using the preceding notebook: `Tutorial1.ipynb`.
#
# ## Introduction
#
# `nodes.csv` and `edges.csv` contain a graph of superheros and their teams along with alternative names for those heros and groups (or aliases).
#
# The image below uses the example of Iron Man, who is known as "Tony" to his friends.
# Iron man is a member of the Avengers, a.k.a. Earth's Mightiest Superheros.
# Other heros are also members of The Avengers, and they will also have aliases.
# Other heros will also be members of other teams and so and so forth.
#
#
# All of these heros, teams and aliases together make our target graph, a graph which we will search using
# In[2]:
# SVG('../img/iron_man.svg')
# SVG('E:/Master Thesis/comparisons/iron_man.svg')
# Let's load the data into the notebook using pandas.
# In[3]:
#%%
# # used for converting csv values in nodes.csv
# mapping = {
# '0': 'hero',
# '1': 'team',
# '2': 'hero_alias',
# '3': 'team_alias'
# }
# nodes_df = pd.read_csv(
# './nodes.csv',
# # rename the columns as targets as this will form the target graph
# # (the graph which we will be searching)
# names=['target_label', 'target_type', 'target_id'],
# # ignore the header
# header=0,
# converters = {
# # convert target_type from numeric values to
# # literal string representations for ease of reading
# 'target_type': lambda key: mapping.get(key)
# }
# )
# # contains pairs of target node ids
# edges_df = pd.read_csv('./edges.csv')
# # We can see that the target nodes have a label (the hero's primary name).
# # The target_type column will be one of `hero`, `team`, `hero alias`, `team alias`, the four types of nodes in the graph.
# #
# # (Note that by hero we mean a person in a comic book who has superpowers regardless of them being good or bad)
# # In[4]:
# nodes_df['target_label'].head()
# # Edges are pairs of `target_id` values.
# # Note that fornax deals with undirected graphs so there is no need to add the edge in the reverse direction.
# # Doing so will cause an exception as the edge will be considered a duplicate.
# # In[5]:
# edges_df.head()
#%%
# ## Label similarity
#
# For some motivation, before using fornax, let us search for nodes just using their labels.
# Let's search for nodes similar to `guardians`, `star` and `groot`.
#
# We will create a function that given a pair of labels, it will return a score where:
#
# $$0 <= score <= 1$$
#
# Secondly we'll create a search function that returns rows from our table of target nodes that have a non zero similarity score.
# In[6]:
# dataset_n='mutag'
# dataset_n='protein'
# dataset_n='protein_notfull'
# dataset_n='aids'
# dataset_n='ptc' # color is not a tuple
# dataset_n='cox2'
dataset_n='bzr'
# dataset_n ='firstmm'
dataset_name = 'BZR'
# dataset_name = 'FIRSTMM_DB'
path='E:/Master Thesis/dataset/data/'
X,label=load_local_data(path,dataset_n,wl=0)
NumG = len(X)
N = 6 # size of query
# NumQ = NumG
# NumQ=5
NumQ_for_each_graph = 10
Is_create_query = 0
mean_fea = 0 # number of nodes that has been changed
std_fea = 0 # zero mean Gaussian
# str_mean = 0
# str_std = 0.1
# Generate a random string of given length
def random_string(length):
letters = string.ascii_lowercase
return ''.join(random.choice(letters) for _ in range(length))
# # Create a list of 97 random strings
# target_labels = query_labels + [random_string(random.randint(1, 10)) for _ in range(97)]
# target_labels = query_labels + [random.choice(query_labels) for _ in range(100-5)]
# target_labels = ['guardians', 'groot', 'star', 'guardians2']
Eps = 1
missing_files_count = 0
# Cost =np.zeros(NumQ)
# Ratio = np.zeros(NumQ)
# Time_execute = np.zeros(NumQ)
# Time_match = np.zeros(NumQ)
# Time_opt = np.zeros(NumQ)
# Time_total = np.zeros(NumQ)
# Time_total_2 = np.zeros(NumQ)
Cost =np.zeros((NumG,NumQ_for_each_graph))
Ratio = np.zeros((NumG,NumQ_for_each_graph))
Time_execute = np.zeros((NumG,NumQ_for_each_graph))
Time_match = np.zeros((NumG,NumQ_for_each_graph))
Time_opt = np.zeros((NumG,NumQ_for_each_graph))
Time_total = np.zeros((NumG,NumQ_for_each_graph))
Time_total_2 = np.zeros((NumG,NumQ_for_each_graph))
for num in range(NumG):
print("num=",num)
for numq in range(NumQ_for_each_graph):
print("numq=", numq)
#%%
if Is_create_query:
#%% Randomly select a graph
random_number = random.randint(0, NumG - 1) # randomly select a graph
random_number = 0
# g3 = random.choice(X).nx_graph
x = X[random_number].nx_graph
target_nodes = x.nodes() # [dict]
target_id = sorted(list(x.nodes.keys())) # [list] list of node_id
target_labels = []
for i in range(len(target_id)):
key = target_id[i]
target_labels.append(target_nodes[key]['attr_name'])
target_edges = list(x.edges())
#%% create connected query graphs by BFS
# Randomly select a graph
random_number = random.randint(0, NumG - 1) # randomly select a graph
random_number = 0
# g3 = random.choice(X).nx_graph
g3 = X[random_number].nx_graph
while N > len(g3.nodes()):
print("The required query is too big")
g3 = random.choice(X).nx_graph
# Randomly select a starting node
start_node = random.choice(list(g3.nodes()))
# Initialize a subgraph with the starting node
subgraph = nx.Graph()
subgraph.add_node(start_node, attr_name = g3.nodes[start_node].get("attr_name",None) )
# Use a breadth-first search (BFS) to add connected nodes to the subgraph
queue = [start_node]
while len(subgraph) < N and queue:
current_node = queue.pop(0)
neighbors = list(g3.neighbors(current_node))
random.shuffle(neighbors) # Shuffle neighbors for randomness
for neighbor in neighbors:
if neighbor not in subgraph and len(subgraph) < N:
subgraph.add_node(neighbor, attr_name = g3.nodes[neighbor].get("attr_name",None) )
subgraph.add_edge(current_node, neighbor)
queue.append(neighbor)
# query = subgraph
query_nodes = subgraph.nodes() # [dict]
query_id = sorted(list(subgraph.nodes.keys())) # [list] list of node_id; query_id has to be changed in fornax.Connection (always start from zero)
query_labels = []
for i in range(len(query_id)):
key = query_id[i]
query_labels.append(query_nodes[key]['attr_name'])
query_edges = list(subgraph.edges())
else:
# target
x = X[num].nx_graph
target_nodes = x.nodes() # [dict]
target_id = sorted(list(x.nodes.keys())) # [list] list of node_id
target_labels = []
for i in range(len(target_id)):
key = target_id[i]
target_labels.append(target_nodes[key]['attr_name'])
target_edges = list(x.edges())
# query
# clean
# Construct the file path
# file_name = str(num) + '.pickle'
file_name = str(num) + '_' + str(numq) + '.pickle'
folder_path_1 = "E:\\Master Thesis\\dataset\\data\\"+dataset_name+"\\query_noise_fea_0_0"
file_path_1 = os.path.join(folder_path_1, file_name)
# Check if the file exists
if os.path.exists(file_path_1) == 0:
# File does not exist
print(f"File {file_name} does not exist.")
missing_files_count += 1# Load the pickle file
with open(file_path_1, 'rb') as file1:
g2_nodummy_original = pickle.load(file1)
query_nodes_original = g2_nodummy_original.nodes() # [dict]
query_id_original = sorted(list(g2_nodummy_original.nodes.keys())) # [list] list of node_id; query_id has to be changed in fornax.Connection (always start from zero)
query_labels_original = []
for i in range(len(query_id_original)):
key = query_id_original[i]
query_labels_original.append(query_nodes_original[key]['attr_name'])
query_edges_original = list(g2_nodummy_original.edges())
# noisy
folder_path_2 = "E:\\Master Thesis\\dataset\\data\\"+dataset_name+"\\query_noise_fea_"+str(mean_fea)+"_"+str(std_fea)
file_path_2 = os.path.join(folder_path_2, file_name)
with open(file_path_2, 'rb') as file2:
g2_nodummy = pickle.load(file2)
query_nodes = g2_nodummy.nodes() # [dict]
query_id = sorted(list(g2_nodummy.nodes.keys())) # [list] list of node_id; query_id has to be changed in fornax.Connection (always start from zero)
query_labels = []
for i in range(len(query_id)):
key = query_id[i]
query_labels.append(query_nodes[key]['attr_name'])
query_edges = list(g2_nodummy.edges())
#%% change the ids of this graph
def relabel_graph(ids, edges):
# Create a mapping from old node IDs to new node IDs
old_to_new = {node: idx for idx, node in enumerate(ids)}
# Create a list of new edges with updated node IDs
new_edges = [(old_to_new[start], old_to_new[end]) for start, end in edges]
# Create a list of new ids, simply using the range of new IDs
new_ids = list(range(len(ids)))
return new_ids, new_edges
target_id, target_edges = relabel_graph(target_id, target_edges)
query_id, query_edges = relabel_graph(query_id, query_edges)
#%%
# query_labels = ['guardians', 'groot', 'star','a','a'] # 4 features, 5 nodes
# create nodes_df by hand
# Create a dictionary with the desired data
data1 = {
'Index': list(range(len(target_labels))),
'target_label': target_labels,
'target_type': ['hero'] * len(target_labels),
'target_id': target_id
# 'target_id': list(x+1000 for x in list(range(len(target_labels))))
# 'target_id': list(range(len(target_labels)))
# 'target_id': [1081675, 1276753309, 74671434, 1295400389]
}
# Create the DataFrame
nodes_df = pd.DataFrame(data1)
#%%
# start_range = 1000
# end_range = 1100
# # Create a list of integers in the specified range
# integers = list(range(start_range, end_range + 1))
# # Shuffle the integers to ensure randomness
# # random.shuffle(integers)
# # Create two lists of length
# # list_a = [1000,1000]
# # list_b = [1001,1002]
# list_a = []
# list_b = []
# for a in range(1000,1100):
# # a = random.choice(integers)
# # b = random.choice(integers)
# # a = integers[_+1]
# for b in range(a+1,1100):
# list_a.append(a)
# list_b.append(b)
# Data for the DataFrame (the ids should be the same as data1)
# data2 = {
# 'Index': list(range(len(list_a))), # First column with indexes starting from 0
# # 'start': [100, 100, 100,100], # Second column with the specified values
# # 'end': [101, 102, 103,101] # Third column with the specified values
# 'start': list_a,
# 'end': list_b
# # 'start': [0, 0, 0], # Second column with the specified values
# # 'end': [1, 2, 3] # Third column with the specified values
# # 'start': [1081675, 1081675, 1081675], # Second column with the specified values
# # 'end': [1276753309, 74671434, 1295400389] # Third column with the specified values
# }
start_ids = [edge[0] for edge in target_edges]
end_ids = [edge[1] for edge in target_edges]
data2 = {
'Index': list(range(len(start_ids))), # First column with indexes starting from 0 # edges indices
# 'start': [100, 100, 100,100], # Second column with the specified values
# 'end': [101, 102, 103,101] # Third column with the specified values
'start': start_ids, # start node index
'end': end_ids # end node index
}
# Create the DataFrame
edges_df = pd.DataFrame(data2)
#%%
# def node_scoring_function(first: str, second: str):
# """ node scoring function takes two strings and returns a
# score in the range 0 <= score <= 1
# """
# first_, second_ = sorted((first.lower(), second.lower()), key=len)
# # if first is not a substring of second: score = 0
# if not first_ in second_:
# return 0
# # otherwise use the relative difference between
# # the two lengths
# score = len(second_) - len(first_)
# score /= max(len(first_), len(second_))
# score = 1. - score
# return score
# def node_scoring_function(f1, f2, fea_metric): # return 0 when two features are the same
# if fea_metric == 'jaccard':
# set1 = set(f1)
# set2 = set(f2)
# intersection = len(set1.intersection(set2))
# union = len(set1.union(set2))
# return 1.0 - intersection / union
# elif fea_metric == 'sqeuclidean':
# if f1.shape != f2.shape:
# return 1.0
# else:
# return 1-1/(1+np.sum([pow(f1-f2 ,2)]))
# def node_scoring_function(f1, f2): # return 0 when two features are the same
# # fea_metric == 'sqeuclidean':
# f1 = np.array(f1)
# f2 = np.array(f2)
# if f1.shape != f2.shape:
# return 1.0
# else:
# return 1-1/(1+np.sum([pow(f1-f2 ,2)]))
def node_scoring_function(f1, f2): # return 1 when two features are the same
# fea_metric == 'sqeuclidean':
f1 = np.array(f1)
f2 = np.array(f2)
if f1.shape != f2.shape:
return 0
else:
return 1/(1+np.sum([pow(f1-f2 ,2)]))
# In[7]:
# def search(query_id: int, query_label: str):
def search(query_id, query_label, Eps):
# compute all of the scores
scores = nodes_df['target_label'].apply(
node_scoring_function,
args=(query_label,)
)
# to include all labels in the candidate list
small_value = 1e-20
scores = scores.replace(0, small_value)
# create a boolean mask
# epsilon = 1e-1 # for noisy query
# epsilon = 1e-9 # for clean query
epsilon = Eps
mask = scores > 1-epsilon
# mask = scores > 0
# # If we do not want to set constraints on features
# mask = scores >= 0
# graph the non zero scoring nodes
matches = nodes_df[mask].copy()
# add extra columns
matches['score'] = scores[mask]
# matches['query_label'] = query_label
matches['query_label'] = [query_label]*len(matches)
matches['query_id'] = query_id
return matches
# ### Aside:
# Note that these string search functions are not terribly efficient.
# They involve repeated full scans of the target nodes table.
# If we were searching a larger graph we could use a search tree as an index, an external sting matching service or database. However, since this is a tutorial, the above functions are simpler and more reproducible.
# This is important as we will be using these search results with
# In[8]:
# Examining the table below we can see that we have a conundrum.
# There are 22 nodes with varying similarity to `star` and 4 nodes similar to `galaxy`.
# In[9]:
# find the nodes similar to 'guardians', 'star' and 'groot'
start_match = time.time()
matches = pd.concat(search(id_, label, Eps) for id_, label in enumerate(query_labels))
end_match = time.time()
matches
Time_match[num,numq]=end_match - start_match
print("TimeMatch",Time_match[num,numq])
# unique_count = matches['query_id'].nunique() # no need to add this
# if unique_count < N:
# print("no enough nodes in the candidate set")
# Ratio[num]=np.nan
# Time[num]=np.nan
# continue
# Fornax enables a more powerful type of search.
# By specifying 'guardians', 'star', 'groot' as nodes in a graph,
# and by specifying the relationships between them,
# we can search for nodes in our target graph with the same relationships.
# ## Creating a target graph
#
# Fornax behaves much like a database. In fact it uses SQLite or Postgresql to store graph data and index it.
# To insert a new graph into fornax we can use the following three steps:
# 1. create a new graph
# 2. add nodes and node meta data
# 3. add edges and edge meta data
#
# The object `GraphHandle` is much like a file handle. It does not represent the graph but it is an accessor to it.
# If the `GraphHandle` goes out of scope the graph will still persist until it is explicitly deleted, much like a file.
# In[10]:
try:
with fornax.Connection('sqlite:///mydb1.sqlite') as conn:
target_graph = fornax.GraphHandle.create(conn)
target_graph.add_nodes(
# use id_src to set a custom id on each node
id_src=nodes_df['target_id'],
# use other keyword arguments to attach arbitrary metadata to each node
label=nodes_df['target_label'],
# the type keyword is reserved to we use target_type
target_type=nodes_df['target_type']
# meta data must be json serialisable
)
target_graph.add_edges(edges_df['start'], edges_df['end'])
# target_graph.graph_id = None
# We can use the `graph_id` to access our graph in the future.
# In[11]:
with fornax.Connection('sqlite:///mydb1.sqlite') as conn:
target_graph.graph_id
another_target_graph_handle = fornax.GraphHandle.read(conn, target_graph.graph_id)
print(another_target_graph_handle == target_graph)
# ## Creating a query graph
#
# Let's imagine that we suspect `groot` is directly related to `guardians` and `star` is also directly related to `guardians`.
# For example `groot` and `star` could both be members of a team called `guardians`.
# Let's create another small graph that represents this situation:
# In[12]:
with fornax.Connection('sqlite:///mydb1.sqlite') as conn:
# create a new graph
query_graph = fornax.GraphHandle.create(conn)
# insert the three nodes:
# 'guardians' (id=0), 'star' (id=1), 'groot' (id=2)
query_graph.add_nodes(label=query_labels)
# alternatively:
# query_graph.add_nodes(id_src=query_labels)
# since id_src can use any unique hashable items
# edges = [
# (0, 1), # edge between groot and guardians
# (0, 2) # edge between star and guardians
# ]
# if target graph is fully connected, so as the query graph
# edges = [ (0, 1), (0,2) , (0,3), (0,4),
# (1,2), (1,3), (1,4),
# (2,3), (2,4),
# (3,4)
# ]
edges = query_edges
# # Create a mapping from old query_id values to new ones
# query_id_mapping = {old_id: new_id for new_id, old_id in enumerate(set(query_id))}
# # Update query_id using the mapping
# query_id = [query_id_mapping[old_id] for old_id in query_id]
# # Update edges using the updated query_id values
# edges = [(query_id_mapping[source], query_id_mapping[target]) for source, target in edges]
sources, targets = zip(*edges)
query_graph.add_edges(sources, targets)
# query_graph.graph_id = None
# ## Search
#
# We can create a query in an analogous way to creating graphs using a `QueryHandle`,
# a handle to a query stored in the fornax database.
# To create a useful query we need to insert the string similarity scores we computed in part 1.
# Fornax will use these scores and the graph edges to execute the query.
# In[13]:
with fornax.Connection('sqlite:///mydb1.sqlite') as conn:
query = fornax.QueryHandle.create(conn, query_graph, target_graph)
query.add_matches(matches['query_id'], matches['target_id'], matches['score'])
# query.qeury_id = None
# Finally we can execute the query using a variety of options.
# We specify we want the top 5 best matches between the query graph and the target graph.
# In[14]:
with fornax.Connection('sqlite:///mydb1.sqlite') as conn:
start_time = time.time()
results = query.execute(n=1, hopping_distance=1) # top-n results
end_time = time.time()
Time_execute[num,numq] = end_time - start_time
# ## Visualise
#
# `query.execute` returns an object describing the search result.
# Of primary interest is the `graph` field which contains a list of graphs in `node_link_graph` format.
# We can use networkx to draw these graphs and visualise the results.
# In[15]:
def draw(graph):
""" function for drawing a graph using matplotlib and networkx"""
# each graph is already in node_link_graph format
G = nx.json_graph.node_link_graph(graph)
labels = {node['id']: node['label'] for node in graph['nodes']}
node_colour = ['r' if node['type'] == 'query' else 'b' for node in graph['nodes']]
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, node_size=600, node_color=node_colour, alpha=.3)
edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] != 'match']
nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, edge_color='grey', alpha=.3)
edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] == 'match']
nx.draw_networkx_edges(G, pos, width=3, edgelist=edgelist, style='dashed', edge_color='pink')
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif', labels=labels)
# Result 1 contains the best match. The three query nodes (in red) best match the three target nodes (in blue). The dashed lines show which pairs of query and target nodes matched each other. The blue nodes are a subgraph of the target graph. Note that the result does not describe the whole target graph because in principle it can be very large.
#
# Here we can see that the blue subgraph has exactly the same shape as the red query graph. However, the labels are not exactly the same (e.g. `guardians != Guardians of the Galaxy`) so the result scores less than the maximum score of 1.
# However, we can see that our query graph is really similar to Groot and Star-Lord from Guardians of the Galaxy.
# Since this is the best match we know that
# # In[16]:
# for i, graph in enumerate(results['graphs'][:1]):
# # plt.title('Result {0}, score: {1:.2f}'.format(1, 1. - graph['cost']))
# plt.title('Result {0}, score: {1:.2f}'.format(1, graph['cost']))
# draw(graph)
# plt.xlim(-1.2,1.2)
# plt.ylim(-1.2,1.2)
# plt.axis('off')
# plt.show()
# # Results 2-4 have a lower score because `star` matches to a different node not adjacent to Guardians of the Galaxy. Further inspection would show that `star` has matched aliases of Star-Lord which are near Guardians of the Galaxy but not ajacent to it.
# # In[17]:
# for i, graph in enumerate(results['graphs'][1:4]):
# # plt.title('Result {0}, score: {1:.2f}'.format(i+2, 1. - graph['cost']))
# plt.title('Result {0}, score: {1:.2f}'.format(i+2, graph['cost']))
# draw(graph)
# plt.xlim(-1.2,1.2)
# plt.ylim(-1.2,1.2)
# plt.axis('off')
# plt.show()
# # The final match pairs `guardians` and `star` to two nodes that do not have similar edges to the target graph. `groot` is not found in the target graph. The result gets a much lower score than the preceding results and we can be sure that any additional results will also be poor because the result are ordered.
# # In[18]:
# for i, graph in enumerate(results['graphs'][4:]):
# # plt.title('Result {0}, score: {1:.2f}'.format(i+5, 1. - graph['cost']))
# plt.title('Result {0}, score: {1:.2f}'.format(i+5, graph['cost']))
# draw(graph)
# plt.xlim(-1.2,1.2)
# plt.ylim(-1.2,1.2)
# plt.axis('off')
# plt.show()
#%%
for i, graph in enumerate(results['graphs'][:1]):
edgelist = [(e['source'], e['target']) for e in graph['links'] if e['type'] == 'match']
labels = {node['id']: node['label'] for node in graph['nodes']}
Cost[num,numq] = graph['cost']
ratio = 1
query_labels_original
query_labels
#%%
# Step 1: Create lists list_A and list_B
list_A = [labels[node_id] for node_id, _ in edgelist]
list_B = [labels[node_id] for _, node_id in edgelist]
# Step 2: Find indices of elements in list_A that are in query_labels
indices = [i for i, label in enumerate(list_A) if label in query_labels]
# Step 3: Reorder query_labels_original based on indices
query_labels_original_reorder = [query_labels_original[i] for i in indices]
# Step 4: Compare list_B and query_labels_original_reorder
matching_count = sum(1 for x, y in zip(list_B, query_labels_original_reorder) if x == y)
# Print the count of matching elements
print("Number of matching elements between list_B and query_labels_original_reorder:", matching_count)
#%% Ratio of nodes that are matched
ratio = matching_count / len(edgelist)
#%%
Ratio[num,numq] = ratio
Time_opt[num,numq] = results['time']
print('ratio', Ratio[num,numq])
print('time_execute', Time_execute[num,numq])
print('time_match', Time_match[num,numq])
print('cost', Cost[num,numq])
Time_total[num,numq] = Time_execute[num,numq] + Time_match[num,numq]
Time_total_2[num,numq] = results['time_total'] + Time_match[num,numq]
print('time_total', Time_total[num,numq])
print('time_total_2', Time_total_2[num,numq])
with fornax.Connection('sqlite:///mydb1.sqlite') as conn: # introduce connection first
# cursor = conn.cursor()
# cursor.execute('DELETE FROM graphs')
query.delete()
target_graph.delete()
query_graph.delete()
# fornax.conn.close()
# conn.commit()
sql_statement = text(f'DELETE FROM match;')
conn.session.execute(sql_statement)
conn.session.commit()
conn.close()
except:
print("Did not succeed")
Ratio[num,numq] = 0
Time_match[num,numq] = np.nan # actually not nan, but we just want to leave out this case
Time_execute[num,numq] = np.nan
Time_opt[num,numq] = np.nan
Time_total[num,numq] = np.nan
Time_total_2[num,numq] = np.nan
Cost[num,numq] = np.nan
with fornax.Connection('sqlite:///mydb1.sqlite') as conn: # introduce connection first
# cursor = conn.cursor()
# cursor.execute('DELETE FROM graphs')
# query.delete()
target_graph.delete()
# query_graph.delete()
# fornax.conn.close()
# conn.commit()
sql_statement = text(f'DELETE FROM match;')
conn.session.execute(sql_statement)
conn.session.commit()
conn.close()
#%%
# file_path_1 = "E:/Master Thesis/results/nema/Ratio.npy"
# file_path_2 = "E:/Master Thesis/results/nema/Time.npy"
# np.save(file_path_1, Ratio)
# np.save(file_path_2, Time)
# directory = "E:/Master Thesis/results/nema/local"
# file_path_1 = os.path.join(directory, "Ratio_noise_0.1_eps_"+str(Eps)+".npy")
# file_path_2 = os.path.join(directory, "Time_noise_0.1_eps_"+str(Eps)+".npy")
# file_path_3 = os.path.join(directory, "TimeMatch_noise_0.1_eps_"+str(Eps)+".npy")
# os.makedirs(directory, exist_ok=True)
# np.save(file_path_1, Ratio)
# np.save(file_path_2, Time)
# np.save(file_path_3, Time_match)
#%%
print("Overall")
print('the ratio of correct nodes', np.nanmean(Ratio))
print('Time_execute', np.nanmean(Time_execute))
print('Time_match', np.nanmean(Time_match))
print('Time_opt', np.nanmean(Time_opt))
print('Time_total', np.nanmean(Time_total))
print('Time_total_new', np.nanmean(Time_total_2))
print('cost_mean', np.nanmean(Cost / N))
print('cost_std', np.nanstd(Cost / N))
#%%
# # Create a cursor object to execute SQL statements
# cursor = conn.cursor()
# # Replace 'old_graph_id' with the current identifier of the graph you want to change
# old_graph_id = target_graph.graph_id
# # Replace 'new_graph_id' with the new identifier you want to set for the graph
# new_graph_id = 0
# # Execute the UPDATE statement for nodes table (replace 'nodes_table' with the actual table name)
# cursor.execute("UPDATE nodes_table SET graph_id = ? WHERE graph_id = ?", (new_graph_id, old_graph_id))
# # Execute the UPDATE statement for edges table (replace 'edges_table' with the actual table name)
# cursor.execute("UPDATE edges_table SET graph_id = ? WHERE graph_id = ?", (new_graph_id, old_graph_id))
# # Commit the changes to make the modification permanent
# conn.commit()
# # Close the cursor and the connection
# cursor.close()
# conn.close()