-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlwm_miningAndExtension.c
314 lines (262 loc) · 12 KB
/
lwm_miningAndExtension.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
#include "lwm_miningAndExtension.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <assert.h>
#include "graph.h"
#include "searchTree.h"
#include "cs_Parsing.h"
#include "cs_Tree.h"
#include "treeEnumeration.h"
#include "lwm_embeddingOperators.h"
//#include "lwm_initAndCollect.h"
/**
* Get a SupportSet for each IntSet in the input.
*
* This method intersects all support sets of the parent patterns of a pattern p and
* returns a list of data structures that the iterative subtree iso algorithm can use
* to compute the support set of p.
*
* This implementation returns the data structures (cubes and pointers to the parent)
* of the parent that has id parentIdToKeep.
*/
struct SupportSet* getCandidateSupportSuperSet(struct IntSet* parentIds, struct SupportSet* previousLevelSupportLists, int parentIdToKeep) {
assert(parentIds != NULL);
assert(previousLevelSupportLists != NULL);
assert(containsInt(parentIds, parentIdToKeep));
// obtain support sets of parents
struct SupportSet* parentSupportSets = getSupportSetsOfPatterns(previousLevelSupportLists, parentIds);
// move desired parent support set to head of list to keep the correct subtreeIsoDataStores
parentSupportSets = supportSetChangeHead(parentSupportSets, parentIdToKeep);
// compute a candidate support set that is the intersection of the support sets of the apriori parents of the extension
struct SupportSet* candidateSupportSuperSet = intersectSupportSets(parentSupportSets);
// garbage collection
while (parentSupportSets) {
struct SupportSet* tmp = parentSupportSets->next;
free(parentSupportSets);
parentSupportSets = tmp;
}
return candidateSupportSuperSet;
}
static void _extendPreviousLevel(// input
struct SupportSet* previousLevelSupportLists,
struct Vertex* previousLevelSearchTree,
struct ShallowGraph* extensionEdges,
size_t threshold,
// output
struct SupportSet** resultCandidateSupportSuperSets,
struct Graph** resultCandidates,
FILE* logStream,
// memory management
struct GraphPool* gp,
struct ShallowGraphPool* sgp) {
assert(previousLevelSupportLists != NULL);
assert(previousLevelSearchTree != NULL);
assert(extensionEdges != NULL);
*resultCandidateSupportSuperSets = NULL;
*resultCandidates = NULL;
struct Vertex* currentLevelCandidateSearchTree = getVertex(gp->vertexPool);
int nAllGeneratedExtensions = 0;
int nAllUniqueGeneratedExtensions = 0;
int nAllExtensionsPostApriori = 0;
int nAllExtensionsPostIntersectionFilter = 0;
int nAddedToOutput = 0;
int nDumped = 0;
// generate a list of extensions of all frequent patterns
// filter these extensions using an apriori property
// for each extension, compute a list of ids of their apriori parents
for (struct SupportSet* frequentPatternSupportList=previousLevelSupportLists; frequentPatternSupportList!=NULL; frequentPatternSupportList=frequentPatternSupportList->next) {
struct Graph* frequentPattern = frequentPatternSupportList->first->data.h;
// extend frequent pattern
struct Graph* listOfExtensions = extendPatternOnOuterShells(frequentPattern, extensionEdges, gp, sgp);
// struct Graph* listOfExtensions = extendPatternOnLeaves(frequentPattern, extensionEdges, gp);
// struct Graph* listOfExtensions = extendPatternByLargerEdgesTMP(frequentPattern, extensionEdges, gp);
for (struct Graph* extension=popGraph(&listOfExtensions); extension!=NULL; extension=popGraph(&listOfExtensions)) {
// count number of generated extensions
++nAllGeneratedExtensions;
/* filter out patterns that were already enumerated as the extension of some other pattern
and are in the search tree */
struct ShallowGraph* string = canonicalStringOfTree(extension, sgp);
int previousNumberOfDistinctPatterns = currentLevelCandidateSearchTree->d;
addToSearchTree(currentLevelCandidateSearchTree, string, gp, sgp);
struct IntSet* aprioriParentIdSet;
if (previousNumberOfDistinctPatterns == currentLevelCandidateSearchTree->d) {
aprioriParentIdSet = NULL;
} else {
++nAllUniqueGeneratedExtensions;
aprioriParentIdSet = aprioriCheckExtensionReturnList(extension, previousLevelSearchTree, gp, sgp);
}
if (aprioriParentIdSet) {
// count number of apriori survivors
++nAllExtensionsPostApriori;
// get (hopefully small) superset of the support set of the extension
struct SupportSet* extensionSupportSuperSet = getCandidateSupportSuperSet(aprioriParentIdSet, previousLevelSupportLists, frequentPattern->number);
dumpIntSet(aprioriParentIdSet);
// check if support superset is larger than the threshold
if (extensionSupportSuperSet->size >= threshold) {
// count number of intersection/threshold survivors
++nAllExtensionsPostIntersectionFilter;
// add extension and support super set to list of candidates for next level
extension->next = *resultCandidates;
*resultCandidates = extension;
extensionSupportSuperSet->next = *resultCandidateSupportSuperSets;
*resultCandidateSupportSuperSets = extensionSupportSuperSet;
++nAddedToOutput;
} else {
// dump extension and support superset
dumpGraph(gp, extension);
dumpSupportSetCopy(extensionSupportSuperSet);
++nDumped;
}
} else {
// dump extension that does not fulfill apriori property
dumpGraph(gp, extension);
++nDumped;
}
}
}
dumpSearchTree(gp, currentLevelCandidateSearchTree);
fprintf(logStream, "generated extensions: %i\n"
"unique extensions: %i\n"
"apriori filtered extensions: %i\n"
"intersection filtered extensions: %i\n",
nAllGeneratedExtensions, nAllUniqueGeneratedExtensions, nAllExtensionsPostApriori, nAllExtensionsPostIntersectionFilter);
assert(nAddedToOutput + nDumped == nAllGeneratedExtensions);
}
static struct SupportSet* _BFSgetNextLevel(// input
struct SupportSet* previousLevelSupportLists,
struct Vertex* previousLevelSearchTree,
size_t threshold,
struct ShallowGraph* frequentEdges,
// embedding operator function pointer,
struct SubtreeIsoDataStore (*embeddingOperator)(struct SubtreeIsoDataStore, struct Graph*, double, struct GraphPool*, struct ShallowGraphPool*),
double importance,
// output
struct Vertex** currentLevelSearchTree,
FILE* logStream,
// memory management
struct GraphPool* gp,
struct ShallowGraphPool* sgp) {
assert(previousLevelSupportLists != NULL);
assert(previousLevelSearchTree != NULL);
assert(frequentEdges != NULL);
struct SupportSet* currentLevelCandidateSupportSets;
struct Graph* currentLevelCandidates;
_extendPreviousLevel(previousLevelSupportLists, previousLevelSearchTree, frequentEdges, threshold,
¤tLevelCandidateSupportSets, ¤tLevelCandidates, logStream,
gp, sgp);
//iterate over all patterns in candidateSupports
struct SupportSet* actualSupportLists = NULL;
struct SupportSet* actualSupportListsTail = NULL;
struct SupportSet* candidateSupport = NULL;
struct Graph* candidate = NULL;
for (candidateSupport=currentLevelCandidateSupportSets, candidate=currentLevelCandidates; candidateSupport!=NULL; candidateSupport=candidateSupport->next, candidate=candidate->next) {
struct SupportSet* currentActualSupport = getSupportSet();
//iterate over all graphs in the support
for (struct SupportSetElement* e=candidateSupport->first; e!=NULL; e=e->next) {
// create actual support list for candidate pattern
struct SubtreeIsoDataStore result = embeddingOperator(e->data, candidate, importance, gp, sgp);
if (result.foundIso) {
appendSupportSetData(currentActualSupport, result);
} else {
dumpNewCube(result.S, result.g->n);
}
}
// filter out candidates with support < threshold
if (currentActualSupport->size < threshold) {
// mark h as infrequent
candidate->activity = 0;
dumpSupportSet(currentActualSupport);
} else {
// mark h as frequent
candidate->activity = currentActualSupport->size;
// add to output list, maintaining order. necessary
if (actualSupportListsTail) {
actualSupportListsTail->next = currentActualSupport;
actualSupportListsTail = currentActualSupport;
} else {
actualSupportLists = currentActualSupport;
actualSupportListsTail = currentActualSupport;
}
}
}
// garbage collection
candidateSupport = currentLevelCandidateSupportSets;
while (candidateSupport) {
struct SupportSet* tmp = candidateSupport->next;
dumpSupportSetCopy(candidateSupport);
candidateSupport = tmp;
}
// add frequent extensions to current level search tree output, set their numbers correctly
// dump those candidates that are not frequent
int nAllFrequentExtensions = 0;
candidate = currentLevelCandidates;
while (candidate) {
struct Graph* tmp = candidate->next;
candidate->next = NULL;
if (candidate->activity == 0) {
dumpGraph(gp, candidate);
} else {
++nAllFrequentExtensions;
struct ShallowGraph* cString = canonicalStringOfTree(candidate, sgp);
cString->data = candidate->activity;
addMultiSetToSearchTree(*currentLevelSearchTree, cString, gp, sgp);
candidate->number = (*currentLevelSearchTree)->lowPoint;
}
candidate = tmp;
}
fprintf(logStream, "frequent patterns: %i\n", nAllFrequentExtensions);
return actualSupportLists;
}
void BFSStrategy(size_t startPatternSize,
size_t maxPatternSize,
size_t threshold,
struct Vertex* initialFrequentPatterns,
struct SupportSet* supportSets,
struct ShallowGraph* extensionEdges,
// embedding operator function pointer,
struct SubtreeIsoDataStore (*embeddingOperator)(struct SubtreeIsoDataStore, struct Graph*, double, struct GraphPool*, struct ShallowGraphPool*),
double importance,
FILE* featureStream,
FILE* patternStream,
FILE* logStream,
struct GraphPool* gp,
struct ShallowGraphPool* sgp) {
// levelwise search for patterns with more than one vertex:
struct Vertex* previousLevelSearchTree = shallowCopySearchTree(initialFrequentPatterns, gp);
struct SupportSet* previousLevelSupportSets = supportSets;
struct Vertex* currentLevelSearchTree = previousLevelSearchTree; // initialization for garbage collection in case of maxPatternSize == 1
struct SupportSet* currentLevelSupportSets = previousLevelSupportSets; // initialization for garbage collection in case of maxPatternSize == 1
for (size_t p=startPatternSize+1; (p<=maxPatternSize) && (previousLevelSearchTree->number>0); ++p) {
fprintf(logStream, "Processing patterns with %zu vertices:\n", p); fflush(logStream);
currentLevelSearchTree = getVertex(gp->vertexPool);
offsetSearchTreeIds(currentLevelSearchTree, previousLevelSearchTree->lowPoint);
currentLevelSupportSets = _BFSgetNextLevel(previousLevelSupportSets, previousLevelSearchTree, threshold, extensionEdges, embeddingOperator, importance, ¤tLevelSearchTree, logStream, gp, sgp);
printStringsInSearchTree(currentLevelSearchTree, patternStream, sgp);
fflush(patternStream);
printSupportSetsSparse(currentLevelSupportSets, featureStream);
fflush(featureStream);
// garbage collection:
// what is now all previousLevel... data structures will not be used at all in the next iteration
dumpSearchTree(gp, previousLevelSearchTree);
while (previousLevelSupportSets) {
struct SupportSet* tmp = previousLevelSupportSets->next;
// ...hence, we also dump the pattern graphs completely, which we can't do in a DFS mining approach.
dumpSupportSetWithPattern(previousLevelSupportSets, gp);
previousLevelSupportSets = tmp;
}
// previous level = current level
previousLevelSearchTree = currentLevelSearchTree;
previousLevelSupportSets = currentLevelSupportSets;
}
// madness(currentLevelSupportSets, currentLevelSearchTree, extensionEdges, maxPatternSize, threshold, &previousLevelSupportSets, &previousLevelSearchTree, featureStream, patternStream, logStream, gp, sgp);
// garbage collection
dumpSearchTree(gp, previousLevelSearchTree);
while (previousLevelSupportSets) {
struct SupportSet* tmp = previousLevelSupportSets->next;
// we also dump the pattern graphs completely, which we can't do in a DFS mining approach.
dumpSupportSetWithPattern(previousLevelSupportSets, gp);
previousLevelSupportSets = tmp;
}
}