-
Notifications
You must be signed in to change notification settings - Fork 3
/
BooleanMerge.py
85 lines (76 loc) · 3.39 KB
/
BooleanMerge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''
The BooleanMerge module does a hard conjunction (AND) for posting lists
'''
import math
DOC_ID_INDEX = 0
TF_INDEX = 1
POSTINGS_INDEX = 2
def get_docID(list, index):
return list[index][DOC_ID_INDEX]
def has_skip(list, index, skip_dist):
return index + skip_dist < len(list) and index % skip_dist == 0
def get_skip_docID(list, index, skip_dist):
return list[index + skip_dist][DOC_ID_INDEX]
def get_skip_position(list, index, skip_dist):
return list[index + skip_dist]
def get_intersected_posting_lists(single, biword, triword):
'''
To intersect the postings lists of single words, biwords and triwords, the postings lists are
combined. The merge_n_lists function then retrieves the docIDs which are common
to all the postings lists. The postings lists are then filtered to remove docIDs which are not
contained in the list of common doc IDs. Each posting list is a list of postings, where each posting
is in either the form [docID, tf, position list] or [docID, tf].
:param single: postings lists of single words.
:param biword: postings lists of biwords.
:param triword: postings lists of triwords.
:return:
'''
lists = single + biword + triword
common_doc_IDs = merge_n_lists(lists)
reduced_single = list(map(lambda plist: list(filter(lambda list: list[0] in common_doc_IDs, plist)), single))
reduced_biword = list(map(lambda plist: list(filter(lambda list: list[0] in common_doc_IDs, plist)), biword))
reduced_triword = list(map(lambda plist: list(filter(lambda list: list[0] in common_doc_IDs, plist)), triword))
return reduced_single, reduced_biword, reduced_triword
def merge_n_lists(lists):
'''
Returns a list of docIDs that occur in all lists. This is done by sorted the lists by length and
then intersecting the lists pairwise.
:param lists: a list of posting lists.
'''
lists = list(map(lambda x: [len(x), x], lists))
lists = sorted(lists, key=lambda x: x[0], reverse=True)
if len(lists) <= 1:
return lists
merged_list = intersect(lists[0][1], lists[1][1], lists[0][0], lists[1][0])
for i in range(2, len(lists)):
merged_list = intersect(merged_list, lists[i][1], len(merged_list), lists[i][0])
docID_list = list(map(lambda x: x[0], merged_list))
return docID_list
def intersect(listA, listB, A_length, B_length):
'''
Intersects two lists making use of dynamically generated skip pointers.
'''
result = []
i, j = 0, 0
skip_dist_A = int(math.sqrt(A_length))
skip_dist_B = int(math.sqrt(B_length))
while i < A_length and j < B_length:
docID_A = get_docID(listA, i)
docID_B = get_docID(listB, j)
if docID_A == docID_B:
result.append(listA[i])
i += 1
j += 1
elif docID_A < docID_B:
if has_skip(listA, i, skip_dist_A) and get_skip_docID(listA, i, skip_dist_A) <= docID_B:
while has_skip(listA, i, skip_dist_A) and get_skip_docID(listA, i, skip_dist_A) <= docID_B:
i += skip_dist_A
else:
i += 1
elif docID_A > docID_B:
if has_skip(listB, j, skip_dist_B) and get_skip_docID(listB, j, skip_dist_B) <= docID_A:
while has_skip(listB, j, skip_dist_B) and get_skip_docID(listB, j, skip_dist_B) <= docID_A:
j += skip_dist_B
else:
j += 1
return result