-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
executable file
·72 lines (59 loc) · 1.92 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from itertools import chain, combinations
import re
def cleanTags(text):
'''
This function removes HTML tags from a webpage text
:param text: A string containing webpage text
:return: A string containing webpage text with tags removed
'''
tags = [
"^<script[^>]*>",
"</script>$",
"^<style[^>]*>",
"</style>$"
]
for tag in tags:
text = re.sub(tag, "", text)
return text
def isEmptyLine(text):
'''
This function checks if a webpage text line is blank/empty.
:param text: A string containing webpage text line
:return: True if line is empty or else False
'''
to_remove = [" ",
"\n",
"\r",
"\t",
"\xa0"]
for char in to_remove:
text = text.replace(char, "")
if text == "":
return True
return False
def makeTransactions(banner, annotations):
'''
This function creates a transaction using banner and its annotations.
:param banners: A string containing banner text
:param annotations: A list of strings containing annotations of the banner
:return: A list of transactions
'''
transactions = []
for annotation in annotations:
transaction = [banner]
for item in annotation:
if item is not None:
transaction.append(item)
transactions.append(transaction)
return transactions
def makeAllTransactions(bannerToAnnotationsMap):
'''
This function creates transactions using banners and their annotations.
:param banners: A string containing banner text
:param bannerToAnnotationsMap: A dictionary that maps banners to their annotations
:return: A list of all transactions
'''
allTransactions = []
for banner, annotations in bannerToAnnotationsMap.items():
allTransactions += makeTransactions(banner, annotations)
return allTransactions