Skip to content

Commit

Permalink
The majestic suffix arrays will top charts
Browse files Browse the repository at this point in the history
  • Loading branch information
KR-Ravindra committed Oct 27, 2023
1 parent 81b74c9 commit e0037e1
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 14 deletions.
62 changes: 50 additions & 12 deletions parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,19 @@ def display(self, node=None, prefix=''):
else:
for char, child in node.children.items():
self.display(child, prefix + char)


def construct_suffix_tree(scrapped_content):
from suffix_tree import Tree
return Tree({"A": scrapped_content})


def suffix_tree(suffix_tree, pattern):
return len(suffix_tree.find_all(pattern))

def construct_suffix_array(text):
suffixes = [(text[i:], i) for i in range(len(text))]
suffixes.sort(key=lambda x: x[0])
n = len(text)
suffixes = [(text[i:], i) for i in range(n)]
suffixes.sort()
suffix_array = [item[1] for item in suffixes]
return suffix_array

Expand All @@ -159,19 +168,13 @@ def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst ag
while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern:
positions.append(suffix_array[i])
i += 1
return positions
return len(positions)
elif text[suffix:suffix + m] < pattern:
left = mid + 1
else:
right = mid - 1
return positions
return len(positions)

def suffix_array(text, pattern):
print("Invoked Suffix Array")
suffix_array_structure = construct_suffix_array(text)
occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure)
return len(occurrences)


# ************************************************************************************************************************************

Expand Down Expand Up @@ -254,6 +257,41 @@ def get_keywords(algo_choice, scrapped_content):
keywords_and_count = []
existing_keywords = []
start_time = time.time()
if algo_choice == "suffix_array":
logger.info("Triggered Suffix Arrays")
suffix_array = construct_suffix_array(scrapped_content)
for each_word in scrapped_content.split(" "):
if each_word == "":
continue
elif not each_word.isalpha():
continue
else:
if each_word not in existing_keywords:
occurences = search_pattern_with_suffix_array(scrapped_content, each_word, suffix_array)
keywords_and_count.append({"keyword": each_word, "count": occurences})
existing_keywords.append(each_word)
return keywords_and_count, elapsed_time + (time.time()-start_time)
if algo_choice == "suffix_tree":
logger.info("Triggered Suffix Trees")
start_time = time.time()
keywords_and_count = []
existing_keywords = []
constructed_suffix_tree = construct_suffix_tree(scrapped_content)
try:
for each_word in scrapped_content.split(" "):
if each_word == "":
continue
elif not each_word.isalpha():
continue
else:
if each_word not in existing_keywords:
occurences = suffix_tree(constructed_suffix_tree, each_word)
keywords_and_count.append({"keyword": each_word, "count": occurences})
existing_keywords.append(each_word)
return keywords_and_count, time.time() - start_time
except Exception as e:
logger.error(f"Error while parsing suffix tree: {e}")
return None
for eachword in scrapped_content.split(" "):
if eachword == "":
continue
Expand Down Expand Up @@ -335,7 +373,7 @@ async def keyword_recommendations_api(request: Request):
async def multialgo_api(request: Request):
payload = await request.json()
url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url']
algo_choices = ["rabin_karp", "naive", "kmp"]
algo_choices = ["rabin_karp", "naive", "kmp", "suffix_array", "suffix_tree"]
final_response = {"data": []}
wait_iterator = 0
try:
Expand Down
5 changes: 5 additions & 0 deletions parser/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
annotated-types==0.6.0
antlr4-python3-runtime==4.13.1
anyio==3.7.1
automaton-tools==0.1.8
beautifulsoup4==4.12.2
better-profanity==0.7.0
bs4==0.0.1
certifi==2023.7.22
charset-normalizer==3.3.1
click==8.1.7
csuffixtree==0.3.6
fastapi==0.104.0
futures==3.0.5
google-search-results==2.4.2
goslate==1.5.4
h11==0.14.0
idna==3.4
Jinja2==3.1.2
jsonify==0.5
lxml==4.9.3
MarkupSafe==2.1.3
numpy==1.26.1
Expand All @@ -32,6 +36,7 @@ six==1.16.0
sniffio==1.3.0
soupsieve==2.5
starlette==0.27.0
suffix-tree==0.1.2
typing_extensions==4.8.0
tzdata==2023.3
urllib3==2.0.7
Expand Down
2 changes: 0 additions & 2 deletions scrapper/scrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@
from nltk.tag import pos_tag




nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
Expand Down

0 comments on commit e0037e1

Please sign in to comment.