From e0037e1a7812f9674f9028a1ec01648884b91641 Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 21:16:47 -0700 Subject: [PATCH] The majestic suffix arrays will top charts --- parser/parser.py | 62 +++++++++++++++++++++++++++++++++-------- parser/requirements.txt | 5 ++++ scrapper/scrapper.py | 2 -- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 2b6581c..09cbdd2 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -134,10 +134,19 @@ def display(self, node=None, prefix=''): else: for char, child in node.children.items(): self.display(child, prefix + char) - + +def construct_suffix_tree(scrapped_content): + from suffix_tree import Tree + return Tree({"A": scrapped_content}) + + +def suffix_tree(suffix_tree, pattern): + return len(suffix_tree.find_all(pattern)) + def construct_suffix_array(text): - suffixes = [(text[i:], i) for i in range(len(text))] - suffixes.sort(key=lambda x: x[0]) + n = len(text) + suffixes = [(text[i:], i) for i in range(n)] + suffixes.sort() suffix_array = [item[1] for item in suffixes] return suffix_array @@ -159,19 +168,13 @@ def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst ag while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern: positions.append(suffix_array[i]) i += 1 - return positions + return len(positions) elif text[suffix:suffix + m] < pattern: left = mid + 1 else: right = mid - 1 - return positions + return len(positions) -def suffix_array(text, pattern): - print("Invoked Suffix Array") - suffix_array_structure = construct_suffix_array(text) - occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure) - return len(occurrences) - # ************************************************************************************************************************************ @@ -254,6 +257,41 @@ def get_keywords(algo_choice, scrapped_content): keywords_and_count = [] existing_keywords = [] start_time = time.time() + if algo_choice == "suffix_array": + logger.info("Triggered Suffix Arrays") + suffix_array = construct_suffix_array(scrapped_content) + for each_word in scrapped_content.split(" "): + if each_word == "": + continue + elif not each_word.isalpha(): + continue + else: + if each_word not in existing_keywords: + occurences = search_pattern_with_suffix_array(scrapped_content, each_word, suffix_array) + keywords_and_count.append({"keyword": each_word, "count": occurences}) + existing_keywords.append(each_word) + return keywords_and_count, elapsed_time + (time.time()-start_time) + if algo_choice == "suffix_tree": + logger.info("Triggered Suffix Trees") + start_time = time.time() + keywords_and_count = [] + existing_keywords = [] + constructed_suffix_tree = construct_suffix_tree(scrapped_content) + try: + for each_word in scrapped_content.split(" "): + if each_word == "": + continue + elif not each_word.isalpha(): + continue + else: + if each_word not in existing_keywords: + occurences = suffix_tree(constructed_suffix_tree, each_word) + keywords_and_count.append({"keyword": each_word, "count": occurences}) + existing_keywords.append(each_word) + return keywords_and_count, time.time() - start_time + except Exception as e: + logger.error(f"Error while parsing suffix tree: {e}") + return None for eachword in scrapped_content.split(" "): if eachword == "": continue @@ -335,7 +373,7 @@ async def keyword_recommendations_api(request: Request): async def multialgo_api(request: Request): payload = await request.json() url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] - algo_choices = ["rabin_karp", "naive", "kmp"] + algo_choices = ["rabin_karp", "naive", "kmp", "suffix_array", "suffix_tree"] final_response = {"data": []} wait_iterator = 0 try: diff --git a/parser/requirements.txt b/parser/requirements.txt index a67e6ae..a7630ea 100644 --- a/parser/requirements.txt +++ b/parser/requirements.txt @@ -1,11 +1,14 @@ annotated-types==0.6.0 +antlr4-python3-runtime==4.13.1 anyio==3.7.1 +automaton-tools==0.1.8 beautifulsoup4==4.12.2 better-profanity==0.7.0 bs4==0.0.1 certifi==2023.7.22 charset-normalizer==3.3.1 click==8.1.7 +csuffixtree==0.3.6 fastapi==0.104.0 futures==3.0.5 google-search-results==2.4.2 @@ -13,6 +16,7 @@ goslate==1.5.4 h11==0.14.0 idna==3.4 Jinja2==3.1.2 +jsonify==0.5 lxml==4.9.3 MarkupSafe==2.1.3 numpy==1.26.1 @@ -32,6 +36,7 @@ six==1.16.0 sniffio==1.3.0 soupsieve==2.5 starlette==0.27.0 +suffix-tree==0.1.2 typing_extensions==4.8.0 tzdata==2023.3 urllib3==2.0.7 diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py index eb6dbd2..06525b8 100644 --- a/scrapper/scrapper.py +++ b/scrapper/scrapper.py @@ -11,8 +11,6 @@ from nltk.tag import pos_tag - - nltk.download('punkt') nltk.download('words') nltk.download('stopwords')