From ba38c2ecc145af18edce98bd15b7954907e4f2c8 Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:30:15 -0700 Subject: [PATCH 1/9] The massive backend push --- docker-compose.yml | 33 ++++ parser/parser.py | 374 ++++++++++++++++++++++++++++++++++++++ parser/requirements.txt | 38 ++++ parser/string_algo.py | 217 ++++++++++++++++++++++ scrapper/mock.json | 34 ---- scrapper/requirements.txt | 13 ++ scrapper/scrapper.py | 116 ++++++++---- 7 files changed, 760 insertions(+), 65 deletions(-) create mode 100644 docker-compose.yml create mode 100644 parser/parser.py create mode 100644 parser/requirements.txt create mode 100644 parser/string_algo.py delete mode 100644 scrapper/mock.json diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..90a6192 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,33 @@ +version: "3.9" +services: + # frontend: + # image: node:20 + # ports: + # - 3000:3000 + # volumes: + # - ./:/app + # working_dir: /app + # command: sh -c "npm i && npm start" + + # scrapper: + # image: python:3.11 + # ports: + # - 8000:8000 + # volumes: + # - ./:/app + # working_dir: /app/scrapper + # command: sh -c "pip3 install -r requirements.txt && python3 main.py" + + # parser: + # image: python:3.11 + # ports: + # - 8001:8000 + # volumes: + # - ./:/app + # working_dir: /app/parser + # command: sh -c "pip3 install -r requirements.txt && python3 parser.py" + + redis: + image: redis:6.2 + ports: + - 6379:6379 \ No newline at end of file diff --git a/parser/parser.py b/parser/parser.py new file mode 100644 index 0000000..4c81eff --- /dev/null +++ b/parser/parser.py @@ -0,0 +1,374 @@ +from fastapi import FastAPI, Request +import uvicorn +from fastapi.middleware.cors import CORSMiddleware +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# Algorithms *************************************************************************************************************************** + +class RabinKarp: + def __init__(self, text, pattern): + self.text = text + self.pattern = pattern + self.text_length = len(text) + self.pattern_length = len(pattern) + self.hash_value = 0 + self.pattern_hash_value = 0 + self.window = [] + self.base = 256 + self.prime = 101 + self.occurrences = [] + def calculate_hash_value(self, string, length): + value = 0 + for i in range(length): + value = (self.base * value + ord(string[i])) % self.prime + return value + def recalculate_hash_value(self, old_hash, old_char, new_char): + new_hash = (self.base * (old_hash - ord(old_char) * (self.base **(self.pattern_length - 1))) + ord(new_char)) % self.prime + return new_hash + def search_pattern(self): + self.pattern_hash_value = self.calculate_hash_value(self.pattern, + self.pattern_length) + self.hash_value = self.calculate_hash_value(self.text, self.pattern_length) + pattern_found = False + for i in range(self.text_length - self.pattern_length + 1): + if self.pattern_hash_value == self.hash_value: + for j in range(self.pattern_length): + if self.text[i + j] != self.pattern[j]: + break + else: + self.occurrences.append(i) + pattern_found = True + if i < self.text_length - self.pattern_length: + self.hash_value = self.recalculate_hash_value(self.hash_value, self.text[i], self.text[i + self.pattern_length]) + if not pattern_found: + print("Pattern not found in the text.") + return len(self.occurrences) + + +def rabin_karp(text, pattern): + rk_search = RabinKarp(text, pattern) + return rk_search.search_pattern() + + +def naive(text, pattern): + n = len(text) + m = len(pattern) + occurrences = [] + for i in range(n - m + 1): + if text[i:i+m] == pattern: + occurrences.append(i) + return len(occurrences) + +def compute_prefix_function(pattern): + m = len(pattern) + pi = [0] * m + j = 0 + for i in range(1, m): + while j > 0 and pattern[i] != pattern[j]: + j = pi[j-1] + if pattern[i] == pattern[j]: + j += 1 + pi[i] = j + return pi + +def kmp(text, pattern): + n = len(text) + m = len(pattern) + pi = compute_prefix_function(pattern) + j = 0 + occurrences = [] + for i in range(n): + while j > 0 and text[i] != pattern[j]: + j = pi[j-1] + if text[i] == pattern[j]: + j += 1 + if j == m: + occurrences.append(i - m + 1) + j = pi[j-1] + return len(occurrences) + + +class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_word = False + +class Trie: + def __init__(self): + self.root = TrieNode() + def insert(self, word): + node = self.root + for char in word: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.is_end_of_word = True + + +class SuffixTree: + def __init__(self, text): + self.text = text + self.trie = Trie() + self.build() + def build(self): + for i in range(len(self.text)): + self.trie.insert(self.text[i:]) + def display(self, node=None, prefix=''): + node = node or self.trie.root + if not node.children: + print(prefix) + else: + for char, child in node.children.items(): + self.display(child, prefix + char) + +def construct_suffix_array(text): + suffixes = [(text[i:], i) for i in range(len(text))] + suffixes.sort(key=lambda x: x[0]) + suffix_array = [item[1] for item in suffixes] + return suffix_array + +def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst against suffix array + n = len(text) + m = len(pattern) + left, right = 0, n - 1 + positions = [] + while left <= right: + mid = (left + right) // 2 + suffix = suffix_array[mid] + if text[suffix:suffix + m] == pattern: + positions.append(suffix) + i = mid - 1 + while i >= left and text[suffix_array[i]:suffix_array[i] + m] == pattern: + positions.append(suffix_array[i]) + i -= 1 + i = mid + 1 + while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern: + positions.append(suffix_array[i]) + i += 1 + return positions + elif text[suffix:suffix + m] < pattern: + left = mid + 1 + else: + right = mid - 1 + return positions + +def suffix_array(text, pattern): + print("Invoked Suffix Array") + suffix_array_structure = construct_suffix_array(text) + occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure) + return len(occurrences) + + +# ************************************************************************************************************************************ + + +# Cache Store ********************************************************************************************************************** + +import redis +import json + +def get_redis_connection(): + return redis.Redis(host="localhost", port=6379, db=0) + +def push_to_redis(key, response): + logger.info(f"Pushing to Cache Store {response}") + try: + redis_connection = get_redis_connection() + redis_connection.hset(key, "response", json.dumps(response)) + return True + except Exception as e: + logger.error(f"Error while pushing to Redis: {e}") + +def check_in_redis(key): + logger.info("Checking in our precious Cache Store") + try: + redis_connection = get_redis_connection() + response = redis_connection.hget(key, "response") + if response: + logger.info("Match found, returning from Cache Store") + return json.loads(response) + else: + return False + except Exception as e: + logger.error(f"Error while checking in Redis: {e}") + return False + +# ************************************************************************************************************************************ + +# Recommendations Generator *********************************************************************************************************** +import requests +import better_profanity + +def get_seo_recommendation(keyword): + url = "https://www.spyfu.com/NsaApi/RelatedKeyword/GetPhraseMatchedKeywords" + payload = f"{{\"query\":\"{keyword}\",\"pageSize\":10,\"isOverview\":true,\"countryCode\":\"US\"}}" + headers = { + 'content-type': 'application/json;charset=UTF-8', + 'Cookie': 'ASP.NET_SessionId=rutmlg02sfx4yakg0nd0asxw' + } + + response = requests.request("POST", url, headers=headers, data=payload) + alternate_keywords = [] + for each in response.json()["keywords"]: + if not better_profanity.profanity.contains_profanity(each["keyword"]): + alternate_keywords.append(each["keyword"]) + return alternate_keywords + + +def get_suggested_replacements(keyword): + url = f"https://api.datamuse.com/words?rel_syn={keyword}" + response = requests.get(url) + if response.status_code == 200: + synonyms = [word['word'] for word in response.json()][:2] + return synonyms + else: + return None + +def generate_recommendations(keywords_and_count): + for each in keywords_and_count: + each["mostSearchedAlternatives"] = get_seo_recommendation(each["originalKeyword"]) + each["probableReplacements"] = get_suggested_replacements(each["originalKeyword"]) + return keywords_and_count + + +# ************************************************************************************************************************************ + + +# Parsing Engine *************************************************************************************************************************** +import time +def get_keywords(algo_choice, scrapped_content): + keywords_and_count = [] + existing_keywords = [] + start_time = time.time() + for eachword in scrapped_content.split(" "): + if eachword == "": + continue + elif not eachword.isalpha(): + continue + else: + if eachword not in existing_keywords: + keywords_and_count.append({"originalKeyword": eachword, "count": eval(f"{algo_choice}(scrapped_content, eachword)")}) + existing_keywords.append(eachword) + return keywords_and_count, time.time() - start_time + +def get_top_keywords(keywords_and_count): + keywords_and_count.sort(key=lambda x: x["count"], reverse=True) + try: + return keywords_and_count[:20] + except Exception as exc: + return exc +# ****************************************************************************************************************************************** + +# API Endpoints *************************************************************************************************************************** + + +@app.post('/api/v1/keyword/') +async def keyword_api(request: Request): + payload = await request.json() + url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] + try: + while True: + data = check_in_redis(url) + if data: + logger.info("Found in Cache Store, Checking if this algo is already executed") + algo_exists = check_in_redis(url + payload["algoChoice"]) + if algo_exists: + logger.info("Cache Store already has recorded this algo, here you go!") + return algo_exists + break + else: + logger.info("Let's give that scrapper engine, a tad bit more time") + time.sleep(10) + logger.info("Calling for parsing") + keywords, execution_time = get_keywords(payload["algoChoice"],data["scrapedContent"]) + final_response = { "topKeywordListings": get_top_keywords(keywords), "alogirthmExecutionTime": execution_time} + logger.info("Quickly pushing to Cache Store") + push_to_redis(url + payload["algoChoice"],final_response) + return final_response + except Exception as e: + return {"503": f"{e}"} + +@app.post('/api/v1/keyword-recommendations/') +async def keyword_recommendations_api(request: Request): + payload = await request.json() + url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] + try: + data = check_in_redis(url) + if data: + logger.info("Found in Cache Store, Checking if this algo is already executed") + existing_algo_data = check_in_redis(url + payload["algoChoice"]) + if existing_algo_data: + logger.info("Cache store found this entry, checking if recommendations already exists") + if existing_algo_data["topKeywordListings"][0].get("mostSearchedAlternatives"): + logger.info("Recommendations exist, returning my precious data without changes") + return existing_algo_data + all_keywords = existing_algo_data["topKeywordListings"] + modified_keywords = generate_recommendations(all_keywords) + existing_algo_data["topKeywordListings"] = modified_keywords + logger.info("Revalidating the cache with recommendations") + push_to_redis(url + payload["algoChoice"],existing_algo_data) + return existing_algo_data + else: + return {"503": "Please run the keyword algo first"} + except Exception as e: + return {"503": f"{e}"} + +@app.post('/api/v1/multi-algo/') +async def multialgo_api(request: Request): + payload = await request.json() + url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] + algo_choices = ["rabin_karp", "naive", "kmp"] + final_response = {"data": []} + try: + while True: + data = check_in_redis(url) + if data: + logger.info("Multi algo found in Cache Store, Checking if this function for multi-algo is already executed") + algo_exists = check_in_redis(url + "multi-algo") + if algo_exists: + logger.info("Cache Store already has recorded this multi-algo, here you go!") + return algo_exists + break + else: + logger.info("Let's give that scrapper engine, a tad bit more time") + time.sleep(10) + for each_algo in algo_choices: + logger.info("Checking if said algo exists") + logger.info(f"Running for {each_algo}") + algo_exists = check_in_redis(url + each_algo) + if algo_exists: + logger.info("Cache Store already has recorded this algo, here you go!") + final_response["data"].append({"algoName": each_algo, "algoExecutionTime": algo_exists["alogirthmExecutionTime"]}) + else: + logger.info("Calling for parsing") + keywords, execution_time = get_keywords(each_algo,data["scrapedContent"]) + intermediate_response = { "topKeywordListings": get_top_keywords(keywords), "alogirthmExecutionTime": execution_time} + logger.info("Quickly pushing to Cache Store") + push_to_redis(url + each_algo,intermediate_response) + final_response["data"].append({"algoName": each_algo, "algoExecutionTime": execution_time}) + print(final_response) + push_to_redis(url + "multi-algo",final_response) + return final_response + except Exception as e: + return {"503": f"{e}"} + + +# ************************************************************************************************************************************ + + +if __name__ == '__main__': + uvicorn.run("parser:app", host='0.0.0.0', port=8001, reload=True) \ No newline at end of file diff --git a/parser/requirements.txt b/parser/requirements.txt new file mode 100644 index 0000000..a67e6ae --- /dev/null +++ b/parser/requirements.txt @@ -0,0 +1,38 @@ +annotated-types==0.6.0 +anyio==3.7.1 +beautifulsoup4==4.12.2 +better-profanity==0.7.0 +bs4==0.0.1 +certifi==2023.7.22 +charset-normalizer==3.3.1 +click==8.1.7 +fastapi==0.104.0 +futures==3.0.5 +google-search-results==2.4.2 +goslate==1.5.4 +h11==0.14.0 +idna==3.4 +Jinja2==3.1.2 +lxml==4.9.3 +MarkupSafe==2.1.3 +numpy==1.26.1 +pandas==2.1.1 +pydantic==2.4.2 +pydantic_core==2.10.1 +PyDictionary==2.0.1 +pyseoanalyzer==4.0.7 +python-dateutil==2.8.2 +pytrends==4.9.2 +pytz==2023.3.post1 +redis==5.0.1 +requests==2.31.0 +seo-keyword-research-tool==0.1.9 +serpapi==0.1.4 +six==1.16.0 +sniffio==1.3.0 +soupsieve==2.5 +starlette==0.27.0 +typing_extensions==4.8.0 +tzdata==2023.3 +urllib3==2.0.7 +uvicorn==0.23.2 diff --git a/parser/string_algo.py b/parser/string_algo.py new file mode 100644 index 0000000..c451a57 --- /dev/null +++ b/parser/string_algo.py @@ -0,0 +1,217 @@ +class RabinKarp: + def __init__(self, text, pattern): + self.text = text + self.pattern = pattern + self.text_length = len(text) + self.pattern_length = len(pattern) + self.hash_value = 0 + self.pattern_hash_value = 0 + self.window = [] + self.base = 256 + self.prime = 101 + self.occurrences = [] + def calculate_hash_value(self, string, length): + value = 0 + for i in range(length): + value = (self.base * value + ord(string[i])) % self.prime + return value + def recalculate_hash_value(self, old_hash, old_char, new_char): + new_hash = (self.base * (old_hash - ord(old_char) * (self.base **(self.pattern_length - 1))) + ord(new_char)) % self.prime + return new_hash + def search_pattern(self): + self.pattern_hash_value = self.calculate_hash_value(self.pattern, + self.pattern_length) + self.hash_value = self.calculate_hash_value(self.text, self.pattern_length) + pattern_found = False + for i in range(self.text_length - self.pattern_length + 1): + if self.pattern_hash_value == self.hash_value: + for j in range(self.pattern_length): + if self.text[i + j] != self.pattern[j]: + break + else: + print(f"Pattern found at index {i}") + self.occurrences.append(i) + pattern_found = True + if i < self.text_length - self.pattern_length: + self.hash_value = self.recalculate_hash_value(self.hash_value, self.text[i], self.text[i + self.pattern_length]) + if not pattern_found: + print("Pattern not found in the text.") + return len(self.occurrences) + +def rabin_karp(text, pattern): + rk_search = RabinKarp(text, pattern) + return rk_search.search_pattern() + + +def naive_string_matching(text, pattern): + n = len(text) + m = len(pattern) + occurrences = [] + for i in range(n - m + 1): + if text[i:i+m] == pattern: + occurrences.append(i) + return len(occurrences) + +def compute_prefix_function(pattern): + m = len(pattern) + pi = [0] * m + j = 0 + for i in range(1, m): + while j > 0 and pattern[i] != pattern[j]: + j = pi[j-1] + if pattern[i] == pattern[j]: + j += 1 + pi[i] = j + return pi + +def kmp_search(text, pattern): + n = len(text) + m = len(pattern) + pi = compute_prefix_function(pattern) + j = 0 + occurrences = [] + for i in range(n): + while j > 0 and text[i] != pattern[j]: + j = pi[j-1] + if text[i] == pattern[j]: + j += 1 + if j == m: + occurrences.append(i - m + 1) + j = pi[j-1] + return len(occurrences) + + +class TrieNode: + def __init__(self): + self.children = {} + self.is_end_of_word = False + +class Trie: + def __init__(self): + self.root = TrieNode() + def insert(self, word): + node = self.root + for char in word: + if char not in node.children: + node.children[char] = TrieNode() + node = node.children[char] + node.is_end_of_word = True + + +class SuffixTree: + def __init__(self, text): + self.text = text + self.trie = Trie() + self.build() + def build(self): + for i in range(len(self.text)): + self.trie.insert(self.text[i:]) + def display(self, node=None, prefix=''): + node = node or self.trie.root + if not node.children: + print(prefix) + else: + for char, child in node.children.items(): + self.display(child, prefix + char) + + + + +def suffix_tree(text, pattern): + print("Invoked Suffix Tree") + suffix_tree = SuffixTree(text) + occurrences = suffix_tree.search(pattern) + return len(occurrences) + +def construct_suffix_array(text): + suffixes = [(text[i:], i) for i in range(len(text))] + suffixes.sort(key=lambda x: x[0]) + suffix_array = [item[1] for item in suffixes] + return suffix_array + +def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst against suffix array + n = len(text) + m = len(pattern) + left, right = 0, n - 1 + positions = [] + while left <= right: + mid = (left + right) // 2 + suffix = suffix_array[mid] + if text[suffix:suffix + m] == pattern: + positions.append(suffix) + i = mid - 1 + while i >= left and text[suffix_array[i]:suffix_array[i] + m] == pattern: + positions.append(suffix_array[i]) + i -= 1 + i = mid + 1 + while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern: + positions.append(suffix_array[i]) + i += 1 + return positions + elif text[suffix:suffix + m] < pattern: + left = mid + 1 + else: + right = mid - 1 + return positions + +def suffix_array(text, pattern): + print("Invoked Suffix Array") + suffix_array_structure = construct_suffix_array(text) + occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure) + return len(occurrences) + + +import time +def get_keywords(algo_choice, scrapped_content): + keywords_and_count = [] + existing_keywords = [] + start_time = time.time() + for eachword in scrapped_content.split(" "): + if eachword == "": + continue + elif not eachword.isalpha(): + continue + else: + if eachword not in existing_keywords: + keywords_and_count.append({"keyword": eachword, "count": eval(f"{algo_choice}(scrapped_content, eachword)")}) + existing_keywords.append(eachword) + return keywords_and_count, time.time() - start_time + +if __name__ == "__main__": + print(get_keywords("suffix_tree", "hello this the i am the pokemon the poki")) + # # Suffix Array + # text = input("Enter the text: ") + # suffix_array = construct_suffix_array(text) + # print("Suffix Array:", suffix_array) + + # # Suffix Tree + # text = input("Enter the text: ") + # suffix_tree = SuffixTree(text) + # suffix_tree.display() + + # # Rabin-Karp + # text = input("Enter the text: ") + # pattern = input("Enter the pattern: ") + # rk_search = RabinKarp(text, pattern) + # print(rk_search.search_pattern()) + + + + # # Naive String Matching + # text = input("Enter the text: ") + # pattern = input("Enter the pattern: ") + # occurrences = naive_string_matching(text, pattern) + # if occurrences: + # print(f'Pattern found at indices: {occurrences}') + # else: + # print('Pattern not found in the text.') + + # # KMP + # text = input("Enter the text: ") + # pattern = input("Enter the pattern: ") + # # Find and display occurrences + # occurrences = kmp_search(text, pattern) + # if occurrences: + # print(f'Pattern found at indices: {occurrences}') + # else: + # print('Pattern not found in the text.') \ No newline at end of file diff --git a/scrapper/mock.json b/scrapper/mock.json deleted file mode 100644 index 26f96b4..0000000 --- a/scrapper/mock.json +++ /dev/null @@ -1,34 +0,0 @@ -# Scraping Service - -{ - "scrapedContent": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Donec et odio pellentesque diam volutpat commodo sed egestas egestas. Neque volutpat ac tincidunt vitae semper quis lectus. Tortor dignissim convallis aenean et tortor at risus. Cras pulvinar mattis nunc sed blandit libero volutpat sed cras. Urna nec tincidunt praesent semper. Fusce id velit ut tortor. Hac habitasse platea dictumst quisque sagittis purus sit amet. Accumsan sit amet nulla facilisi morbi tempus. Volutpat odio facilisis mauris sit amet massa vitae. Sit amet facilisis magna etiam tempor orci eu. Scelerisque purus semper eget duis at tellus at urna. Elit eget gravida cum sociis natoque penatibus et magnis dis. Sed faucibus turpis in eu mi bibendum neque egestas congue. Viverra maecenas accumsan lacus vel facilisis volutpat. Commodo quis imperdiet massa tincidunt nunc pulvinar sapien. Eget nunc lobortis mattis aliquam faucibus purus in massa. Enim neque volutpat ac tincidunt vitae semper quis. Pharetra convallis posuere morbi leo urna molestie at elementum eu. Amet consectetur adipiscing elit ut aliquam purus sit amet. Sed tempus urna et pharetra pharetra massa massa ultricies mi. Feugiat sed lectus vestibulum mattis ullamcorper velit. Ridiculus mus mauris vitae ultricies leo integer. Mauris ultrices eros in cursus turpis. Turpis egestas pretium aenean pharetra magna ac. Ultrices eros in cursus turpis massa tincidunt dui ut ornare. Eu volutpat odio facilisis mauris sit. Odio eu feugiat pretium nibh ipsum consequat. Nibh tortor id aliquet lectus proin. Sed blandit libero volutpat sed cras ornare arcu. Risus commodo viverra maecenas accumsan lacus vel facilisis. Lectus arcu bibendum at varius vel pharetra. Condimentum vitae sapien pellentesque habitant morbi tristique senectus. Aliquam sem fringilla ut morbi tincidunt. Amet dictum sit amet justo donec enim diam. Convallis convallis tellus id interdum velit laoreet id. Malesuada bibendum arcu vitae elementum. Cursus eget nunc scelerisque viverra mauris in aliquam. Ut sem viverra aliquet eget. Dolor sit amet consectetur adipiscing elit. Tellus at urna condimentum mattis pellentesque id nibh tortor id. Penatibus et magnis dis parturient montes nascetur ridiculus. Nec nam aliquam sem et tortor consequat. Tincidunt vitae semper quis lectus. Eget nulla facilisi etiam dignissim. Fringilla urna porttitor rhoncus dolor purus non enim praesent. Ultricies integer quis auctor elit sed. Consequat nisl vel pretium lectus quam id leo in. Vitae turpis massa sed elementum tempus egestas sed sed risus. At lectus urna duis convallis. Donec ac odio tempor orci dapibus ultrices in. Vitae semper quis lectus nulla. Mollis aliquam ut porttitor leo a diam sollicitudin tempor. Eu facilisis sed odio morbi quis commodo odio aenean. Nulla porttitor massa id neque. Quam viverra orci sagittis eu volutpat odio facilisis. Imperdiet nulla malesuada pellentesque elit eget gravida cum. Sit amet massa vitae tortor condimentum lacinia quis vel eros. Sit amet massa vitae tortor condimentum. Risus viverra adipiscing at in tellus integer. Ut diam quam nulla porttitor massa id. Justo nec ultrices dui sapien eget mi proin. Commodo odio aenean sed adipiscing diam donec adipiscing tristique risus. Vivamus arcu felis bibendum ut tristique et egestas quis ipsum. Nec ultrices dui sapien eget. Ullamcorper malesuada proin libero nunc consequat interdum varius. Id diam vel quam elementum pulvinar etiam non quam lacus. Volutpat maecenas volutpat blandit aliquam etiam. Pellentesque diam volutpat commodo sed egestas egestas fringilla phasellus faucibus. Interdum velit euismod in pellentesque massa placerat duis ultricies. Porttitor lacus luctus accumsan tortor posuere ac. Quam lacus suspendisse faucibus interdum. Nunc faucibus a pellentesque sit amet porttitor eget dolor. Eget mi proin sed libero enim sed faucibus turpis. Id consectetur purus ut faucibus pulvinar elementum integer enim. Nisl nunc mi ipsum faucibus vitae aliquet nec ullamcorper sit. Ut etiam sit amet nisl purus in mollis nunc. Sed euismod nisi porta lorem mollis aliquam ut porttitor leo. Vulputate ut pharetra sit amet aliquam id. Adipiscing commodo elit at imperdiet dui accumsan sit amet nulla. Vestibulum sed arcu non odio. Tellus rutrum tellus pellentesque eu tincidunt tortor aliquam. Morbi tristique senectus et netus et malesuada fames." -} - -# keyword service (time in seconds) - -{ - "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"], - "alogirthmExecutingTime": 0.893473 -} - -# comparision service for multi algo - -{ - "data": [ - { - "algoName": "algo1", - "algoExecutingTime": 0.893473, - "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"] - }, - { - "algoName": "algo2", - "algoExecutingTime": 0.893473, - "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"] - }, - { - "algoName": "algo3", - "algoExecutingTime": 0.893473, - "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"] - } - ] -} \ No newline at end of file diff --git a/scrapper/requirements.txt b/scrapper/requirements.txt index f848169..f4b0157 100644 --- a/scrapper/requirements.txt +++ b/scrapper/requirements.txt @@ -1,12 +1,25 @@ annotated-types==0.6.0 anyio==3.7.1 +beautifulsoup4==4.12.2 +bs4==0.0.1 +certifi==2023.7.22 +charset-normalizer==3.3.1 click==8.1.7 fastapi==0.104.0 h11==0.14.0 idna==3.4 +joblib==1.3.2 +nltk==3.8.1 +numpy==1.26.1 pydantic==2.4.2 pydantic_core==2.10.1 +redis==5.0.1 +regex==2023.10.3 +requests==2.31.0 sniffio==1.3.0 +soupsieve==2.5 starlette==0.27.0 +tqdm==4.66.1 typing_extensions==4.8.0 +urllib3==2.0.7 uvicorn==0.23.2 diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py index 89622d2..cb218f7 100644 --- a/scrapper/scrapper.py +++ b/scrapper/scrapper.py @@ -3,10 +3,28 @@ from fastapi.middleware.cors import CORSMiddleware import uvicorn import time +import logging +import nltk +import numpy +from bs4 import BeautifulSoup +from nltk.corpus import stopwords +from nltk.tag import pos_tag -app = FastAPI() + +nltk.download('punkt') +nltk.download('words') +nltk.download('stopwords') +nltk.download('pos_tag') +nltk.download('averaged_perceptron_tagger') +nltk.download('maxent_ne_chunker') + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], @@ -15,7 +33,47 @@ allow_headers=["*"], ) -from bs4 import BeautifulSoup + + + +# Cache Store ********************************************************************************************************************** + +import redis +import json +from scrapper import logger + +def get_redis_connection(): + return redis.Redis(host="localhost", port=6379, db=0) + +def push_to_redis(key, response): + logger.info("Pushing to Cache Store") + try: + redis_connection = get_redis_connection() + redis_connection.hset(key, "response", json.dumps(response)) + return True + except Exception as e: + logger.error(f"Error while pushing to Redis: {e}") + +def check_in_redis(links, key): + logger.info("Checking in our precious Cache Store") + try: + redis_connection = get_redis_connection() + response = redis_connection.hget(key, "response") + if response: + logger.info("Found a match, Checking if this is latest") + if json.loads(response)["scrappedUrls"] == links: + logger.info("Perfect match found, returning from Cache Store") + return json.loads(response) + else: + return False + except Exception as e: + logger.error(f"Error while checking in Redis: {e}") + return False + +# ************************************************************************************************************************************ + + +# Scraping Engine *************************************************************************************************************************** def get_urls(base_url): response = requests.get(base_url) @@ -23,7 +81,7 @@ def get_urls(base_url): links = set() for link in soup.find_all('a'): href = link.get('href') - if href and not href.startswith('http') and '.' not in href: + if href and not href.startswith('http') and '.' not in href and len(links) < 5: links.add(base_url+href) return list(links) @@ -35,35 +93,23 @@ def scrape(url): def scrape_all(url): links = get_urls(url) text = '' + cache = check_in_redis(links,url) + if cache: + logger.info("Found in Cache") + return True, cache["scrappedUrls"], cache["scrapedContent"] + logger.info("Not Found in Cache, Scraping New") for link in links: text += scrape(link) - return links, text - -import nltk -import numpy -from nltk.tokenize import word_tokenize -from nltk.corpus import stopwords -from nltk.tag import pos_tag -from nltk.chunk import ne_chunk - -nltk.download('punkt') -nltk.download('words') -nltk.download('stopwords') -nltk.download('pos_tag') -nltk.download('averaged_perceptron_tagger') -nltk.download('maxent_ne_chunker') + return False, links, text def remove_pronouns_nouns(text): - # Tokenize the text into words - words = word_tokenize(text) - - # Remove stop words from the words list + import re + text = re.sub(r'[^\w\s]', ' ', text) + words = text.split() + words = [word for word in words if len(word) > 2 and not word.isdigit()] stop_words = set(stopwords.words('english')) words = [word for word in words if word.lower() not in stop_words] - - # Tag the parts of speech of each word tagged_words = pos_tag(words) - filtered_words = [] removed_words = [] for word, tag in tagged_words: @@ -77,19 +123,27 @@ def remove_pronouns_nouns(text): return filtered_text, removed_text +# ********************************************************************************************************************************************* + + +# API Endpoints ********************************************************************************************************************************* + @app.post('/api/v1/scraping/') async def root(request: Request): payload = await request.json() + url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] try: - scrapped_urls, scrapped_text = scrape_all(payload['url']) + cacheExists, scrapped_urls, scrapped_text = scrape_all(url) except Exception as e: return {"503": f"{e}"} final_text, removed_text = remove_pronouns_nouns(scrapped_text) - return { - "scrapedContent": final_text, - "scrappedUrls": scrapped_urls, - "removedContent": removed_text - } + response = { "scrapedContent": final_text, "scrappedUrls": scrapped_urls, "removedContent": removed_text, "returnedFromCache": True if cacheExists else False } + if not cacheExists: + logger.info("That's new to me, populating Cache Store right away!") + push_to_redis(url,response) + return response + +# ************************************************************************************************************************************************ if __name__ == '__main__': uvicorn.run("scrapper:app", host='0.0.0.0', port=8000, reload=True) \ No newline at end of file From 23e8a9f8734f4fcb69d814489fdbcd673c96476f Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Wed, 25 Oct 2023 23:31:31 -0700 Subject: [PATCH 2/9] Yet to do, enhancements on scrapping; suffix_array and suffix_tree; deployment Co-authored-by: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Co-authored-by: PallaviKhedle --- parser/string_algo.py | 217 ------------------------------------------ 1 file changed, 217 deletions(-) delete mode 100644 parser/string_algo.py diff --git a/parser/string_algo.py b/parser/string_algo.py deleted file mode 100644 index c451a57..0000000 --- a/parser/string_algo.py +++ /dev/null @@ -1,217 +0,0 @@ -class RabinKarp: - def __init__(self, text, pattern): - self.text = text - self.pattern = pattern - self.text_length = len(text) - self.pattern_length = len(pattern) - self.hash_value = 0 - self.pattern_hash_value = 0 - self.window = [] - self.base = 256 - self.prime = 101 - self.occurrences = [] - def calculate_hash_value(self, string, length): - value = 0 - for i in range(length): - value = (self.base * value + ord(string[i])) % self.prime - return value - def recalculate_hash_value(self, old_hash, old_char, new_char): - new_hash = (self.base * (old_hash - ord(old_char) * (self.base **(self.pattern_length - 1))) + ord(new_char)) % self.prime - return new_hash - def search_pattern(self): - self.pattern_hash_value = self.calculate_hash_value(self.pattern, - self.pattern_length) - self.hash_value = self.calculate_hash_value(self.text, self.pattern_length) - pattern_found = False - for i in range(self.text_length - self.pattern_length + 1): - if self.pattern_hash_value == self.hash_value: - for j in range(self.pattern_length): - if self.text[i + j] != self.pattern[j]: - break - else: - print(f"Pattern found at index {i}") - self.occurrences.append(i) - pattern_found = True - if i < self.text_length - self.pattern_length: - self.hash_value = self.recalculate_hash_value(self.hash_value, self.text[i], self.text[i + self.pattern_length]) - if not pattern_found: - print("Pattern not found in the text.") - return len(self.occurrences) - -def rabin_karp(text, pattern): - rk_search = RabinKarp(text, pattern) - return rk_search.search_pattern() - - -def naive_string_matching(text, pattern): - n = len(text) - m = len(pattern) - occurrences = [] - for i in range(n - m + 1): - if text[i:i+m] == pattern: - occurrences.append(i) - return len(occurrences) - -def compute_prefix_function(pattern): - m = len(pattern) - pi = [0] * m - j = 0 - for i in range(1, m): - while j > 0 and pattern[i] != pattern[j]: - j = pi[j-1] - if pattern[i] == pattern[j]: - j += 1 - pi[i] = j - return pi - -def kmp_search(text, pattern): - n = len(text) - m = len(pattern) - pi = compute_prefix_function(pattern) - j = 0 - occurrences = [] - for i in range(n): - while j > 0 and text[i] != pattern[j]: - j = pi[j-1] - if text[i] == pattern[j]: - j += 1 - if j == m: - occurrences.append(i - m + 1) - j = pi[j-1] - return len(occurrences) - - -class TrieNode: - def __init__(self): - self.children = {} - self.is_end_of_word = False - -class Trie: - def __init__(self): - self.root = TrieNode() - def insert(self, word): - node = self.root - for char in word: - if char not in node.children: - node.children[char] = TrieNode() - node = node.children[char] - node.is_end_of_word = True - - -class SuffixTree: - def __init__(self, text): - self.text = text - self.trie = Trie() - self.build() - def build(self): - for i in range(len(self.text)): - self.trie.insert(self.text[i:]) - def display(self, node=None, prefix=''): - node = node or self.trie.root - if not node.children: - print(prefix) - else: - for char, child in node.children.items(): - self.display(child, prefix + char) - - - - -def suffix_tree(text, pattern): - print("Invoked Suffix Tree") - suffix_tree = SuffixTree(text) - occurrences = suffix_tree.search(pattern) - return len(occurrences) - -def construct_suffix_array(text): - suffixes = [(text[i:], i) for i in range(len(text))] - suffixes.sort(key=lambda x: x[0]) - suffix_array = [item[1] for item in suffixes] - return suffix_array - -def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst against suffix array - n = len(text) - m = len(pattern) - left, right = 0, n - 1 - positions = [] - while left <= right: - mid = (left + right) // 2 - suffix = suffix_array[mid] - if text[suffix:suffix + m] == pattern: - positions.append(suffix) - i = mid - 1 - while i >= left and text[suffix_array[i]:suffix_array[i] + m] == pattern: - positions.append(suffix_array[i]) - i -= 1 - i = mid + 1 - while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern: - positions.append(suffix_array[i]) - i += 1 - return positions - elif text[suffix:suffix + m] < pattern: - left = mid + 1 - else: - right = mid - 1 - return positions - -def suffix_array(text, pattern): - print("Invoked Suffix Array") - suffix_array_structure = construct_suffix_array(text) - occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure) - return len(occurrences) - - -import time -def get_keywords(algo_choice, scrapped_content): - keywords_and_count = [] - existing_keywords = [] - start_time = time.time() - for eachword in scrapped_content.split(" "): - if eachword == "": - continue - elif not eachword.isalpha(): - continue - else: - if eachword not in existing_keywords: - keywords_and_count.append({"keyword": eachword, "count": eval(f"{algo_choice}(scrapped_content, eachword)")}) - existing_keywords.append(eachword) - return keywords_and_count, time.time() - start_time - -if __name__ == "__main__": - print(get_keywords("suffix_tree", "hello this the i am the pokemon the poki")) - # # Suffix Array - # text = input("Enter the text: ") - # suffix_array = construct_suffix_array(text) - # print("Suffix Array:", suffix_array) - - # # Suffix Tree - # text = input("Enter the text: ") - # suffix_tree = SuffixTree(text) - # suffix_tree.display() - - # # Rabin-Karp - # text = input("Enter the text: ") - # pattern = input("Enter the pattern: ") - # rk_search = RabinKarp(text, pattern) - # print(rk_search.search_pattern()) - - - - # # Naive String Matching - # text = input("Enter the text: ") - # pattern = input("Enter the pattern: ") - # occurrences = naive_string_matching(text, pattern) - # if occurrences: - # print(f'Pattern found at indices: {occurrences}') - # else: - # print('Pattern not found in the text.') - - # # KMP - # text = input("Enter the text: ") - # pattern = input("Enter the pattern: ") - # # Find and display occurrences - # occurrences = kmp_search(text, pattern) - # if occurrences: - # print(f'Pattern found at indices: {occurrences}') - # else: - # print('Pattern not found in the text.') \ No newline at end of file From 14ad4e5d0abcfe21671f82c588bd5812e91a9e35 Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 14:10:19 -0700 Subject: [PATCH 3/9] few deployment changes --- .github/workflows/deploy.yml | 78 ++++++++++++++++++++++++++++++++++++ Dockerfile | 17 ++++++++ docker-compose.yml | 48 +++++++++++----------- parser/parser.py | 2 +- scrapper/scrapper.py | 2 +- 5 files changed, 121 insertions(+), 26 deletions(-) create mode 100644 .github/workflows/deploy.yml create mode 100644 Dockerfile diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..d777298 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,78 @@ + name: Build and Push Docker + + on: + push: + branches: ['main', 'scrapper'] + + env: + REGISTRY: docker.io + IMAGE_NAME: ${{ github.repository }} + CONTAINER_NAME: django + + + jobs: + build-and-push-image: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Log in to the Container registry + uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + + - name: Extract metadata (tags, labels) for Docker + id: meta2 + uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38 + with: + images: ${{ env.REGISTRY }}/maxconformance/proxy + + - name: Build and push Docker image + uses: docker/build-push-action@v4.1.1 + with: + context: . + push: true + target: builder + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + - name: Build and push Docker Proxy image + uses: docker/build-push-action@v4.1.1 + with: + context: . + push: true + target: deployer + tags: ${{ steps.meta2.outputs.tags }} + labels: ${{ steps.meta2.outputs.labels }} + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS }} + aws-region: ap-south-1 + + # - name: Deploy Amazon ECS task definition + # uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + # with: + # task-definition: .deploy/ecs-task-definition.json + # service: mc-application + # cluster: ProdCluster + # force-new-deployment: true + - name: Update ECS Service + run: | + aws ecs update-service --cluster ProdCluster --service mc-application --desired-count 1 --force-new-deployment \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5c99dee --- /dev/null +++ b/Dockerfile @@ -0,0 +1,17 @@ +FROM node:20.6.0-buster as frontend +ADD . /app +WORKDIR /app +RUN npm install +CMD npm start + +FROM python:3.11.5-bullseye as parser +ADD . /app +WORKDIR /app/parser +RUN pip install -r requirements.txt +CMD python parser.py + +FROM python:3.11.5-bullseye as scrapper +ADD . /app +WORKDIR /app/scrapper +RUN pip install -r requirements.txt +CMD python scrapper.py \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 90a6192..b6447b8 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,31 +1,31 @@ version: "3.9" services: - # frontend: - # image: node:20 - # ports: - # - 3000:3000 - # volumes: - # - ./:/app - # working_dir: /app - # command: sh -c "npm i && npm start" + frontend: + image: krravindra/algo-frontend:latest + ports: + - 3000:3000 + # volumes: + # - ./:/app + # working_dir: /app + # command: sh -c "npm i && npm start" - # scrapper: - # image: python:3.11 - # ports: - # - 8000:8000 - # volumes: - # - ./:/app - # working_dir: /app/scrapper - # command: sh -c "pip3 install -r requirements.txt && python3 main.py" + scrapper: + image: krravindra/scrapper:latest + ports: + - 8000:8000 + # volumes: + # - ./:/app + # working_dir: /app/scrapper + # command: sh -c "pip3 install -r requirements.txt && python3 main.py" - # parser: - # image: python:3.11 - # ports: - # - 8001:8000 - # volumes: - # - ./:/app - # working_dir: /app/parser - # command: sh -c "pip3 install -r requirements.txt && python3 parser.py" + parser: + image: krravindra/parser:latest + ports: + - 8001:8001 + # volumes: + # - ./:/app + # working_dir: /app/parser + # command: sh -c "pip3 install -r requirements.txt && python3 parser.py" redis: image: redis:6.2 diff --git a/parser/parser.py b/parser/parser.py index 4c81eff..6212acd 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -182,7 +182,7 @@ def suffix_array(text, pattern): import json def get_redis_connection(): - return redis.Redis(host="localhost", port=6379, db=0) + return redis.Redis(host="redis", port=6379, db=0) def push_to_redis(key, response): logger.info(f"Pushing to Cache Store {response}") diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py index cb218f7..b5cfabe 100644 --- a/scrapper/scrapper.py +++ b/scrapper/scrapper.py @@ -43,7 +43,7 @@ from scrapper import logger def get_redis_connection(): - return redis.Redis(host="localhost", port=6379, db=0) + return redis.Redis(host="redis", port=6379, db=0) def push_to_redis(key, response): logger.info("Pushing to Cache Store") From 1a997daba6fdaad4c9bf0f4cbd52460cde9b1a0c Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 16:14:13 -0700 Subject: [PATCH 4/9] Errors are errors not hurdels --- parser/parser.py | 24 ++++++++++++++++-------- scrapper/scrapper.py | 19 +++++++++++-------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 6212acd..2b6581c 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, HTTPException import uvicorn from fastapi.middleware.cors import CORSMiddleware import logging @@ -268,7 +268,7 @@ def get_keywords(algo_choice, scrapped_content): def get_top_keywords(keywords_and_count): keywords_and_count.sort(key=lambda x: x["count"], reverse=True) try: - return keywords_and_count[:20] + return keywords_and_count[:12] except Exception as exc: return exc # ****************************************************************************************************************************************** @@ -281,6 +281,7 @@ async def keyword_api(request: Request): payload = await request.json() url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] try: + wait_iterator = 0 while True: data = check_in_redis(url) if data: @@ -292,7 +293,10 @@ async def keyword_api(request: Request): break else: logger.info("Let's give that scrapper engine, a tad bit more time") - time.sleep(10) + if wait_iterator > 3: + raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later") + wait_iterator += 1 + time.sleep(5) logger.info("Calling for parsing") keywords, execution_time = get_keywords(payload["algoChoice"],data["scrapedContent"]) final_response = { "topKeywordListings": get_top_keywords(keywords), "alogirthmExecutionTime": execution_time} @@ -300,7 +304,7 @@ async def keyword_api(request: Request): push_to_redis(url + payload["algoChoice"],final_response) return final_response except Exception as e: - return {"503": f"{e}"} + raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later") @app.post('/api/v1/keyword-recommendations/') async def keyword_recommendations_api(request: Request): @@ -323,9 +327,9 @@ async def keyword_recommendations_api(request: Request): push_to_redis(url + payload["algoChoice"],existing_algo_data) return existing_algo_data else: - return {"503": "Please run the keyword algo first"} + raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later") except Exception as e: - return {"503": f"{e}"} + raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later") @app.post('/api/v1/multi-algo/') async def multialgo_api(request: Request): @@ -333,6 +337,7 @@ async def multialgo_api(request: Request): url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] algo_choices = ["rabin_karp", "naive", "kmp"] final_response = {"data": []} + wait_iterator = 0 try: while True: data = check_in_redis(url) @@ -345,7 +350,10 @@ async def multialgo_api(request: Request): break else: logger.info("Let's give that scrapper engine, a tad bit more time") - time.sleep(10) + if wait_iterator > 3: + raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later") + wait_iterator += 1 + time.sleep(5) for each_algo in algo_choices: logger.info("Checking if said algo exists") logger.info(f"Running for {each_algo}") @@ -364,7 +372,7 @@ async def multialgo_api(request: Request): push_to_redis(url + "multi-algo",final_response) return final_response except Exception as e: - return {"503": f"{e}"} + raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later") # ************************************************************************************************************************************ diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py index b5cfabe..eb6dbd2 100644 --- a/scrapper/scrapper.py +++ b/scrapper/scrapper.py @@ -1,4 +1,4 @@ -from fastapi import FastAPI, Request +from fastapi import FastAPI, Request, HTTPException import requests from fastapi.middleware.cors import CORSMiddleware import uvicorn @@ -134,14 +134,17 @@ async def root(request: Request): url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] try: cacheExists, scrapped_urls, scrapped_text = scrape_all(url) + + final_text, removed_text = remove_pronouns_nouns(scrapped_text) + response = { "scrapedContent": final_text, "scrappedUrls": scrapped_urls, "removedContent": removed_text, "returnedFromCache": True if cacheExists else False } + if not cacheExists: + logger.info("That's new to me, populating Cache Store right away!") + push_to_redis(url,response) + return response + except Exception as e: - return {"503": f"{e}"} - final_text, removed_text = remove_pronouns_nouns(scrapped_text) - response = { "scrapedContent": final_text, "scrappedUrls": scrapped_urls, "removedContent": removed_text, "returnedFromCache": True if cacheExists else False } - if not cacheExists: - logger.info("That's new to me, populating Cache Store right away!") - push_to_redis(url,response) - return response + logger.error(f"Error while scraping: {e}") + raise HTTPException(status_code=500, detail="Error while scraping") # ************************************************************************************************************************************************ From d58d05a2fd6806d14ef38a12528beb08c2b26c1d Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 17:13:43 -0700 Subject: [PATCH 5/9] Analyzer API added --- seo_analyzer/analyzer.py | 33 +++++++++++++++++++++++++++++++++ seo_analyzer/requirements.txt | 23 +++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 seo_analyzer/analyzer.py create mode 100644 seo_analyzer/requirements.txt diff --git a/seo_analyzer/analyzer.py b/seo_analyzer/analyzer.py new file mode 100644 index 0000000..0074409 --- /dev/null +++ b/seo_analyzer/analyzer.py @@ -0,0 +1,33 @@ +from fastapi import FastAPI, Request, HTTPException +from fastapi.middleware.cors import CORSMiddleware +import uvicorn +app = FastAPI() +from seoanalyzer import analyze + + +app = FastAPI() +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +def analyze_url(url): + output = analyze(url, follow_links=False, analyze_headings=True, analyze_extra_tags=True) + return output + + +@app.post('/api/v1/analyzer/') +async def root(request: Request): + try: + payload = await request.json() + return analyze_url(payload['url']) + except Exception as e: + raise HTTPException(status_code=500, detail="Error while analyzing") + + +if __name__ == '__main__': + uvicorn.run("analyzer:app", host='0.0.0.0', port=8000, reload=True) \ No newline at end of file diff --git a/seo_analyzer/requirements.txt b/seo_analyzer/requirements.txt new file mode 100644 index 0000000..499b339 --- /dev/null +++ b/seo_analyzer/requirements.txt @@ -0,0 +1,23 @@ +annotated-types==0.6.0 +anyio==3.7.1 +beautifulsoup4==4.12.2 +certifi==2023.7.22 +charset-normalizer==3.3.1 +click==8.1.7 +fastapi==0.104.0 +h11==0.14.0 +idna==3.4 +Jinja2==3.1.2 +lxml==4.9.3 +MarkupSafe==2.1.3 +pydantic==2.4.2 +pydantic_core==2.10.1 +pyseoanalyzer==4.0.7 +requests==2.31.0 +sitemap==20191121 +sniffio==1.3.0 +soupsieve==2.5 +starlette==0.27.0 +typing_extensions==4.8.0 +urllib3==2.0.7 +uvicorn==0.23.2 From 6c76e8732b8a058d6ee8a5b298327b346e5571a0 Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 17:15:43 -0700 Subject: [PATCH 6/9] Analyzer changes for deployment --- Dockerfile | 8 +++++++- {seo_analyzer => analyzer}/analyzer.py | 2 +- {seo_analyzer => analyzer}/requirements.txt | 0 docker-compose.yml | 19 ++++++++++++++++++- 4 files changed, 26 insertions(+), 3 deletions(-) rename {seo_analyzer => analyzer}/analyzer.py (92%) rename {seo_analyzer => analyzer}/requirements.txt (100%) diff --git a/Dockerfile b/Dockerfile index 5c99dee..78021ef 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,4 +14,10 @@ FROM python:3.11.5-bullseye as scrapper ADD . /app WORKDIR /app/scrapper RUN pip install -r requirements.txt -CMD python scrapper.py \ No newline at end of file +CMD python scrapper.py + +FROM python:3.11.5-bullseye as analyzer +ADD . /app +WORKDIR /app/analyzer +RUN pip install -r requirements.txt +CMD python analyzer.py \ No newline at end of file diff --git a/seo_analyzer/analyzer.py b/analyzer/analyzer.py similarity index 92% rename from seo_analyzer/analyzer.py rename to analyzer/analyzer.py index 0074409..4c95a38 100644 --- a/seo_analyzer/analyzer.py +++ b/analyzer/analyzer.py @@ -30,4 +30,4 @@ async def root(request: Request): if __name__ == '__main__': - uvicorn.run("analyzer:app", host='0.0.0.0', port=8000, reload=True) \ No newline at end of file + uvicorn.run("analyzer:app", host='0.0.0.0', port=8002, reload=True) \ No newline at end of file diff --git a/seo_analyzer/requirements.txt b/analyzer/requirements.txt similarity index 100% rename from seo_analyzer/requirements.txt rename to analyzer/requirements.txt diff --git a/docker-compose.yml b/docker-compose.yml index b6447b8..02a2157 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,8 +26,25 @@ services: # - ./:/app # working_dir: /app/parser # command: sh -c "pip3 install -r requirements.txt && python3 parser.py" + + + analyzer: + image: krravindra/analyzer:latest + ports: + - 8002:8002 redis: image: redis:6.2 ports: - - 6379:6379 \ No newline at end of file + - 6379:6379 + + + proxy: + image: nginx:latest + ports: + - 80:80 + - 443:443 + volumes: + - ./app.conf:/etc/nginx/conf.d/app.conf + - ./final.crt:/etc/nginx/final.crt + - ./server.key:/etc/nginx/server.key \ No newline at end of file From e0037e1a7812f9674f9028a1ec01648884b91641 Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 21:16:47 -0700 Subject: [PATCH 7/9] The majestic suffix arrays will top charts --- parser/parser.py | 62 +++++++++++++++++++++++++++++++++-------- parser/requirements.txt | 5 ++++ scrapper/scrapper.py | 2 -- 3 files changed, 55 insertions(+), 14 deletions(-) diff --git a/parser/parser.py b/parser/parser.py index 2b6581c..09cbdd2 100644 --- a/parser/parser.py +++ b/parser/parser.py @@ -134,10 +134,19 @@ def display(self, node=None, prefix=''): else: for char, child in node.children.items(): self.display(child, prefix + char) - + +def construct_suffix_tree(scrapped_content): + from suffix_tree import Tree + return Tree({"A": scrapped_content}) + + +def suffix_tree(suffix_tree, pattern): + return len(suffix_tree.find_all(pattern)) + def construct_suffix_array(text): - suffixes = [(text[i:], i) for i in range(len(text))] - suffixes.sort(key=lambda x: x[0]) + n = len(text) + suffixes = [(text[i:], i) for i in range(n)] + suffixes.sort() suffix_array = [item[1] for item in suffixes] return suffix_array @@ -159,19 +168,13 @@ def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst ag while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern: positions.append(suffix_array[i]) i += 1 - return positions + return len(positions) elif text[suffix:suffix + m] < pattern: left = mid + 1 else: right = mid - 1 - return positions + return len(positions) -def suffix_array(text, pattern): - print("Invoked Suffix Array") - suffix_array_structure = construct_suffix_array(text) - occurrences = search_pattern_with_suffix_array(text, pattern, suffix_array_structure) - return len(occurrences) - # ************************************************************************************************************************************ @@ -254,6 +257,41 @@ def get_keywords(algo_choice, scrapped_content): keywords_and_count = [] existing_keywords = [] start_time = time.time() + if algo_choice == "suffix_array": + logger.info("Triggered Suffix Arrays") + suffix_array = construct_suffix_array(scrapped_content) + for each_word in scrapped_content.split(" "): + if each_word == "": + continue + elif not each_word.isalpha(): + continue + else: + if each_word not in existing_keywords: + occurences = search_pattern_with_suffix_array(scrapped_content, each_word, suffix_array) + keywords_and_count.append({"keyword": each_word, "count": occurences}) + existing_keywords.append(each_word) + return keywords_and_count, elapsed_time + (time.time()-start_time) + if algo_choice == "suffix_tree": + logger.info("Triggered Suffix Trees") + start_time = time.time() + keywords_and_count = [] + existing_keywords = [] + constructed_suffix_tree = construct_suffix_tree(scrapped_content) + try: + for each_word in scrapped_content.split(" "): + if each_word == "": + continue + elif not each_word.isalpha(): + continue + else: + if each_word not in existing_keywords: + occurences = suffix_tree(constructed_suffix_tree, each_word) + keywords_and_count.append({"keyword": each_word, "count": occurences}) + existing_keywords.append(each_word) + return keywords_and_count, time.time() - start_time + except Exception as e: + logger.error(f"Error while parsing suffix tree: {e}") + return None for eachword in scrapped_content.split(" "): if eachword == "": continue @@ -335,7 +373,7 @@ async def keyword_recommendations_api(request: Request): async def multialgo_api(request: Request): payload = await request.json() url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url'] - algo_choices = ["rabin_karp", "naive", "kmp"] + algo_choices = ["rabin_karp", "naive", "kmp", "suffix_array", "suffix_tree"] final_response = {"data": []} wait_iterator = 0 try: diff --git a/parser/requirements.txt b/parser/requirements.txt index a67e6ae..a7630ea 100644 --- a/parser/requirements.txt +++ b/parser/requirements.txt @@ -1,11 +1,14 @@ annotated-types==0.6.0 +antlr4-python3-runtime==4.13.1 anyio==3.7.1 +automaton-tools==0.1.8 beautifulsoup4==4.12.2 better-profanity==0.7.0 bs4==0.0.1 certifi==2023.7.22 charset-normalizer==3.3.1 click==8.1.7 +csuffixtree==0.3.6 fastapi==0.104.0 futures==3.0.5 google-search-results==2.4.2 @@ -13,6 +16,7 @@ goslate==1.5.4 h11==0.14.0 idna==3.4 Jinja2==3.1.2 +jsonify==0.5 lxml==4.9.3 MarkupSafe==2.1.3 numpy==1.26.1 @@ -32,6 +36,7 @@ six==1.16.0 sniffio==1.3.0 soupsieve==2.5 starlette==0.27.0 +suffix-tree==0.1.2 typing_extensions==4.8.0 tzdata==2023.3 urllib3==2.0.7 diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py index eb6dbd2..06525b8 100644 --- a/scrapper/scrapper.py +++ b/scrapper/scrapper.py @@ -11,8 +11,6 @@ from nltk.tag import pos_tag - - nltk.download('punkt') nltk.download('words') nltk.download('stopwords') From 280e2598cb772d6e1c8556bd2a4331747e564f0f Mon Sep 17 00:00:00 2001 From: Ravindra <42912207+KR-Ravindra@users.noreply.github.com> Date: Thu, 26 Oct 2023 21:27:41 -0700 Subject: [PATCH 8/9] Added suffx stuff to frontend --- src/components/homePage/index.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/components/homePage/index.js b/src/components/homePage/index.js index 89eb61a..1d60ade 100644 --- a/src/components/homePage/index.js +++ b/src/components/homePage/index.js @@ -147,7 +147,8 @@ function HomePage() { - + +