diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
new file mode 100644
index 0000000..d777298
--- /dev/null
+++ b/.github/workflows/deploy.yml
@@ -0,0 +1,78 @@
+ name: Build and Push Docker
+
+ on:
+ push:
+ branches: ['main', 'scrapper']
+
+ env:
+ REGISTRY: docker.io
+ IMAGE_NAME: ${{ github.repository }}
+ CONTAINER_NAME: django
+
+
+ jobs:
+ build-and-push-image:
+ runs-on: ubuntu-latest
+ permissions:
+ contents: read
+ packages: write
+
+ steps:
+
+ - name: Checkout repository
+ uses: actions/checkout@v3
+
+ - name: Log in to the Container registry
+ uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: Extract metadata (tags, labels) for Docker
+ id: meta
+ uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+ with:
+ images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+ - name: Extract metadata (tags, labels) for Docker
+ id: meta2
+ uses: docker/metadata-action@98669ae865ea3cffbcbaa878cf57c20bbf1c6c38
+ with:
+ images: ${{ env.REGISTRY }}/maxconformance/proxy
+
+ - name: Build and push Docker image
+ uses: docker/build-push-action@v4.1.1
+ with:
+ context: .
+ push: true
+ target: builder
+ tags: ${{ steps.meta.outputs.tags }}
+ labels: ${{ steps.meta.outputs.labels }}
+
+ - name: Build and push Docker Proxy image
+ uses: docker/build-push-action@v4.1.1
+ with:
+ context: .
+ push: true
+ target: deployer
+ tags: ${{ steps.meta2.outputs.tags }}
+ labels: ${{ steps.meta2.outputs.labels }}
+
+ - name: Configure AWS credentials
+ uses: aws-actions/configure-aws-credentials@v1
+ with:
+ aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+ aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS }}
+ aws-region: ap-south-1
+
+ # - name: Deploy Amazon ECS task definition
+ # uses: aws-actions/amazon-ecs-deploy-task-definition@v1
+ # with:
+ # task-definition: .deploy/ecs-task-definition.json
+ # service: mc-application
+ # cluster: ProdCluster
+ # force-new-deployment: true
+ - name: Update ECS Service
+ run: |
+ aws ecs update-service --cluster ProdCluster --service mc-application --desired-count 1 --force-new-deployment
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..78021ef
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,23 @@
+FROM node:20.6.0-buster as frontend
+ADD . /app
+WORKDIR /app
+RUN npm install
+CMD npm start
+
+FROM python:3.11.5-bullseye as parser
+ADD . /app
+WORKDIR /app/parser
+RUN pip install -r requirements.txt
+CMD python parser.py
+
+FROM python:3.11.5-bullseye as scrapper
+ADD . /app
+WORKDIR /app/scrapper
+RUN pip install -r requirements.txt
+CMD python scrapper.py
+
+FROM python:3.11.5-bullseye as analyzer
+ADD . /app
+WORKDIR /app/analyzer
+RUN pip install -r requirements.txt
+CMD python analyzer.py
\ No newline at end of file
diff --git a/analyzer/analyzer.py b/analyzer/analyzer.py
new file mode 100644
index 0000000..4c95a38
--- /dev/null
+++ b/analyzer/analyzer.py
@@ -0,0 +1,33 @@
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+import uvicorn
+app = FastAPI()
+from seoanalyzer import analyze
+
+
+app = FastAPI()
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+
+def analyze_url(url):
+ output = analyze(url, follow_links=False, analyze_headings=True, analyze_extra_tags=True)
+ return output
+
+
+@app.post('/api/v1/analyzer/')
+async def root(request: Request):
+ try:
+ payload = await request.json()
+ return analyze_url(payload['url'])
+ except Exception as e:
+ raise HTTPException(status_code=500, detail="Error while analyzing")
+
+
+if __name__ == '__main__':
+ uvicorn.run("analyzer:app", host='0.0.0.0', port=8002, reload=True)
\ No newline at end of file
diff --git a/analyzer/requirements.txt b/analyzer/requirements.txt
new file mode 100644
index 0000000..499b339
--- /dev/null
+++ b/analyzer/requirements.txt
@@ -0,0 +1,23 @@
+annotated-types==0.6.0
+anyio==3.7.1
+beautifulsoup4==4.12.2
+certifi==2023.7.22
+charset-normalizer==3.3.1
+click==8.1.7
+fastapi==0.104.0
+h11==0.14.0
+idna==3.4
+Jinja2==3.1.2
+lxml==4.9.3
+MarkupSafe==2.1.3
+pydantic==2.4.2
+pydantic_core==2.10.1
+pyseoanalyzer==4.0.7
+requests==2.31.0
+sitemap==20191121
+sniffio==1.3.0
+soupsieve==2.5
+starlette==0.27.0
+typing_extensions==4.8.0
+urllib3==2.0.7
+uvicorn==0.23.2
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..02a2157
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,50 @@
+version: "3.9"
+services:
+ frontend:
+ image: krravindra/algo-frontend:latest
+ ports:
+ - 3000:3000
+ # volumes:
+ # - ./:/app
+ # working_dir: /app
+ # command: sh -c "npm i && npm start"
+
+ scrapper:
+ image: krravindra/scrapper:latest
+ ports:
+ - 8000:8000
+ # volumes:
+ # - ./:/app
+ # working_dir: /app/scrapper
+ # command: sh -c "pip3 install -r requirements.txt && python3 main.py"
+
+ parser:
+ image: krravindra/parser:latest
+ ports:
+ - 8001:8001
+ # volumes:
+ # - ./:/app
+ # working_dir: /app/parser
+ # command: sh -c "pip3 install -r requirements.txt && python3 parser.py"
+
+
+ analyzer:
+ image: krravindra/analyzer:latest
+ ports:
+ - 8002:8002
+
+ redis:
+ image: redis:6.2
+ ports:
+ - 6379:6379
+
+
+ proxy:
+ image: nginx:latest
+ ports:
+ - 80:80
+ - 443:443
+ volumes:
+ - ./app.conf:/etc/nginx/conf.d/app.conf
+ - ./final.crt:/etc/nginx/final.crt
+ - ./server.key:/etc/nginx/server.key
\ No newline at end of file
diff --git a/parser/parser.py b/parser/parser.py
new file mode 100644
index 0000000..e8f62a3
--- /dev/null
+++ b/parser/parser.py
@@ -0,0 +1,420 @@
+from fastapi import FastAPI, Request, HTTPException
+import uvicorn
+from fastapi.middleware.cors import CORSMiddleware
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+app = FastAPI()
+app.add_middleware(
+ CORSMiddleware,
+ allow_origins=["*"],
+ allow_credentials=True,
+ allow_methods=["*"],
+ allow_headers=["*"],
+)
+
+
+# Algorithms ***************************************************************************************************************************
+
+class RabinKarp:
+ def __init__(self, text, pattern):
+ self.text = text
+ self.pattern = pattern
+ self.text_length = len(text)
+ self.pattern_length = len(pattern)
+ self.hash_value = 0
+ self.pattern_hash_value = 0
+ self.window = []
+ self.base = 256
+ self.prime = 101
+ self.occurrences = []
+ def calculate_hash_value(self, string, length):
+ value = 0
+ for i in range(length):
+ value = (self.base * value + ord(string[i])) % self.prime
+ return value
+ def recalculate_hash_value(self, old_hash, old_char, new_char):
+ new_hash = (self.base * (old_hash - ord(old_char) * (self.base **(self.pattern_length - 1))) + ord(new_char)) % self.prime
+ return new_hash
+ def search_pattern(self):
+ self.pattern_hash_value = self.calculate_hash_value(self.pattern,
+ self.pattern_length)
+ self.hash_value = self.calculate_hash_value(self.text, self.pattern_length)
+ pattern_found = False
+ for i in range(self.text_length - self.pattern_length + 1):
+ if self.pattern_hash_value == self.hash_value:
+ for j in range(self.pattern_length):
+ if self.text[i + j] != self.pattern[j]:
+ break
+ else:
+ self.occurrences.append(i)
+ pattern_found = True
+ if i < self.text_length - self.pattern_length:
+ self.hash_value = self.recalculate_hash_value(self.hash_value, self.text[i], self.text[i + self.pattern_length])
+ if not pattern_found:
+ print("Pattern not found in the text.")
+ return len(self.occurrences)
+
+
+def rabin_karp(text, pattern):
+ rk_search = RabinKarp(text, pattern)
+ return rk_search.search_pattern()
+
+
+def naive(text, pattern):
+ n = len(text)
+ m = len(pattern)
+ occurrences = []
+ for i in range(n - m + 1):
+ if text[i:i+m] == pattern:
+ occurrences.append(i)
+ return len(occurrences)
+
+def compute_prefix_function(pattern):
+ m = len(pattern)
+ pi = [0] * m
+ j = 0
+ for i in range(1, m):
+ while j > 0 and pattern[i] != pattern[j]:
+ j = pi[j-1]
+ if pattern[i] == pattern[j]:
+ j += 1
+ pi[i] = j
+ return pi
+
+def kmp(text, pattern):
+ n = len(text)
+ m = len(pattern)
+ pi = compute_prefix_function(pattern)
+ j = 0
+ occurrences = []
+ for i in range(n):
+ while j > 0 and text[i] != pattern[j]:
+ j = pi[j-1]
+ if text[i] == pattern[j]:
+ j += 1
+ if j == m:
+ occurrences.append(i - m + 1)
+ j = pi[j-1]
+ return len(occurrences)
+
+
+class TrieNode:
+ def __init__(self):
+ self.children = {}
+ self.is_end_of_word = False
+
+class Trie:
+ def __init__(self):
+ self.root = TrieNode()
+ def insert(self, word):
+ node = self.root
+ for char in word:
+ if char not in node.children:
+ node.children[char] = TrieNode()
+ node = node.children[char]
+ node.is_end_of_word = True
+
+
+class SuffixTree:
+ def __init__(self, text):
+ self.text = text
+ self.trie = Trie()
+ self.build()
+ def build(self):
+ for i in range(len(self.text)):
+ self.trie.insert(self.text[i:])
+ def display(self, node=None, prefix=''):
+ node = node or self.trie.root
+ if not node.children:
+ print(prefix)
+ else:
+ for char, child in node.children.items():
+ self.display(child, prefix + char)
+
+def construct_suffix_tree(scrapped_content):
+ from suffix_tree import Tree
+ return Tree({"A": scrapped_content})
+
+
+def suffix_tree(suffix_tree, pattern):
+ return len(suffix_tree.find_all(pattern))
+
+def construct_suffix_array(text):
+ n = len(text)
+ suffixes = [(text[i:], i) for i in range(n)]
+ suffixes.sort()
+ suffix_array = [item[1] for item in suffixes]
+ return suffix_array
+
+def search_pattern_with_suffix_array(text, pattern, suffix_array): #Using bst against suffix array
+ n = len(text)
+ m = len(pattern)
+ left, right = 0, n - 1
+ positions = []
+ while left <= right:
+ mid = (left + right) // 2
+ suffix = suffix_array[mid]
+ if text[suffix:suffix + m] == pattern:
+ positions.append(suffix)
+ i = mid - 1
+ while i >= left and text[suffix_array[i]:suffix_array[i] + m] == pattern:
+ positions.append(suffix_array[i])
+ i -= 1
+ i = mid + 1
+ while i <= right and text[suffix_array[i]:suffix_array[i] + m] == pattern:
+ positions.append(suffix_array[i])
+ i += 1
+ return len(positions)
+ elif text[suffix:suffix + m] < pattern:
+ left = mid + 1
+ else:
+ right = mid - 1
+ return len(positions)
+
+
+# ************************************************************************************************************************************
+
+
+# Cache Store **********************************************************************************************************************
+
+import redis
+import json
+
+def get_redis_connection():
+ return redis.Redis(host="redis", port=6379, db=0)
+
+def push_to_redis(key, response):
+ logger.info(f"Pushing to Cache Store {response}")
+ try:
+ redis_connection = get_redis_connection()
+ redis_connection.hset(key, "response", json.dumps(response))
+ return True
+ except Exception as e:
+ logger.error(f"Error while pushing to Redis: {e}")
+
+def check_in_redis(key):
+ logger.info("Checking in our precious Cache Store")
+ try:
+ redis_connection = get_redis_connection()
+ response = redis_connection.hget(key, "response")
+ if response:
+ logger.info("Match found, returning from Cache Store")
+ return json.loads(response)
+ else:
+ return False
+ except Exception as e:
+ logger.error(f"Error while checking in Redis: {e}")
+ return False
+
+# ************************************************************************************************************************************
+
+# Recommendations Generator ***********************************************************************************************************
+import requests
+import better_profanity
+
+def get_seo_recommendation(keyword):
+ url = "https://www.spyfu.com/NsaApi/RelatedKeyword/GetPhraseMatchedKeywords"
+ payload = f"{{\"query\":\"{keyword}\",\"pageSize\":10,\"isOverview\":true,\"countryCode\":\"US\"}}"
+ headers = {
+ 'content-type': 'application/json;charset=UTF-8',
+ 'Cookie': 'ASP.NET_SessionId=rutmlg02sfx4yakg0nd0asxw'
+ }
+
+ response = requests.request("POST", url, headers=headers, data=payload)
+ alternate_keywords = []
+ for each in response.json()["keywords"]:
+ if not better_profanity.profanity.contains_profanity(each["keyword"]):
+ alternate_keywords.append(each["keyword"])
+ return alternate_keywords
+
+
+def get_suggested_replacements(keyword):
+ url = f"https://api.datamuse.com/words?rel_syn={keyword}"
+ response = requests.get(url)
+ if response.status_code == 200:
+ synonyms = [word['word'] for word in response.json()][:2]
+ return synonyms
+ else:
+ return None
+
+def generate_recommendations(keywords_and_count):
+ for each in keywords_and_count:
+ each["mostSearchedAlternatives"] = get_seo_recommendation(each["originalKeyword"])
+ each["probableReplacements"] = get_suggested_replacements(each["originalKeyword"])
+ return keywords_and_count
+
+
+# ************************************************************************************************************************************
+
+
+# Parsing Engine ***************************************************************************************************************************
+import time
+def get_keywords(algo_choice, scrapped_content):
+ keywords_and_count = []
+ existing_keywords = []
+ start_time = time.time()
+ if algo_choice == "suffix_array":
+ logger.info("Triggered Suffix Arrays")
+ suffix_array = construct_suffix_array(scrapped_content)
+ for each_word in scrapped_content.split(" "):
+ if each_word == "":
+ continue
+ elif not each_word.isalpha():
+ continue
+ else:
+ if each_word not in existing_keywords:
+ occurences = search_pattern_with_suffix_array(scrapped_content, each_word, suffix_array)
+ keywords_and_count.append({"keyword": each_word, "count": occurences})
+ existing_keywords.append(each_word)
+ return keywords_and_count, (time.time()-start_time)
+ if algo_choice == "suffix_tree":
+ logger.info("Triggered Suffix Trees")
+ start_time = time.time()
+ keywords_and_count = []
+ existing_keywords = []
+ constructed_suffix_tree = construct_suffix_tree(scrapped_content)
+ try:
+ for each_word in scrapped_content.split(" "):
+ if each_word == "":
+ continue
+ elif not each_word.isalpha():
+ continue
+ else:
+ if each_word not in existing_keywords:
+ occurences = suffix_tree(constructed_suffix_tree, each_word)
+ keywords_and_count.append({"keyword": each_word, "count": occurences})
+ existing_keywords.append(each_word)
+ return keywords_and_count, time.time() - start_time
+ except Exception as e:
+ logger.error(f"Error while parsing suffix tree: {e}")
+ return None
+ for eachword in scrapped_content.split(" "):
+ if eachword == "":
+ continue
+ elif not eachword.isalpha():
+ continue
+ else:
+ if eachword not in existing_keywords:
+ keywords_and_count.append({"originalKeyword": eachword, "count": eval(f"{algo_choice}(scrapped_content, eachword)")})
+ existing_keywords.append(eachword)
+ return keywords_and_count, time.time() - start_time
+
+def get_top_keywords(keywords_and_count):
+ keywords_and_count.sort(key=lambda x: x["count"], reverse=True)
+ try:
+ return keywords_and_count[:12]
+ except Exception as exc:
+ return exc
+# ******************************************************************************************************************************************
+
+# API Endpoints ***************************************************************************************************************************
+
+
+@app.post('/api/v1/keyword/')
+async def keyword_api(request: Request):
+ payload = await request.json()
+ url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url']
+ try:
+ wait_iterator = 0
+ while True:
+ data = check_in_redis(url)
+ if data:
+ logger.info("Found in Cache Store, Checking if this algo is already executed")
+ algo_exists = check_in_redis(url + payload["algoChoice"])
+ if algo_exists:
+ logger.info("Cache Store already has recorded this algo, here you go!")
+ return algo_exists
+ break
+ else:
+ logger.info("Let's give that scrapper engine, a tad bit more time")
+ if wait_iterator > 3:
+ raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later")
+ wait_iterator += 1
+ time.sleep(5)
+ logger.info("Calling for parsing")
+ keywords, execution_time = get_keywords(payload["algoChoice"],data["scrapedContent"])
+ final_response = { "topKeywordListings": get_top_keywords(keywords), "alogirthmExecutionTime": execution_time}
+ logger.info("Quickly pushing to Cache Store")
+ push_to_redis(url + payload["algoChoice"],final_response)
+ return final_response
+ except Exception as e:
+ raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later")
+
+@app.post('/api/v1/keyword-recommendations/')
+async def keyword_recommendations_api(request: Request):
+ payload = await request.json()
+ url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url']
+ try:
+ data = check_in_redis(url)
+ if data:
+ logger.info("Found in Cache Store, Checking if this algo is already executed")
+ existing_algo_data = check_in_redis(url + payload["algoChoice"])
+ if existing_algo_data:
+ logger.info("Cache store found this entry, checking if recommendations already exists")
+ if existing_algo_data["topKeywordListings"][0].get("mostSearchedAlternatives"):
+ logger.info("Recommendations exist, returning my precious data without changes")
+ return existing_algo_data
+ all_keywords = existing_algo_data["topKeywordListings"]
+ modified_keywords = generate_recommendations(all_keywords)
+ existing_algo_data["topKeywordListings"] = modified_keywords
+ logger.info("Revalidating the cache with recommendations")
+ push_to_redis(url + payload["algoChoice"],existing_algo_data)
+ return existing_algo_data
+ else:
+ raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later")
+ except Exception as e:
+ raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later")
+
+@app.post('/api/v1/multi-algo/')
+async def multialgo_api(request: Request):
+ payload = await request.json()
+ url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url']
+ algo_choices = ["rabin_karp", "naive", "kmp", "suffix_array", "suffix_tree"]
+ final_response = {"data": []}
+ wait_iterator = 0
+ try:
+ while True:
+ data = check_in_redis(url)
+ if data:
+ logger.info("Multi algo found in Cache Store, Checking if this function for multi-algo is already executed")
+ algo_exists = check_in_redis(url + "multi-algo")
+ if algo_exists:
+ logger.info("Cache Store already has recorded this multi-algo, here you go!")
+ return algo_exists
+ break
+ else:
+ logger.info("Let's give that scrapper engine, a tad bit more time")
+ if wait_iterator > 3:
+ raise HTTPException(status_code=503, detail="Scrapper Engine is taking too long, please try again later")
+ wait_iterator += 1
+ time.sleep(5)
+ for each_algo in algo_choices:
+ logger.info("Checking if said algo exists")
+ logger.info(f"Running for {each_algo}")
+ algo_exists = check_in_redis(url + each_algo)
+ if algo_exists:
+ logger.info("Cache Store already has recorded this algo, here you go!")
+ final_response["data"].append({"algoName": each_algo, "algoExecutionTime": algo_exists["alogirthmExecutionTime"]})
+ else:
+ logger.info("Calling for parsing")
+ keywords, execution_time = get_keywords(each_algo,data["scrapedContent"])
+ intermediate_response = { "topKeywordListings": get_top_keywords(keywords), "alogirthmExecutionTime": execution_time}
+ logger.info("Quickly pushing to Cache Store")
+ push_to_redis(url + each_algo,intermediate_response)
+ final_response["data"].append({"algoName": each_algo, "algoExecutionTime": execution_time})
+ print(final_response)
+ push_to_redis(url + "multi-algo",final_response)
+ return final_response
+ except Exception as e:
+ raise HTTPException(status_code=503, detail="Hello, I am the parser engine, Scrapper is taking too long, please try again later")
+
+
+# ************************************************************************************************************************************
+
+
+if __name__ == '__main__':
+ uvicorn.run("parser:app", host='0.0.0.0', port=8001, reload=True)
\ No newline at end of file
diff --git a/parser/requirements.txt b/parser/requirements.txt
new file mode 100644
index 0000000..a7630ea
--- /dev/null
+++ b/parser/requirements.txt
@@ -0,0 +1,43 @@
+annotated-types==0.6.0
+antlr4-python3-runtime==4.13.1
+anyio==3.7.1
+automaton-tools==0.1.8
+beautifulsoup4==4.12.2
+better-profanity==0.7.0
+bs4==0.0.1
+certifi==2023.7.22
+charset-normalizer==3.3.1
+click==8.1.7
+csuffixtree==0.3.6
+fastapi==0.104.0
+futures==3.0.5
+google-search-results==2.4.2
+goslate==1.5.4
+h11==0.14.0
+idna==3.4
+Jinja2==3.1.2
+jsonify==0.5
+lxml==4.9.3
+MarkupSafe==2.1.3
+numpy==1.26.1
+pandas==2.1.1
+pydantic==2.4.2
+pydantic_core==2.10.1
+PyDictionary==2.0.1
+pyseoanalyzer==4.0.7
+python-dateutil==2.8.2
+pytrends==4.9.2
+pytz==2023.3.post1
+redis==5.0.1
+requests==2.31.0
+seo-keyword-research-tool==0.1.9
+serpapi==0.1.4
+six==1.16.0
+sniffio==1.3.0
+soupsieve==2.5
+starlette==0.27.0
+suffix-tree==0.1.2
+typing_extensions==4.8.0
+tzdata==2023.3
+urllib3==2.0.7
+uvicorn==0.23.2
diff --git a/scrapper/mock.json b/scrapper/mock.json
deleted file mode 100644
index 26f96b4..0000000
--- a/scrapper/mock.json
+++ /dev/null
@@ -1,34 +0,0 @@
-# Scraping Service
-
-{
- "scrapedContent": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Donec et odio pellentesque diam volutpat commodo sed egestas egestas. Neque volutpat ac tincidunt vitae semper quis lectus. Tortor dignissim convallis aenean et tortor at risus. Cras pulvinar mattis nunc sed blandit libero volutpat sed cras. Urna nec tincidunt praesent semper. Fusce id velit ut tortor. Hac habitasse platea dictumst quisque sagittis purus sit amet. Accumsan sit amet nulla facilisi morbi tempus. Volutpat odio facilisis mauris sit amet massa vitae. Sit amet facilisis magna etiam tempor orci eu. Scelerisque purus semper eget duis at tellus at urna. Elit eget gravida cum sociis natoque penatibus et magnis dis. Sed faucibus turpis in eu mi bibendum neque egestas congue. Viverra maecenas accumsan lacus vel facilisis volutpat. Commodo quis imperdiet massa tincidunt nunc pulvinar sapien. Eget nunc lobortis mattis aliquam faucibus purus in massa. Enim neque volutpat ac tincidunt vitae semper quis. Pharetra convallis posuere morbi leo urna molestie at elementum eu. Amet consectetur adipiscing elit ut aliquam purus sit amet. Sed tempus urna et pharetra pharetra massa massa ultricies mi. Feugiat sed lectus vestibulum mattis ullamcorper velit. Ridiculus mus mauris vitae ultricies leo integer. Mauris ultrices eros in cursus turpis. Turpis egestas pretium aenean pharetra magna ac. Ultrices eros in cursus turpis massa tincidunt dui ut ornare. Eu volutpat odio facilisis mauris sit. Odio eu feugiat pretium nibh ipsum consequat. Nibh tortor id aliquet lectus proin. Sed blandit libero volutpat sed cras ornare arcu. Risus commodo viverra maecenas accumsan lacus vel facilisis. Lectus arcu bibendum at varius vel pharetra. Condimentum vitae sapien pellentesque habitant morbi tristique senectus. Aliquam sem fringilla ut morbi tincidunt. Amet dictum sit amet justo donec enim diam. Convallis convallis tellus id interdum velit laoreet id. Malesuada bibendum arcu vitae elementum. Cursus eget nunc scelerisque viverra mauris in aliquam. Ut sem viverra aliquet eget. Dolor sit amet consectetur adipiscing elit. Tellus at urna condimentum mattis pellentesque id nibh tortor id. Penatibus et magnis dis parturient montes nascetur ridiculus. Nec nam aliquam sem et tortor consequat. Tincidunt vitae semper quis lectus. Eget nulla facilisi etiam dignissim. Fringilla urna porttitor rhoncus dolor purus non enim praesent. Ultricies integer quis auctor elit sed. Consequat nisl vel pretium lectus quam id leo in. Vitae turpis massa sed elementum tempus egestas sed sed risus. At lectus urna duis convallis. Donec ac odio tempor orci dapibus ultrices in. Vitae semper quis lectus nulla. Mollis aliquam ut porttitor leo a diam sollicitudin tempor. Eu facilisis sed odio morbi quis commodo odio aenean. Nulla porttitor massa id neque. Quam viverra orci sagittis eu volutpat odio facilisis. Imperdiet nulla malesuada pellentesque elit eget gravida cum. Sit amet massa vitae tortor condimentum lacinia quis vel eros. Sit amet massa vitae tortor condimentum. Risus viverra adipiscing at in tellus integer. Ut diam quam nulla porttitor massa id. Justo nec ultrices dui sapien eget mi proin. Commodo odio aenean sed adipiscing diam donec adipiscing tristique risus. Vivamus arcu felis bibendum ut tristique et egestas quis ipsum. Nec ultrices dui sapien eget. Ullamcorper malesuada proin libero nunc consequat interdum varius. Id diam vel quam elementum pulvinar etiam non quam lacus. Volutpat maecenas volutpat blandit aliquam etiam. Pellentesque diam volutpat commodo sed egestas egestas fringilla phasellus faucibus. Interdum velit euismod in pellentesque massa placerat duis ultricies. Porttitor lacus luctus accumsan tortor posuere ac. Quam lacus suspendisse faucibus interdum. Nunc faucibus a pellentesque sit amet porttitor eget dolor. Eget mi proin sed libero enim sed faucibus turpis. Id consectetur purus ut faucibus pulvinar elementum integer enim. Nisl nunc mi ipsum faucibus vitae aliquet nec ullamcorper sit. Ut etiam sit amet nisl purus in mollis nunc. Sed euismod nisi porta lorem mollis aliquam ut porttitor leo. Vulputate ut pharetra sit amet aliquam id. Adipiscing commodo elit at imperdiet dui accumsan sit amet nulla. Vestibulum sed arcu non odio. Tellus rutrum tellus pellentesque eu tincidunt tortor aliquam. Morbi tristique senectus et netus et malesuada fames."
-}
-
-# keyword service (time in seconds)
-
-{
- "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"],
- "alogirthmExecutingTime": 0.893473
-}
-
-# comparision service for multi algo
-
-{
- "data": [
- {
- "algoName": "algo1",
- "algoExecutingTime": 0.893473,
- "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"]
- },
- {
- "algoName": "algo2",
- "algoExecutingTime": 0.893473,
- "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"]
- },
- {
- "algoName": "algo3",
- "algoExecutingTime": 0.893473,
- "topKeywordListings": ["first", "second", "third", "fourth", "fifth", "sixth", "seventh", "eighth", "ninth", "tenth", "eleventh", "twelveth", "thirteenth", "fourteenth", "fifteenth"]
- }
- ]
-}
\ No newline at end of file
diff --git a/scrapper/requirements.txt b/scrapper/requirements.txt
index f848169..f4b0157 100644
--- a/scrapper/requirements.txt
+++ b/scrapper/requirements.txt
@@ -1,12 +1,25 @@
annotated-types==0.6.0
anyio==3.7.1
+beautifulsoup4==4.12.2
+bs4==0.0.1
+certifi==2023.7.22
+charset-normalizer==3.3.1
click==8.1.7
fastapi==0.104.0
h11==0.14.0
idna==3.4
+joblib==1.3.2
+nltk==3.8.1
+numpy==1.26.1
pydantic==2.4.2
pydantic_core==2.10.1
+redis==5.0.1
+regex==2023.10.3
+requests==2.31.0
sniffio==1.3.0
+soupsieve==2.5
starlette==0.27.0
+tqdm==4.66.1
typing_extensions==4.8.0
+urllib3==2.0.7
uvicorn==0.23.2
diff --git a/scrapper/scrapper.py b/scrapper/scrapper.py
index 89622d2..06525b8 100644
--- a/scrapper/scrapper.py
+++ b/scrapper/scrapper.py
@@ -1,12 +1,28 @@
-from fastapi import FastAPI, Request
+from fastapi import FastAPI, Request, HTTPException
import requests
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import time
+import logging
+import nltk
+import numpy
+from bs4 import BeautifulSoup
+from nltk.corpus import stopwords
+from nltk.tag import pos_tag
-app = FastAPI()
+nltk.download('punkt')
+nltk.download('words')
+nltk.download('stopwords')
+nltk.download('pos_tag')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('maxent_ne_chunker')
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
@@ -15,7 +31,47 @@
allow_headers=["*"],
)
-from bs4 import BeautifulSoup
+
+
+
+# Cache Store **********************************************************************************************************************
+
+import redis
+import json
+from scrapper import logger
+
+def get_redis_connection():
+ return redis.Redis(host="redis", port=6379, db=0)
+
+def push_to_redis(key, response):
+ logger.info("Pushing to Cache Store")
+ try:
+ redis_connection = get_redis_connection()
+ redis_connection.hset(key, "response", json.dumps(response))
+ return True
+ except Exception as e:
+ logger.error(f"Error while pushing to Redis: {e}")
+
+def check_in_redis(links, key):
+ logger.info("Checking in our precious Cache Store")
+ try:
+ redis_connection = get_redis_connection()
+ response = redis_connection.hget(key, "response")
+ if response:
+ logger.info("Found a match, Checking if this is latest")
+ if json.loads(response)["scrappedUrls"] == links:
+ logger.info("Perfect match found, returning from Cache Store")
+ return json.loads(response)
+ else:
+ return False
+ except Exception as e:
+ logger.error(f"Error while checking in Redis: {e}")
+ return False
+
+# ************************************************************************************************************************************
+
+
+# Scraping Engine ***************************************************************************************************************************
def get_urls(base_url):
response = requests.get(base_url)
@@ -23,7 +79,7 @@ def get_urls(base_url):
links = set()
for link in soup.find_all('a'):
href = link.get('href')
- if href and not href.startswith('http') and '.' not in href:
+ if href and not href.startswith('http') and '.' not in href and len(links) < 5:
links.add(base_url+href)
return list(links)
@@ -35,35 +91,23 @@ def scrape(url):
def scrape_all(url):
links = get_urls(url)
text = ''
+ cache = check_in_redis(links,url)
+ if cache:
+ logger.info("Found in Cache")
+ return True, cache["scrappedUrls"], cache["scrapedContent"]
+ logger.info("Not Found in Cache, Scraping New")
for link in links:
text += scrape(link)
- return links, text
-
-import nltk
-import numpy
-from nltk.tokenize import word_tokenize
-from nltk.corpus import stopwords
-from nltk.tag import pos_tag
-from nltk.chunk import ne_chunk
-
-nltk.download('punkt')
-nltk.download('words')
-nltk.download('stopwords')
-nltk.download('pos_tag')
-nltk.download('averaged_perceptron_tagger')
-nltk.download('maxent_ne_chunker')
+ return False, links, text
def remove_pronouns_nouns(text):
- # Tokenize the text into words
- words = word_tokenize(text)
-
- # Remove stop words from the words list
+ import re
+ text = re.sub(r'[^\w\s]', ' ', text)
+ words = text.split()
+ words = [word for word in words if len(word) > 2 and not word.isdigit()]
stop_words = set(stopwords.words('english'))
words = [word for word in words if word.lower() not in stop_words]
-
- # Tag the parts of speech of each word
tagged_words = pos_tag(words)
-
filtered_words = []
removed_words = []
for word, tag in tagged_words:
@@ -77,19 +121,30 @@ def remove_pronouns_nouns(text):
return filtered_text, removed_text
+# *********************************************************************************************************************************************
+
+
+# API Endpoints *********************************************************************************************************************************
+
@app.post('/api/v1/scraping/')
async def root(request: Request):
payload = await request.json()
+ url = payload['url'].strip('/') if payload['url'].endswith('/') else payload['url']
try:
- scrapped_urls, scrapped_text = scrape_all(payload['url'])
+ cacheExists, scrapped_urls, scrapped_text = scrape_all(url)
+
+ final_text, removed_text = remove_pronouns_nouns(scrapped_text)
+ response = { "scrapedContent": final_text, "scrappedUrls": scrapped_urls, "removedContent": removed_text, "returnedFromCache": True if cacheExists else False }
+ if not cacheExists:
+ logger.info("That's new to me, populating Cache Store right away!")
+ push_to_redis(url,response)
+ return response
+
except Exception as e:
- return {"503": f"{e}"}
- final_text, removed_text = remove_pronouns_nouns(scrapped_text)
- return {
- "scrapedContent": final_text,
- "scrappedUrls": scrapped_urls,
- "removedContent": removed_text
- }
+ logger.error(f"Error while scraping: {e}")
+ raise HTTPException(status_code=500, detail="Error while scraping")
+
+# ************************************************************************************************************************************************
if __name__ == '__main__':
uvicorn.run("scrapper:app", host='0.0.0.0', port=8000, reload=True)
\ No newline at end of file
diff --git a/src/components/homePage/index.js b/src/components/homePage/index.js
index 89eb61a..1d60ade 100644
--- a/src/components/homePage/index.js
+++ b/src/components/homePage/index.js
@@ -147,7 +147,8 @@ function HomePage() {
-
+
+