Skip to content

Commit

Permalink
Add metadata extraction, fix hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
charitarthchugh committed Aug 6, 2023
1 parent 4f33e78 commit 04f6e9f
Showing 1 changed file with 41 additions and 6 deletions.
47 changes: 41 additions & 6 deletions backend/bookie_backend/bookied/utils/extract-metadata.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,50 @@
import hashlib
import requests
from bs4 import BeautifulSoup
from typing import Optional
from io import BytesIO
import urllib


def extract_metadata(url) -> Optional[dict]:
# if url is None:
# return

try:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
title = soup.find("title")
description = soup.find("meta", property="og:description")
# Get base url
base_url = urllib.parse.urlparse(url).hostname
req = requests.get(f"https://favicongrabber.com/api/grab/{base_url}")
fav_url = req.json()["icons"][0]["src"]
favicon = requests.get(fav_url).content

return {
"title": title.string if title else None,
"description": description["content"] if description else None,
"favicon": favicon if favicon else None,
"url": url,
}
except Exception as e:
print(e)
return None

def hash(input)->str:
if input is None:
return

def md5(content) -> Optional[str]:
if content is None:
return
content = BytesIO(content)
md5 = hashlib.md5()
md5.update(input)
md5.update(content.read())
return md5.hexdigest()


if __name__ == "__main__":
hash_icon = "https://www.lfaticon.com/free-icon/url_1078454"
print(hash(hash_icon))
meta = extract_metadata("https://www.youtube.com/watch?v=9bZkp7q19f0")
from pathlib import Path
# save favicon
favicon = meta["favicon"]
if favicon:
print(md5(favicon))

0 comments on commit 04f6e9f

Please sign in to comment.