Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resolve "Proxy" #63

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backend/src/crawling/crawling_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@
async def get_page(url: Url):
page_url = url.url
html, page_title = await crawling_service.parse(page_url)
html = crawling_service.fix_relative_paths(html, page_url)
# save_to_html(data=html, filename=page_title)
html = crawling_service.redirect_to_proxy(html, page_url, "https://cors-anywhere.herokuapp.com/")
save_to_html(data=html, filename=page_title)
return Page(url=page_url, html=html)


Expand Down
61 changes: 60 additions & 1 deletion backend/src/crawling/crawling_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,71 @@ async def parse(url):


# add_base_href_to_html
def fix_relative_paths(html: str, url: str):
def add_base_href(html: str, url: str):
base_url = re.search(r"(?P<url>https?://[/\w+$]+.[/\w+$][^/]*)", url).group("url")
base_href = '<base href="' + base_url + '">'
return base_href + "\n" + html


def relative_to_absolute_paths(html: str, url: str):
base_url = re.search(r"(?P<url>https?://[/\w+$]+.[/\w+$][^/]*)", url).group("url")

relative_href_gen = [m.start() for m in re.finditer("href=\"/", html)]
start = 0
new_html = ""
end = 0
for pos in relative_href_gen:
end = pos + 6
new_html += html[start:end]
new_html += (base_url + "/")
start = end
new_html += html[end:]

html = new_html
relative_src_gen = [m.start() for m in re.finditer("src=\"/", html)]
start = 0
new_html = ""
end = 0
for pos in relative_src_gen:
end = pos + 5
new_html += html[start:end]
new_html += (base_url + "/")
start = end
new_html += html[end:]

return new_html


def redirect_to_proxy(html: str, url: str, proxy_url: str):
html = relative_to_absolute_paths(html, url)
absolute_href_gen = [m.start() for m in re.finditer("href=\"http", html)]

start = 0
new_html = ""
end = 0
for pos in absolute_href_gen:
end = pos + 6
new_html += html[start:end]
new_html += proxy_url
start = end
new_html += html[end:]

html = new_html
absolute_src_gen = [m.start() for m in re.finditer("src=\"http", html)]
start = 0
new_html = ""
end = 0
for pos in absolute_src_gen:
end = pos + 5
new_html += html[start:end]
new_html += proxy_url
start = end
new_html += html[end:]

return new_html



def get_crawls_by_user(db: Session, user_email: str):
user = user_service.get_user_by_email(db, user_email)
return list(
Expand Down
10 changes: 5 additions & 5 deletions backend/src/database/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@
"postgresql://${DB_USER}:${DB_PASSWORD}@${DB_HOST}:${DB_PORT}/${DB_NAME}"
)
DATABASE_URL = database_url_template.substitute(
DB_USER=settings.USER,
DB_PASSWORD=settings.PASSWORD,
DB_NAME=settings.NAME,
DB_HOST=settings.HOST,
DB_PORT=settings.as_int("PORT"),
DB_USER=settings.DB_USER,
DB_PASSWORD=settings.DB_PASSWORD,
DB_NAME=settings.DB_NAME,
DB_HOST=settings.DB_HOST,
DB_PORT=settings.as_int("DB_PORT"),
)

# Create SQLAlchemy engine:
Expand Down