Skip to content

Commit

Permalink
Merge pull request #60 from seowings/main
Browse files Browse the repository at this point in the history
GUI and Crawl improvements for #22 #55
  • Loading branch information
seowings authored Dec 25, 2023
2 parents ec8511d + 07503bf commit 7a47a57
Show file tree
Hide file tree
Showing 20 changed files with 669 additions and 294 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@ Python Package for Converting WordPress Installation to a Static Website.
[![pypi_version](https://img.shields.io/pypi/v/staticwordpress.svg?style=flat-square "Available on PyPi - the Python Package Index")](https://pypi.python.org/pypi/staticwordpress)
[![supported_python_versions](https://img.shields.io/pypi/pyversions/staticwordpress.svg?style=flat-square "Supported Python Version")](https://pypi.python.org/pypi/staticwordpress)


## Desktop Version

![staticwordpress-gui](docs/img/staticwordpress-gui.png)

## How to Install static-wordpress?

### Windows Installer
Expand Down
Binary file added docs/img/staticwordpress-gui.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 4 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ Python Package for Converting WordPress Installation to a Static Website.
[![pypi_version](https://img.shields.io/pypi/v/staticwordpress.svg?style=flat-square "Available on PyPi - the Python Package Index")](https://pypi.python.org/pypi/staticwordpress)
[![supported_python_versions](https://img.shields.io/pypi/pyversions/staticwordpress.svg?style=flat-square "Supported Python Version")](https://pypi.python.org/pypi/staticwordpress)

## Desktop Version

![staticwordpress-gui](docs/img/staticwordpress-gui.png)

## How to Install static-wordpress?

### Windows Installer
Expand Down
2 changes: 1 addition & 1 deletion src/staticwordpress/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@

VERSION_MAJOR = 0
VERSION_MINOR = 0
VERSION_REVISION = 5
VERSION_REVISION = 6
VERISON = f"{VERSION_MAJOR}.{VERSION_MINOR}.{VERSION_REVISION}"

SHARE_FOLDER_PATH = Path(
Expand Down
34 changes: 19 additions & 15 deletions src/staticwordpress/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,30 +54,30 @@


class Crawler:
def __init__(self, loc_: str, type_: URL = URL.FOLDER, scheme_="") -> None:
def __init__(self, loc_: str, typ_: URL = URL.FOLDER, scheme_: str = "") -> None:
loc_ = parse.unquote(loc_).replace("\/", "/")
if not any([loc_.startswith(f"{scheme}://") for scheme in CONFIGS["SCHEMES"]]):
loc_ = f"{CONFIGS['DEFAULT_SCHEME']}://{loc_}"

if CONFIGS["CLEAN"]["URL"]:
loc_ = get_clean_url(loc_, "", scheme_)

self._type = type_
self._typ = typ_
self._loc = loc_
self._urlparse = parse.urlparse(self._loc)

file_ext = self._urlparse.path.split(".")[-1].upper()
if file_ext:
for keys in CONFIGS["FORMATS"]:
if file_ext in CONFIGS["FORMATS"][keys]:
self._type = URL[keys]
self._typ = URL[keys]

if any(
[exclule_url in self._urlparse.path for exclule_url in CONFIGS["EXCLUDE"]]
):
self._type = URL.NONE
self._typ = URL.NONE

if self._type == URL.FOLDER:
if self._typ == URL.FOLDER:
self._loc = (
f"{self._loc}{'/' if not self._urlparse.path.endswith('/') else ''}"
)
Expand All @@ -92,6 +92,10 @@ def __init__(self, loc_: str, type_: URL = URL.FOLDER, scheme_="") -> None:
def hash(self) -> str:
return self._hash

@property
def typ(self) -> str:
return self._typ

@property
def external_links(self) -> list:
return self._externals_links
Expand Down Expand Up @@ -187,7 +191,7 @@ def is_valid(self) -> bool:
len(self._urlparse.scheme) > 0,
len(self._urlparse.netloc) > 0,
len(self._urlparse.netloc.split(".")) > 1,
len(self._urlparse.path) > 0 or self._type == URL.HOME,
len(self._urlparse.path) > 0 or self._typ == URL.HOME,
]
)

Expand All @@ -200,7 +204,7 @@ def fetch(self) -> None:
if self.is_valid:
self._response = get_remote_content(self._urlparse)

if self._type in [URL.FOLDER, URL.HTML, URL.JS, URL.HOME]:
if self._typ in [URL.FOLDER, URL.HTML, URL.JS, URL.HOME]:
extracted_urls = set(
[link[0] for link in re.findall(LINK_REGEX, self._response.text)]
)
Expand All @@ -220,16 +224,16 @@ def save(self, full_output_folder: Path, dst_url: str = "") -> str:
full_output_path = Path(f"{full_output_folder}/{folder_path}")

if self._response.status_code == 404:
self._type = URL.HTML
self._typ = URL.HTML
full_output_path = full_output_folder / Path("404.html")

if self._type in [URL.FOLDER, URL.HOME]:
if self._typ in [URL.FOLDER, URL.HOME]:
full_output_path = full_output_path / Path("index.html")

if self._type not in [URL.NONE]:
if self._typ not in [URL.NONE]:
full_output_path.parent.mkdir(parents=True, exist_ok=True)

if self._type in [
if self._typ in [
URL.HTML,
URL.XML,
URL.FOLDER,
Expand All @@ -250,15 +254,15 @@ def save(self, full_output_folder: Path, dst_url: str = "") -> str:
with open(full_output_path, "w", encoding="utf-8") as f:
f.write(_text)

elif self._type in [URL.IMAGE, URL.PDF, URL.BINARY]:
elif self._typ in [URL.IMAGE, URL.PDF, URL.BINARY]:
with open(full_output_path, "wb") as file:
file.write(self._response.content)

elif self._type in [URL.JSON]:
elif self._typ in [URL.JSON]:
with open(full_output_path, "w", encoding="utf-8") as file:
json.dump(json.loads(self._response.text), file, indent=4)

elif self._type == URL.ZIP:
elif self._typ == URL.ZIP:
headers = CaseInsensitiveDict()
headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
headers["Pragma"] = "no-cache"
Expand All @@ -273,7 +277,7 @@ def save(self, full_output_folder: Path, dst_url: str = "") -> str:
fd.write(chunk)
current_session.cookies.clear()

elif self._type == URL.FONTS:
elif self._typ == URL.FONTS:
totalbits = 0
if self._response.status_code == 200:
with open(full_output_path, "wb") as f:
Expand Down
4 changes: 4 additions & 0 deletions src/staticwordpress/core/redirects.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ class Redirects:
def __init__(self) -> None:
self._items = dict()

@property
def items(self) -> dict():
return self._items

def add_redirect(self, redirect_: Redirect) -> None:
if redirect_.hash not in self._items:
self._items[redirect_.hash] = redirect_
Expand Down
20 changes: 20 additions & 0 deletions src/staticwordpress/core/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,26 @@ def __init__(self, search_page_: Path = None, dst_url_="") -> None:
)
self._dst_url = dst_url_

@property
def search_index(self) -> list:
return self._search_index

@property
def search_path(self) -> str:
return self._search_path

@property
def search_path_lunr(self) -> Path:
return self._search_path_lunr

@property
def search_path_script(self) -> Path:
return self._search_path_script

@property
def dst_url(self) -> str:
return self._dst_url

def update(self, soup_: BeautifulSoup, output_path_: str) -> None:
"""Update search page by adding new tags
Expand Down
6 changes: 2 additions & 4 deletions src/staticwordpress/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import stat
import shutil
from urllib import parse
from urllib.request import urlopen
from pathlib import Path
from zipfile import ZipFile
from functools import lru_cache
Expand Down Expand Up @@ -228,10 +229,7 @@ def is_url_valid(url_: str) -> bool:
url_parsed_ = parse.urlparse(url_)

if all([url_parsed_.scheme, url_parsed_.netloc]):
from urllib.request import urlopen

# print(url_parsed_)
# # return get_remote_content(url_parsed_, max_retires=1).status_code < 399
# return get_remote_content(url_parsed_, max_retires=1).status_code < 399
try:
return urlopen(url_).getcode() < 399
except:
Expand Down
65 changes: 43 additions & 22 deletions src/staticwordpress/core/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def __init__(self):
self._project = Project()
self._redirects = Redirects()
self._search = Search()
self._crawler = Crawler(loc_="", type_=URL.NONE)
self._crawler = Crawler(loc_="", typ_=URL.NONE)
self._urls = dict()
self._github = None
self._keep_running = True
Expand All @@ -83,6 +83,30 @@ def __init__(self):
def sitemap(self) -> str:
return self._project.sitemap

@property
def project(self):
return self._project

@property
def redirects(self) -> Redirects:
return self._redirects

@property
def search(self) -> Search:
return self._search

@property
def crawler(self) -> Crawler:
return self._crawler

@property
def urls(self) -> dict:
return self._urls

@property
def github(self):
return self._github

def clear(self):
self._urls = dict()

Expand Down Expand Up @@ -121,6 +145,7 @@ def create_project(
self._project.src_url = src_url_

# TODO: Add Support for GH Repo ??? Do We need it?
# for now keep it like this.
if all(
[
self._project.gh_token != "",
Expand Down Expand Up @@ -155,23 +180,18 @@ def set_project(self, project_: Project) -> None:
if self._project.src_type == SOURCE.ZIP:
self._project.update_ss()

def start_calculations(self) -> None:
self._keep_running = True
logging.warn(f"Background Processings is Starting")

def stop_calculations(self) -> None:
self._keep_running = False
logging.warn(f"Background Processings will Stop. Please wait!")

def open_project(self) -> None:
pass

def save_project(self) -> None:
pass

def close_project(self) -> None:
pass

def download_zip_file(self) -> None:
if self._keep_running:
rm_dir_tree(self._project.output, delete_root_=False)
self._crawler = Crawler(loc_=self._project.zip_file_url, type_=URL.ZIP)
self._crawler = Crawler(loc_=self._project.zip_file_url, typ_=URL.ZIP)
self._crawler.fetch()
self._crawler.save(full_output_folder=self._project.output)

Expand Down Expand Up @@ -270,7 +290,7 @@ def add_404_page(self) -> None:
if self._keep_running:
self._crawler = Crawler(
loc_=self._project._404_url,
type_=URL.HTML,
typ_=URL.HTML,
scheme_=self._project.scheme,
)
self._crawler.fetch()
Expand All @@ -297,25 +317,25 @@ def crawl_sitemap(self) -> None:
if self._keep_running:
self.crawl_url(loc_=sitemap_path)

def crawl_url(self, loc_) -> None:
current_url = Crawler(loc_=loc_, scheme_=self._project.scheme)
if current_url.hash not in self._urls:
current_url.fetch()
full_output_path = current_url.save(
def crawl_url(self, loc_: str) -> None:
current_crawler = Crawler(loc_=loc_, scheme_=self._project.scheme)
if current_crawler.hash not in self._urls:
time.sleep(self._project.delay + random.random() / 100)
current_crawler.fetch()
full_output_path = current_crawler.save(
self._project.output, dst_url=self._project.dst_url
)
self._urls[current_url._hash] = current_url
self._urls[current_crawler._hash] = current_crawler

custom_message = "Saved"
if current_url.status_code >= 400 or current_url._type == URL.NONE:
if current_crawler.status_code >= 400 or current_crawler._typ == URL.NONE:
custom_message = "Ignored"

logging.info(
f"{custom_message}: {current_url.status_code} {current_url._type} {full_output_path}"
f"{custom_message}: {current_crawler.status_code} {current_crawler._typ} {full_output_path}"
)

for internal_link in current_url.internal_links:
time.sleep(self._project.delay + random.random() / 100)
for internal_link in current_crawler.internal_links:
if self._keep_running:
self.crawl_url(internal_link)

Expand All @@ -326,6 +346,7 @@ def verify_project_name(self) -> bool:

def verify_src_url(self) -> bool:
logging.info(f"Verifying Source Url!")
# TODO: replace with urllib implementation ???
current_url = Crawler(loc_=self._project.src_url, scheme_=self._project.scheme)
current_url.fetch()
return current_url.status_code < 399 # non error status codes
Expand Down
16 changes: 14 additions & 2 deletions src/staticwordpress/gui/editor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,18 @@
"""


# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# INTERNAL IMPORTS
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

from collections import namedtuple


# +++++++++++++++++++++++++++++++++++++++++++++++++++++
# 3rd PARTY LIBRARY IMPORTS
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

from PyQt5.QtCore import QSize
from qtconsole.rich_jupyter_widget import RichJupyterWidget
from qtconsole.inprocess import QtInProcessKernelManager
from IPython.lib import guisupport
Expand All @@ -39,7 +47,7 @@


class SWIPythonWidget(RichJupyterWidget):
def __init__(self, interface_: dict = {"iface": None}, *args, **kwargs):
def __init__(self, interface_: dict = {}, *args, **kwargs):
super(SWIPythonWidget, self).__init__(*args, **kwargs)

self.ipython_kernal_manager = QtInProcessKernelManager()
Expand All @@ -51,11 +59,15 @@ def __init__(self, interface_: dict = {"iface": None}, *args, **kwargs):
for module in import_custom_modules:
self._execute(module, hidden=True)

self.ipython_kernal_manager.kernel.shell.push(interface_)
SWInterface = namedtuple("SWInterface", interface_.keys())(**interface_)
self.ipython_kernal_manager.kernel.shell.push({"iface": SWInterface})

def stop():
self.kernel_client.stop_channels()
self.ipython_kernal_manager.shutdown_kernel()
guisupport.get_app_qt4().exit()

self.exit_requested.connect(stop)

def sizeHint(self):
return QSize(620, 75)
Loading

0 comments on commit 7a47a57

Please sign in to comment.