Skip to content

Commit

Permalink
Allow custom translation pickers to be supplied
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Oct 13, 2023
1 parent efaf1ab commit 1c776d2
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 17 deletions.
17 changes: 15 additions & 2 deletions src/extractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
)
from extractor.extractors.tags import load_tags
from extractor.extractors.users import load_users
from extractor.parse.translations import PickerListType
from extractor.scrape.crawler import ScrapeCrawl
from extractor.util.file import prefix_filename

Expand All @@ -31,21 +32,28 @@ class WPExtractor:
users: Optional[DataFrame]
pages: Optional[DataFrame]
scrape_url_mapping: Dict[str, Path]
translation_pickers: Optional[PickerListType]

def __init__(
self, json_root: Path, scrape_root: Path, json_prefix: Optional[str] = None
self,
json_root: Path,
scrape_root: Path,
json_prefix: Optional[str] = None,
translation_pickers: Optional[PickerListType] = None,
):
"""Create a new extractor.
Args:
json_root: Path to directory of JSON files
scrape_root: Path to scrape directory
json_prefix: Prefix of files in ``json_root``
translation_pickers: Supply a custom list of translation pickers
"""
self.json_root = json_root
self.scrape_root = scrape_root
self.json_prefix = json_prefix
self.link_registry = LinkRegistry()
self.translation_pickers = translation_pickers

def extract(self) -> None:
"""Perform the extraction."""
Expand Down Expand Up @@ -78,7 +86,12 @@ def _crawl_scrape(self):

def _extract_posts(self):
json_file = self.json_root / self._prefix_filename("posts.json")
self.posts = load_posts(json_file, self.link_registry, self.scrape_url_mapping)
self.posts = load_posts(
path=json_file,
link_registry=self.link_registry,
scrape_urls_files=self.scrape_url_mapping,
translation_pickers=self.translation_pickers,
)

def _extract_media(self):
json_file = self.json_root / self._prefix_filename("media.json")
Expand Down
11 changes: 8 additions & 3 deletions src/extractor/extractors/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from extractor.extractors.io import load_df
from extractor.parse.content import extract_content_data
from extractor.parse.html import extract_html_text, parse_html
from extractor.parse.translations import extract_translations
from extractor.parse.translations import PickerListType, extract_translations
from extractor.parse.translations._resolver import TranslationLink
from extractor.scrape.scrape import load_scrape
from extractor.util.locale import extract_locale
Expand Down Expand Up @@ -54,7 +54,10 @@


def load_posts(
path: Path, link_registry: LinkRegistry, scrape_urls_files: Dict[str, Path]
path: Path,
link_registry: LinkRegistry,
scrape_urls_files: Dict[str, Path],
translation_pickers: Optional[PickerListType],
) -> Optional[pd.DataFrame]:
"""Load the posts from a JSON file.
Expand All @@ -64,6 +67,7 @@ def load_posts(
path: The path to the JSON file
link_registry: The Link Registry to populate
scrape_urls_files: A dictionary of site URLs to scrape file paths
translation_pickers: Custom list of translation pickers.
Returns:
A dataframe of the posts.
Expand Down Expand Up @@ -99,7 +103,8 @@ def load_posts(
lambda link: load_scrape(scrape_urls_files, link)
)
posts_df[["language", "translations"]] = posts_df.apply(
lambda r: extract_translations(r["scrape_bs"], r["link"]), axis=1
lambda r: extract_translations(r["scrape_bs"], r["link"], translation_pickers),
axis=1,
)

link_registry.add_linkables(
Expand Down
2 changes: 2 additions & 0 deletions src/extractor/parse/translations/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
from ._extractor import PickerListType # noqa: F401
from ._extractor import extract_translations # noqa: F401
from ._pickers import LangPicker # noqa: F401
27 changes: 18 additions & 9 deletions src/extractor/parse/translations/_extractor.py
Original file line number Diff line number Diff line change
@@ -1,40 +1,49 @@
import logging
from typing import Optional
from typing import List, Optional, Type

import pandas as pd
from bs4 import BeautifulSoup

import extractor.parse.translations._pickers as pickers

RESOLVERS = [pickers.Polylang, pickers.GenericLangSwitcher]
PICKERS = [pickers.Polylang, pickers.GenericLangSwitcher]
PickerListType = List[Type[pickers.LangPicker]]

PageTranslationData = pd.Series


def extract_translations(
page_doc: Optional[BeautifulSoup], link: str
page_doc: Optional[BeautifulSoup],
link: str,
translation_pickers: Optional[PickerListType],
) -> PageTranslationData:
"""Get a list of URLs linked as translations.
Args:
page_doc: The full scrape document
link: The link to the post
translation_pickers: A list of translation pickers to use
Returns:
The doc's language and list of translation links
"""
if translation_pickers is None:
translation_pickers = PICKERS

if page_doc is None:
return pd.Series([None, []])
for resolver_class in RESOLVERS:
resolver = resolver_class(page_doc)
for picker_class in translation_pickers:
picker = picker_class(page_doc)

if not resolver.matches():
if not picker.matches():
continue

resolver.extract()
picker.extract()

return pd.Series([resolver.current_language, resolver.translations])
return pd.Series([picker.current_language, picker.translations])

logging.debug(f'No resolvers matched "{link}", unable to extract translations.')
logging.debug(
f'No translation pickers matched "{link}", unable to extract translations.'
)

return pd.Series([None, []])
17 changes: 16 additions & 1 deletion src/extractor/scrape/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

from extractor.scrape.processor import extract_self_url, self_url_strainer

# Increment to invalidate old caches
SCRAPE_CRAWL_VERSION = 1


class ScrapeCrawl:
"""Crawl a scraped website to get original page URLs.
Expand Down Expand Up @@ -53,11 +56,23 @@ def _get_cache_path(self) -> Path:

def _export(self):
with open(self._get_cache_path(), "w") as f:
json.dump({"found": self.found_pages, "failed": self.failed_docs}, f)
json.dump(
{
"found": self.found_pages,
"failed": self.failed_docs,
"version": SCRAPE_CRAWL_VERSION,
},
f,
)

def _import(self):
with open(self._get_cache_path(), "r") as f:
data = json.load(f)

if "version" not in data or data["version"] != SCRAPE_CRAWL_VERSION:
logging.info("Scrape crawl cache is out of date, re-crawling.")
return

self.found_pages = data["found"]
self.failed_docs = data["failed"]
self.crawled = True
Expand Down
4 changes: 2 additions & 2 deletions tests/extractors/test_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from extractor.parse.translations._resolver import TranslationLink


def mock_translation_extractor(post_bs: BeautifulSoup, link: str):
def mock_translation_extractor(post_bs: BeautifulSoup, link: str, translation_pickers):
id_meta = post_bs.find("meta", attrs={"name": "post_id_for_mock"})
post_id = int(id_meta["content"])
if post_id == 1:
Expand Down Expand Up @@ -73,7 +73,7 @@ def _scrape_path(slug):
def posts_df_and_registry(_do_mock_translation_extractor, datadir, scrape_urls_files):
link_registry = LinkRegistry()
return (
load_posts(datadir / "posts.json", link_registry, scrape_urls_files),
load_posts(datadir / "posts.json", link_registry, scrape_urls_files, None),
link_registry,
)

Expand Down

0 comments on commit 1c776d2

Please sign in to comment.