Skip to content

Commit

Permalink
Implement media resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
freddyheppell committed Nov 17, 2023
1 parent f1ad368 commit e975d4d
Show file tree
Hide file tree
Showing 12 changed files with 157 additions and 17 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ The scrape should be in a 'merged pages' format, i.e. the pages of the list endp
The following files should be placed in a directory. Their names may be prefixed by a consistent string (e.g. to record the date).

| File Name | Endpoint |
| ----------------- | -------------------------------------- |
|-------------------|----------------------------------------|
| `categories.json` | [`/wp/v2/categories`][categories_path] |
| `comments.json` | [`/wp/v2/comments`][comments_path] |
| `media.json` | [`/wp/v2/media`][media_path] |
Expand Down
5 changes: 3 additions & 2 deletions src/extractor/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from pandas import DataFrame

from extractor.extractors.categories import load_categories
from extractor.extractors.data.links import LinkRegistry
from extractor.extractors.io import export_df
from extractor.extractors.media import load_media
Expand Down Expand Up @@ -95,15 +96,15 @@ def _extract_posts(self):

def _extract_media(self):
json_file = self.json_root / self._prefix_filename("media.json")
self.media = load_media(json_file)
self.media = load_media(json_file, self.link_registry)

def _extract_tags(self):
json_file = self.json_root / self._prefix_filename("tags.json")
self.tags = load_tags(json_file, self.link_registry)

def _extract_categories(self):
json_file = self.json_root / self._prefix_filename("categories.json")
self.categories = load_tags(json_file, self.link_registry)
self.categories = load_categories(json_file, self.link_registry)

def _extract_users(self):
json_file = self.json_root / self._prefix_filename("users.json")
Expand Down
2 changes: 1 addition & 1 deletion src/extractor/extractors/categories.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def load_categories(path: Path, link_registry: LinkRegistry) -> Optional[pd.Data
categories_df = categories_df[categories_df.columns.intersection(EXPORT_COLUMNS)]

link_registry.add_linkables(
"tag", categories_df["link"].to_list(), categories_df.index.to_list()
"category", categories_df["link"].to_list(), categories_df.index.to_list()
)

return categories_df
54 changes: 52 additions & 2 deletions src/extractor/extractors/data/images.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
import re
from dataclasses import dataclass
from typing import Optional
from typing import List, Optional

from extractor.extractors.data.links import Linkable
from extractor.extractors.data.links import Linkable, LinkRegistry


@dataclass
Expand All @@ -18,3 +20,51 @@ class ResolvableMediaUse(MediaUse):
"""An instance of media that can be resolved against known media."""

destination: Optional[Linkable] = None


def resolve_image(
registry: LinkRegistry, image: ResolvableMediaUse
) -> ResolvableMediaUse:
"""Resolve the internal links of a media use.
Args:
registry: A filled link registry
image: A media use
Returns:
The media use with link data resolved.
"""
if image.destination is not None:
return image

if "wp-content" not in image.src:
return image

# Remove dimensions from image URL
# e.g. test-image-300x200.jpg -> test-image.jpg
src = re.sub(r"-\d{3,4}x\d{3,4}\.", ".", image.src)

linkable = registry.query_link(src)

if linkable is None:
logging.debug(f'Could not resolve image "{src}"')
return image

image.destination = linkable

return image


def resolve_images(
registry: LinkRegistry, images: List[ResolvableMediaUse]
) -> List[ResolvableMediaUse]:
"""Resolve the internal links of a list of media uses.
Args:
registry: A filled link registry
images: A list of media uses
Returns:
The list of media uses with link data resolved.
"""
return [resolve_image(registry, image) for image in images]
10 changes: 8 additions & 2 deletions src/extractor/extractors/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pandas as pd
from bs4 import Tag

from extractor.extractors.data.links import LinkRegistry
from extractor.extractors.io import load_df
from extractor.parse.html import extract_html_text

Expand Down Expand Up @@ -33,6 +34,7 @@
"modified_gmt",
"post",
"slug",
"source_url",
"title.rendered",
"title.text",
"yoast_head_json.og_url",
Expand All @@ -46,19 +48,19 @@
"post": "post_id",
"title.rendered": "title.html",
"yoast_head_json.title": "page_title",
"media_details.file": "file_path",
"media_details.parent_image.attachment_id": "parent_image_id",
"yoast_head_json.og_url": "og_url",
}


def load_media(path: Path) -> Optional[pd.DataFrame]:
def load_media(path: Path, link_registry: LinkRegistry) -> Optional[pd.DataFrame]:
"""Load media from a JSON file.
The JSON file is expected to be in the response format of the WordPress media API.
Args:
path: The path to the JSON file
link_registry: A link registry to populate
Returns:
A dataframe of the media
Expand Down Expand Up @@ -88,6 +90,10 @@ def load_media(path: Path) -> Optional[pd.DataFrame]:

media_df = media_df.rename(columns=RENAME_COLUMNS)

link_registry.add_linkables(
"media", media_df["source_url"].to_list(), media_df.index.to_list()
)

return media_df


Expand Down
18 changes: 18 additions & 0 deletions src/extractor/extractors/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pandas import DataFrame
from tqdm.auto import tqdm

from extractor.extractors.data.images import resolve_images
from extractor.extractors.data.link_resolver import resolve_links
from extractor.extractors.data.links import LinkRegistry
from extractor.extractors.io import load_df
Expand Down Expand Up @@ -141,6 +142,23 @@ def resolve_post_links(registry: LinkRegistry, posts_df: DataFrame) -> DataFrame
return posts_df


def resolve_post_media(registry: LinkRegistry, posts_df: DataFrame) -> DataFrame:
"""Look up the images of each post.
Args:
registry: A filled link registry
posts_df: The processed posts dataframe
Returns:
The posts dataframe with link data resolved.
"""
posts_df["images"] = posts_df["images"].apply(
lambda media: resolve_images(registry, media)
)

return posts_df


def ensure_translations_undirected(posts_df: DataFrame) -> DataFrame:
"""Create translation relationships if they are not bidirectional.
Expand Down
30 changes: 30 additions & 0 deletions tests/extractors/data/test_images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from extractor.extractors.data.images import ResolvableMediaUse, resolve_image
from extractor.extractors.data.links import LinkRegistry


def test_image_resolver():
registry = LinkRegistry()
registry.add_linkable(
"https://example.org/wp-content/uploads/2022/12/test-image.jpg", "media", 1
)

resolvable = ResolvableMediaUse(
"https://example.org/wp-content/uploads/2022/12/test-image.jpg", "alt", None
)

assert resolve_image(registry, resolvable).destination == registry.links[0]


def test_image_resolver_with_dimensions():
registry = LinkRegistry()
registry.add_linkable(
"https://example.org/wp-content/uploads/2022/12/test-image.jpg", "media", 1
)

resolvable = ResolvableMediaUse(
"https://example.org/wp-content/uploads/2022/12/test-image-300x200.jpg",
"alt",
None,
)

assert resolve_image(registry, resolvable).destination == registry.links[0]
22 changes: 20 additions & 2 deletions tests/extractors/test_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,20 @@
import pandas as pd
import pytest

from extractor.extractors.data.links import LinkRegistry
from extractor.extractors.media import load_media


@pytest.fixture
def media_df(datadir):
return load_media(datadir / "media.json")
def media_df_and_registry(datadir):
link_registry = LinkRegistry()
return load_media(datadir / "media.json", link_registry), link_registry


@pytest.fixture
def media_df(media_df_and_registry):
media_df, _ = media_df_and_registry
return media_df


def test_media_times(media_df):
Expand Down Expand Up @@ -45,3 +53,13 @@ def test_title_extraction(media_df):

def test_columns_removed(media_df):
assert "_links" not in media_df.columns


def test_adds_link_registry(media_df_and_registry):
media_df, registry = media_df_and_registry

assert len(registry.links) == 1
assert (
registry.links[0].link
== "https://example.org/wp-content/uploads/2022/12/test-image.jpg"
)
12 changes: 6 additions & 6 deletions tests/extractors/test_media/media.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"slug": "test-image",
"status": "inherit",
"type": "attachment",
"link": "https://waronfakes.com/some-post/attachment/test-image/",
"link": "https://example.org/some-post/attachment/test-image/",
"title": {
"rendered": "test-image"
},
Expand All @@ -29,22 +29,22 @@
},
"og_locale": "en_US",
"og_type": "article",
"og_title": "40-5 - waronfakes.com",
"og_title": "40-5 - example.org",
"og_url": "https://example.org/wp-content/uploads/2022/12/test-image.jpg",
"og_site_name": "waronfakes.com",
"og_site_name": "example.org",
"og_image": [
{
"width": 500,
"height": 500,
"url": "https://waronfakes.com/wp-content/uploads/2022/12/test-image.jpg",
"url": "https://example.org/wp-content/uploads/2022/12/test-image.jpg",
"type": "image/jpeg"
}
],
"twitter_card": "summary_large_image",
"schema": "..."
},
"description": {
"rendered": "<p class=\"attachment\"><a href='https://waronfakes.com/wp-content/uploads/2022/12/test-image.jpg'><img /></a> Some Text</p>\n"
"rendered": "<p class=\"attachment\"><a href='https://example.org/wp-content/uploads/2022/12/test-image.jpg'><img /></a> Some Text</p>\n"
},
"caption": {
"rendered": "<p>Some <em>caption text</em></p>"
Expand Down Expand Up @@ -79,7 +79,7 @@
"width": 500,
"height": 500,
"mime_type": "image/jpeg",
"source_url": "https://waronfakes.com/wp-content/uploads/2022/12/test-image.jpg"
"source_url": "https://example.org/wp-content/uploads/2022/12/test-image.jpg"
}
},
"image_meta": {
Expand Down
16 changes: 16 additions & 0 deletions tests/extractors/test_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from extractor.extractors.posts import (
ensure_translations_undirected,
load_posts,
resolve_post_media,
resolve_post_translations,
)
from extractor.parse.translations._resolver import TranslationLink
Expand Down Expand Up @@ -183,3 +184,18 @@ def test_translations_bidirectional(posts_df_and_registry):
assert posts_df.loc[2]["translations"][0].destination == Linkable(
link="https://example.org/an-example-post/", data_type="post", idx=1
)


def test_resolves_media(posts_df_and_registry):
posts_df, registry = posts_df_and_registry
registry.add_linkable(
"https://example.org/wp-content/uploads/2022/12/test-image.jpg", "media", 1
)

posts_df = resolve_post_media(registry, posts_df)

assert posts_df.loc[1]["images"][0].destination == Linkable(
link="https://example.org/wp-content/uploads/2022/12/test-image.jpg",
data_type="media",
idx=1,
)
2 changes: 1 addition & 1 deletion tests/extractors/test_posts/posts.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"rendered": "An Example Post"
},
"content": {
"rendered": "<p>This is an example post.</p><p>It has two paragraphs</p>"
"rendered": "<p>This is an example post.</p><p>It has two paragraphs</p><img src=\"https://example.org/wp-content/uploads/2022/12/test-image.jpg\" alt=\"an image\">"
},
"excerpt": {
"rendered": "<p>An excerpt about this post</p>"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@
<body>
<p>This is an example post</p>
<p>It has two paragraphs.</p>
<img src="https://example.org/wp-content/uploads/2022/12/test-image.jpg" alt="an image">
</body>
</html>

0 comments on commit e975d4d

Please sign in to comment.