Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add published_at to the extracted attributes. #58

Merged
merged 1 commit into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 19 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ summary = Readability.summarize(url)
summary.title
#=> "Why I’m betting on Elixir"

summary.published_at
#=> ~U[2015-02-23 16:53:27.006Z]

summary.authors
#=> ["Ken Mazaika"]

Expand All @@ -62,6 +65,9 @@ summary.article_text
### Extract the title.
Readability.title(html)

### Extract the published at
Readability.published_at(html)

### Extract authors.
Readability.authors(html)

Expand Down Expand Up @@ -93,11 +99,11 @@ url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
summary = Readability.summarize(url, [clean_conditionally: false])
```

* `:min_text_length` \\\\ 25
* `:remove_unlikely_candidates` \\\\ true
* `:weight_classes` \\\\ true
* `:clean_conditionally` \\\\ true
* `:retry_length` \\\\ 250
- `:min_text_length` \\\\ 25
- `:remove_unlikely_candidates` \\\\ true
- `:weight_classes` \\\\ true
- `:clean_conditionally` \\\\ true
- `:retry_length` \\\\ 250

**You can find other algorithm and regex options in `readability.ex`**

Expand All @@ -109,16 +115,17 @@ To run the test suite:

## Todo

* [x] Extract authors
* [x] More configurable
* [x] Summarize function
* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
- [x] Extract authors
- [x] More configurable
- [x] Summarize function
- [ ] Convert relative paths into absolute paths of `img#src` and `a#href`

## Contributions are welcome!

Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones) and features of related projects below

**Contributing**

1. **Fork** the repo on GitHub
2. **Clone** the project to your own machine
3. **Commit** changes to your own branch
Expand All @@ -127,12 +134,11 @@ Check out [the main features milestone](https://github.com/keepcosmos/readabilit

NOTE: Be sure to merge the latest from "upstream" before making a pull request!


## Related and Inspired Projects

* [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
* [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
* [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.
- [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
- [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
- [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.

## Copyright and License

Expand Down
23 changes: 23 additions & 0 deletions lib/readability.ex
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ defmodule Readability do
# Extract title
Readability.title(html)

# Extract published at
Readability.published_at(html)

# Extract authors.
Readability.authors(html)

Expand All @@ -31,6 +34,7 @@ defmodule Readability do
alias Readability.ArticleBuilder
alias Readability.AuthorFinder
alias Readability.Helper
alias Readability.PublishedAtFinder
alias Readability.Summary
alias Readability.TitleFinder

Expand Down Expand Up @@ -91,6 +95,7 @@ defmodule Readability do
%Summary{
title: title(html_tree),
authors: authors(html_tree),
published_at: published_at(html_tree),
article_html: readable_html(article_tree),
article_text: readable_text(article_tree)
}
Expand Down Expand Up @@ -166,6 +171,24 @@ defmodule Readability do
def authors(html) when is_binary(html), do: html |> Floki.parse_document!() |> authors
def authors(html_tree), do: AuthorFinder.find(html_tree)

@doc """
Extract published_at

## Example

iex> datetime = Readability.published_at(html_str)
%DateTime{}

"""
@spec published_at(binary | html_tree) :: %DateTime{} | %Date{} | nil
def published_at(raw_html) when is_binary(raw_html) do
raw_html
|> Floki.parse_document()
|> published_at()
end

def published_at(html_tree), do: PublishedAtFinder.find(html_tree)

@doc """
Using a variety of metrics (content score, classname, element types), find the content that is
most likely to be the stuff a user wants to read.
Expand Down
67 changes: 67 additions & 0 deletions lib/readability/published_at_finder.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
defmodule Readability.PublishedAtFinder do
@moduledoc """
Extract the published at.
"""

@type html_tree :: tuple | list

@strategies [:meta_tag, :time_element, :data_attribute]

@doc """
Extract the published at.
"""
@spec find(html_tree) :: %DateTime{} | %Date{} | nil
def find(html_tree) do
value =
Enum.find_value(@strategies, fn strategy ->
strategy(strategy, html_tree)
end)

if value do
parse(value)
end
end

defp strategy(:meta_tag, html_tree) do
selector = "meta[property='article:published_time'], meta[property='article:published']"

html_tree
|> Floki.attribute(selector, "content")
|> Enum.map(&String.trim/1)
|> List.first()
end

defp strategy(:time_element, html_tree) do
html_tree
|> Floki.find("time")
|> Enum.flat_map(&Floki.attribute(&1, "datetime"))
|> Enum.map(&String.trim/1)
|> List.first()
end

defp strategy(:data_attribute, html_tree) do
html_tree
|> Floki.find("[data-datetime]")
|> Enum.flat_map(&Floki.attribute(&1, "data-datetime"))
|> Enum.map(&String.trim/1)
|> List.first()
end

defp parse(value) do
parse(:datetime, value) || parse(:date, value)
end

defp parse(:datetime, value) do
case DateTime.from_iso8601(value) do
{:ok, datetime, _} -> datetime
_ -> nil
end
end

defp parse(:date, value) do
case Date.from_iso8601(value) do
{:ok, date} -> date
_ -> nil
end
end
end
2 changes: 1 addition & 1 deletion lib/readability/summary.ex
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
defmodule Readability.Summary do
@moduledoc false
defstruct title: nil, authors: [], article_html: nil, article_text: nil
defstruct title: nil, authors: [], article_html: nil, article_text: nil, published_at: nil
end
33 changes: 33 additions & 0 deletions test/readability/published_at_finder_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
defmodule Readability.PublishedAtFinderTest do
use ExUnit.Case, async: true

alias Readability.PublishedAtFinder

test "extracting bbc format published at" do
html = TestHelper.read_parse_fixture("bbc.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting buzzfeed format published at" do
html = TestHelper.read_parse_fixture("buzzfeed.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting elixir format published at" do
html = TestHelper.read_parse_fixture("elixir.html")

assert PublishedAtFinder.find(html) == nil
end

test "extracting medium format published at" do
html = TestHelper.read_parse_fixture("medium.html")
assert PublishedAtFinder.find(html) == ~U[2015-01-31 22:58:05.645Z]
end

test "extracting nytimes format published at" do
html = TestHelper.read_parse_fixture("nytimes.html")
assert PublishedAtFinder.find(html) == ~D[2016-03-16]
end
end
Loading