keepcosmos · Valian · Nov 5, 2024 · Nov 2, 2024
diff --git a/README.md b/README.md
@@ -40,6 +40,9 @@ summary = Readability.summarize(url)
 summary.title
 #=> "Why I’m betting on Elixir"
 
+summary.published_at
+#=> ~U[2015-02-23 16:53:27.006Z]
+
 summary.authors
 #=> ["Ken Mazaika"]
 
@@ -62,6 +65,9 @@ summary.article_text
 ### Extract the title.
 Readability.title(html)
 
+### Extract the published at
+Readability.published_at(html)
+
 ### Extract authors.
 Readability.authors(html)
 
@@ -93,11 +99,11 @@ url = "https://medium.com/@kenmazaika/why-im-betting-on-elixir-7c8f847b58"
 summary = Readability.summarize(url, [clean_conditionally: false])
 ```
 
-* `:min_text_length` \\\\ 25
-* `:remove_unlikely_candidates` \\\\ true
-* `:weight_classes` \\\\ true
-* `:clean_conditionally` \\\\ true
-* `:retry_length` \\\\ 250
+- `:min_text_length` \\\\ 25
+- `:remove_unlikely_candidates` \\\\ true
+- `:weight_classes` \\\\ true
+- `:clean_conditionally` \\\\ true
+- `:retry_length` \\\\ 250
 
 **You can find other algorithm and regex options in `readability.ex`**
 
@@ -109,16 +115,17 @@ To run the test suite:
 
 ## Todo
 
-* [x] Extract authors
-* [x] More configurable
-* [x] Summarize function
-* [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
+- [x] Extract authors
+- [x] More configurable
+- [x] Summarize function
+- [ ] Convert relative paths into absolute paths of `img#src` and `a#href`
 
 ## Contributions are welcome!
 
 Check out [the main features milestone](https://github.com/keepcosmos/readability/milestones) and features of related projects below
 
 **Contributing**
+
 1. **Fork** the repo on GitHub
 2. **Clone** the project to your own machine
 3. **Commit** changes to your own branch
@@ -127,12 +134,11 @@ Check out [the main features milestone](https://github.com/keepcosmos/readabilit
 
 NOTE: Be sure to merge the latest from "upstream" before making a pull request!
 
-
 ## Related and Inspired Projects
 
-* [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
-* [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
-* [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.
+- [readability.js](https://github.com/mozilla/readability) is a standalone version of the readability library used for Firefox Reader View.
+- [newspaper](https://github.com/codelucas/newspaper) is an advanced news extraction, article extraction, and content curation library for Python.
+- [ruby-readability](https://github.com/cantino/ruby-readability) is a tool for extracting the primary readable content of a webpage.
 
 ## Copyright and License
 

diff --git a/lib/readability.ex b/lib/readability.ex
@@ -13,6 +13,9 @@ defmodule Readability do
   # Extract title
   Readability.title(html)
 
+  # Extract published at
+  Readability.published_at(html)
+
   # Extract authors.
   Readability.authors(html)
 
@@ -31,6 +34,7 @@ defmodule Readability do
   alias Readability.ArticleBuilder
   alias Readability.AuthorFinder
   alias Readability.Helper
+  alias Readability.PublishedAtFinder
   alias Readability.Summary
   alias Readability.TitleFinder
 
@@ -91,6 +95,7 @@ defmodule Readability do
         %Summary{
           title: title(html_tree),
           authors: authors(html_tree),
+          published_at: published_at(html_tree),
           article_html: readable_html(article_tree),
           article_text: readable_text(article_tree)
         }
@@ -166,6 +171,24 @@ defmodule Readability do
   def authors(html) when is_binary(html), do: html |> Floki.parse_document!() |> authors
   def authors(html_tree), do: AuthorFinder.find(html_tree)
 
+  @doc """
+  Extract published_at
+
+  ## Example
+
+      iex> datetime = Readability.published_at(html_str)
+      %DateTime{}
+
+  """
+  @spec published_at(binary | html_tree) :: %DateTime{} | %Date{} | nil
+  def published_at(raw_html) when is_binary(raw_html) do
+    raw_html
+    |> Floki.parse_document()
+    |> published_at()
+  end
+
+  def published_at(html_tree), do: PublishedAtFinder.find(html_tree)
+
   @doc """
   Using a variety of metrics (content score, classname, element types), find the content that is
   most likely to be the stuff a user wants to read.

diff --git a/lib/readability/published_at_finder.ex b/lib/readability/published_at_finder.ex
@@ -0,0 +1,67 @@
+defmodule Readability.PublishedAtFinder do
+  @moduledoc """
+  Extract the published at.
+  """
+
+  @type html_tree :: tuple | list
+
+  @strategies [:meta_tag, :time_element, :data_attribute]
+
+  @doc """
+  Extract the published at.
+  """
+  @spec find(html_tree) :: %DateTime{} | %Date{} | nil
+  def find(html_tree) do
+    value =
+      Enum.find_value(@strategies, fn strategy ->
+        strategy(strategy, html_tree)
+      end)
+
+    if value do
+      parse(value)
+    end
+  end
+
+  defp strategy(:meta_tag, html_tree) do
+    selector = "meta[property='article:published_time'], meta[property='article:published']"
+
+    html_tree
+    |> Floki.attribute(selector, "content")
+    |> Enum.map(&String.trim/1)
+    |> List.first()
+  end
+
+  defp strategy(:time_element, html_tree) do
+    html_tree
+    |> Floki.find("time")
+    |> Enum.flat_map(&Floki.attribute(&1, "datetime"))
+    |> Enum.map(&String.trim/1)
+    |> List.first()
+  end
+
+  defp strategy(:data_attribute, html_tree) do
+    html_tree
+    |> Floki.find("[data-datetime]")
+    |> Enum.flat_map(&Floki.attribute(&1, "data-datetime"))
+    |> Enum.map(&String.trim/1)
+    |> List.first()
+  end
+
+  defp parse(value) do
+    parse(:datetime, value) || parse(:date, value)
+  end
+
+  defp parse(:datetime, value) do
+    case DateTime.from_iso8601(value) do
+      {:ok, datetime, _} -> datetime
+      _ -> nil
+    end
+  end
+
+  defp parse(:date, value) do
+    case Date.from_iso8601(value) do
+      {:ok, date} -> date
+      _ -> nil
+    end
+  end
+end
diff --git a/lib/readability/summary.ex b/lib/readability/summary.ex
@@ -1,4 +1,4 @@
 defmodule Readability.Summary do
   @moduledoc false
-  defstruct title: nil, authors: [], article_html: nil, article_text: nil
+  defstruct title: nil, authors: [], article_html: nil, article_text: nil, published_at: nil
 end
diff --git a/test/readability/published_at_finder_test.exs b/test/readability/published_at_finder_test.exs
@@ -0,0 +1,33 @@
+defmodule Readability.PublishedAtFinderTest do
+  use ExUnit.Case, async: true
+
+  alias Readability.PublishedAtFinder
+
+  test "extracting bbc format published at" do
+    html = TestHelper.read_parse_fixture("bbc.html")
+
+    assert PublishedAtFinder.find(html) == nil
+  end
+
+  test "extracting buzzfeed format published at" do
+    html = TestHelper.read_parse_fixture("buzzfeed.html")
+
+    assert PublishedAtFinder.find(html) == nil
+  end
+
+  test "extracting elixir format published at" do
+    html = TestHelper.read_parse_fixture("elixir.html")
+
+    assert PublishedAtFinder.find(html) == nil
+  end
+
+  test "extracting medium format published at" do
+    html = TestHelper.read_parse_fixture("medium.html")
+    assert PublishedAtFinder.find(html) == ~U[2015-01-31 22:58:05.645Z]
+  end
+
+  test "extracting nytimes format published at" do
+    html = TestHelper.read_parse_fixture("nytimes.html")
+    assert PublishedAtFinder.find(html) == ~D[2016-03-16]
+  end
+end