Skip to content

Commit

Permalink
WIP: Github actions CI (#3)
Browse files Browse the repository at this point in the history
Add github actions based CI. Formatted the code.
  • Loading branch information
oltarasenko authored Apr 8, 2024
1 parent d05c5ff commit 7622c9a
Show file tree
Hide file tree
Showing 11 changed files with 153 additions and 38 deletions.
5 changes: 5 additions & 0 deletions .formatter.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Used by "mix format"
[
inputs: ["{mix,.formatter}.exs", "{config,lib,test}/**/*.{ex,exs}"],
line_length: 80
]
29 changes: 29 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Elixir GitHub Actions

on:
push:
branches:
- master
pull_request:
branches:
- '*'

jobs:
build:
runs-on: ubuntu-latest

steps:
- name: Checkout Repository
uses: actions/checkout@v3


- name: Setup Elixir
uses: erlef/setup-beam@v1
with:
version-file: .tool-versions
version-type: strict

- run: mix deps.get
- run: mix compile --all-warnings --warnings-as-errors
- run: mix format --check-formatted
- run: mix test
1 change: 1 addition & 0 deletions .tool-versions
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
elixir 1.16.2-otp-26
erlang 26.0.2
7 changes: 5 additions & 2 deletions lib/gollum.ex
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ defmodule Gollum do
:uncrawlable
```
"""
@spec crawlable?(binary, binary, keyword) :: :crawlable | :uncrawlable | :undefined
@spec crawlable?(binary, binary, keyword) ::
:crawlable | :uncrawlable | :undefined
def crawlable?(user_agent, url, opts \\ []) do
name = opts[:name] || Gollum.Cache

Expand All @@ -50,7 +51,9 @@ defmodule Gollum do
path = uri.path || "/"

case Cache.fetch(host, name: name) do
{:error, _} -> :crawlable
{:error, _} ->
:crawlable

:ok ->
host
|> Cache.get(name: name)
Expand Down
5 changes: 4 additions & 1 deletion lib/gollum/application.ex
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ defmodule Gollum.Application do
{Gollum.Cache, opts()}
]

Supervisor.start_link(children, strategy: :one_for_one, name: Gollum.Supervisor)
Supervisor.start_link(children,
strategy: :one_for_one,
name: Gollum.Supervisor
)
end
end
16 changes: 12 additions & 4 deletions lib/gollum/cache.ex
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,14 @@ defmodule Gollum.Cache do
@doc false
def handle_call({:fetch, host, fetch_opts}, from, {store, pending, opts}) do
case pending[host] do
nil -> do_possible_fetch({host, [{:from, from} | fetch_opts]}, {store, pending, opts})
froms -> {:noreply, {store, %{pending | host => [from | froms]}, opts}}
nil ->
do_possible_fetch(
{host, [{:from, from} | fetch_opts]},
{store, pending, opts}
)

froms ->
{:noreply, {store, %{pending | host => [from | froms]}, opts}}
end
end

Expand All @@ -151,9 +157,11 @@ defmodule Gollum.Cache do

with {:force, false} <- {:force, fetch_opts[:force] || @force},
{:exists, {_data, time}} <- {:exists, store[host]},
{:lazy_refresh, true} <- {:lazy_refresh, opts[:lazy_refresh] || @lazy_refresh},
{:lazy_refresh, true} <-
{:lazy_refresh, opts[:lazy_refresh] || @lazy_refresh},
refresh_secs = opts[:refresh_secs] || @refresh_secs,
{:refresh_secs, true} <- {:refresh_secs, cur_time - time > refresh_secs} do
{:refresh_secs, true} <-
{:refresh_secs, cur_time - time > refresh_secs} do
do_fetch({host, fetch_opts}, {store, pending, opts})
else
{:force, true} ->
Expand Down
8 changes: 6 additions & 2 deletions lib/gollum/fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,24 @@ defmodule Gollum.Fetcher do
@spec fetch(binary, keyword) :: {:ok, binary} | {:error, term}
def fetch(domain, opts) do
headers = [
{"User-Agent", opts[:user_agent] || "Gollum"},
{"User-Agent", opts[:user_agent] || "Gollum"}
]

other_opts = [
follow_redirect: true,
ssl: [{:versions, [:'tlsv1.2']}],
ssl: [{:versions, [:"tlsv1.2"]}]
]

opts = Keyword.merge(opts, other_opts)

# Make the request via HTTPoison and return the ok | error tuple
case HTTPoison.get("#{domain}/robots.txt", headers, opts) do
{:ok, %HTTPoison.Response{status_code: 200, body: body}} ->
{:ok, body}

{:ok, _response} ->
{:error, :no_robots_file}

{:error, %HTTPoison.Error{reason: reason}} ->
{:error, reason}
end
Expand Down
10 changes: 7 additions & 3 deletions lib/gollum/host.ex
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ defmodule Gollum.Host do
:uncrawlable
```
"""
@spec crawlable?(Gollum.Host.t(), binary, binary) :: :crawlable | :uncrawlable | :undefined
@spec crawlable?(Gollum.Host.t(), binary, binary) ::
:crawlable | :uncrawlable | :undefined
def crawlable?(%Gollum.Host{rules: rules}, user_agent, path) do
# Determine the user agent
key =
Expand All @@ -85,7 +86,9 @@ defmodule Gollum.Host do

defp sanitize_user_agent_map(nil), do: %{allowed: [], disallowed: []}
defp sanitize_user_agent_map(%{allowed: _, disallowed: _} = map), do: map
defp sanitize_user_agent_map(%{allowed: allowed}), do: %{allowed: allowed, disallowed: []}

defp sanitize_user_agent_map(%{allowed: allowed}),
do: %{allowed: allowed, disallowed: []}

defp sanitize_user_agent_map(%{disallowed: disallowed}),
do: %{allowed: [], disallowed: disallowed}
Expand Down Expand Up @@ -176,5 +179,6 @@ defmodule Gollum.Host do
defp do_match_group(<<ch::utf8, lhs::binary>>, <<ch::utf8, rhs::binary>>),
do: do_match_group(lhs, rhs)

defp do_match_group(<<_ch::utf8, lhs::binary>>, rhs), do: do_match_group(lhs, rhs)
defp do_match_group(<<_ch::utf8, lhs::binary>>, rhs),
do: do_match_group(lhs, rhs)
end
20 changes: 15 additions & 5 deletions lib/gollum/parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,17 @@ defmodule Gollum.Parser do
# Tokenize a single line of the robots.txt.
defp tokenize(line) do
cond do
result = Regex.run(~r/^allow:?\s(.+)$/i, line) -> {:allow, Enum.at(result, 1)}
result = Regex.run(~r/^disallow:?\s(.+)$/i, line) -> {:disallow, Enum.at(result, 1)}
result = Regex.run(~r/^user-agent:?\s(.+)$/i, line) -> {:user_agent, Enum.at(result, 1)}
true -> :unknown
result = Regex.run(~r/^allow:?\s(.+)$/i, line) ->
{:allow, Enum.at(result, 1)}

result = Regex.run(~r/^disallow:?\s(.+)$/i, line) ->
{:disallow, Enum.at(result, 1)}

result = Regex.run(~r/^user-agent:?\s(.+)$/i, line) ->
{:user_agent, Enum.at(result, 1)}

true ->
:unknown
end
end

Expand Down Expand Up @@ -71,7 +78,10 @@ defmodule Gollum.Parser do
# Add a disallowed field
defp do_parse([{:disallow, path} | tokens], {agents, rules}, accum) do
[path | _] = String.split(URI.encode(URI.decode(path)), "#")
rules = Map.put(rules || %{}, :disallowed, [path | rules[:disallowed] || []])

rules =
Map.put(rules || %{}, :disallowed, [path | rules[:disallowed] || []])

do_parse(tokens, {agents, rules}, accum)
end

Expand Down
87 changes: 66 additions & 21 deletions test/gollum/host_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -164,18 +164,30 @@ defmodule Gollum.HostTest do
dIsAlLoW: /
"""

host_upper = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_upper))
host_lower = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_lower))
host_camel = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_camel))
host_upper =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_upper))

host_lower =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_lower))

host_camel =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_camel))

path_allowed = "/x/y"
path_disallowed = "/a/b"

assert Host.crawlable?(host_upper, "FooBot", path_allowed) == :crawlable
assert Host.crawlable?(host_lower, "FooBot", path_allowed) == :crawlable
assert Host.crawlable?(host_camel, "FooBot", path_allowed) == :crawlable
refute Host.crawlable?(host_upper, "FooBot", path_disallowed) == :crawlable
refute Host.crawlable?(host_lower, "FooBot", path_disallowed) == :crawlable
refute Host.crawlable?(host_camel, "FooBot", path_disallowed) == :crawlable

refute Host.crawlable?(host_upper, "FooBot", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_lower, "FooBot", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_camel, "FooBot", path_disallowed) ==
:crawlable
end

# User-agent line values are case insensitive. See REP RFC section "The
Expand All @@ -200,24 +212,43 @@ defmodule Gollum.HostTest do
Disallow: /
"""

host_upper = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_upper))
host_lower = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_lower))
host_camel = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_camel))
host_upper =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_upper))

host_lower =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_lower))

host_camel =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_camel))

path_allowed = "/x/y"
path_disallowed = "/a/b"

assert Host.crawlable?(host_upper, "Foo Bar", path_allowed) == :crawlable
assert Host.crawlable?(host_lower, "Foo Bar", path_allowed) == :crawlable
assert Host.crawlable?(host_camel, "Foo Bar", path_allowed) == :crawlable
refute Host.crawlable?(host_upper, "Foo Bar", path_disallowed) == :crawlable
refute Host.crawlable?(host_lower, "Foo Bar", path_disallowed) == :crawlable
refute Host.crawlable?(host_camel, "Foo Bar", path_disallowed) == :crawlable

refute Host.crawlable?(host_upper, "Foo Bar", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_lower, "Foo Bar", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_camel, "Foo Bar", path_disallowed) ==
:crawlable

assert Host.crawlable?(host_upper, "foo bar", path_allowed) == :crawlable
assert Host.crawlable?(host_lower, "foo bar", path_allowed) == :crawlable
assert Host.crawlable?(host_camel, "foo bar", path_allowed) == :crawlable
refute Host.crawlable?(host_upper, "foo bar", path_disallowed) == :crawlable
refute Host.crawlable?(host_lower, "foo bar", path_disallowed) == :crawlable
refute Host.crawlable?(host_camel, "foo bar", path_disallowed) == :crawlable

refute Host.crawlable?(host_upper, "foo bar", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_lower, "foo bar", path_disallowed) ==
:crawlable

refute Host.crawlable?(host_camel, "foo bar", path_disallowed) ==
:crawlable
end

# If no group matches the user-agent, crawlers must obey the first group with a
Expand All @@ -244,11 +275,17 @@ defmodule Gollum.HostTest do
disallow: /
"""

host_empty = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_empty))
host_global = Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_global))
host_empty =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_empty))

host_global =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_global))

host_only_specific =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_only_specific))
Host.new(
"http://foo.bar/",
Gollum.Parser.parse(robotstxt_only_specific)
)

path = "/x/y"

Expand All @@ -273,10 +310,16 @@ defmodule Gollum.HostTest do
"""

host_lowercase_url =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_lowercase_url))
Host.new(
"http://foo.bar/",
Gollum.Parser.parse(robotstxt_lowercase_url)
)

host_uppercase_url =
Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt_uppercase_url))
Host.new(
"http://foo.bar/",
Gollum.Parser.parse(robotstxt_uppercase_url)
)

path = "/x/y"

Expand Down Expand Up @@ -445,7 +488,9 @@ defmodule Gollum.HostTest do
""",
host <- Host.new("http://foo.bar/", Gollum.Parser.parse(robotstxt)) do
assert Host.crawlable?(host, "FooBot", "/foo/bar/baz") == :crawlable
assert Host.crawlable?(host, "FooBot", "/foo/bar/%62%61%7A") == :crawlable

assert Host.crawlable?(host, "FooBot", "/foo/bar/%62%61%7A") ==
:crawlable
end
end

Expand Down
3 changes: 3 additions & 0 deletions test/support/mock_fetcher.ex
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ defmodule MockFetcher do
def fetch("ok", _opts) do
{:ok, "User-agent: Hello\nAllow: /hello\nDisallow: /hey"}
end

def fetch("delay_ok", _opts) do
:timer.sleep(100)
{:ok, "User-agent: Hello\nAllow: /hello\nDisallow: /hey"}
end

def fetch("error", _opts) do
{:error, :no_robots_file}
end

def fetch("http://example.com", _opts) do
{:ok, "User-agent: Hello\nAllow: /hello\nDisallow: /hey"}
end
Expand Down

0 comments on commit 7622c9a

Please sign in to comment.