Skip to content

Commit

Permalink
Add option to strip metadata when parsing PO files (#141)
Browse files Browse the repository at this point in the history
  • Loading branch information
maennchen authored Sep 10, 2024
1 parent deb65ba commit bd94b21
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 32 deletions.
12 changes: 11 additions & 1 deletion lib/expo/po.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,17 @@ defmodule Expo.PO do
alias Expo.Messages
alias Expo.PO.{DuplicateMessagesError, Parser, SyntaxError}

@type parse_option :: {:file, Path.t()}
@typedoc """
Parsing option.
* `:file` (`t:Path.t/0`) - path to use in error messages when using `parse_string/2`. If not present, errors
don't have a path.
* `:strip_meta` (`t:boolean/0`) - include only messages (no comments and other metadata) from the `.po` file
to reduce memory usage when meta information is not needed.
Defaults to `false`.
"""
@type parse_option :: {:file, Path.t()} | {:strip_meta, boolean()}

@doc """
Dumps a `Expo.Messages` struct as iodata.
Expand Down
6 changes: 3 additions & 3 deletions lib/expo/po/parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ defmodule Expo.PO.Parser do
def parse(content, opts) do
content = prune_bom(content, Keyword.get(opts, :file, "nofile"))

with {:ok, tokens} <- tokenize(content),
with {:ok, tokens} <- tokenize(content, opts),
{:ok, po} <- parse_tokens(tokens),
{:ok, po} <- check_for_duplicates(po) do
{:ok, %Messages{po | file: Keyword.get(opts, :file)}}
Expand All @@ -22,8 +22,8 @@ defmodule Expo.PO.Parser do
end
end

defp tokenize(content) do
case Tokenizer.tokenize(content) do
defp tokenize(content, opts) do
case Tokenizer.tokenize(content, opts) do
{:ok, tokens} -> {:ok, tokens}
{:error, line, message} -> {:error, %SyntaxError{line: line, reason: message}}
end
Expand Down
65 changes: 39 additions & 26 deletions lib/expo/po/tokenizer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ defmodule Expo.PO.Tokenizer do
* `{:str, 6, "foo"}`
"""
@spec tokenize(binary) :: {:ok, [token]} | {:error, pos_integer, binary}
def tokenize(str) do
tokenize_line(str, _line = 1, _tokens_acc = [])
@spec tokenize(binary, [Expo.PO.parse_option()]) ::
{:ok, [token]} | {:error, pos_integer, binary}
def tokenize(str, opts \\ []) do
strip_meta? = Keyword.get(opts, :strip_meta, false)
tokenize_line(str, _line = 1, strip_meta?, _tokens_acc = [])
end

# Reverse str_lines strings.
Expand Down Expand Up @@ -86,79 +88,85 @@ defmodule Expo.PO.Tokenizer do
end

# End of file.
defp tokenize_line(<<>>, line, acc) do
defp tokenize_line(<<>>, line, _strip_meta?, acc) do
{:ok, [{:"$end", line} | acc] |> Enum.reverse() |> postprocess_tokens()}
end

# Go to the next line.
defp tokenize_line(<<?\n, rest::binary>>, line, acc) do
tokenize_line(rest, line + 1, acc)
defp tokenize_line(<<?\n, rest::binary>>, line, strip_meta?, acc) do
tokenize_line(rest, line + 1, strip_meta?, acc)
end

# Skip other whitespace.
defp tokenize_line(<<char, rest::binary>>, line, acc)
defp tokenize_line(<<char, rest::binary>>, line, strip_meta?, acc)
when char in @whitespace_no_nl do
tokenize_line(rest, line, acc)
tokenize_line(rest, line, strip_meta?, acc)
end

# Skip Meta Information when strip_meta is enabled
defp tokenize_line(<<?#, rest::binary>>, line, true, acc) do
from_next_line = discard_until_nl(rest)
tokenize_line(from_next_line, line, true, acc)
end

# Obsolete comment.
defp tokenize_line(<<"#~", rest::binary>>, line, acc) do
tokenize_line(rest, line, [{:obsolete, line} | acc])
defp tokenize_line(<<"#~", rest::binary>>, line, strip_meta?, acc) do
tokenize_line(rest, line, strip_meta?, [{:obsolete, line} | acc])
end

# Previous comment.
defp tokenize_line(<<"#|", rest::binary>>, line, acc) do
tokenize_line(rest, line, [{:previous, line} | acc])
defp tokenize_line(<<"#|", rest::binary>>, line, strip_meta?, acc) do
tokenize_line(rest, line, strip_meta?, [{:previous, line} | acc])
end

# Normal comment.
defp tokenize_line(<<?#, _rest::binary>> = rest, line, acc) do
defp tokenize_line(<<?#, _rest::binary>> = rest, line, strip_meta?, acc) do
{contents, rest} = to_eol_or_eof(rest, "")
tokenize_line(rest, line, [{:comment, line, contents} | acc])
tokenize_line(rest, line, strip_meta?, [{:comment, line, contents} | acc])
end

# Keywords.
for kw <- @string_keywords do
defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, acc)
defp tokenize_line(unquote(kw) <> <<char, rest::binary>>, line, strip_meta?, acc)
when char in @whitespace do
acc = [{unquote(String.to_existing_atom(kw)), line} | acc]
tokenize_line(rest, line, acc)
tokenize_line(rest, line, strip_meta?, acc)
end

defp tokenize_line(unquote(kw) <> _rest, line, _acc) do
defp tokenize_line(unquote(kw) <> _rest, line, _strip_meta?, _acc) do
{:error, line, "no space after '#{unquote(kw)}'"}
end
end

# `msgstr`.
defp tokenize_line("msgstr[" <> <<rest::binary>>, line, acc) do
defp tokenize_line("msgstr[" <> <<rest::binary>>, line, strip_meta?, acc) do
case tokenize_plural_form(rest, "") do
{:ok, plural_form, rest} ->
# The order of the :plural_form and :msgstr tokens is inverted since
# the `acc` array of tokens will be reversed at the end.
acc = [{:plural_form, line, plural_form}, {:msgstr, line} | acc]
tokenize_line(rest, line, acc)
tokenize_line(rest, line, strip_meta?, acc)

{:error, reason} ->
{:error, line, reason}
end
end

defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, acc)
defp tokenize_line("msgstr" <> <<char, rest::binary>>, line, strip_meta?, acc)
when char in @whitespace do
acc = [{:msgstr, line} | acc]
tokenize_line(rest, line, acc)
tokenize_line(rest, line, strip_meta?, acc)
end

defp tokenize_line("msgstr" <> _rest, line, _acc) do
defp tokenize_line("msgstr" <> _rest, line, _strip_meta?, _acc) do
{:error, line, "no space after 'msgstr'"}
end

# String.
defp tokenize_line(<<?", rest::binary>>, line, acc) do
defp tokenize_line(<<?", rest::binary>>, line, strip_meta?, acc) do
case tokenize_string(rest, "") do
{:ok, string, rest} ->
tokenize_line(rest, line, add_str_lines(line, string, acc))
tokenize_line(rest, line, strip_meta?, add_str_lines(line, string, acc))

{:error, reason} ->
{:error, line, reason}
Expand All @@ -170,7 +178,7 @@ defmodule Expo.PO.Tokenizer do
# a letter (we don't take care of unicode or fancy stuff, just ASCII letters),
# we assume there's an unknown keyword. We parse it with a regex
# so that the error message is informative.
defp tokenize_line(<<letter, _rest::binary>> = binary, line, _acc)
defp tokenize_line(<<letter, _rest::binary>> = binary, line, _strip_meta?, _acc)
when letter in ?a..?z or letter in ?A..?Z do
next_word = List.first(Regex.run(~r/\w+/u, binary))
{:error, line, "unknown keyword '#{next_word}'"}
Expand All @@ -180,13 +188,18 @@ defmodule Expo.PO.Tokenizer do
# Last resort: this is just a plain unexpected token. We take the first
# Unicode char of the given binary and build an informative error message
# (with the codepoint of the char).
defp tokenize_line(binary, line, _acc) when is_binary(binary) do
defp tokenize_line(binary, line, _strip_meta?, _acc) when is_binary(binary) do
# To get the first Unicode char, we convert to char list first.
[char | _] = String.to_charlist(binary)
msg = :io_lib.format(~c"unexpected token: \"~ts\" (codepoint U+~4.16.0B)", [[char], char])
{:error, line, :unicode.characters_to_binary(msg)}
end

defp discard_until_nl(content)
defp discard_until_nl(<<?\n, _rest::binary>> = content), do: content
defp discard_until_nl(<<>>), do: <<>>
defp discard_until_nl(<<_char, rest::binary>>), do: discard_until_nl(rest)

@obsolete_keywords ~w(msgid msgid_plural msgctxt msgstr)a

# Collapse the string into the previous str_lines token if there is one *on the same line*.
Expand Down
32 changes: 30 additions & 2 deletions test/expo/parser_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,36 @@ defmodule Expo.ParserTest do
end
end

defp parse(string) do
case PO.parse_string(string) do
describe "strip meta" do
test "does not include extra information" do
assert [
%Message.Plural{
msgid: ["foo"],
msgid_plural: ["foos"],
msgstr: %{0 => ["bar"], 1 => ["bars"]},
comments: [],
extracted_comments: [],
references: []
}
] =
parse(
"""
# This is a message
#: lib/foo.ex:32
# Ah, another comment!
#. An extracted comment
msgid "foo"
msgid_plural "foos"
msgstr[0] "bar"
msgstr[1] "bars"
""",
strip_meta: true
)
end
end

defp parse(string, options \\ []) do
case PO.parse_string(string, options) do
{:ok, %Messages{messages: messages}} ->
messages

Expand Down

0 comments on commit bd94b21

Please sign in to comment.