Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for parsing attributes as maps #467

Merged
merged 4 commits into from
Jun 14, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ defmodule Floki do
inside a list.
"""

@type html_declaration :: {:pi, String.t(), [html_attribute()]}
@type html_attribute :: {String.t(), String.t()}
@type html_attributes :: [html_attribute()] | html_attributes_map()
@type html_attributes_map :: %{String.t() => String.t()}
@type html_declaration :: {:pi, String.t(), html_attributes()}
@type html_comment :: {:comment, String.t()}
@type html_doctype :: {:doctype, String.t(), String.t(), String.t()}
@type html_attribute :: {String.t(), String.t()}
@type html_text :: String.t()
@type html_tag :: {String.t(), [html_attribute()], [html_node()]}
@type html_tag :: {String.t(), html_attributes(), [html_node()]}
@type html_node ::
html_tag() | html_comment() | html_doctype() | html_declaration() | html_text()
@type html_tree :: [html_node()]
Expand Down Expand Up @@ -102,13 +104,19 @@ defmodule Floki do

## Options

* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Default to `false`.

* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.

See https://github.com/philss/floki#alternative-html-parsers for more details.

* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific for a given parser. Defaults to an empty list.

## Examples

iex> Floki.parse_document("<html><head></head><body>hello</body></html>")
Expand All @@ -117,6 +125,13 @@ defmodule Floki do
iex> Floki.parse_document("<html><head></head><body>hello</body></html>", html_parser: Floki.HTMLParser.Mochiweb)
{:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]}

iex> Floki.parse_document(
...> "<html><head></head><body class=main>hello</body></html>",
...> attributes_as_maps: true,
...> html_parser: Floki.HTMLParser.Mochiweb
...>)
{:ok, [{"html", %{}, [{"head", %{}, []}, {"body", %{"class" => "main"}, ["hello"]}]}]}

"""

@spec parse_document(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -152,13 +167,20 @@ defmodule Floki do

## Options

* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Remember that maps are no longer
ordered since OTP 26. Default to `false`.

* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.

See https://github.com/philss/floki#alternative-html-parsers for more details.

* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific for a given parser. Defaults to an empty list.
philss marked this conversation as resolved.
Show resolved Hide resolved

"""

@spec parse_fragment(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -355,7 +377,7 @@ defmodule Floki do
@spec find_and_update(
html_tree(),
css_selector(),
({String.t(), [html_attribute()]} -> {String.t(), [html_attribute()]} | :delete)
({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete)
) :: html_tree()
def find_and_update(html_tree, selector, fun) do
{tree, results} = Finder.find(html_tree, selector)
Expand Down
39 changes: 31 additions & 8 deletions lib/floki/html_parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,47 @@ defmodule Floki.HTMLParser do
The default parser is Mochiweb, which comes with Floki.
You can also choose between Html5ever or FastHtml.

And it's possible to pass down options to the parsers using
the `parser_args` option.

This module is also a behaviour that those parsers must implement.
"""

@default_parser Floki.HTMLParser.Mochiweb

@callback parse_document(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@callback parse_fragment(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@typep result(success) :: {:ok, success} | {:error, String.t()}
@typep html :: binary()

@callback parse_document(html(), Keyword.t()) :: result(Floki.html_tree())
@callback parse_fragment(html(), Keyword.t()) :: result(Floki.html_tree())

@callback parse_document_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())
@callback parse_fragment_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())

def parse_document(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_document(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser(opts).parse_document(html, parser_args)
parser = parser(opts)

if opts[:attributes_as_maps] do
parser.parse_document_with_attributes_as_maps(html, parser_args)
else
parser.parse_document(html, parser_args)
end
end

def parse_fragment(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_fragment(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser = parser(opts)

parser(opts).parse_fragment(html, parser_args)
if opts[:attributes_as_maps] do
parser.parse_fragment_with_attributes_as_maps(html, parser_args)
else
parser.parse_fragment(html, parser_args)
end
end

defp parser(opts) do
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/fast_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ defmodule Floki.HTMLParser.FastHtml do
execute_with_module(fn module -> module.decode_fragment(html, args) end)
end

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

defp execute_with_module(fun) do
case Code.ensure_loaded(:fast_html) do
{:module, module} ->
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/html5ever.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@ defmodule Floki.HTMLParser.Html5ever do
# NOTE: html5ever does not implement parse_fragment yet.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end
end
14 changes: 12 additions & 2 deletions lib/floki/html_parser/mochiweb.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ defmodule Floki.HTMLParser.Mochiweb do
@root_node "floki"

@impl true
def parse_document(html, _args) do
def parse_document(html, args) do
html = "<#{@root_node}>#{html}</#{@root_node}>"
{@root_node, [], parsed} = :floki_mochi_html.parse(html)
{@root_node, _, parsed} = :floki_mochi_html.parse(html, args)

{:ok, parsed}
end

# NOTE: mochi_html cannot make a distinction of a fragment and document.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end

@impl true
def parse_fragment_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end
end
73 changes: 45 additions & 28 deletions src/floki_mochi_html.erl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
-module(floki_mochi_html).
-export([
tokens/1,
parse/1
parse/2
]).
-ifdef(TEST).
-export([destack/1, destack/2, is_singleton/1]).
Expand Down Expand Up @@ -96,17 +96,28 @@

%% External API.

%% @spec parse(string() | binary()) -> html_node()
%% @spec parse(string() | binary(), list()) -> html_node()
%% @doc tokenize and then transform the token stream into a HTML tree.
parse(Input) ->
parse_tokens(tokens(Input)).
%%
%% The following option is supported:
%%
%% <dl>
%% <dt>`attributes_as_maps`</dt>
%% <dd>
%% When `true`, it configures the parser to use maps for the attributes.
%% It is `false` by default, which means attributes are going to be represented
%% as a list of tuples.
%% </dd>
%% </dl>
parse(Input, Opts) ->
parse_tokens(tokens(Input), Opts).

%% @spec parse_tokens([html_token()]) -> html_node()
%% @doc Transform the output of tokens(Doc) into a HTML tree.
parse_tokens(Tokens) when is_list(Tokens) ->
parse_tokens(Tokens, Opts) when is_list(Tokens) andalso is_list(Opts) ->
%% Skip over doctype, processing instructions
[{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
{Tree, _} = tree(Rest, [norm({Tag, Attrs}, Opts)], Opts),
Tree.

find_document(Tokens = [{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
Expand Down Expand Up @@ -215,38 +226,44 @@ tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
tree_data(Rest, AllWhitespace, Acc) ->
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.

tree([], Stack) ->
tree([], Stack, _Opts) ->
{destack(Stack), []};
tree([{end_tag, Tag} | Rest], Stack) ->
case destack(norm(Tag), Stack) of
tree([{end_tag, Tag} | Rest], Stack, Opts) ->
case destack(norm(Tag, Opts), Stack) of
S when is_list(S) ->
tree(Rest, S);
tree(Rest, S, Opts);
Result ->
{Result, []}
end;
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
tree(Rest, stack(norm({Tag, Attrs}), S));
tree([T = {pi, _Tag, _Attrs} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree([T = {comment, _Comment} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree(L = [{data, _Data, _Whitespace} | _], S) ->
tree([{start_tag, Tag, Attrs, true} | Rest], S, Opts) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}, Opts), S), Opts);
tree([{start_tag, Tag, Attrs, false} | Rest], S, Opts) ->
tree(Rest, stack(norm({Tag, Attrs}, Opts), S), Opts);
tree([T = {pi, _Tag, _Attrs} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree([T = {comment, _Comment} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree(L = [{data, _Data, _Whitespace} | _], S, Opts) ->
case tree_data(L, true, []) of
{_, true, Rest} ->
tree(Rest, S);
tree(Rest, S, Opts);
{Data, false, Rest} ->
tree(Rest, append_stack_child(Data, S))
tree(Rest, append_stack_child(Data, S), Opts)
end;
tree([{doctype, _} | Rest], Stack) ->
tree(Rest, Stack).

norm({Tag, Attrs}) ->
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
norm(Tag) when is_binary(Tag) ->
tree([{doctype, _} | Rest], Stack, Opts) ->
tree(Rest, Stack, Opts).

norm({Tag, Attrs}, Opts) ->
Attrs = [{norm(K, Opts), iolist_to_binary(V)} || {K, V} <- Attrs],
case lists:keyfind(attributes_as_maps, 1, Opts) of
{attributes_as_maps, true} ->
{norm(Tag, Opts), maps:from_list(Attrs), []};
Copy link

@bennelsonweiss bennelsonweiss Jun 10, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Documentation on maps:from_list says

If the same key appears more than once, the latter (right-most) value is used and the previous values are ignored.

This means for duplicate attributes the last instance is used in the map.

The HTML spec suggests that it's a parse error when you encounter multiple identical attributes (duplicate-attribute parse error), and that the parser ignores the duplicates (keeping the first).

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great catch! Fixed in 708ad44
Thank you! 💜

_ ->
{norm(Tag, Opts), Attrs, []}
end;
norm(Tag, _Opts) when is_binary(Tag) ->
Tag;
norm(Tag) ->
norm(Tag, _Opts) ->
list_to_binary(string:to_lower(Tag)).

stack(T1 = {TN, _, _}, Stack = [{TN, _, _} | _Rest]) when
Expand Down
34 changes: 34 additions & 0 deletions test/floki_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1792,6 +1792,40 @@ defmodule FlokiTest do
assert result == html
end

@tag only_parser: Mochiweb
test "parse document with attributes as map option enabled" do
html =
html_body("""
<div class="container">
<ul>
<li class="link active"><a href="/">Home</a></li>
<li class="link"><a href="/about-us">About us</a></li>
</ul>
</div>
""")

assert {:ok, html_tree} = Floki.parse_document(html, attributes_as_maps: true)

assert html_tree == [
{"html", %{},
[
{"head", %{}, []},
{"body", %{},
[
{"div", %{"class" => "container"},
[
{"ul", %{},
[
{"li", %{"class" => "link active"}, [{"a", %{"href" => "/"}, ["Home"]}]},
{"li", %{"class" => "link"},
[{"a", %{"href" => "/about-us"}, ["About us"]}]}
]}
]}
]}
]}
]
end

defp html_body(body) do
"<html><head></head><body>#{body}</body></html>"
end
Expand Down