Skip to content

Commit

Permalink
Add support for parsing attributes as maps
Browse files Browse the repository at this point in the history
This is a feature request from #463
and is the base for implementing the same feature in more backends.

Right now only the built-in `mochiweb` parser implements the feature.
  • Loading branch information
philss committed Jun 9, 2023
1 parent 2d5c282 commit 706d5bf
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 42 deletions.
25 changes: 21 additions & 4 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ defmodule Floki do
inside a list.
"""

@type html_declaration :: {:pi, String.t(), [html_attribute()]}
@type html_attribute :: {String.t(), String.t()}
@type html_attributes :: [html_attribute()] | html_attributes_map()
@type html_attributes_map :: %{String.t() => String.t()}
@type html_declaration :: {:pi, String.t(), html_attributes()}
@type html_comment :: {:comment, String.t()}
@type html_doctype :: {:doctype, String.t(), String.t(), String.t()}
@type html_attribute :: {String.t(), String.t()}
@type html_text :: String.t()
@type html_tag :: {String.t(), [html_attribute()], [html_node()]}
@type html_tag :: {String.t(), html_attributes(), [html_node()]}
@type html_node ::
html_tag() | html_comment() | html_doctype() | html_declaration() | html_text()
@type html_tree :: [html_node()]
Expand Down Expand Up @@ -102,13 +104,21 @@ defmodule Floki do
## Options
* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Remember that maps are no longer
ordered since OTP 26. Default to `false`.
* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.
See https://github.com/philss/floki#alternative-html-parsers for more details.
* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific from a given parser.
Defaults to an empty list.
## Examples
iex> Floki.parse_document("<html><head></head><body>hello</body></html>")
Expand All @@ -117,6 +127,13 @@ defmodule Floki do
iex> Floki.parse_document("<html><head></head><body>hello</body></html>", html_parser: Floki.HTMLParser.Mochiweb)
{:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]}
iex> Floki.parse_document(
...> "<html><head></head><body class=main>hello</body></html>",
...> attributes_as_maps: true,
...> html_parser: Floki.HTMLParser.Mochiweb
...>)
{:ok, [{"html", %{}, [{"head", %{}, []}, {"body", %{"class" => "main"}, ["hello"]}]}]}
"""

@spec parse_document(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -355,7 +372,7 @@ defmodule Floki do
@spec find_and_update(
html_tree(),
css_selector(),
({String.t(), [html_attribute()]} -> {String.t(), [html_attribute()]} | :delete)
({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete)
) :: html_tree()
def find_and_update(html_tree, selector, fun) do
{tree, results} = Finder.find(html_tree, selector)
Expand Down
39 changes: 31 additions & 8 deletions lib/floki/html_parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,47 @@ defmodule Floki.HTMLParser do
The default parser is Mochiweb, which comes with Floki.
You can also choose between Html5ever or FastHtml.
And it's possible to pass down options to the parsers using
the `parser_args` option.
This module is also a behaviour that those parsers must implement.
"""

@default_parser Floki.HTMLParser.Mochiweb

@callback parse_document(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@callback parse_fragment(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@typep result(success) :: {:ok, success} | {:error, String.t()}
@typep html :: binary()

@callback parse_document(html(), Keyword.t()) :: result(Floki.html_tree())
@callback parse_fragment(html(), Keyword.t()) :: result(Floki.html_tree())

@callback parse_document_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())
@callback parse_fragment_with_attributes_as_maps(html(), Keyword.t()) ::
result(Floki.html_tree())

def parse_document(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_document(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser(opts).parse_document(html, parser_args)
parser = parser(opts)

if opts[:attributes_as_maps] do
parser.parse_document_with_attributes_as_maps(html, parser_args)
else
parser.parse_document(html, parser_args)
end
end

def parse_fragment(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_fragment(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser = parser(opts)

parser(opts).parse_fragment(html, parser_args)
if opts[:attributes_as_maps] do
parser.parse_fragment_with_attributes_as_maps(html, parser_args)
else
parser.parse_fragment(html, parser_args)
end
end

defp parser(opts) do
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/fast_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ defmodule Floki.HTMLParser.FastHtml do
execute_with_module(fn module -> module.decode_fragment(html, args) end)
end

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"
end

defp execute_with_module(fun) do
case Code.ensure_loaded(:fast_html) do
{:module, module} ->
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/html5ever.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@ defmodule Floki.HTMLParser.Html5ever do
# NOTE: html5ever does not implement parse_fragment yet.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
end
end
14 changes: 12 additions & 2 deletions lib/floki/html_parser/mochiweb.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ defmodule Floki.HTMLParser.Mochiweb do
@root_node "floki"

@impl true
def parse_document(html, _args) do
def parse_document(html, args) do
html = "<#{@root_node}>#{html}</#{@root_node}>"
{@root_node, [], parsed} = :floki_mochi_html.parse(html)
{@root_node, _, parsed} = :floki_mochi_html.parse(html, args)

{:ok, parsed}
end

# NOTE: mochi_html cannot make a distinction of a fragment and document.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end

@impl true
def parse_fragment_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
end
end
74 changes: 46 additions & 28 deletions src/floki_mochi_html.erl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
-module(floki_mochi_html).
-export([
tokens/1,
parse/1
parse/2
]).
-ifdef(TEST).
-export([destack/1, destack/2, is_singleton/1]).
Expand Down Expand Up @@ -96,17 +96,28 @@

%% External API.

%% @spec parse(string() | binary()) -> html_node()
%% @spec parse(string() | binary(), list()) -> html_node()
%% @doc tokenize and then transform the token stream into a HTML tree.
parse(Input) ->
parse_tokens(tokens(Input)).
%%
%% The following option is supported:
%%
%% <dl>
%% <dt>`attributes_as_maps`</dt>
%% <dd>
%% When `true`, it configures the parser to use maps for the attributes.
%% It is `false` by default, which means attributes are going to be represented
%% as a list of tuples.
%% </dd>
%% </dl>
parse(Input, Opts) ->
parse_tokens(tokens(Input), Opts).

%% @spec parse_tokens([html_token()]) -> html_node()
%% @doc Transform the output of tokens(Doc) into a HTML tree.
parse_tokens(Tokens) when is_list(Tokens) ->
parse_tokens(Tokens, Opts) when is_list(Tokens) andalso is_list(Opts) ->
%% Skip over doctype, processing instructions
[{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
{Tree, _} = tree(Rest, [norm({Tag, Attrs}, Opts)], Opts),
Tree.

find_document(Tokens = [{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
Expand Down Expand Up @@ -215,38 +226,45 @@ tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
tree_data(Rest, AllWhitespace, Acc) ->
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.

tree([], Stack) ->
tree([], Stack, _Opts) ->
{destack(Stack), []};
tree([{end_tag, Tag} | Rest], Stack) ->
case destack(norm(Tag), Stack) of
tree([{end_tag, Tag} | Rest], Stack, Opts) ->
case destack(norm(Tag, Opts), Stack) of
S when is_list(S) ->
tree(Rest, S);
tree(Rest, S, Opts);
Result ->
{Result, []}
end;
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
tree(Rest, stack(norm({Tag, Attrs}), S));
tree([T = {pi, _Tag, _Attrs} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree([T = {comment, _Comment} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree(L = [{data, _Data, _Whitespace} | _], S) ->
tree([{start_tag, Tag, Attrs, true} | Rest], S, Opts) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}, Opts), S), Opts);
tree([{start_tag, Tag, Attrs, false} | Rest], S, Opts) ->
tree(Rest, stack(norm({Tag, Attrs}, Opts), S), Opts);
tree([T = {pi, _Tag, _Attrs} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree([T = {comment, _Comment} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree(L = [{data, _Data, _Whitespace} | _], S, Opts) ->
case tree_data(L, true, []) of
{_, true, Rest} ->
tree(Rest, S);
tree(Rest, S, Opts);
{Data, false, Rest} ->
tree(Rest, append_stack_child(Data, S))
tree(Rest, append_stack_child(Data, S), Opts)
end;
tree([{doctype, _} | Rest], Stack) ->
tree(Rest, Stack).

norm({Tag, Attrs}) ->
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
norm(Tag) when is_binary(Tag) ->
tree([{doctype, _} | Rest], Stack, Opts) ->
tree(Rest, Stack, Opts).

% TODO: change in here - need
norm({Tag, Attrs}, Opts) ->
Attrs = [{norm(K, Opts), iolist_to_binary(V)} || {K, V} <- Attrs],
case lists:keyfind(attributes_as_maps, 1, Opts) of
{attributes_as_maps, true} ->
{norm(Tag, Opts), maps:from_list(Attrs), []};
_ ->
{norm(Tag, Opts), Attrs, []}
end;
norm(Tag, _Opts) when is_binary(Tag) ->
Tag;
norm(Tag) ->
norm(Tag, _Opts) ->
list_to_binary(string:to_lower(Tag)).

stack(T1 = {TN, _, _}, Stack = [{TN, _, _} | _Rest]) when
Expand Down
34 changes: 34 additions & 0 deletions test/floki_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1792,6 +1792,40 @@ defmodule FlokiTest do
assert result == html
end

@tag only_parser: Mochiweb
test "parse document with attributes as map option enabled" do
html =
html_body("""
<div class="container">
<ul>
<li class="link active"><a href="/">Home</a></li>
<li class="link"><a href="/about-us">About us</a></li>
</ul>
</div>
""")

assert {:ok, html_tree} = Floki.parse_document(html, attributes_as_maps: true)

assert html_tree == [
{"html", %{},
[
{"head", %{}, []},
{"body", %{},
[
{"div", %{"class" => "container"},
[
{"ul", %{},
[
{"li", %{"class" => "link active"}, [{"a", %{"href" => "/"}, ["Home"]}]},
{"li", %{"class" => "link"},
[{"a", %{"href" => "/about-us"}, ["About us"]}]}
]}
]}
]}
]}
]
end

defp html_body(body) do
"<html><head></head><body>#{body}</body></html>"
end
Expand Down

0 comments on commit 706d5bf

Please sign in to comment.