Add support for parsing attributes as maps
This is a feature request from #463
and is the base for implementing the same feature in more backends.

Right now only the built-in `mochiweb` parser implements the feature.
philss committed Jun 9, 2023
1 parent 2d5c282 commit 706d5bf
Showing 7 changed files with 164 additions and 42 deletions.
25 changes: 21 additions & 4 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ defmodule Floki do
inside a list.

@type html_declaration :: {:pi, String.t(), [html_attribute()]}
@type html_attribute :: {String.t(), String.t()}
@type html_attributes :: [html_attribute()] | html_attributes_map()
@type html_attributes_map :: %{String.t() => String.t()}
@type html_declaration :: {:pi, String.t(), html_attributes()}
@type html_comment :: {:comment, String.t()}
@type html_doctype :: {:doctype, String.t(), String.t(), String.t()}
@type html_attribute :: {String.t(), String.t()}
@type html_text :: String.t()
@type html_tag :: {String.t(), [html_attribute()], [html_node()]}
@type html_tag :: {String.t(), html_attributes(), [html_node()]}
@type html_node ::
html_tag() | html_comment() | html_doctype() | html_declaration() | html_text()
@type html_tree :: [html_node()]
Expand Down Expand Up @@ -102,13 +104,21 @@ defmodule Floki do
## Options
* `:attributes_as_maps` - Change the behaviour of the parser to return the attributes
as maps, instead of a list of `{"key", "value"}`. Remember that maps are no longer
ordered since OTP 26. Default to `false`.
* `:html_parser` - The module of the backend that is responsible for parsing
the HTML string. By default it is set to the built-in parser, and the module
name is equal to `Floki.HTMLParser.Mochiweb`, or from the value of the
application env of the same name.
See for more details.
* `:parser_args` - A list of options to the parser. This can be used to pass options
that are specific from a given parser.
Defaults to an empty list.
## Examples
iex> Floki.parse_document("<html><head></head><body>hello</body></html>")
Expand All @@ -117,6 +127,13 @@ defmodule Floki do
iex> Floki.parse_document("<html><head></head><body>hello</body></html>", html_parser: Floki.HTMLParser.Mochiweb)
{:ok, [{"html", [], [{"head", [], []}, {"body", [], ["hello"]}]}]}
iex> Floki.parse_document(
...> "<html><head></head><body class=main>hello</body></html>",
...> attributes_as_maps: true,
...> html_parser: Floki.HTMLParser.Mochiweb
{:ok, [{"html", %{}, [{"head", %{}, []}, {"body", %{"class" => "main"}, ["hello"]}]}]}

@spec parse_document(binary(), Keyword.t()) :: {:ok, html_tree()} | {:error, String.t()}
Expand Down Expand Up @@ -355,7 +372,7 @@ defmodule Floki do
@spec find_and_update(
({String.t(), [html_attribute()]} -> {String.t(), [html_attribute()]} | :delete)
({String.t(), html_attributes()} -> {String.t(), html_attributes()} | :delete)
) :: html_tree()
def find_and_update(html_tree, selector, fun) do
{tree, results} = Finder.find(html_tree, selector)
Expand Down
39 changes: 31 additions & 8 deletions lib/floki/html_parser.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,47 @@ defmodule Floki.HTMLParser do
The default parser is Mochiweb, which comes with Floki.
You can also choose between Html5ever or FastHtml.
And it's possible to pass down options to the parsers using
the `parser_args` option.
This module is also a behaviour that those parsers must implement.

@default_parser Floki.HTMLParser.Mochiweb

@callback parse_document(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@callback parse_fragment(binary(), list()) :: {:ok, Floki.html_tree()} | {:error, String.t()}
@typep result(success) :: {:ok, success} | {:error, String.t()}
@typep html :: binary()

@callback parse_document(html(), Keyword.t()) :: result(Floki.html_tree())
@callback parse_fragment(html(), Keyword.t()) :: result(Floki.html_tree())

@callback parse_document_with_attributes_as_maps(html(), Keyword.t()) ::
@callback parse_fragment_with_attributes_as_maps(html(), Keyword.t()) ::

def parse_document(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_document(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser(opts).parse_document(html, parser_args)
parser = parser(opts)

if opts[:attributes_as_maps] do
parser.parse_document_with_attributes_as_maps(html, parser_args)
parser.parse_document(html, parser_args)

def parse_fragment(html, opts \\ []) do
parser_args = opts[:parser_args] || []
def parse_fragment(html, opts \\ []) when is_binary(html) do
{parser_args, opts} = Keyword.pop(opts, :parser_args, [])

parser = parser(opts)

parser(opts).parse_fragment(html, parser_args)
if opts[:attributes_as_maps] do
parser.parse_fragment_with_attributes_as_maps(html, parser_args)
parser.parse_fragment(html, parser_args)

defp parser(opts) do
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/fast_html.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,16 @@ defmodule Floki.HTMLParser.FastHtml do
execute_with_module(fn module -> module.decode_fragment(html, args) end)

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for FastHTML"

defp execute_with_module(fun) do
case Code.ensure_loaded(:fast_html) do
{:module, module} ->
Expand Down
10 changes: 10 additions & 0 deletions lib/floki/html_parser/html5ever.ex
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@ defmodule Floki.HTMLParser.Html5ever do
# NOTE: html5ever does not implement parse_fragment yet.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"

@impl true
def parse_fragment_with_attributes_as_maps(_html, _args) do
raise "parsing with attributes as maps is not supported yet for Html5ever"
14 changes: 12 additions & 2 deletions lib/floki/html_parser/mochiweb.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,24 @@ defmodule Floki.HTMLParser.Mochiweb do
@root_node "floki"

@impl true
def parse_document(html, _args) do
def parse_document(html, args) do
html = "<#{@root_node}>#{html}</#{@root_node}>"
{@root_node, [], parsed} = :floki_mochi_html.parse(html)
{@root_node, _, parsed} = :floki_mochi_html.parse(html, args)

{:ok, parsed}

# NOTE: mochi_html cannot make a distinction of a fragment and document.
@impl true
def parse_fragment(html, args), do: parse_document(html, args)

@impl true
def parse_document_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))

@impl true
def parse_fragment_with_attributes_as_maps(html, args) do
parse_document(html, Keyword.put(args, :attributes_as_maps, true))
74 changes: 46 additions & 28 deletions src/floki_mochi_html.erl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
-export([destack/1, destack/2, is_singleton/1]).
Expand Down Expand Up @@ -96,17 +96,28 @@

%% External API.

%% @spec parse(string() | binary()) -> html_node()
%% @spec parse(string() | binary(), list()) -> html_node()
%% @doc tokenize and then transform the token stream into a HTML tree.
parse(Input) ->
%% The following option is supported:
%% <dl>
%% <dt>`attributes_as_maps`</dt>
%% <dd>
%% When `true`, it configures the parser to use maps for the attributes.
%% It is `false` by default, which means attributes are going to be represented
%% as a list of tuples.
%% </dd>
%% </dl>
parse(Input, Opts) ->
parse_tokens(tokens(Input), Opts).

%% @spec parse_tokens([html_token()]) -> html_node()
%% @doc Transform the output of tokens(Doc) into a HTML tree.
parse_tokens(Tokens) when is_list(Tokens) ->
parse_tokens(Tokens, Opts) when is_list(Tokens) andalso is_list(Opts) ->
%% Skip over doctype, processing instructions
[{start_tag, Tag, Attrs, false} | Rest] = find_document(Tokens, normal),
{Tree, _} = tree(Rest, [norm({Tag, Attrs})]),
{Tree, _} = tree(Rest, [norm({Tag, Attrs}, Opts)], Opts),

find_document(Tokens = [{start_tag, _Tag, _Attrs, false} | _Rest], Mode) ->
Expand Down Expand Up @@ -215,38 +226,45 @@ tree_data([{data, Data, Whitespace} | Rest], AllWhitespace, Acc) ->
tree_data(Rest, AllWhitespace, Acc) ->
{iolist_to_binary(lists:reverse(Acc)), AllWhitespace, Rest}.

tree([], Stack) ->
tree([], Stack, _Opts) ->
{destack(Stack), []};
tree([{end_tag, Tag} | Rest], Stack) ->
case destack(norm(Tag), Stack) of
tree([{end_tag, Tag} | Rest], Stack, Opts) ->
case destack(norm(Tag, Opts), Stack) of
S when is_list(S) ->
tree(Rest, S);
tree(Rest, S, Opts);
Result ->
{Result, []}
tree([{start_tag, Tag, Attrs, true} | Rest], S) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}), S));
tree([{start_tag, Tag, Attrs, false} | Rest], S) ->
tree(Rest, stack(norm({Tag, Attrs}), S));
tree([T = {pi, _Tag, _Attrs} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree([T = {comment, _Comment} | Rest], S) ->
tree(Rest, append_stack_child(T, S));
tree(L = [{data, _Data, _Whitespace} | _], S) ->
tree([{start_tag, Tag, Attrs, true} | Rest], S, Opts) ->
tree(Rest, append_stack_child(norm({Tag, Attrs}, Opts), S), Opts);
tree([{start_tag, Tag, Attrs, false} | Rest], S, Opts) ->
tree(Rest, stack(norm({Tag, Attrs}, Opts), S), Opts);
tree([T = {pi, _Tag, _Attrs} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree([T = {comment, _Comment} | Rest], S, Opts) ->
tree(Rest, append_stack_child(T, S), Opts);
tree(L = [{data, _Data, _Whitespace} | _], S, Opts) ->
case tree_data(L, true, []) of
{_, true, Rest} ->
tree(Rest, S);
tree(Rest, S, Opts);
{Data, false, Rest} ->
tree(Rest, append_stack_child(Data, S))
tree(Rest, append_stack_child(Data, S), Opts)
tree([{doctype, _} | Rest], Stack) ->
tree(Rest, Stack).

norm({Tag, Attrs}) ->
{norm(Tag), [{norm(K), iolist_to_binary(V)} || {K, V} <- Attrs], []};
norm(Tag) when is_binary(Tag) ->
tree([{doctype, _} | Rest], Stack, Opts) ->
tree(Rest, Stack, Opts).

% TODO: change in here - need
norm({Tag, Attrs}, Opts) ->
Attrs = [{norm(K, Opts), iolist_to_binary(V)} || {K, V} <- Attrs],
case lists:keyfind(attributes_as_maps, 1, Opts) of
{attributes_as_maps, true} ->
{norm(Tag, Opts), maps:from_list(Attrs), []};
_ ->
{norm(Tag, Opts), Attrs, []}
norm(Tag, _Opts) when is_binary(Tag) ->
norm(Tag) ->
norm(Tag, _Opts) ->

stack(T1 = {TN, _, _}, Stack = [{TN, _, _} | _Rest]) when
Expand Down
34 changes: 34 additions & 0 deletions test/floki_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1792,6 +1792,40 @@ defmodule FlokiTest do
assert result == html

@tag only_parser: Mochiweb
test "parse document with attributes as map option enabled" do
html =
<div class="container">
<li class="link active"><a href="/">Home</a></li>
<li class="link"><a href="/about-us">About us</a></li>

assert {:ok, html_tree} = Floki.parse_document(html, attributes_as_maps: true)

assert html_tree == [
{"html", %{},
{"head", %{}, []},
{"body", %{},
{"div", %{"class" => "container"},
{"ul", %{},
{"li", %{"class" => "link active"}, [{"a", %{"href" => "/"}, ["Home"]}]},
{"li", %{"class" => "link"},
[{"a", %{"href" => "/about-us"}, ["About us"]}]}

defp html_body(body) do
Expand Down

