From 0812e611038cd8c0c84ff6d8f1d267911f5b4b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20M=C3=BCller?= Date: Fri, 19 May 2023 19:12:47 -0300 Subject: [PATCH 1/5] Add "include_inputs?" option to Floki.text --- lib/floki.ex | 9 +++++--- lib/floki/deep_text.ex | 32 +++++++++++++++++--------- lib/floki/flat_text.ex | 35 ++++++++++++++++------------- lib/floki/text_extractor.ex | 42 +++++++++++++++++++++++++++++++++++ test/floki/deep_text_test.exs | 24 ++++++++++++++++++++ test/floki/flat_text_test.exs | 22 ++++++++++++++++++ 6 files changed, 135 insertions(+), 29 deletions(-) create mode 100644 lib/floki/text_extractor.ex diff --git a/lib/floki.ex b/lib/floki.ex index bbaaabf1..042982e4 100644 --- a/lib/floki.ex +++ b/lib/floki.ex @@ -482,6 +482,9 @@ defmodule Floki do iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]}) " world" + iex> Floki.text([{"input", [{"type", "date"}, {"value", "2017-06-01"}], []}], include_inputs?: true) + "2017-06-01" + iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]}, js: true) "hello world" @@ -504,7 +507,7 @@ defmodule Floki do @spec text(html_tree | html_node | binary) :: binary - def text(html, opts \\ [deep: true, js: false, style: true, sep: ""]) do + def text(html, opts \\ [deep: true, js: false, style: true, sep: "", include_inputs?: false]) do cleaned_html_tree = html |> parse_it() @@ -518,8 +521,8 @@ defmodule Floki do end case opts[:sep] do - nil -> search_strategy.get(cleaned_html_tree) - sep -> search_strategy.get(cleaned_html_tree, sep) + nil -> search_strategy.get(cleaned_html_tree, "", opts[:include_inputs?]) + sep -> search_strategy.get(cleaned_html_tree, sep, opts[:include_inputs]) end end diff --git a/lib/floki/deep_text.ex b/lib/floki/deep_text.ex index 0f8bbd8a..2dd4edd7 100644 --- a/lib/floki/deep_text.ex +++ b/lib/floki/deep_text.ex @@ -6,25 +6,35 @@ defmodule Floki.DeepText do @type html_tree :: tuple | list - @spec get(html_tree, binary) :: binary + @spec get(html_tree, binary, boolean) :: binary - def get(html_tree, sep \\ "") do - get_text(html_tree, "", sep) + def get(html_tree, sep \\ "", include_inputs? \\ false) + + def get(html_tree, sep, include_inputs?) do + get_text(html_tree, "", sep, include_inputs?) end - defp get_text(text, "", _sep) when is_binary(text), do: text - defp get_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep) + defp get_text(text, "", _sep, _) when is_binary(text), do: text + defp get_text(text, acc, sep, _) when is_binary(text), do: Enum.join([acc, text], sep) - defp get_text(nodes, acc, sep) when is_list(nodes) do + defp get_text(nodes, acc, sep, include_inputs?) when is_list(nodes) do Enum.reduce(nodes, acc, fn child, istr -> - get_text(child, istr, sep) + get_text(child, istr, sep, include_inputs?) end) end - defp get_text({:comment, _}, acc, _), do: acc - defp get_text({"br", _, _}, acc, _), do: acc <> "\n" + defp get_text({:comment, _}, acc, _, _), do: acc + defp get_text({"br", _, _}, acc, _, _), do: acc <> "\n" + + defp get_text({"input", attrs, _}, acc, _, true) do + acc <> Floki.TextExtractor.extract_input_value(attrs) + end + + defp get_text({"textarea", attrs, _}, acc, _, true) do + acc <> Floki.TextExtractor.extract_input_value(attrs) + end - defp get_text({_, _, nodes}, acc, sep) do - get_text(nodes, acc, sep) + defp get_text({_, _, nodes}, acc, sep, include_inputs?) do + get_text(nodes, acc, sep, include_inputs?) end end diff --git a/lib/floki/flat_text.ex b/lib/floki/flat_text.ex index ff772944..7f0bb281 100644 --- a/lib/floki/flat_text.ex +++ b/lib/floki/flat_text.ex @@ -11,31 +11,36 @@ defmodule Floki.FlatText do @type html_tree :: tuple | list - @spec get(html_tree, binary) :: binary + @spec get(html_tree, binary, boolean) :: binary - def get(html_nodes, sep \\ "") + def get(html_nodes, sep \\ "", include_inputs? \\ false) - def get(html_nodes, sep) when is_list(html_nodes) do + def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do Enum.reduce(html_nodes, "", fn html_node, acc -> - text_from_node(html_node, acc, sep) + text_from_node(html_node, acc, 0, sep, include_inputs?) end) end - def get(html_node, sep) do - text_from_node(html_node, "", sep) + def get(html_node, sep, include_inputs?) do + text_from_node(html_node, "", 0, sep, include_inputs?) end - defp text_from_node({_tag, _attrs, html_nodes}, acc, sep) do + defp text_from_node({"input", attrs, []}, acc, _, _, true) do + acc <> Floki.TextExtractor.extract_input_value(attrs) + end + + defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do + acc <> Floki.TextExtractor.extract_input_value(attrs) + end + + defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?) + when depth < 1 do Enum.reduce(html_nodes, acc, fn html_node, acc -> - capture_text(html_node, acc, sep) + text_from_node(html_node, acc, depth + 1, sep, include_inputs?) end) end - defp text_from_node(text, "", _sep) when is_binary(text), do: text - defp text_from_node(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep) - defp text_from_node(_, acc, _), do: acc - - defp capture_text(text, "", _sep) when is_binary(text), do: text - defp capture_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep) - defp capture_text(_html_node, acc, _), do: acc + defp text_from_node(text, "", _, _sep, _) when is_binary(text), do: text + defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: Enum.join([acc, text], sep) + defp text_from_node(_, acc, _, _, _), do: acc end diff --git a/lib/floki/text_extractor.ex b/lib/floki/text_extractor.ex new file mode 100644 index 00000000..2ae41cb3 --- /dev/null +++ b/lib/floki/text_extractor.ex @@ -0,0 +1,42 @@ +defmodule Floki.TextExtractor do + @moduledoc false + + @allowed_input_types [ + nil, + "color", + "date", + "datetime-local", + "email", + "month", + "number", + "search", + "tel", + "text", + "time", + "url", + "week" + ] + + def extract_input_value(attrs) do + type = Enum.find(attrs, &match?({"type", _}, &1)) + + case type do + {"type", t} -> + if t in @allowed_input_types do + extract_value(attrs) + else + "" + end + + nil -> + extract_value(attrs) + end + end + + defp extract_value(attrs) do + Enum.find_value(attrs, "", fn + {"value", v} -> v + _ -> nil + end) + end +end diff --git a/test/floki/deep_text_test.exs b/test/floki/deep_text_test.exs index 30d808e6..abd78f48 100644 --- a/test/floki/deep_text_test.exs +++ b/test/floki/deep_text_test.exs @@ -15,6 +15,30 @@ defmodule Floki.DeepTextTest do assert Floki.DeepText.get(node, " ") == "Google" end + test "extracts text from text input" do + html = "" + {:ok, node} = Floki.parse_document(html) + + assert Floki.DeepText.get(node, " ", true) == "foo" + end + + test "extracts text from textarea" do + html = "