Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add include_inputs? option to Floki.text #459

Merged
merged 5 commits into from
May 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions lib/floki.ex
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,9 @@ defmodule Floki do
iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]})
" world"
iex> Floki.text([{"input", [{"type", "date"}, {"value", "2017-06-01"}], []}], include_inputs: true)
"2017-06-01"
iex> Floki.text({"div", [], [{"script", [], ["hello"]}, " world"]}, js: true)
"hello world"
Expand All @@ -504,7 +507,7 @@ defmodule Floki do

@spec text(html_tree | html_node | binary) :: binary

def text(html, opts \\ [deep: true, js: false, style: true, sep: ""]) do
def text(html, opts \\ [deep: true, js: false, style: true, sep: "", include_inputs: false]) do
cleaned_html_tree =
html
|> parse_it()
Expand All @@ -518,8 +521,8 @@ defmodule Floki do
end

case opts[:sep] do
nil -> search_strategy.get(cleaned_html_tree)
sep -> search_strategy.get(cleaned_html_tree, sep)
nil -> search_strategy.get(cleaned_html_tree, "", opts[:include_inputs])
sep -> search_strategy.get(cleaned_html_tree, sep, opts[:include_inputs])
end
end

Expand Down
32 changes: 21 additions & 11 deletions lib/floki/deep_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,35 @@ defmodule Floki.DeepText do

@type html_tree :: tuple | list

@spec get(html_tree, binary) :: binary
@spec get(html_tree, binary, boolean) :: binary

def get(html_tree, sep \\ "") do
get_text(html_tree, "", sep)
def get(html_tree, sep \\ "", include_inputs? \\ false)
viniciusmuller marked this conversation as resolved.
Show resolved Hide resolved

def get(html_tree, sep, include_inputs?) do
get_text(html_tree, "", sep, include_inputs?)
end

defp get_text(text, "", _sep) when is_binary(text), do: text
defp get_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp get_text(text, "", _sep, _) when is_binary(text), do: text
defp get_text(text, acc, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)

defp get_text(nodes, acc, sep) when is_list(nodes) do
defp get_text(nodes, acc, sep, include_inputs?) when is_list(nodes) do
Enum.reduce(nodes, acc, fn child, istr ->
get_text(child, istr, sep)
get_text(child, istr, sep, include_inputs?)
end)
end

defp get_text({:comment, _}, acc, _), do: acc
defp get_text({"br", _, _}, acc, _), do: acc <> "\n"
defp get_text({:comment, _}, acc, _, _), do: acc
defp get_text({"br", _, _}, acc, _, _), do: acc <> "\n"

defp get_text({"input", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp get_text({"textarea", attrs, _}, acc, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp get_text({_, _, nodes}, acc, sep) do
get_text(nodes, acc, sep)
defp get_text({_, _, nodes}, acc, sep, include_inputs?) do
get_text(nodes, acc, sep, include_inputs?)
end
end
35 changes: 20 additions & 15 deletions lib/floki/flat_text.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,31 +11,36 @@ defmodule Floki.FlatText do

@type html_tree :: tuple | list

@spec get(html_tree, binary) :: binary
@spec get(html_tree, binary, boolean) :: binary

def get(html_nodes, sep \\ "")
def get(html_nodes, sep \\ "", include_inputs? \\ false)

def get(html_nodes, sep) when is_list(html_nodes) do
def get(html_nodes, sep, include_inputs?) when is_list(html_nodes) do
Enum.reduce(html_nodes, "", fn html_node, acc ->
text_from_node(html_node, acc, sep)
text_from_node(html_node, acc, 0, sep, include_inputs?)
end)
end

def get(html_node, sep) do
text_from_node(html_node, "", sep)
def get(html_node, sep, include_inputs?) do
text_from_node(html_node, "", 0, sep, include_inputs?)
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, sep) do
defp text_from_node({"input", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp text_from_node({"textarea", attrs, []}, acc, _, _, true) do
acc <> Floki.TextExtractor.extract_input_value(attrs)
end

defp text_from_node({_tag, _attrs, html_nodes}, acc, depth, sep, include_inputs?)
when depth < 1 do
Enum.reduce(html_nodes, acc, fn html_node, acc ->
capture_text(html_node, acc, sep)
text_from_node(html_node, acc, depth + 1, sep, include_inputs?)
end)
end

defp text_from_node(text, "", _sep) when is_binary(text), do: text
defp text_from_node(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp text_from_node(_, acc, _), do: acc

defp capture_text(text, "", _sep) when is_binary(text), do: text
defp capture_text(text, acc, sep) when is_binary(text), do: Enum.join([acc, text], sep)
defp capture_text(_html_node, acc, _), do: acc
defp text_from_node(text, "", _, _sep, _) when is_binary(text), do: text
defp text_from_node(text, acc, _, sep, _) when is_binary(text), do: Enum.join([acc, text], sep)
defp text_from_node(_, acc, _, _, _), do: acc
end
35 changes: 35 additions & 0 deletions lib/floki/text_extractor.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
defmodule Floki.TextExtractor do
@moduledoc false

@allowed_input_types [
"color",
"date",
"datetime-local",
"email",
"month",
"number",
"search",
"tel",
"text",
"time",
"url",
"week"
]

def extract_input_value(attrs) do
{"type", t} = Enum.find(attrs, {"type", "text"}, &match?({"type", _}, &1))

if t in @allowed_input_types do
extract_value(attrs)
else
""
end
end

defp extract_value(attrs) do
Enum.find_value(attrs, "", fn
{"value", v} -> v
_ -> nil
end)
end
end
24 changes: 24 additions & 0 deletions test/floki/deep_text_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,30 @@ defmodule Floki.DeepTextTest do
assert Floki.DeepText.get(node, " ") == "Google"
end

test "extracts text from text input" do
html = "<input value='foo' />"
{:ok, node} = Floki.parse_document(html)

assert Floki.DeepText.get(node, " ", true) == "foo"
end

test "extracts text from textarea" do
html = "<textarea value='bar' />"
{:ok, node} = Floki.parse_document(html)

assert Floki.DeepText.get(node, " ", true) == "bar"
end

test "extracts text from nested inputs" do
node =
{"div", [],
[
{"input", [{"value", "bar"}], []}
]}

assert Floki.DeepText.get(node, " ", true) == "bar"
end

test "text from a list of deep nodes" do
nodes = [
{
Expand Down
22 changes: 22 additions & 0 deletions test/floki/flat_text_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,28 @@ defmodule Floki.FlatTextTest do
assert Floki.FlatText.get(node, " ") == "Elixir lang"
end

test "extracts text from text input" do
node = {"input", [{"value", "foo"}], []}

assert Floki.FlatText.get(node, " ", true) == "foo"
end

test "extracts text from textarea" do
node = {"textarea", [{"value", "bar"}], []}

assert Floki.FlatText.get(node, " ", true) == "bar"
end

test "extracts text from nested inputs" do
node =
{"div", [],
[
{"input", [{"value", "bar"}], []}
]}

assert Floki.FlatText.get(node, " ", true) == "bar"
end

test "a blank string when the node does not have text in the same level" do
node = {"div", [], [{"a", [], ["Something in a deeper node"]}]}

Expand Down