Skip to content

Commit

Permalink
Add option to escape excel formulas (#103)
Browse files Browse the repository at this point in the history
  • Loading branch information
maennchen committed Oct 6, 2021
1 parent 757c25c commit faef83e
Show file tree
Hide file tree
Showing 8 changed files with 126 additions and 42 deletions.
4 changes: 4 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@

## Unreleased
- Optional parameter `escape_formulas` to prevent CSV injection. [Fixes #103](https://github.com/beatrichartz/csv/issues/103) reported by [@maennchen](https://github.com/maennchen). Contributed by [@maennchen](https://github.com/maennchen) in [PR #104](https://github.com/beatrichartz/csv/pull/104).

## 2.4.1
- Fix unnecessary escaping of delimiters when encoding [Fixes #70](https://github.com/beatrichartz/csv/issues/70)
reported by [@karmajunkie](https://github.com/karmajunkie)
Expand Down
14 changes: 8 additions & 6 deletions lib/csv/decoding/decoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -22,23 +22,25 @@ defmodule CSV.Decoding.Decoder do
These are the options:
* `:separator` – The separator token to use, defaults to `?,`.
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + (your separator)).
* `:strip_fields` – When set to true, will strip whitespace from fields.
Defaults to false.
* `:num_workers` – The number of parallel operations to run when producing
* `:num_workers` – The number of parallel operations to run when producing
the stream.
* `:worker_work_ratio` – The available work per worker, defaults to 5.
Higher rates will mean more work sharing, but might also lead to work
fragmentation slowing down the queues.
* `:headers` – When set to `true`, will take the first row of the csv
* `:headers` – When set to `true`, will take the first row of the csv
and use it as header values.
When set to a list, will use the given list as header values.
When set to `false` (default), will use no header values.
When set to anything but `false`, the resulting rows in the matrix will
be maps instead of lists.
* `:replacement` – The replacement string to use where lines have bad
* `:replacement` – The replacement string to use where lines have bad
encoding. Defaults to `nil`, which disables replacement.
* `:escape_formulas – Remove Formular Escaping inserted to prevent
[CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
## Examples
Expand All @@ -50,7 +52,7 @@ defmodule CSV.Decoding.Decoder do
...> |> Enum.take(2)
[ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
Map an existing stream of lines separated by a token to a stream of rows with
Map an existing stream of lines separated by a token to a stream of rows with
a header row:
iex> [\"a;b\",\"c;d\", \"e;f\"]
Expand All @@ -62,7 +64,7 @@ defmodule CSV.Decoding.Decoder do
ok: %{\"a\" => \"e\", \"b\" => \"f\"}
]
Map an existing stream of lines separated by a token to a stream of rows with
Map an existing stream of lines separated by a token to a stream of rows with
a header row with duplications:
iex> [\"a;b;b\",\"c;d;e\", \"f;g;h\"]
Expand Down
70 changes: 49 additions & 21 deletions lib/csv/decoding/lexer.ex
Original file line number Diff line number Diff line change
Expand Up @@ -25,69 +25,97 @@ defmodule CSV.Decoding.Lexer do
def lex({line, index}, options \\ []) when is_list(options) do
separator = options |> Keyword.get(:separator, @separator)
replacement = options |> Keyword.get(:replacement, @replacement)
escape_formulas = options |> Keyword.get(:escape_formulas, @escape_formulas)

case String.valid?(line) do
false ->
if replacement do
replace_bad_encoding(line, replacement) |> lex(index, separator)
replace_bad_encoding(line, replacement) |> lex(index, separator, escape_formulas)
else
{:error, EncodingError, "Invalid encoding", index}
end

true ->
lex(line, index, separator)
lex(line, index, separator, escape_formulas)
end
end

defp lex(line, index, separator) do
case lex([], nil, line, separator) do
defp lex(line, index, separator, escape_formulas) do
case lex([], nil, line, separator, escape_formulas) do
{:ok, tokens} -> {:ok, tokens, index}
end
end

defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator) do
lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator)
defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator) do
lex(tokens |> add_token(current_token), {:delimiter, <<@newline::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:delimiter, <<@newline::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator) do
defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:delimiter, <<@carriage_return::utf8>>},
tail,
separator
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator) do
defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:double_quote, <<@double_quote::utf8>>},
tail,
separator
separator,
escape_formulas
)
end

defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) when head == separator do
lex(tokens |> add_token(current_token), {:separator, <<separator::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas)
when head == separator do
lex(
tokens |> add_token(current_token),
{:separator, <<separator::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator) do
lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator)
for start <- @escape_formula_start do
defp lex(tokens, current_token, "'#{unquote(start)}" <> tail, separator, true) do
lex(tokens, current_token, unquote(start) <> tail, separator, true)
end
end

defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, nil, <<head::utf8>> <> tail, separator) do
lex(tokens, {:content, <<head::utf8>>}, tail, separator)
defp lex(tokens, nil, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(tokens, {:content, <<head::utf8>>}, tail, separator, escape_formulas)
end

defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) do
lex(tokens |> add_token(current_token), {:content, <<head::utf8>>}, tail, separator)
defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas) do
lex(
tokens |> add_token(current_token),
{:content, <<head::utf8>>},
tail,
separator,
escape_formulas
)
end

defp lex(tokens, current_token, "", _) do
defp lex(tokens, current_token, "", _, _) do
{:ok, tokens |> add_token(current_token)}
end

Expand Down
2 changes: 2 additions & 0 deletions lib/csv/defaults.ex
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ defmodule CSV.Defaults do
@double_quote ?"
@escape_max_lines 1000
@replacement nil
@escape_formulas false
@escape_formula_start ["=", "-", "+", "@"]
end
end

Expand Down
31 changes: 24 additions & 7 deletions lib/csv/encoding/encode.ex
Original file line number Diff line number Diff line change
Expand Up @@ -33,15 +33,32 @@ defimpl CSV.Encode, for: BitString do
def encode(data, env \\ []) do
separator = env |> Keyword.get(:separator, @separator)
delimiter = env |> Keyword.get(:delimiter, @delimiter)
escape_formulas = env |> Keyword.get(:escape_formulas, @escape_formulas)

data =
if escape_formulas and String.starts_with?(data, @escape_formula_start) do
"'" <> data
else
data
end

patterns = [
<<separator::utf8>>,
delimiter,
<<@carriage_return::utf8>>,
<<@newline::utf8>>,
<<@double_quote::utf8>>
]

patterns =
if escape_formulas do
patterns ++ @escape_formula_start
else
patterns
end

cond do
String.contains?(data, [
<<separator::utf8>>,
delimiter,
<<@carriage_return::utf8>>,
<<@newline::utf8>>,
<<@double_quote::utf8>>
]) ->
String.contains?(data, patterns) ->
<<@double_quote::utf8>> <>
(data
|> String.replace(
Expand Down
14 changes: 8 additions & 6 deletions lib/csv/encoding/encoder.ex
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,18 @@ defmodule CSV.Encoding.Encoder do
These are the options:
* `:separator` – The separator token to use, defaults to `?,`.
* `:separator` – The separator token to use, defaults to `?,`.
Must be a codepoint (syntax: ? + your separator token).
* `:delimiter` – The delimiter token to use, defaults to `\"\\r\\n\"`.
* `:headers` – When set to `true`, uses the keys of the first map as
* `:delimiter` – The delimiter token to use, defaults to `\"\\r\\n\"`.
* `:headers` – When set to `true`, uses the keys of the first map as
the first element in the stream. All subsequent elements are the values
of the maps. When set to a list, will use the given list as the first
element in the stream and order all subsequent elements using that list.
When set to `false` (default), will use the raw inputs as elements.
When set to anything but `false`, all elements in the input stream are
assumed to be maps.
* `:escape_formulas` – Escape formulas to prevent
[CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
## Examples
Expand Down Expand Up @@ -89,13 +91,13 @@ defmodule CSV.Encoding.Encoder do

encoded =
row
|> Enum.map(&encode_cell(&1, separator, delimiter))
|> Enum.map(&encode_cell(&1, options))
|> Enum.join(<<separator::utf8>>)

encoded <> delimiter
end

defp encode_cell(cell, separator, delimiter) do
CSV.Encode.encode(cell, separator: separator, delimiter: delimiter)
defp encode_cell(cell, options) do
CSV.Encode.encode(cell, options)
end
end
17 changes: 15 additions & 2 deletions test/decoding/baseline_exceptions_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ defmodule DecodingTests.BaselineExceptionsTest do
]
end

def encode_decode_loop(l) do
l |> CSV.encode() |> Decoder.decode() |> Enum.to_list()
def encode_decode_loop(l, opts \\ []) do
l |> CSV.encode(opts) |> Decoder.decode(opts) |> Enum.to_list()
end

test "does not get corrupted after an error" do
Expand All @@ -101,4 +101,17 @@ defmodule DecodingTests.BaselineExceptionsTest do
assert result_b == [ok: ~w(b)]
assert result_c == [ok: ~w(b)]
end

test "removes escaping for formula" do
input = [["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]]

assert [
ok: [
"=1+1=1+2\";=1+2=1+2'\" ;,=1+2",
"-10+7",
"+10+7",
"@A1:A10"
]
] = encode_decode_loop([input], escape_formulas: true)
end
end
16 changes: 16 additions & 0 deletions test/encoding/escaped_fields_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@ defmodule EncodingTests.EscapedFieldsTest do
assert result == ["\"a,\",\"b\re\"\r\n", "\"c,f\"\"\",dg\r\n"]
end

test "encodes formulas and escapes them" do
result =
Encoder.encode(
[["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]],
escape_formulas: true
)
|> Enum.to_list()

assert result == [
~S("'=1+1","'=1+2"";=1+2","'=1+2'"" ;,=1+2") <> "\r\n",
~S("'-10+7") <> "\r\n",
~S("'+10+7") <> "\r\n",
~S("'@A1:A10") <> "\r\n"
]
end

test "encodes streams of various content to csv strings and escapes them" do
result = Encoder.encode([[:atom, 1], [["a", "b"], "dg"]]) |> Enum.to_list()
assert result == ["atom,1\r\n", "ab,dg\r\n"]
Expand Down

0 comments on commit faef83e

Please sign in to comment.