Add option to escape excel formulas (#103)

beatrichartz · Oct 6, 2021 · faef83e · faef83e
1 parent 757c25c
commit faef83e
Show file tree

Hide file tree

Showing 8 changed files with 126 additions and 42 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -1,3 +1,7 @@
+
+## Unreleased
+- Optional parameter `escape_formulas` to prevent CSV injection. [Fixes #103](https://github.com/beatrichartz/csv/issues/103) reported by [@maennchen](https://github.com/maennchen). Contributed by [@maennchen](https://github.com/maennchen) in [PR #104](https://github.com/beatrichartz/csv/pull/104).
+
 ## 2.4.1
 - Fix unnecessary escaping of delimiters when encoding [Fixes #70](https://github.com/beatrichartz/csv/issues/70)
   reported by [@karmajunkie](https://github.com/karmajunkie)

diff --git a/lib/csv/decoding/decoder.ex b/lib/csv/decoding/decoder.ex
@@ -22,23 +22,25 @@ defmodule CSV.Decoding.Decoder do
 
   These are the options:
 
-  * `:separator`   – The separator token to use, defaults to `?,`.
+  * `:separator`    – The separator token to use, defaults to `?,`.
       Must be a codepoint (syntax: ? + (your separator)).
   * `:strip_fields` – When set to true, will strip whitespace from fields.
       Defaults to false.
-  * `:num_workers` – The number of parallel operations to run when producing
+  * `:num_workers`  – The number of parallel operations to run when producing
       the stream.
   * `:worker_work_ratio` – The available work per worker, defaults to 5.
       Higher rates will mean more work sharing, but might also lead to work
       fragmentation slowing down the queues.
-  * `:headers`     – When set to `true`, will take the first row of the csv
+  * `:headers`      – When set to `true`, will take the first row of the csv
       and use it as header values.
       When set to a list, will use the given list as header values.
       When set to `false` (default), will use no header values.
       When set to anything but `false`, the resulting rows in the matrix will
       be maps instead of lists.
-  * `:replacement`   – The replacement string to use where lines have bad
+  * `:replacement`    – The replacement string to use where lines have bad
       encoding. Defaults to `nil`, which disables replacement.
+  * `:escape_formulas – Remove Formular Escaping inserted to prevent
+      [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
 
   ## Examples
 
@@ -50,7 +52,7 @@ defmodule CSV.Decoding.Decoder do
       ...> |> Enum.take(2)
       [ok: [\"a\", \"b\"], ok: [\"c\", \"d\"]]
 
-  Map an existing stream of lines separated by a token to a stream of rows with 
+  Map an existing stream of lines separated by a token to a stream of rows with
   a header row:
 
       iex> [\"a;b\",\"c;d\", \"e;f\"]
@@ -62,7 +64,7 @@ defmodule CSV.Decoding.Decoder do
         ok: %{\"a\" => \"e\", \"b\" => \"f\"}
       ]
 
-  Map an existing stream of lines separated by a token to a stream of rows with 
+  Map an existing stream of lines separated by a token to a stream of rows with
   a header row with duplications:
 
       iex> [\"a;b;b\",\"c;d;e\", \"f;g;h\"]

diff --git a/lib/csv/decoding/lexer.ex b/lib/csv/decoding/lexer.ex
@@ -25,69 +25,97 @@ defmodule CSV.Decoding.Lexer do
   def lex({line, index}, options \\ []) when is_list(options) do
     separator = options |> Keyword.get(:separator, @separator)
     replacement = options |> Keyword.get(:replacement, @replacement)
+    escape_formulas = options |> Keyword.get(:escape_formulas, @escape_formulas)
 
     case String.valid?(line) do
       false ->
         if replacement do
-          replace_bad_encoding(line, replacement) |> lex(index, separator)
+          replace_bad_encoding(line, replacement) |> lex(index, separator, escape_formulas)
         else
           {:error, EncodingError, "Invalid encoding", index}
         end
 
       true ->
-        lex(line, index, separator)
+        lex(line, index, separator, escape_formulas)
     end
   end
 
-  defp lex(line, index, separator) do
-    case lex([], nil, line, separator) do
+  defp lex(line, index, separator, escape_formulas) do
+    case lex([], nil, line, separator, escape_formulas) do
       {:ok, tokens} -> {:ok, tokens, index}
     end
   end
 
-  defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator) do
-    lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator)
+  defp lex(tokens, {:delimiter, value}, <<@newline::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:delimiter, value <> <<@newline::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator) do
-    lex(tokens |> add_token(current_token), {:delimiter, <<@newline::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<@newline::utf8>> <> tail, separator, escape_formulas) do
+    lex(
+      tokens |> add_token(current_token),
+      {:delimiter, <<@newline::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator) do
+  defp lex(tokens, current_token, <<@carriage_return::utf8>> <> tail, separator, escape_formulas) do
     lex(
       tokens |> add_token(current_token),
       {:delimiter, <<@carriage_return::utf8>>},
       tail,
-      separator
+      separator,
+      escape_formulas
     )
   end
 
-  defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator) do
+  defp lex(tokens, current_token, <<@double_quote::utf8>> <> tail, separator, escape_formulas) do
     lex(
       tokens |> add_token(current_token),
       {:double_quote, <<@double_quote::utf8>>},
       tail,
-      separator
+      separator,
+      escape_formulas
     )
   end
 
-  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) when head == separator do
-    lex(tokens |> add_token(current_token), {:separator, <<separator::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas)
+       when head == separator do
+    lex(
+      tokens |> add_token(current_token),
+      {:separator, <<separator::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator) do
-    lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator)
+  for start <- @escape_formula_start do
+    defp lex(tokens, current_token, "'#{unquote(start)}" <> tail, separator, true) do
+      lex(tokens, current_token, unquote(start) <> tail, separator, true)
+    end
+  end
+
+  defp lex(tokens, {:content, value}, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:content, value <> <<head::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, nil, <<head::utf8>> <> tail, separator) do
-    lex(tokens, {:content, <<head::utf8>>}, tail, separator)
+  defp lex(tokens, nil, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(tokens, {:content, <<head::utf8>>}, tail, separator, escape_formulas)
   end
 
-  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator) do
-    lex(tokens |> add_token(current_token), {:content, <<head::utf8>>}, tail, separator)
+  defp lex(tokens, current_token, <<head::utf8>> <> tail, separator, escape_formulas) do
+    lex(
+      tokens |> add_token(current_token),
+      {:content, <<head::utf8>>},
+      tail,
+      separator,
+      escape_formulas
+    )
   end
 
-  defp lex(tokens, current_token, "", _) do
+  defp lex(tokens, current_token, "", _, _) do
     {:ok, tokens |> add_token(current_token)}
   end
 

diff --git a/lib/csv/defaults.ex b/lib/csv/defaults.ex
@@ -12,6 +12,8 @@ defmodule CSV.Defaults do
       @double_quote ?"
       @escape_max_lines 1000
       @replacement nil
+      @escape_formulas false
+      @escape_formula_start ["=", "-", "+", "@"]
     end
   end
 

diff --git a/lib/csv/encoding/encode.ex b/lib/csv/encoding/encode.ex
@@ -33,15 +33,32 @@ defimpl CSV.Encode, for: BitString do
   def encode(data, env \\ []) do
     separator = env |> Keyword.get(:separator, @separator)
     delimiter = env |> Keyword.get(:delimiter, @delimiter)
+    escape_formulas = env |> Keyword.get(:escape_formulas, @escape_formulas)
+
+    data =
+      if escape_formulas and String.starts_with?(data, @escape_formula_start) do
+        "'" <> data
+      else
+        data
+      end
+
+    patterns = [
+      <<separator::utf8>>,
+      delimiter,
+      <<@carriage_return::utf8>>,
+      <<@newline::utf8>>,
+      <<@double_quote::utf8>>
+    ]
+
+    patterns =
+      if escape_formulas do
+        patterns ++ @escape_formula_start
+      else
+        patterns
+      end
 
     cond do
-      String.contains?(data, [
-        <<separator::utf8>>,
-        delimiter,
-        <<@carriage_return::utf8>>,
-        <<@newline::utf8>>,
-        <<@double_quote::utf8>>
-      ]) ->
+      String.contains?(data, patterns) ->
         <<@double_quote::utf8>> <>
           (data
            |> String.replace(

diff --git a/lib/csv/encoding/encoder.ex b/lib/csv/encoding/encoder.ex
@@ -14,16 +14,18 @@ defmodule CSV.Encoding.Encoder do
 
   These are the options:
 
-    * `:separator`   – The separator token to use, defaults to `?,`.
+    * `:separator`      – The separator token to use, defaults to `?,`.
       Must be a codepoint (syntax: ? + your separator token).
-    * `:delimiter`   – The delimiter token to use, defaults to `\"\\r\\n\"`.
-    * `:headers`     – When set to `true`, uses the keys of the first map as
+    * `:delimiter`       – The delimiter token to use, defaults to `\"\\r\\n\"`.
+    * `:headers`         – When set to `true`, uses the keys of the first map as
       the first element in the stream. All subsequent elements are the values
       of the maps. When set to a list, will use the given list as the first
       element in the stream and order all subsequent elements using that list.
       When set to `false` (default), will use the raw inputs as elements.
       When set to anything but `false`, all elements in the input stream are
       assumed to be maps.
+    * `:escape_formulas` – Escape formulas to prevent
+      [CSV Injection](https://owasp.org/www-community/attacks/CSV_Injection).
 
   ## Examples
 
@@ -89,13 +91,13 @@ defmodule CSV.Encoding.Encoder do
 
     encoded =
       row
-      |> Enum.map(&encode_cell(&1, separator, delimiter))
+      |> Enum.map(&encode_cell(&1, options))
       |> Enum.join(<<separator::utf8>>)
 
     encoded <> delimiter
   end
 
-  defp encode_cell(cell, separator, delimiter) do
-    CSV.Encode.encode(cell, separator: separator, delimiter: delimiter)
+  defp encode_cell(cell, options) do
+    CSV.Encode.encode(cell, options)
   end
 end
diff --git a/test/decoding/baseline_exceptions_test.exs b/test/decoding/baseline_exceptions_test.exs
@@ -84,8 +84,8 @@ defmodule DecodingTests.BaselineExceptionsTest do
            ]
   end
 
-  def encode_decode_loop(l) do
-    l |> CSV.encode() |> Decoder.decode() |> Enum.to_list()
+  def encode_decode_loop(l, opts \\ []) do
+    l |> CSV.encode(opts) |> Decoder.decode(opts) |> Enum.to_list()
   end
 
   test "does not get corrupted after an error" do
@@ -101,4 +101,17 @@ defmodule DecodingTests.BaselineExceptionsTest do
     assert result_b == [ok: ~w(b)]
     assert result_c == [ok: ~w(b)]
   end
+
+  test "removes escaping for formula" do
+    input = [["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]]
+
+    assert [
+             ok: [
+               "=1+1=1+2\";=1+2=1+2'\" ;,=1+2",
+               "-10+7",
+               "+10+7",
+               "@A1:A10"
+             ]
+           ] = encode_decode_loop([input], escape_formulas: true)
+  end
 end
diff --git a/test/encoding/escaped_fields_test.exs b/test/encoding/escaped_fields_test.exs
@@ -7,6 +7,22 @@ defmodule EncodingTests.EscapedFieldsTest do
     assert result == ["\"a,\",\"b\re\"\r\n", "\"c,f\"\"\",dg\r\n"]
   end
 
+  test "encodes formulas and escapes them" do
+    result =
+      Encoder.encode(
+        [["=1+1", ~S(=1+2";=1+2), ~S(=1+2'" ;,=1+2)], ["-10+7"], ["+10+7"], ["@A1:A10"]],
+        escape_formulas: true
+      )
+      |> Enum.to_list()
+
+    assert result == [
+             ~S("'=1+1","'=1+2"";=1+2","'=1+2'"" ;,=1+2") <> "\r\n",
+             ~S("'-10+7") <> "\r\n",
+             ~S("'+10+7") <> "\r\n",
+             ~S("'@A1:A10") <> "\r\n"
+           ]
+  end
+
   test "encodes streams of various content to csv strings and escapes them" do
     result = Encoder.encode([[:atom, 1], [["a", "b"], "dg"]]) |> Enum.to_list()
     assert result == ["atom,1\r\n", "ab,dg\r\n"]