Implement Property Tests for DataFrame.new (#1012)

* Implement Property Tests for DataFrame.new Co-authored-by: Billy Lanchantin <william.lanchantin@cargosense.com> * First pass at changes * Replace old generators The existing property test used a special case of the new generator logic. This replaces that special case with the new generator. * Add (skipped) serialization property tests We should be able to serialize any DataFrame or document the cases where we can't. None of these are working at the moment. * Drop max_runs to 1_000 * Fix bad copy/paste * Add notes * Switch to the `test_type:property` tag Before we weren't actually skipping property tests. We were excluding the`property`tag, but that tag wasn't actually being set. * Whoops! Revert that last change Turns out we _were_ setting the `property` tag. --------- Co-authored-by: Billy Lanchantin <william.lanchantin@cargosense.com>
elixir-explorer · Nov 14, 2024 · 7c5a087 · 7c5a087
1 parent 6e40319
commit 7c5a087
Show file tree

Hide file tree

Showing 3 changed files with 556 additions and 52 deletions.
diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs
@@ -1,5 +1,6 @@
 defmodule Explorer.DataFrameTest do
   use ExUnit.Case, async: true
+  use ExUnitProperties
 
   # Tests for most IO operations are in the data_frame folder
   # Tests for summarise, group, ungroup are available in grouped_test.exs
@@ -4709,4 +4710,121 @@ defmodule Explorer.DataFrameTest do
              }
     end
   end
+
+  # These property tests are a work in progress. They currently aim to cover
+  # creation and serialization (including printing). Serialization in particular
+  # is causing lots of panics. The plan is to keep the properties that don't
+  # pass but with `@tag :skip` until we can fix them.
+  #
+  # Notes:
+  #
+  #   * `max_runs: 1_000` is being used for all properties. This is an
+  #     essentially arbitrary choice to ensure relatively quick runs. Future
+  #     devs should feel free to change individual search parameters as needed.
+  #   * `@tag timeout: :infinity` is insurance against timeouts from extra long
+  #     searches. Future devs should feel free to remove this if it's no longer
+  #     deemed necessary.
+  #   * For local development, remember to include property tests with the tag
+  #     `--include property`.
+  @tag timeout: :infinity
+  describe "properties" do
+    property "should be able to create a DataFrame from valid rows" do
+      check all(
+              # TODO: remove `exclude: :decimal` once we fix whatever bug(s)
+              # this is finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        assert %DF{} = DF.new(rows, dtypes: dtypes)
+      end
+    end
+
+    property "should be able to create a DataFrame from valid columns" do
+      check all(
+              # TODO: remove `exclude: :decimal` once we fix whatever bug(s)
+              # this is finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
+              cols <- Explorer.Generator.columns(dtypes),
+              max_runs: 1_000
+            ) do
+        assert %DF{} = DF.new(cols, dtypes: dtypes)
+      end
+    end
+
+    @tag :skip
+    property "should be able to print any DataFrame" do
+      check all(
+              dtypes <- Explorer.Generator.dtypes(),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        df = DF.new(rows, dtypes: dtypes)
+
+        if DF.n_rows(df) > 0 do
+          DF.print(df)
+        end
+      end
+    end
+
+    @tag :skip
+    property "can dump any DataFrame (without duration) to CSV" do
+      check all(
+              # TODO: remove `:decimal` once we fix whatever bug(s) this is
+              # finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: [:decimal, :duration]),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        rows
+        |> DF.new(dtypes: dtypes)
+        |> DF.dump_csv!()
+      end
+    end
+
+    @tag :skip
+    property "can dump any DataFrame to IPC" do
+      check all(
+              # TODO: remove `exclude: :decimal` once we fix whatever bug(s)
+              # this is finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        rows
+        |> DF.new(dtypes: dtypes)
+        |> DF.dump_ipc!()
+      end
+    end
+
+    @tag :skip
+    property "can dump any DataFrame to NDJSON" do
+      check all(
+              # TODO: remove `exclude: :decimal` once we fix whatever bug(s)
+              # this is finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        rows
+        |> DF.new(dtypes: dtypes)
+        |> DF.dump_ndjson!()
+      end
+    end
+
+    @tag :skip
+    property "can dump any DataFrame to PARQUET" do
+      check all(
+              # TODO: remove `exclude: :decimal` once we fix whatever bug(s)
+              # this is finding.
+              dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
+              rows <- Explorer.Generator.rows(dtypes),
+              max_runs: 1_000
+            ) do
+        rows
+        |> DF.new(dtypes: dtypes)
+        |> DF.dump_parquet!()
+      end
+    end
+  end
 end
diff --git a/test/explorer/series/inferred_dtype_property_test.exs b/test/explorer/series/inferred_dtype_property_test.exs
@@ -2,74 +2,25 @@ defmodule Explorer.Series.InferredDtypePropertyTest do
   @moduledoc """
   Property tests for checking the inferred dtype logic when the dtype isn't
   specified in `Explorer.Series.from_list/1`.
-
-  ## Notes
-
-    * A maximum of 3 used quite a bit. This is intentional. Usually issues stem
-      from empty lists, not really long lists. By keeping lists small, we can
-      iterate much quicker through the input space.
   """
   use ExUnit.Case, async: true
   use ExUnitProperties
 
-  import StreamData
-
   alias Explorer.Series
 
   @moduletag timeout: :infinity
 
   property "inferred dtype should always be a sub-dtype" do
     check all(
-            dtype <- dtype_generator(),
-            series <- series_of_dtype_generator(dtype),
+            dtype <- Explorer.Generator.dtype(scalar: constant({:s, 64})),
+            list_of_dtype <- Explorer.Generator.column(dtype, as: :list),
             max_run_time: 60_000,
             max_runs: 10_000
           ) do
-      assert series |> Series.dtype() |> sub_dtype_of?(dtype)
+      assert list_of_dtype |> Series.from_list() |> Series.dtype() |> sub_dtype_of?(dtype)
     end
   end
 
-  defp dtype_generator do
-    scalar_dtype_generator = constant({:s, 64})
-
-    # We don't need complicated keys: single letter strings should suffice.
-    key_generator = string(?a..?z, min_length: 1, max_length: 1)
-
-    dtype_generator =
-      tree(scalar_dtype_generator, fn generator ->
-        # Building the keyword list from a map ensures unique keys.
-        keyword_generator =
-          map(nonempty(map_of(key_generator, generator, max_length: 3)), &Enum.to_list/1)
-
-        one_of([
-          tuple({constant(:list), generator}),
-          tuple({constant(:struct), keyword_generator})
-        ])
-      end)
-
-    dtype_generator
-  end
-
-  defp series_of_dtype_generator(dtype) do
-    series_value_generator = build_series_value_generator(dtype)
-
-    bind(list_of(series_value_generator, max_length: 3), fn series_values ->
-      constant(Explorer.Series.from_list(series_values))
-    end)
-  end
-
-  defp build_series_value_generator({:s, 64}),
-    do: integer()
-
-  defp build_series_value_generator({:list, dtype}),
-    do: list_of(build_series_value_generator(dtype), max_length: 3)
-
-  defp build_series_value_generator({:struct, keyword_of_dtypes}) do
-    keyword_of_dtypes
-    |> Map.new(fn {key, dtype} -> {key, build_series_value_generator(dtype)} end)
-    |> fixed_map()
-  end
-
   # The idea behind a "sub" dtype is that in the dtype tree, you can replace
   # any subtree with `:null` and it's still valid. This is to deal with empty
   # lists where we can't reasonably infer the dtype of a list with no elements.