Skip to content

Commit

Permalink
Implement Property Tests for DataFrame.new (#1012)
Browse files Browse the repository at this point in the history
* Implement Property Tests for DataFrame.new

Co-authored-by: Billy Lanchantin <william.lanchantin@cargosense.com>

* First pass at changes

* Replace old generators

The existing property test used a special case
of the new generator logic. This replaces that
special case with the new generator.

* Add (skipped) serialization property tests

We should be able to serialize any DataFrame
or document the cases where we can't. None of
these are working at the moment.

* Drop max_runs to 1_000

* Fix bad copy/paste

* Add notes

* Switch to the `test_type:property` tag

Before we weren't actually skipping property
tests. We were excluding the`property`tag, but
that tag wasn't actually being set.

* Whoops! Revert that last change

Turns out we _were_ setting the `property` tag.

---------

Co-authored-by: Billy Lanchantin <william.lanchantin@cargosense.com>
  • Loading branch information
maennchen and billylanchantin authored Nov 14, 2024
1 parent 6e40319 commit 7c5a087
Show file tree
Hide file tree
Showing 3 changed files with 556 additions and 52 deletions.
118 changes: 118 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
defmodule Explorer.DataFrameTest do
use ExUnit.Case, async: true
use ExUnitProperties

# Tests for most IO operations are in the data_frame folder
# Tests for summarise, group, ungroup are available in grouped_test.exs
Expand Down Expand Up @@ -4709,4 +4710,121 @@ defmodule Explorer.DataFrameTest do
}
end
end

# These property tests are a work in progress. They currently aim to cover
# creation and serialization (including printing). Serialization in particular
# is causing lots of panics. The plan is to keep the properties that don't
# pass but with `@tag :skip` until we can fix them.
#
# Notes:
#
# * `max_runs: 1_000` is being used for all properties. This is an
# essentially arbitrary choice to ensure relatively quick runs. Future
# devs should feel free to change individual search parameters as needed.
# * `@tag timeout: :infinity` is insurance against timeouts from extra long
# searches. Future devs should feel free to remove this if it's no longer
# deemed necessary.
# * For local development, remember to include property tests with the tag
# `--include property`.
@tag timeout: :infinity
describe "properties" do
property "should be able to create a DataFrame from valid rows" do
check all(
# TODO: remove `exclude: :decimal` once we fix whatever bug(s)
# this is finding.
dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
assert %DF{} = DF.new(rows, dtypes: dtypes)
end
end

property "should be able to create a DataFrame from valid columns" do
check all(
# TODO: remove `exclude: :decimal` once we fix whatever bug(s)
# this is finding.
dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
cols <- Explorer.Generator.columns(dtypes),
max_runs: 1_000
) do
assert %DF{} = DF.new(cols, dtypes: dtypes)
end
end

@tag :skip
property "should be able to print any DataFrame" do
check all(
dtypes <- Explorer.Generator.dtypes(),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
df = DF.new(rows, dtypes: dtypes)

if DF.n_rows(df) > 0 do
DF.print(df)
end
end
end

@tag :skip
property "can dump any DataFrame (without duration) to CSV" do
check all(
# TODO: remove `:decimal` once we fix whatever bug(s) this is
# finding.
dtypes <- Explorer.Generator.dtypes(exclude: [:decimal, :duration]),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
rows
|> DF.new(dtypes: dtypes)
|> DF.dump_csv!()
end
end

@tag :skip
property "can dump any DataFrame to IPC" do
check all(
# TODO: remove `exclude: :decimal` once we fix whatever bug(s)
# this is finding.
dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
rows
|> DF.new(dtypes: dtypes)
|> DF.dump_ipc!()
end
end

@tag :skip
property "can dump any DataFrame to NDJSON" do
check all(
# TODO: remove `exclude: :decimal` once we fix whatever bug(s)
# this is finding.
dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
rows
|> DF.new(dtypes: dtypes)
|> DF.dump_ndjson!()
end
end

@tag :skip
property "can dump any DataFrame to PARQUET" do
check all(
# TODO: remove `exclude: :decimal` once we fix whatever bug(s)
# this is finding.
dtypes <- Explorer.Generator.dtypes(exclude: :decimal),
rows <- Explorer.Generator.rows(dtypes),
max_runs: 1_000
) do
rows
|> DF.new(dtypes: dtypes)
|> DF.dump_parquet!()
end
end
end
end
55 changes: 3 additions & 52 deletions test/explorer/series/inferred_dtype_property_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2,74 +2,25 @@ defmodule Explorer.Series.InferredDtypePropertyTest do
@moduledoc """
Property tests for checking the inferred dtype logic when the dtype isn't
specified in `Explorer.Series.from_list/1`.
## Notes
* A maximum of 3 used quite a bit. This is intentional. Usually issues stem
from empty lists, not really long lists. By keeping lists small, we can
iterate much quicker through the input space.
"""
use ExUnit.Case, async: true
use ExUnitProperties

import StreamData

alias Explorer.Series

@moduletag timeout: :infinity

property "inferred dtype should always be a sub-dtype" do
check all(
dtype <- dtype_generator(),
series <- series_of_dtype_generator(dtype),
dtype <- Explorer.Generator.dtype(scalar: constant({:s, 64})),
list_of_dtype <- Explorer.Generator.column(dtype, as: :list),
max_run_time: 60_000,
max_runs: 10_000
) do
assert series |> Series.dtype() |> sub_dtype_of?(dtype)
assert list_of_dtype |> Series.from_list() |> Series.dtype() |> sub_dtype_of?(dtype)
end
end

defp dtype_generator do
scalar_dtype_generator = constant({:s, 64})

# We don't need complicated keys: single letter strings should suffice.
key_generator = string(?a..?z, min_length: 1, max_length: 1)

dtype_generator =
tree(scalar_dtype_generator, fn generator ->
# Building the keyword list from a map ensures unique keys.
keyword_generator =
map(nonempty(map_of(key_generator, generator, max_length: 3)), &Enum.to_list/1)

one_of([
tuple({constant(:list), generator}),
tuple({constant(:struct), keyword_generator})
])
end)

dtype_generator
end

defp series_of_dtype_generator(dtype) do
series_value_generator = build_series_value_generator(dtype)

bind(list_of(series_value_generator, max_length: 3), fn series_values ->
constant(Explorer.Series.from_list(series_values))
end)
end

defp build_series_value_generator({:s, 64}),
do: integer()

defp build_series_value_generator({:list, dtype}),
do: list_of(build_series_value_generator(dtype), max_length: 3)

defp build_series_value_generator({:struct, keyword_of_dtypes}) do
keyword_of_dtypes
|> Map.new(fn {key, dtype} -> {key, build_series_value_generator(dtype)} end)
|> fixed_map()
end

# The idea behind a "sub" dtype is that in the dtype tree, you can replace
# any subtree with `:null` and it's still valid. This is to deal with empty
# lists where we can't reasonably infer the dtype of a list with no elements.
Expand Down
Loading

0 comments on commit 7c5a087

Please sign in to comment.