From f622c56f10d28410b0fd323417ee2800b37cb6e2 Mon Sep 17 00:00:00 2001 From: Primoz Godec Date: Thu, 4 Nov 2021 08:47:34 +0100 Subject: [PATCH] pandas_compat: do not parse column of numbers (object dtype) to datetime --- Orange/data/pandas_compat.py | 10 ++++++++++ Orange/data/tests/test_pandas.py | 19 +++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py index 95e9e92cc7b..2d6aac4b545 100644 --- a/Orange/data/pandas_compat.py +++ b/Orange/data/pandas_compat.py @@ -157,6 +157,16 @@ def _is_datetime(s): return True try: if is_object_dtype(s): + # pd.to_datetime would sucessfuly parse column of numbers to datetime + # but for column of object dtype with numbers we want to be either + # discret or string - following code try to parse column to numeric + # if connversion to numeric is sucessful return False + try: + pd.to_numeric(s) + return False + except (ValueError, TypeError): + pass + # utc=True - to allow different timezones in a series object pd.to_datetime(s, infer_datetime_format=True, utc=True) return True diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py index f8649f11310..2d30ed3639a 100644 --- a/Orange/data/tests/test_pandas.py +++ b/Orange/data/tests/test_pandas.py @@ -383,6 +383,25 @@ def test_table_from_frame_timezones(self): ], ) + def test_table_from_frame_no_datetim(self): + """ + In case when dtype of column is object and column contains numbers only, + column could be recognized as a TimeVarialbe since pd.to_datetime can parse + numbers as datetime. That column must be result either in StringVariable + or DiscreteVariable since it's dtype is object. + """ + from Orange.data.pandas_compat import table_from_frame + + df = pd.DataFrame([[1], [2], [3]], dtype="object") + table = table_from_frame(df) + # check if exactly ContinuousVariable and not subtype TimeVariable + self.assertIsInstance(table.domain.metas[0], StringVariable) + + df = pd.DataFrame([[1], [2], [2]], dtype="object") + table = table_from_frame(df) + # check if exactly ContinuousVariable and not subtype TimeVariable + self.assertIsInstance(table.domain.attributes[0], DiscreteVariable) + def test_time_variable_compatible(self): from Orange.data.pandas_compat import table_from_frame