From f622c56f10d28410b0fd323417ee2800b37cb6e2 Mon Sep 17 00:00:00 2001
From: Primoz Godec
Date: Thu, 4 Nov 2021 08:47:34 +0100
Subject: [PATCH] pandas_compat: do not parse column of numbers (object dtype)
to datetime
---
Orange/data/pandas_compat.py | 10 ++++++++++
Orange/data/tests/test_pandas.py | 19 +++++++++++++++++++
2 files changed, 29 insertions(+)
diff --git a/Orange/data/pandas_compat.py b/Orange/data/pandas_compat.py
index 95e9e92cc7b..2d6aac4b545 100644
--- a/Orange/data/pandas_compat.py
+++ b/Orange/data/pandas_compat.py
@@ -157,6 +157,16 @@ def _is_datetime(s):
return True
try:
if is_object_dtype(s):
+ # pd.to_datetime would sucessfuly parse column of numbers to datetime
+ # but for column of object dtype with numbers we want to be either
+ # discret or string - following code try to parse column to numeric
+ # if connversion to numeric is sucessful return False
+ try:
+ pd.to_numeric(s)
+ return False
+ except (ValueError, TypeError):
+ pass
+
# utc=True - to allow different timezones in a series object
pd.to_datetime(s, infer_datetime_format=True, utc=True)
return True
diff --git a/Orange/data/tests/test_pandas.py b/Orange/data/tests/test_pandas.py
index f8649f11310..2d30ed3639a 100644
--- a/Orange/data/tests/test_pandas.py
+++ b/Orange/data/tests/test_pandas.py
@@ -383,6 +383,25 @@ def test_table_from_frame_timezones(self):
],
)
+ def test_table_from_frame_no_datetim(self):
+ """
+ In case when dtype of column is object and column contains numbers only,
+ column could be recognized as a TimeVarialbe since pd.to_datetime can parse
+ numbers as datetime. That column must be result either in StringVariable
+ or DiscreteVariable since it's dtype is object.
+ """
+ from Orange.data.pandas_compat import table_from_frame
+
+ df = pd.DataFrame([[1], [2], [3]], dtype="object")
+ table = table_from_frame(df)
+ # check if exactly ContinuousVariable and not subtype TimeVariable
+ self.assertIsInstance(table.domain.metas[0], StringVariable)
+
+ df = pd.DataFrame([[1], [2], [2]], dtype="object")
+ table = table_from_frame(df)
+ # check if exactly ContinuousVariable and not subtype TimeVariable
+ self.assertIsInstance(table.domain.attributes[0], DiscreteVariable)
+
def test_time_variable_compatible(self):
from Orange.data.pandas_compat import table_from_frame