fix(pandas): don't silently ignore result column name mismatches

ibis-project · Sep 6, 2024 · 48be246 · 48be246
1 parent 85e1dcc
commit 48be246
Show file tree

Hide file tree

Showing 4 changed files with 23 additions and 11 deletions.
diff --git a/ibis/backends/impala/__init__.py b/ibis/backends/impala/__init__.py
@@ -266,7 +266,7 @@ def raw_sql(self, query: str):
     def _fetch_from_cursor(self, cursor, schema):
         from ibis.formats.pandas import PandasData
 
-        results = fetchall(cursor)
+        results = fetchall(cursor, schema.names)
         return PandasData.convert_table(results, schema)
 
     @contextlib.contextmanager
@@ -1260,9 +1260,10 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
                     cur.execute(insert_stmt, row)
 
 
-def fetchall(cur):
+def fetchall(cur, names=None):
     batches = cur.fetchcolumnar()
-    names = list(map(operator.itemgetter(0), cur.description))
+    if names is None:
+        names = list(map(operator.itemgetter(0), cur.description))
     df = _column_batches_to_dataframe(names, batches)
     return df
 

diff --git a/ibis/backends/polars/__init__.py b/ibis/backends/polars/__init__.py
@@ -466,12 +466,19 @@ def _to_dataframe(
         streaming: bool = False,
         **kwargs: Any,
     ) -> pl.DataFrame:
-        lf = self.compile(expr, params=params, **kwargs)
+        table_expr = expr.as_table()
+        lf = self.compile(table_expr, params=params, **kwargs)
         if limit == "default":
             limit = ibis.options.sql.default_limit
         if limit is not None:
             lf = lf.limit(limit)
-        return lf.collect(streaming=streaming)
+        df = lf.collect(streaming=streaming)
+        # XXX: Polars sometimes returns data with the incorrect column names.
+        # For now we catch this case and rename them here if needed.
+        expected_cols = tuple(table_expr.columns)
+        if tuple(df.columns) != expected_cols:
+            df = df.rename(dict(zip(df.columns, expected_cols)))
+        return df
 
     def execute(
         self,

diff --git a/ibis/formats/pandas.py b/ibis/formats/pandas.py
@@ -109,14 +109,11 @@ def infer_table(cls, df):
 
     @classmethod
     def convert_table(cls, df, schema):
-        if len(schema) != len(df.columns):
-            raise ValueError(
-                "schema column count does not match input data column count"
-            )
+        if schema.names != tuple(df.columns):
+            raise ValueError("schema names don't match input data columns")
 
         columns = {
-            name: cls.convert_column(series, dtype)
-            for (name, dtype), (_, series) in zip(schema.items(), df.items())
+            name: cls.convert_column(df[name], dtype) for name, dtype in schema.items()
         }
         df = pd.DataFrame(columns)
 

diff --git a/ibis/formats/tests/test_pandas.py b/ibis/formats/tests/test_pandas.py
@@ -433,3 +433,10 @@ def test_convert_dataframe_with_timezone():
     desired_schema = ibis.schema(dict(time='timestamp("EST")'))
     result = PandasData.convert_table(df.copy(), desired_schema)
     tm.assert_frame_equal(expected, result)
+
+
+def test_schema_doesnt_match_input_columns():
+    df = pd.DataFrame({"x": [1], "y": [2]})
+    schema = sch.Schema({"a": "int64", "b": "int64"})
+    with pytest.raises(ValueError, match="schema names don't match"):
+        PandasData.convert_table(df, schema)