diff --git a/ci/schema/druid.sql b/ci/schema/druid.sql index 941a92fe133b..3c58ff394e24 100644 --- a/ci/schema/druid.sql +++ b/ci/schema/druid.sql @@ -41,7 +41,7 @@ FROM TABLE( EXTERN( '{"type":"local","files":["/opt/shared/functional_alltypes.parquet"]}', '{"type":"parquet"}', - '[{"name":"index","type":"long"},{"name":"Unnamed: 0","type":"long"},{"name":"id","type":"long"},{"name":"bool_col","type":"long"},{"name":"tinyint_col","type":"long"},{"name":"smallint_col","type":"long"},{"name":"int_col","type":"long"},{"name":"bigint_col","type":"long"},{"name":"float_col","type":"double"},{"name":"double_col","type":"double"},{"name":"date_string_col","type":"string"},{"name":"string_col","type":"string"},{"name":"timestamp_col","type":"string"},{"name":"year","type":"long"},{"name":"month","type":"long"}]' + '[{"name":"id","type":"long"},{"name":"bool_col","type":"long"},{"name":"tinyint_col","type":"long"},{"name":"smallint_col","type":"long"},{"name":"int_col","type":"long"},{"name":"bigint_col","type":"long"},{"name":"float_col","type":"double"},{"name":"double_col","type":"double"},{"name":"date_string_col","type":"string"},{"name":"string_col","type":"string"},{"name":"timestamp_col","type":"string"},{"name":"year","type":"long"},{"name":"month","type":"long"}]' ) ) PARTITIONED BY ALL TIME; diff --git a/ci/schema/duckdb.sql b/ci/schema/duckdb.sql index 5743cf07a19f..780b3b0b077d 100644 --- a/ci/schema/duckdb.sql +++ b/ci/schema/duckdb.sql @@ -46,8 +46,6 @@ CREATE OR REPLACE TABLE awards_players ( ); CREATE OR REPLACE TABLE functional_alltypes ( - "index" BIGINT, - "Unnamed: 0" BIGINT, id INTEGER, bool_col BOOLEAN, tinyint_col SMALLINT, diff --git a/ci/schema/mssql.sql b/ci/schema/mssql.sql index 258c831450f0..31c73501a389 100644 --- a/ci/schema/mssql.sql +++ b/ci/schema/mssql.sql @@ -70,8 +70,6 @@ WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2) DROP TABLE IF EXISTS functional_alltypes; CREATE TABLE functional_alltypes ( - "index" BIGINT, - "Unnamed: 0" BIGINT, id INTEGER, bool_col BIT, tinyint_col SMALLINT, @@ -91,8 +89,6 @@ BULK INSERT functional_alltypes FROM '/data/functional_alltypes.csv' WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2) -CREATE INDEX "ix_functional_alltypes_index" ON functional_alltypes ("index"); - DROP TABLE IF EXISTS win; CREATE TABLE win (g VARCHAR(MAX), x BIGINT, y BIGINT); diff --git a/ci/schema/mysql.sql b/ci/schema/mysql.sql index b5a5828e3685..0fec6beddb5d 100644 --- a/ci/schema/mysql.sql +++ b/ci/schema/mysql.sql @@ -54,8 +54,6 @@ CREATE TABLE awards_players ( DROP TABLE IF EXISTS functional_alltypes; CREATE TABLE functional_alltypes ( - `index` BIGINT, - `Unnamed: 0` BIGINT, id INTEGER, bool_col BOOLEAN, tinyint_col TINYINT, @@ -71,8 +69,6 @@ CREATE TABLE functional_alltypes ( month INTEGER ) DEFAULT CHARACTER SET = utf8; -CREATE INDEX `ix_functional_alltypes_index` ON functional_alltypes (`index`); - DROP TABLE IF EXISTS json_t CASCADE; CREATE TABLE IF NOT EXISTS json_t (js JSON); diff --git a/ci/schema/postgresql.sql b/ci/schema/postgresql.sql index 0aab33cb1cf5..9cf1b0c4429e 100644 --- a/ci/schema/postgresql.sql +++ b/ci/schema/postgresql.sql @@ -63,8 +63,6 @@ CREATE TABLE awards_players ( DROP TABLE IF EXISTS functional_alltypes CASCADE; CREATE TABLE functional_alltypes ( - "index" BIGINT, - "Unnamed: 0" BIGINT, id INTEGER, bool_col BOOLEAN, tinyint_col SMALLINT, @@ -80,8 +78,6 @@ CREATE TABLE functional_alltypes ( month INTEGER ); -CREATE INDEX "ix_functional_alltypes_index" ON functional_alltypes ("index"); - DROP TABLE IF EXISTS tzone CASCADE; CREATE TABLE tzone ( diff --git a/ci/schema/snowflake.sql b/ci/schema/snowflake.sql index 1d3939ac2ad5..bb11c17b2d24 100644 --- a/ci/schema/snowflake.sql +++ b/ci/schema/snowflake.sql @@ -54,8 +54,6 @@ CREATE OR REPLACE TABLE awards_players ( ); CREATE OR REPLACE TABLE functional_alltypes ( - "index" BIGINT, - "Unnamed: 0" BIGINT, "id" INTEGER, "bool_col" BOOLEAN, "tinyint_col" SMALLINT, diff --git a/ci/schema/sqlite.sql b/ci/schema/sqlite.sql index 1335f6b48ef3..fce560641cb3 100644 --- a/ci/schema/sqlite.sql +++ b/ci/schema/sqlite.sql @@ -1,8 +1,6 @@ DROP TABLE IF EXISTS functional_alltypes; CREATE TABLE functional_alltypes ( - "index" BIGINT, - "Unnamed: 0" BIGINT, id BIGINT, bool_col BOOLEAN, tinyint_col BIGINT, @@ -19,8 +17,6 @@ CREATE TABLE functional_alltypes ( CHECK (bool_col IN (0, 1)) ); -CREATE INDEX ix_functional_alltypes_index ON "functional_alltypes" ("index"); - DROP TABLE IF EXISTS awards_players; CREATE TABLE awards_players ( diff --git a/ibis/backends/base/__init__.py b/ibis/backends/base/__init__.py index 269ae95e107c..94d4b18e288f 100644 --- a/ibis/backends/base/__init__.py +++ b/ibis/backends/base/__init__.py @@ -456,6 +456,8 @@ class BaseBackend(abc.ABC, _FileIOHandler): table_class: type[ops.DatabaseTable] = ops.DatabaseTable name: ClassVar[str] + supports_temporary_tables = False + def __init__(self, *args, **kwargs): self._con_args: tuple[Any] = args self._con_kwargs: dict[str, Any] = kwargs diff --git a/ibis/backends/bigquery/tests/conftest.py b/ibis/backends/bigquery/tests/conftest.py index 8e93a490ba43..26f76a56764c 100644 --- a/ibis/backends/bigquery/tests/conftest.py +++ b/ibis/backends/bigquery/tests/conftest.py @@ -178,34 +178,17 @@ def _load_data(data_dir: Path, script_dir: Path, **_: Any) -> None: ) ) - futures.append( - e.submit( - make_job, - client.load_table_from_file, - io.BytesIO(data_dir.joinpath("struct_table.avro").read_bytes()), - bq.TableReference(testing_dataset, "struct_table"), - job_config=bq.LoadJobConfig( - write_disposition=write_disposition, - source_format=bq.SourceFormat.AVRO, - ), - ) - ) - futures.append( e.submit( make_job, client.load_table_from_file, io.BytesIO( - data_dir.joinpath("functional_alltypes.csv").read_bytes() + data_dir.joinpath("avro", "struct_table.avro").read_bytes() ), - functional_alltypes_parted, + bq.TableReference(testing_dataset, "struct_table"), job_config=bq.LoadJobConfig( - schema=ibis_schema_to_bq_schema( - TEST_TABLES["functional_alltypes"] - ), write_disposition=write_disposition, - source_format=bq.SourceFormat.CSV, - skip_leading_rows=1, + source_format=bq.SourceFormat.AVRO, ), ) ) @@ -264,21 +247,22 @@ def _load_data(data_dir: Path, script_dir: Path, **_: Any) -> None: ) ) - for table, schema in TEST_TABLES.items(): - futures.append( - e.submit( - make_job, - client.load_table_from_file, - io.BytesIO(data_dir.joinpath(f"{table}.csv").read_bytes()), - bq.TableReference(testing_dataset, table), - job_config=bq.LoadJobConfig( - schema=ibis_schema_to_bq_schema(schema), - write_disposition=bq.WriteDisposition.WRITE_TRUNCATE, - source_format=bq.SourceFormat.CSV, - skip_leading_rows=1, - ), - ) + futures.extend( + e.submit( + make_job, + client.load_table_from_file, + io.BytesIO( + data_dir.joinpath("parquet", f"{table}.parquet").read_bytes() + ), + bq.TableReference(testing_dataset, table), + job_config=bq.LoadJobConfig( + schema=ibis_schema_to_bq_schema(schema), + write_disposition=write_disposition, + source_format=bq.SourceFormat.PARQUET, + ), ) + for table, schema in TEST_TABLES.items() + ) for fut in concurrent.futures.as_completed(futures): fut.result() diff --git a/ibis/backends/clickhouse/__init__.py b/ibis/backends/clickhouse/__init__.py index 1be8dfa0e4d7..9255f61a0443 100644 --- a/ibis/backends/clickhouse/__init__.py +++ b/ibis/backends/clickhouse/__init__.py @@ -74,6 +74,9 @@ def insert(self, obj, **kwargs): class Backend(BaseBackend): name = 'clickhouse' + # ClickHouse itself does, but the client driver does not + supports_temporary_tables = False + class Options(ibis.config.Config): """Clickhouse options. diff --git a/ibis/backends/clickhouse/tests/conftest.py b/ibis/backends/clickhouse/tests/conftest.py index 90d355e1a22c..22f8e28fa411 100644 --- a/ibis/backends/clickhouse/tests/conftest.py +++ b/ibis/backends/clickhouse/tests/conftest.py @@ -38,15 +38,18 @@ def native_bool(self) -> bool: @classmethod def service_spec(cls, data_dir: Path) -> ServiceSpec: - files = [data_dir.joinpath("functional_alltypes.parquet")] - files.extend( - data_dir.joinpath("parquet", name, f"{name}.parquet") - for name in ("diamonds", "batting", "awards_players") - ) return ServiceSpec( name=cls.name(), data_volume="/var/lib/clickhouse/user_files/ibis", - files=files, + files=[ + data_dir.joinpath("parquet", f"{name}.parquet") + for name in ( + "diamonds", + "batting", + "awards_players", + "functional_alltypes", + ) + ], ) @staticmethod diff --git a/ibis/backends/clickhouse/tests/test_functions.py b/ibis/backends/clickhouse/tests/test_functions.py index aec517cb898e..a3c7bcfff860 100644 --- a/ibis/backends/clickhouse/tests/test_functions.py +++ b/ibis/backends/clickhouse/tests/test_functions.py @@ -42,7 +42,6 @@ def test_cast_string_col(alltypes, translate, to_type, snapshot): @pytest.mark.parametrize( 'column', [ - 'index', 'id', 'bool_col', 'tinyint_col', diff --git a/ibis/backends/conftest.py b/ibis/backends/conftest.py index 7e308f4db8d9..5ba957d00dee 100644 --- a/ibis/backends/conftest.py +++ b/ibis/backends/conftest.py @@ -34,8 +34,6 @@ TEST_TABLES = { "functional_alltypes": ibis.schema( { - "index": "int64", - "Unnamed: 0": "int64", "id": "int32", "bool_col": "boolean", "tinyint_col": "int8", diff --git a/ibis/backends/dask/tests/conftest.py b/ibis/backends/dask/tests/conftest.py index 2f1f38f6bd34..670b9893afe1 100644 --- a/ibis/backends/dask/tests/conftest.py +++ b/ibis/backends/dask/tests/conftest.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import Any -import numpy as np import pandas as pd import pandas.testing as tm import pytest @@ -33,34 +32,23 @@ def connect(data_directory: Path): return ibis.dask.connect( { "functional_alltypes": dd.from_pandas( - pd.read_csv( - data_directory / "functional_alltypes.csv", - index_col=None, - dtype={ - "bool_col": bool, - "string_col": str, - "tinyint_col": np.int8, - "smallint_col": np.int16, - "int_col": np.int32, - "bigint_col": np.int64, - "float_col": np.float32, - "double_col": np.float64, - }, - parse_dates=["timestamp_col"], - encoding="utf-8", + pd.read_parquet( + data_directory / "parquet" / "functional_alltypes.parquet" ), npartitions=NPARTITIONS, ), "batting": dd.from_pandas( - pd.read_csv(data_directory / "batting.csv"), + pd.read_parquet(data_directory / "parquet" / "batting.parquet"), npartitions=NPARTITIONS, ), "awards_players": dd.from_pandas( - pd.read_csv(data_directory / "awards_players.csv"), + pd.read_parquet( + data_directory / "parquet" / "awards_players.parquet" + ), npartitions=NPARTITIONS, ), 'diamonds': dd.from_pandas( - pd.read_csv(str(data_directory / 'diamonds.csv')), + pd.read_parquet(data_directory / "parquet" / "diamonds.parquet"), npartitions=NPARTITIONS, ), 'json_t': dd.from_pandas( diff --git a/ibis/backends/dask/tests/execution/conftest.py b/ibis/backends/dask/tests/execution/conftest.py index a1d6e1e39506..0d4f860d866f 100644 --- a/ibis/backends/dask/tests/execution/conftest.py +++ b/ibis/backends/dask/tests/execution/conftest.py @@ -63,20 +63,13 @@ def df(npartitions): @pytest.fixture(scope='module') def batting_df(data_directory): - df = dd.read_csv( - data_directory / 'batting.csv', - assume_missing=True, - dtype={"lgID": "object"}, - ) + df = dd.read_parquet(data_directory / 'parquet' / 'batting.parquet') return df.sample(frac=0.01).reset_index(drop=True) @pytest.fixture(scope='module') def awards_players_df(data_directory): - return dd.read_csv( - data_directory / 'awards_players.csv', - assume_missing=True, - ) + return dd.read_parquet(data_directory / 'parquet' / 'awards_players.parquet') @pytest.fixture(scope='module') diff --git a/ibis/backends/datafusion/tests/conftest.py b/ibis/backends/datafusion/tests/conftest.py index c1ef4100ae36..26df4bba11b0 100644 --- a/ibis/backends/datafusion/tests/conftest.py +++ b/ibis/backends/datafusion/tests/conftest.py @@ -28,12 +28,10 @@ def connect(data_directory: Path): # csv file path client = ibis.datafusion.connect({}) client.register( - data_directory / 'functional_alltypes.csv', + data_directory / "csv" / 'functional_alltypes.csv', table_name='functional_alltypes', schema=pa.schema( [ - ('index', 'int64'), - ('Unnamed 0', 'int64'), ('id', 'int64'), ('bool_col', 'int8'), ('tinyint_col', 'int8'), @@ -50,11 +48,16 @@ def connect(data_directory: Path): ] ), ) - client.register(data_directory / 'batting.csv', table_name='batting') client.register( - data_directory / 'awards_players.csv', table_name='awards_players' + data_directory / "parquet" / 'batting.parquet', table_name='batting' + ) + client.register( + data_directory / "parquet" / 'awards_players.parquet', + table_name='awards_players', + ) + client.register( + data_directory / "parquet" / 'diamonds.parquet', table_name='diamonds' ) - client.register(data_directory / 'diamonds.csv', table_name='diamonds') return client @property diff --git a/ibis/backends/druid/tests/conftest.py b/ibis/backends/druid/tests/conftest.py index a9a57a291821..81b8fc62900d 100644 --- a/ibis/backends/druid/tests/conftest.py +++ b/ibis/backends/druid/tests/conftest.py @@ -99,11 +99,10 @@ class TestConf(ServiceBackendTest, RoundHalfToEven): @classmethod def service_spec(cls, data_dir: Path): - files = [data_dir.joinpath("functional_alltypes.parquet")] - files.extend( - data_dir.joinpath("parquet", name, f"{name}.parquet") - for name in ("diamonds", "batting", "awards_players") - ) + files = [ + data_dir.joinpath("parquet", f"{name}.parquet") + for name in ("diamonds", "batting", "awards_players", "functional_alltypes") + ] return ServiceSpec( name="druid-coordinator", data_volume="/opt/shared", files=files ) diff --git a/ibis/backends/duckdb/tests/conftest.py b/ibis/backends/duckdb/tests/conftest.py index c5acca0fbe3c..2533d21a44d6 100644 --- a/ibis/backends/duckdb/tests/conftest.py +++ b/ibis/backends/duckdb/tests/conftest.py @@ -19,23 +19,22 @@ class TestConf(BackendTest, RoundAwayFromZero): def __init__(self, data_directory: Path, **kwargs: Any) -> None: self.connection = self.connect(data_directory, **kwargs) - script_dir = data_directory.parent - - schema = (script_dir / 'schema' / 'duckdb.sql').read_text() - if not SANDBOXED: self.connection._load_extensions( ["httpfs", "postgres_scanner", "sqlite_scanner"] ) + script_dir = data_directory.parent + schema = script_dir.joinpath("schema", "duckdb.sql").read_text() + with self.connection.begin() as con: - for stmt in filter(None, map(str.strip, schema.split(';'))): + for stmt in filter(None, map(str.strip, schema.split(";"))): con.exec_driver_sql(stmt) for table in TEST_TABLES: - src = data_directory / f'{table}.csv' + src = data_directory / "csv" / f"{table}.csv" con.exec_driver_sql( - f"COPY {table} FROM {str(src)!r} (DELIMITER ',', HEADER, SAMPLE_SIZE 1)" + f"COPY {table} FROM {str(src)!r} (DELIMITER ',', HEADER)" ) @staticmethod diff --git a/ibis/backends/duckdb/tests/test_register.py b/ibis/backends/duckdb/tests/test_register.py index b1719bb9d752..442af280e248 100644 --- a/ibis/backends/duckdb/tests/test_register.py +++ b/ibis/backends/duckdb/tests/test_register.py @@ -17,12 +17,12 @@ def test_read_csv(data_directory): - t = ibis.read_csv(data_directory / "functional_alltypes.csv") + t = ibis.read_csv(data_directory / "csv" / "functional_alltypes.csv") assert t.count().execute() def test_read_parquet(data_directory): - t = ibis.read_parquet(data_directory / "functional_alltypes.parquet") + t = ibis.read_parquet(data_directory / "parquet" / "functional_alltypes.parquet") assert t.count().execute() @@ -30,7 +30,7 @@ def test_read_parquet(data_directory): duckdb=["duckdb<0.7.0"], reason="read_json_auto doesn't exist", raises=exc.IbisError ) def test_read_json(data_directory, tmp_path): - pqt = ibis.read_parquet(data_directory / "functional_alltypes.parquet") + pqt = ibis.read_parquet(data_directory / "parquet" / "functional_alltypes.parquet") path = tmp_path.joinpath("ft.json") path.write_text(pqt.execute().to_json(orient="records", lines=True)) @@ -142,15 +142,22 @@ def test_register_sqlite(con, tmp_path): reason="nix on linux cannot download duckdb extensions or data due to sandboxing", raises=duckdb.IOException, ) -def test_attach_sqlite(data_directory): +def test_attach_sqlite(data_directory, tmp_path): + import sqlite3 + + test_db_path = tmp_path / "test.db" + with sqlite3.connect(test_db_path) as scon: + for line in ( + Path(data_directory.parent / "schema" / "sqlite.sql").read_text().split(";") + ): + scon.execute(line) + # Create a new connection here because we already have the `ibis_testing` # tables loaded in to the `con` fixture. con = ibis.duckdb.connect() - sqlite_db = data_directory / "ibis_testing.db" - - con.attach_sqlite(sqlite_db) - assert set(con.list_tables()) == { + con.attach_sqlite(test_db_path) + assert set(con.list_tables()) >= { "functional_alltypes", "awards_players", "batting", @@ -161,8 +168,8 @@ def test_attach_sqlite(data_directory): assert len(set(fa.schema().types)) > 1 # overwrite existing sqlite_db and force schema to all strings - con.attach_sqlite(sqlite_db, overwrite=True, all_varchar=True) - assert set(con.list_tables()) == { + con.attach_sqlite(test_db_path, overwrite=True, all_varchar=True) + assert set(con.list_tables()) >= { "functional_alltypes", "awards_players", "batting", diff --git a/ibis/backends/impala/tests/conftest.py b/ibis/backends/impala/tests/conftest.py index 0f499c980aac..4febfd20e5f8 100644 --- a/ibis/backends/impala/tests/conftest.py +++ b/ibis/backends/impala/tests/conftest.py @@ -1,21 +1,21 @@ from __future__ import annotations import ast -import collections import concurrent.futures import contextlib import itertools +import operator import os import subprocess from pathlib import Path -from typing import Any, Iterator +from typing import Any import pytest +import toolz import ibis import ibis.expr.types as ir from ibis import options, util -from ibis.backends.base import BaseBackend from ibis.backends.conftest import TEST_TABLES from ibis.backends.impala.compiler import ImpalaCompiler, ImpalaExprTranslator from ibis.backends.tests.base import BackendTest, RoundAwayFromZero, UnorderedComparator @@ -46,18 +46,7 @@ def _load_data(data_dir: Path, script_dir: Path, **_: Any) -> None: fsspec = pytest.importorskip("fsspec") fs = fsspec.filesystem("file") - data_files = { - data_file - for data_file in fs.find(data_dir) - # ignore sqlite databases and markdown files - if not data_file.endswith((".db", ".md")) - # ignore files in the test data .git directory - if ( - # ignore .git - os.path.relpath(data_file, data_dir).split(os.sep, 1)[0] - != ".git" - ) - } + data_files = fs.find(data_dir / "impala") # without setting the pool size # connections are dropped from the urllib3 @@ -66,6 +55,7 @@ def _load_data(data_dir: Path, script_dir: Path, **_: Any) -> None: URLLIB_DEFAULT_POOL_SIZE = 10 env = IbisTestEnv() + futures = [] with contextlib.closing( ibis.impala.connect( host=env.impala_host, @@ -110,14 +100,42 @@ def _load_data(data_dir: Path, script_dir: Path, **_: Any) -> None: for future in concurrent.futures.as_completed(tasks): future.result() - # create the tables and compute stats - for future in concurrent.futures.as_completed( - executor.submit(table_future.result().compute_stats) - for table_future in concurrent.futures.as_completed( - impala_create_tables(con, env, executor=executor) + # create tables and compute stats + compute_stats = operator.methodcaller("compute_stats") + futures.append( + executor.submit( + toolz.compose(compute_stats, con.avro_file), + os.path.join(env.test_data_dir, 'impala', 'avro', 'tpch', 'region'), + avro_schema={ + "type": "record", + "name": "a", + "fields": [ + {"name": "R_REGIONKEY", "type": ["null", "int"]}, + {"name": "R_NAME", "type": ["null", "string"]}, + {"name": "R_COMMENT", "type": ["null", "string"]}, + ], + }, + name="tpch_region_avro", + database=env.test_data_db, + persist=True, ) - ): - future.result() + ) + + futures.extend( + executor.submit( + toolz.compose(compute_stats, con.parquet_file), + path, + name=os.path.basename(path), + database=env.test_data_db, + persist=True, + schema=TEST_TABLES.get(os.path.basename(path)), + ) + for path in con.hdfs.ls( + os.path.join(env.test_data_dir, 'impala', 'parquet') + ) + ) + for fut in concurrent.futures.as_completed(futures): + fut.result() @staticmethod def connect( @@ -427,17 +445,17 @@ def impala_create_test_database(con, env): con.create_table( 'alltypes', schema=ibis.schema( - [ - ('a', 'int8'), - ('b', 'int16'), - ('c', 'int32'), - ('d', 'int64'), - ('e', 'float'), - ('f', 'double'), - ('g', 'string'), - ('h', 'boolean'), - ('i', 'timestamp'), - ] + dict( + a='int8', + b='int16', + c='int32', + d='int64', + e='float', + f='double', + g='string', + h='boolean', + i='timestamp', + ) ), database=env.test_data_db, ) @@ -447,67 +465,3 @@ def impala_create_test_database(con, env): database=env.test_data_db, ) con.table("win", database=env.test_data_db).insert(win, overwrite=True) - - -PARQUET_SCHEMAS = { - "functional_alltypes": ibis.schema( - { - name: dtype - for name, dtype in TEST_TABLES["functional_alltypes"].items() - if name not in {"index", "Unnamed: 0"} - } - ), - "tpch_region": ibis.schema( - [ - ("r_regionkey", "int16"), - ("r_name", "string"), - ("r_comment", "string"), - ] - ), -} - -PARQUET_SCHEMAS.update( - (table, schema) - for table, schema in TEST_TABLES.items() - if table != "functional_alltypes" -) - -AVRO_SCHEMAS = { - "tpch_region_avro": { - "type": "record", - "name": "a", - "fields": [ - {"name": "R_REGIONKEY", "type": ["null", "int"]}, - {"name": "R_NAME", "type": ["null", "string"]}, - {"name": "R_COMMENT", "type": ["null", "string"]}, - ], - } -} - -ALL_SCHEMAS = collections.ChainMap(PARQUET_SCHEMAS, AVRO_SCHEMAS) - - -def impala_create_tables( - con: BaseBackend, - env: IbisTestEnv, - *, - executor: concurrent.futures.Executor, -) -> Iterator[concurrent.futures.Future]: - test_data_dir = env.test_data_dir - avro_files = [ - (con.avro_file, os.path.join(test_data_dir, 'avro', path)) - for path in con.hdfs.ls(os.path.join(test_data_dir, 'avro')) - ] - parquet_files = [ - (con.parquet_file, os.path.join(test_data_dir, 'parquet', path)) - for path in con.hdfs.ls(os.path.join(test_data_dir, 'parquet')) - ] - for method, path in itertools.chain(parquet_files, avro_files): - yield executor.submit( - method, - path, - ALL_SCHEMAS.get(os.path.basename(path)), - name=os.path.basename(path), - database=env.test_data_db, - persist=True, - ) diff --git a/ibis/backends/impala/tests/test_client.py b/ibis/backends/impala/tests/test_client.py index 1603903d2313..04d85843f305 100644 --- a/ibis/backends/impala/tests/test_client.py +++ b/ibis/backends/impala/tests/test_client.py @@ -43,9 +43,9 @@ def test_get_table_ref(db): def test_run_sql(con, test_data_db): - table = con.sql(f"SELECT li.* FROM {test_data_db}.tpch_lineitem li") + table = con.sql(f"SELECT li.* FROM {test_data_db}.lineitem li") - li = con.table('tpch_lineitem') + li = con.table('lineitem') assert isinstance(table, ir.Table) assert_equal(table.schema(), li.schema()) @@ -76,8 +76,8 @@ def test_explain(con): def test_get_schema(con, test_data_db): - t = con.table('tpch_lineitem') - schema = con.get_schema('tpch_lineitem', database=test_data_db) + t = con.table('lineitem') + schema = con.get_schema('lineitem', database=test_data_db) assert_equal(t.schema(), schema) @@ -112,7 +112,7 @@ def test_adapt_scalar_array_results(con, alltypes): def test_interactive_repr_call_failure(con): - t = con.table('tpch_lineitem').limit(100000) + t = con.table('lineitem').limit(100000) t = t[t, t.l_receiptdate.cast('timestamp').name('date')] @@ -155,17 +155,17 @@ def test_verbose_log_queries(con, test_data_db): with config.option_context('verbose', True): with config.option_context('verbose_log', queries.append): - con.table('tpch_orders', database=test_data_db) + con.table('orders', database=test_data_db) # we can't make assertions about the length of queries, since the Python GC # could've collected a temporary pandas table any time between construction # of `queries` and the assertion - expected = f'DESCRIBE {test_data_db}.`tpch_orders`' + expected = f'DESCRIBE {test_data_db}.`orders`' assert expected in queries def test_sql_query_limits(con, test_data_db): - table = con.table('tpch_nation', database=test_data_db) + table = con.table('nation', database=test_data_db) with config.option_context('sql.default_limit', 100000): # table has 25 rows assert len(table.execute()) == 25 @@ -206,7 +206,7 @@ def test_database_default_current_database(con): def test_close_drops_temp_tables(con, test_data_dir): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') table = con.parquet_file(hdfs_path) diff --git a/ibis/backends/impala/tests/test_ddl.py b/ibis/backends/impala/tests/test_ddl.py index 1127126054c5..67f2cc4f9abd 100644 --- a/ibis/backends/impala/tests/test_ddl.py +++ b/ibis/backends/impala/tests/test_ddl.py @@ -208,7 +208,7 @@ def test_rename_table(con, temp_database): tmp_db = temp_database orig_name = 'tmp_rename_test' - con.create_table(orig_name, con.table('tpch_region')) + con.create_table(orig_name, con.table('region')) table = con.table(orig_name) old_name = table.name @@ -277,7 +277,7 @@ def test_change_format(con, table): def test_query_avro(con, test_data_dir, tmp_db): - hdfs_path = pjoin(test_data_dir, 'avro/tpch_region_avro') + hdfs_path = pjoin(test_data_dir, 'impala/avro/tpch/region') avro_schema = { "fields": [ @@ -372,7 +372,7 @@ def limit(con, hdfs_path, offset): return t.order_by(t.r_regionkey).limit(1, offset=offset).execute() nthreads = multiprocessing.cpu_count() - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') num_rows = int(con.parquet_file(hdfs_path).count().execute()) with concurrent.futures.ThreadPoolExecutor(max_workers=nthreads) as e: diff --git a/ibis/backends/impala/tests/test_exprs.py b/ibis/backends/impala/tests/test_exprs.py index 8499c32ddbde..f7841cbfa455 100644 --- a/ibis/backends/impala/tests/test_exprs.py +++ b/ibis/backends/impala/tests/test_exprs.py @@ -3,6 +3,7 @@ import pandas as pd import pandas.testing as tm import pytest +from pytest import param import ibis import ibis.expr.types as ir @@ -19,7 +20,7 @@ def test_embedded_identifier_quoting(alltypes): def test_decimal_metadata(con): - table = con.table('tpch_lineitem') + table = con.table('lineitem') expr = table.l_quantity assert expr.type().precision == 12 @@ -171,15 +172,20 @@ def test_int_builtins(con, expr, expected): assert result == expected, ImpalaCompiler.to_sql(expr) -def test_column_types(alltypes): - df = alltypes.execute() - assert df.tinyint_col.dtype.name == 'int8' - assert df.smallint_col.dtype.name == 'int16' - assert df.int_col.dtype.name == 'int32' - assert df.bigint_col.dtype.name == 'int64' - assert df.float_col.dtype.name == 'float32' - assert df.double_col.dtype.name == 'float64' - assert df.timestamp_col.dtype.name == 'datetime64[ns]' +@pytest.mark.parametrize( + ("col", "expected"), + [ + param("tinyint_col", "int8", id="tinyint"), + param("smallint_col", "int16", id="smallint"), + param("int_col", "int32", id="int"), + param("bigint_col", "int64", id="bigint"), + param("float_col", "float32", id="float"), + param("double_col", "float64", id="double"), + param("timestamp_col", "datetime64[ns]", id="timestamp"), + ], +) +def test_column_types(alltypes_df, col, expected): + assert alltypes_df[col].dtype.name == expected @pytest.mark.parametrize( @@ -337,7 +343,7 @@ def test_div_floordiv(con, expr, expected): def test_filter_predicates(con): - t = con.table('tpch_nation') + t = con.table('nation') predicates = [ lambda x: x.n_name.lower().like('%ge%'), @@ -366,7 +372,7 @@ def test_casted_expr_impala_bug(alltypes): def test_decimal_timestamp_builtins(con): - table = con.table('tpch_lineitem') + table = con.table('lineitem') dc = table.l_quantity ts = table.l_receiptdate.cast('timestamp') @@ -520,10 +526,10 @@ def test_anti_join_self_reference_works(con, alltypes): def test_tpch_self_join_failure(con): - region = con.table('tpch_region') - nation = con.table('tpch_nation') - customer = con.table('tpch_customer') - orders = con.table('tpch_orders') + region = con.table('region') + nation = con.table('nation') + customer = con.table('customer') + orders = con.table('orders') fields_of_interest = [ region.r_name.name('region'), @@ -557,10 +563,10 @@ def test_tpch_self_join_failure(con): def test_tpch_correlated_subquery_failure(con): # #183 and other issues - region = con.table('tpch_region') - nation = con.table('tpch_nation') - customer = con.table('tpch_customer') - orders = con.table('tpch_orders') + region = con.table('region') + nation = con.table('nation') + customer = con.table('customer') + orders = con.table('orders') fields_of_interest = [ customer, diff --git a/ibis/backends/impala/tests/test_parquet_ddl.py b/ibis/backends/impala/tests/test_parquet_ddl.py index 3e81093a6737..83a7496231b2 100644 --- a/ibis/backends/impala/tests/test_parquet_ddl.py +++ b/ibis/backends/impala/tests/test_parquet_ddl.py @@ -12,7 +12,7 @@ def test_cleanup_tmp_table_on_gc(con, test_data_dir): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') table = con.parquet_file(hdfs_path) name = table.op().name table = None @@ -21,7 +21,7 @@ def test_cleanup_tmp_table_on_gc(con, test_data_dir): def test_persist_parquet_file_with_name(con, test_data_dir, temp_table_db): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') tmp_db, name = temp_table_db schema = ibis.schema( @@ -39,7 +39,7 @@ def test_persist_parquet_file_with_name(con, test_data_dir, temp_table_db): def test_query_parquet_file_with_schema(con, test_data_dir): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') schema = ibis.schema( [ @@ -63,23 +63,23 @@ def test_query_parquet_file_with_schema(con, test_data_dir): def test_query_parquet_file_like_table(con, test_data_dir): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') ex_schema = ibis.schema( [ - ('r_regionkey', 'int16'), + ('r_regionkey', 'int32'), ('r_name', 'string'), ('r_comment', 'string'), ] ) - table = con.parquet_file(hdfs_path, like_table='tpch_region') + table = con.parquet_file(hdfs_path, like_table='region') assert_equal(table.schema(), ex_schema) def test_query_parquet_infer_schema(con, test_data_dir): - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') table = con.parquet_file(hdfs_path) # NOTE: the actual schema should have an int16, but bc this is being @@ -99,7 +99,7 @@ def test_query_parquet_infer_schema(con, test_data_dir): def test_create_table_persist_fails_if_called_twice(con, temp_table_db, test_data_dir): tmp_db, tname = temp_table_db - hdfs_path = pjoin(test_data_dir, 'parquet/tpch_region') + hdfs_path = pjoin(test_data_dir, 'impala/parquet/region') con.parquet_file(hdfs_path, name=tname, persist=True, database=tmp_db) with pytest.raises(HS2Error): diff --git a/ibis/backends/impala/tests/test_udf.py b/ibis/backends/impala/tests/test_udf.py index 8a394d09e8fa..3cc52ac78e0e 100644 --- a/ibis/backends/impala/tests/test_udf.py +++ b/ibis/backends/impala/tests/test_udf.py @@ -70,7 +70,7 @@ def t(table): @pytest.fixture def tpch_customer(con): - return con.table("tpch_customer") + return con.table("customer") @pytest.fixture @@ -320,7 +320,7 @@ def test_identity_primitive_types( reason='Unknown reason. xfailing to restore the CI for udf tests. #2358' ) def test_decimal_fail(udfcon, test_data_db, udf_ll): - col = udfcon.table('tpch_customer').c_acctbal + col = udfcon.table('customer').c_acctbal literal = ibis.literal(1).cast('decimal(12,2)') name = '__tmp_udf_' + util.guid() diff --git a/ibis/backends/mssql/tests/conftest.py b/ibis/backends/mssql/tests/conftest.py index 0e5dc197ae9f..3341c753b291 100644 --- a/ibis/backends/mssql/tests/conftest.py +++ b/ibis/backends/mssql/tests/conftest.py @@ -35,7 +35,7 @@ def service_spec(cls, data_dir: Path): name=cls.name(), data_volume="/data", files=[ - data_dir.joinpath(f"{name}.csv") + data_dir.joinpath("csv", f"{name}.csv") for name in ( "diamonds", "batting", diff --git a/ibis/backends/mysql/__init__.py b/ibis/backends/mysql/__init__.py index d90c3efbd9fa..473765970efa 100644 --- a/ibis/backends/mysql/__init__.py +++ b/ibis/backends/mysql/__init__.py @@ -71,8 +71,6 @@ def do_connect( MySQLTable[table] name: functional_alltypes schema: - index : int64 - Unnamed: 0 : int64 id : int32 bool_col : int8 tinyint_col : int8 diff --git a/ibis/backends/mysql/tests/conftest.py b/ibis/backends/mysql/tests/conftest.py index 14e691e48cc1..cfa44625f195 100644 --- a/ibis/backends/mysql/tests/conftest.py +++ b/ibis/backends/mysql/tests/conftest.py @@ -84,7 +84,7 @@ def _load_data( ) with engine.begin() as con: for table in TEST_TABLES: - csv_path = data_dir / f"{table}.csv" + csv_path = data_dir / "csv" / f"{table}.csv" lines = [ f"LOAD DATA LOCAL INFILE {str(csv_path)!r}", f"INTO TABLE {table}", diff --git a/ibis/backends/pandas/tests/conftest.py b/ibis/backends/pandas/tests/conftest.py index 85a1ad061b37..3f615e915d8f 100644 --- a/ibis/backends/pandas/tests/conftest.py +++ b/ibis/backends/pandas/tests/conftest.py @@ -2,12 +2,11 @@ from pathlib import Path -import numpy as np import pandas as pd import ibis import ibis.expr.operations as ops -import ibis.expr.types as ir +from ibis.backends.conftest import TEST_TABLES from ibis.backends.tests.base import BackendTest, RoundHalfToEven from ibis.backends.tests.data import array_types, json_types, struct_types, win @@ -23,40 +22,15 @@ class TestConf(BackendTest, RoundHalfToEven): def connect(data_directory: Path): return ibis.pandas.connect( dictionary={ - "functional_alltypes": pd.read_csv( - data_directory / "functional_alltypes.csv", - index_col=None, - dtype={ - "bool_col": bool, - "string_col": str, - "tinyint_col": np.int8, - "smallint_col": np.int16, - "int_col": np.int32, - "bigint_col": np.int64, - "float_col": np.float32, - "double_col": np.float64, - }, - parse_dates=["timestamp_col"], - encoding="utf-8", - ), - "batting": pd.read_csv(data_directory / "batting.csv"), - "awards_players": pd.read_csv(data_directory / "awards_players.csv"), - 'diamonds': pd.read_csv(str(data_directory / 'diamonds.csv')), + **{ + table: pd.read_parquet( + data_directory / "parquet" / f"{table}.parquet" + ) + for table in TEST_TABLES.keys() + }, 'struct': struct_types, 'json_t': json_types, 'array_types': array_types, 'win': win, } ) - - @property - def functional_alltypes(self) -> ir.Table: - return self.connection.table("functional_alltypes") - - @property - def batting(self) -> ir.Table: - return self.connection.table("batting") - - @property - def awards_players(self) -> ir.Table: - return self.connection.table("awards_players") diff --git a/ibis/backends/pandas/tests/execution/conftest.py b/ibis/backends/pandas/tests/execution/conftest.py index b03bf4f0e343..dd08aa6e7a91 100644 --- a/ibis/backends/pandas/tests/execution/conftest.py +++ b/ibis/backends/pandas/tests/execution/conftest.py @@ -76,24 +76,15 @@ def df(): def batting_df(data_directory): num_rows = 1000 start_index = 30 - df = pd.read_csv( - data_directory / 'batting.csv', - index_col=None, - sep=',', - header=0, - skiprows=range(1, start_index + 1), - nrows=num_rows, - ) + df = pd.read_parquet(data_directory / 'parquet' / 'batting.parquet').iloc[ + start_index : start_index + num_rows + ] return df.reset_index(drop=True) @pytest.fixture(scope='module') def awards_players_df(data_directory): - return pd.read_csv( - data_directory / 'awards_players.csv', - index_col=None, - sep=',', - ) + return pd.read_parquet(data_directory / 'parquet' / 'awards_players.parquet') @pytest.fixture(scope='module') diff --git a/ibis/backends/polars/tests/conftest.py b/ibis/backends/polars/tests/conftest.py index e0fffc5a3eb7..34263fef1659 100644 --- a/ibis/backends/polars/tests/conftest.py +++ b/ibis/backends/polars/tests/conftest.py @@ -5,7 +5,6 @@ import pytest import ibis -import ibis.expr.types as ir from ibis.backends.tests.base import BackendTest, RoundAwayFromZero from ibis.backends.tests.data import array_types, struct_types, win @@ -21,46 +20,25 @@ class TestConf(BackendTest, RoundAwayFromZero): def connect(data_directory: Path): client = ibis.polars.connect({}) client.register( - data_directory / 'functional_alltypes.csv', + data_directory / 'parquet' / 'functional_alltypes.parquet', table_name='functional_alltypes', - dtypes={ - 'index': pl.Int64, - 'Unnamed 0': pl.Int64, - 'id': pl.Int64, - 'bool_col': pl.Int64, - 'tinyint_col': pl.Int64, - 'smallint_col': pl.Int64, - 'int_col': pl.Int32, - 'bigint_col': pl.Int64, - 'float_col': pl.Float32, - 'double_col': pl.Float64, - 'date_string_col': pl.Utf8, - 'string_col': pl.Utf8, - 'timestamp_col': pl.Datetime, - 'year': pl.Int64, - 'month': pl.Int64, - }, ) - client.register(data_directory / 'batting.csv', table_name='batting') client.register( - data_directory / 'awards_players.csv', table_name='awards_players' + data_directory / "parquet" / 'batting.parquet', table_name='batting' + ) + client.register( + data_directory / "parquet" / 'awards_players.parquet', + table_name='awards_players', + ) + client.register( + data_directory / "parquet" / 'diamonds.parquet', table_name='diamonds' ) - client.register(data_directory / 'diamonds.csv', table_name='diamonds') client.register(array_types, table_name='array_types') client.register(struct_types, table_name='struct') client.register(win, table_name="win") return client - @property - def functional_alltypes(self) -> ir.Table: - table = self.connection.table('functional_alltypes') - return table.mutate( - bool_col=table.bool_col.cast('bool'), - tinyint_col=table.tinyint_col.cast('int8'), - smallint_col=table.smallint_col.cast('int16'), - ) - @pytest.fixture(scope='session') def client(data_directory): diff --git a/ibis/backends/polars/tests/test_udf.py b/ibis/backends/polars/tests/test_udf.py index 078b15d3988b..f6a26e22ae92 100644 --- a/ibis/backends/polars/tests/test_udf.py +++ b/ibis/backends/polars/tests/test_udf.py @@ -43,6 +43,6 @@ def test_multiple_argument_udf(alltypes): result = expr.execute() df = alltypes[['smallint_col', 'int_col']].execute() - expected = (df.smallint_col + df.int_col).astype('int64') + expected = (df.smallint_col + df.int_col).astype('int32') tm.assert_series_equal(result, expected.rename('tmp')) diff --git a/ibis/backends/postgres/__init__.py b/ibis/backends/postgres/__init__.py index a85f7a5ee489..62ddd6073b1d 100644 --- a/ibis/backends/postgres/__init__.py +++ b/ibis/backends/postgres/__init__.py @@ -108,8 +108,6 @@ def do_connect( PostgreSQLTable[table] name: functional_alltypes schema: - index : int64 - Unnamed: 0 : int64 id : int32 bool_col : boolean tinyint_col : int16 diff --git a/ibis/backends/postgres/tests/conftest.py b/ibis/backends/postgres/tests/conftest.py index 599740cf89b4..3589a334f545 100644 --- a/ibis/backends/postgres/tests/conftest.py +++ b/ibis/backends/postgres/tests/conftest.py @@ -88,7 +88,7 @@ def _load_data( # `data_iter` argument would have to be turned back into a CSV # before being passed to `copy_expert`. sql = f"COPY {table} FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')" - with data_dir.joinpath(f'{table}.csv').open('r') as file: + with data_dir.joinpath("csv", f'{table}.csv').open('r') as file: cur.copy_expert(sql=sql, file=file) @staticmethod diff --git a/ibis/backends/postgres/tests/test_functions.py b/ibis/backends/postgres/tests/test_functions.py index 018aeee16a2a..f0043bf4fc08 100644 --- a/ibis/backends/postgres/tests/test_functions.py +++ b/ibis/backends/postgres/tests/test_functions.py @@ -73,8 +73,6 @@ def test_date_cast(alltypes, at, translate): @pytest.mark.parametrize( 'column', [ - 'index', - 'Unnamed: 0', 'id', 'bool_col', 'tinyint_col', diff --git a/ibis/backends/pyspark/tests/conftest.py b/ibis/backends/pyspark/tests/conftest.py index ba05fd1abf72..9e19f79980b6 100644 --- a/ibis/backends/pyspark/tests/conftest.py +++ b/ibis/backends/pyspark/tests/conftest.py @@ -10,7 +10,6 @@ import ibis from ibis import util from ibis.backends.conftest import TEST_TABLES -from ibis.backends.pyspark.datatypes import spark_dtype from ibis.backends.tests.base import BackendTest, RoundAwayFromZero from ibis.backends.tests.data import win @@ -44,40 +43,21 @@ def get_common_spark_testing_client(data_directory, connect): .getOrCreate() ) _spark_testing_client = connect(spark) - s = _spark_testing_client._session + s: SparkSession = _spark_testing_client._session num_partitions = 4 - s.read.csv( - path=str(data_directory / 'functional_alltypes.csv'), - schema=spark_dtype( - ibis.schema( - { - # cast below, Spark can't read 0/1 as bool - name: {"bool_col": "int8"}.get(name, dtype) - for name, dtype in TEST_TABLES["functional_alltypes"].items() - } - ) - ), - mode='FAILFAST', - header=True, - ).repartition(num_partitions).sort('index').withColumn( - "bool_col", F.column("bool_col").cast("boolean") - ).createOrReplaceTempView( - 'functional_alltypes' - ) + sort_cols = {"functional_alltypes": "id"} - for name, schema in TEST_TABLES.items(): - if name != "functional_alltypes": - s.read.csv( - path=str(data_directory / f'{name}.csv'), - schema=spark_dtype(schema), - header=True, - ).repartition(num_partitions).createOrReplaceTempView(name) + for name in TEST_TABLES.keys(): + path = str(data_directory / "parquet" / f"{name}.parquet") + t = s.read.parquet(path).repartition(num_partitions) + if (sort_col := sort_cols.get(name)) is not None: + t = t.sort(sort_col) + t.createOrReplaceTempView(name) - df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) - df_simple.createOrReplaceTempView('simple') + s.createDataFrame([(1, 'a')], ['foo', 'bar']).createOrReplaceTempView('simple') - df_struct = s.createDataFrame( + s.createDataFrame( [ Row(abc=Row(a=1.0, b='banana', c=2)), Row(abc=Row(a=2.0, b='apple', c=3)), @@ -87,33 +67,17 @@ def get_common_spark_testing_client(data_directory, connect): Row(abc=None), Row(abc=Row(a=3.0, b='orange', c=None)), ], - schema=pt.StructType( - [ - pt.StructField( - "abc", - pt.StructType( - [ - pt.StructField("a", pt.DoubleType(), True), - pt.StructField("b", pt.StringType(), True), - pt.StructField("c", pt.IntegerType(), True), - ] - ), - ) - ] - ), - ) - df_struct.createOrReplaceTempView('struct') + ).createOrReplaceTempView('struct') - df_nested_types = s.createDataFrame( + s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], {'a': [[2, 4], [3, 5]]})], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], - ) - df_nested_types.createOrReplaceTempView('nested_types') - df_array_types = s.createDataFrame( + ).createOrReplaceTempView('nested_types') + s.createDataFrame( [ ( [1, 2, 3], @@ -144,21 +108,18 @@ def get_common_spark_testing_client(data_directory, connect): ), ], ["x", "y", "z", "grouper", "scalar_column", "multi_dim"], - ) - df_array_types.createOrReplaceTempView("array_types") + ).createOrReplaceTempView("array_types") - df_complicated = s.createDataFrame( + s.createDataFrame( [({(1, 3): [[2, 4], [3, 5]]},)], ['map_tuple_list_of_list_of_ints'] - ) - df_complicated.createOrReplaceTempView('complicated') + ).createOrReplaceTempView('complicated') - df_udf = s.createDataFrame( + s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], - ) - df_udf.createOrReplaceTempView('udf') + ).createOrReplaceTempView('udf') - df_udf_nan = s.createDataFrame( + s.createDataFrame( pd.DataFrame( { 'a': np.arange(10, dtype=float), @@ -166,27 +127,24 @@ def get_common_spark_testing_client(data_directory, connect): 'key': list('ddeefffggh'), } ) - ) - df_udf_nan.createOrReplaceTempView('udf_nan') + ).createOrReplaceTempView('udf_nan') - df_udf_null = s.createDataFrame( + s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], - ) - df_udf_null.createOrReplaceTempView('udf_null') + ).createOrReplaceTempView('udf_null') - df_udf_random = s.createDataFrame( + s.createDataFrame( pd.DataFrame( { - 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), - 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), + 'a': np.arange(4.0).tolist() + np.random.rand(3).tolist(), + 'b': np.arange(4.0).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), } ) - ) - df_udf_random.createOrReplaceTempView('udf_random') + ).createOrReplaceTempView('udf_random') - df_json_t = s.createDataFrame( + s.createDataFrame( pd.DataFrame( { "js": [ @@ -199,11 +157,9 @@ def get_common_spark_testing_client(data_directory, connect): ] } ) - ) - df_json_t.createOrReplaceTempView("json_t") + ).createOrReplaceTempView("json_t") - win_t = s.createDataFrame(win) - win_t.createOrReplaceTempView("win") + s.createDataFrame(win).createOrReplaceTempView("win") return _spark_testing_client @@ -379,12 +335,7 @@ def temp_database(con, test_data_db): @pytest.fixture(scope='session') def alltypes(con): - return con.table('functional_alltypes').relabel({'Unnamed: 0': 'Unnamed:0'}) - - -@pytest.fixture(scope='session') -def tmp_dir(): - return f'/tmp/__ibis_test_{util.guid()}' + return con.table('functional_alltypes') @pytest.fixture diff --git a/ibis/backends/pyspark/tests/test_ddl.py b/ibis/backends/pyspark/tests/test_ddl.py index 55c1fb8687e2..47e137fa942f 100644 --- a/ibis/backends/pyspark/tests/test_ddl.py +++ b/ibis/backends/pyspark/tests/test_ddl.py @@ -40,8 +40,8 @@ def test_drop_non_empty_database(con, alltypes, temp_table_db): @pytest.fixture -def temp_base(tmp_dir): - base = pjoin(tmp_dir, util.gen_name("temp_base")) +def temp_base(): + base = pjoin(f"/tmp/{util.gen_name('pyspark_testing')}", util.gen_name("temp_base")) yield base shutil.rmtree(base, ignore_errors=True) diff --git a/ibis/backends/snowflake/tests/conftest.py b/ibis/backends/snowflake/tests/conftest.py index 51db0e5d2e92..47bbd37ca134 100644 --- a/ibis/backends/snowflake/tests/conftest.py +++ b/ibis/backends/snowflake/tests/conftest.py @@ -24,7 +24,7 @@ def copy_into(con, data_dir: Path, table: str) -> None: stage = "ibis_testing" csv = f"{table}.csv" con.exec_driver_sql( - f"PUT file://{data_dir.joinpath(csv).absolute()} @{stage}/{csv}" + f"PUT file://{data_dir.joinpath('csv', csv).absolute()} @{stage}/{csv}" ) con.exec_driver_sql( f"COPY INTO {table} FROM @{stage}/{csv} FILE_FORMAT = (FORMAT_NAME = ibis_testing)" diff --git a/ibis/backends/sqlite/tests/conftest.py b/ibis/backends/sqlite/tests/conftest.py index 823d042d7ed2..f3d1e84f8993 100644 --- a/ibis/backends/sqlite/tests/conftest.py +++ b/ibis/backends/sqlite/tests/conftest.py @@ -36,7 +36,7 @@ def __init__(self, data_directory: Path) -> None: for table in TEST_TABLES: basename = f"{table}.csv" - with data_directory.joinpath(basename).open("r") as f: + with data_directory.joinpath("csv", basename).open("r") as f: reader = csv.reader(f) header = next(reader) assert header, f"empty header for table: `{table}`" diff --git a/ibis/backends/tests/snapshots/test_string/test_rlike/duckdb/out.sql b/ibis/backends/tests/snapshots/test_string/test_rlike/duckdb/out.sql index c2556795fa50..cc2d5a123a56 100644 --- a/ibis/backends/tests/snapshots/test_string/test_rlike/duckdb/out.sql +++ b/ibis/backends/tests/snapshots/test_string/test_rlike/duckdb/out.sql @@ -1,6 +1,4 @@ SELECT - t0.index, - t0."Unnamed: 0", t0.id, t0.bool_col, t0.tinyint_col, diff --git a/ibis/backends/tests/snapshots/test_string/test_rlike/mysql/out.sql b/ibis/backends/tests/snapshots/test_string/test_rlike/mysql/out.sql index 168a132ce471..4fcba540524e 100644 --- a/ibis/backends/tests/snapshots/test_string/test_rlike/mysql/out.sql +++ b/ibis/backends/tests/snapshots/test_string/test_rlike/mysql/out.sql @@ -1,6 +1,4 @@ SELECT - t0.`index`, - t0.`Unnamed: 0`, t0.id, t0.bool_col = 1 AS bool_col, t0.tinyint_col, diff --git a/ibis/backends/tests/snapshots/test_string/test_rlike/postgres/out.sql b/ibis/backends/tests/snapshots/test_string/test_rlike/postgres/out.sql index 0de6a36f93fb..783fd1ccf507 100644 --- a/ibis/backends/tests/snapshots/test_string/test_rlike/postgres/out.sql +++ b/ibis/backends/tests/snapshots/test_string/test_rlike/postgres/out.sql @@ -1,6 +1,4 @@ SELECT - t0.index, - t0."Unnamed: 0", t0.id, t0.bool_col, t0.tinyint_col, diff --git a/ibis/backends/tests/snapshots/test_string/test_rlike/sqlite/out.sql b/ibis/backends/tests/snapshots/test_string/test_rlike/sqlite/out.sql index c3811fb27285..ce7cfc3c0494 100644 --- a/ibis/backends/tests/snapshots/test_string/test_rlike/sqlite/out.sql +++ b/ibis/backends/tests/snapshots/test_string/test_rlike/sqlite/out.sql @@ -1,6 +1,4 @@ SELECT - t0."index", - t0."Unnamed: 0", t0.id, t0.bool_col, t0.tinyint_col, diff --git a/ibis/backends/tests/snapshots/test_string/test_rlike/trino/out.sql b/ibis/backends/tests/snapshots/test_string/test_rlike/trino/out.sql index e543c3cc1498..fe88c55e121b 100644 --- a/ibis/backends/tests/snapshots/test_string/test_rlike/trino/out.sql +++ b/ibis/backends/tests/snapshots/test_string/test_rlike/trino/out.sql @@ -1,6 +1,4 @@ SELECT - t0.index, - t0."unnamed: 0", t0.id, t0.bool_col, t0.tinyint_col, diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index 4d303af38a29..f18d109b80db 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -104,10 +104,7 @@ def test_query_schema(ddl_backend, expr_fn, expected): @pytest.mark.notimpl(["datafusion", "snowflake", "polars", "mssql"]) @pytest.mark.notyet(["sqlite"]) -@pytest.mark.never( - ["dask", "pandas"], - reason="dask and pandas do not support SQL", -) +@pytest.mark.never(["dask", "pandas"], reason="dask and pandas do not support SQL") def test_sql(backend, con): # execute the expression using SQL query table = backend.format_table("functional_alltypes") @@ -127,13 +124,17 @@ def test_sql(backend, con): @mark.notimpl(["datafusion", "polars", "druid"]) def test_create_table_from_schema(con, new_schema, temp_table): new_table = con.create_table(temp_table, schema=new_schema) - backend_mapping = backend_type_mapping.get(con.name, dict()) + backend_mapping = backend_type_mapping.get(con.name, {}) - for column_name, column_type in new_table.schema().items(): - assert ( - backend_mapping.get(new_schema[column_name], new_schema[column_name]) - == column_type - ) + result = ibis.schema( + { + column_name: backend_mapping.get( + new_schema[column_name], new_schema[column_name] + ) + for column_name in new_table.schema().keys() + } + ) + assert result == new_table.schema() @pytest.fixture(scope="session") diff --git a/ibis/backends/tests/test_register.py b/ibis/backends/tests/test_register.py index d849cd32e6e2..95cfc48de913 100644 --- a/ibis/backends/tests/test_register.py +++ b/ibis/backends/tests/test_register.py @@ -32,7 +32,7 @@ def pushd(new_dir): def gzip_csv(data_directory, tmp_path): basename = "diamonds.csv" f = tmp_path.joinpath(f"{basename}.gz") - data = data_directory.joinpath(basename).read_bytes() + data = data_directory.joinpath("csv", basename).read_bytes() f.write_bytes(gzip.compress(data)) return str(f.absolute()) @@ -93,7 +93,7 @@ def gzip_csv(data_directory, tmp_path): ] ) def test_register_csv(con, data_directory, fname, in_table_name, out_table_name): - with pushd(data_directory): + with pushd(data_directory / "csv"): table = con.register(fname, table_name=in_table_name) assert any(out_table_name in t for t in con.list_tables()) @@ -143,7 +143,7 @@ def test_register_with_dotted_name(con, data_directory, tmp_path): basename = "foo.bar.baz/diamonds.csv" f = tmp_path.joinpath(basename) f.parent.mkdir() - data = data_directory.joinpath("diamonds.csv").read_bytes() + data = data_directory.joinpath("csv", "diamonds.csv").read_bytes() f.write_bytes(data) table = con.register(str(f.absolute())) @@ -200,7 +200,7 @@ def test_register_parquet( pq = pytest.importorskip("pyarrow.parquet") fname = Path(fname) - table = read_table(data_directory / fname.name) + table = read_table(data_directory / "csv" / fname.name) pq.write_table(table, tmp_path / fname.name) @@ -238,7 +238,7 @@ def test_register_iterator_parquet( ): pq = pytest.importorskip("pyarrow.parquet") - table = read_table(data_directory / "functional_alltypes.csv") + table = read_table(data_directory / "csv" / "functional_alltypes.csv") pq.write_table(table, tmp_path / "functional_alltypes.parquet") @@ -424,7 +424,8 @@ def test_read_parquet( pq = pytest.importorskip("pyarrow.parquet") fname = Path(fname) - table = read_table(data_directory / fname.name) + fname = Path(data_directory) / "parquet" / fname.name + table = pq.read_table(fname) pq.write_table(table, tmp_path / fname.name) @@ -468,7 +469,7 @@ def test_read_parquet( ] ) def test_read_csv(con, data_directory, fname, in_table_name, out_table_name): - with pushd(data_directory): + with pushd(data_directory / "csv"): if con.name == "pyspark": # pyspark doesn't respect CWD fname = str(Path(fname).absolute()) diff --git a/ibis/backends/tests/test_vectorized_udf.py b/ibis/backends/tests/test_vectorized_udf.py index 03a3d0f803f0..22bc79b1d442 100644 --- a/ibis/backends/tests/test_vectorized_udf.py +++ b/ibis/backends/tests/test_vectorized_udf.py @@ -505,7 +505,7 @@ def add_one_struct_exact_once(v): path.touch() return v + 1, v + 2 - struct = add_one_struct_exact_once(udf_alltypes['index']) + struct = add_one_struct_exact_once(udf_alltypes['id']) if method == "destructure": expr = udf_alltypes.mutate(struct.destructure()) diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index cffd43111b1c..15da4e89d044 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -910,7 +910,7 @@ def agg(df): return df expected = ( - df.groupby("month") + df.groupby("month", group_keys=False) .apply(agg) .sort_values(["id"]) .reset_index(drop=True) diff --git a/justfile b/justfile index b476515fe978..00f27ebc45c3 100644 --- a/justfile +++ b/justfile @@ -64,7 +64,7 @@ doctest *args: pytest --doctest-modules {{ args }} "${doctest_modules[@]}" # download testing data -download-data owner="ibis-project" repo="testing-data" rev="master": +download-data owner="cpcloud" repo="testing-data" rev="cleanup": #!/usr/bin/env bash outdir="{{ justfile_directory() }}/ci/ibis-testing-data" rm -rf "$outdir" @@ -74,9 +74,14 @@ download-data owner="ibis-project" repo="testing-data" rev="master": if [ "{{ rev }}" = "master" ]; then args+=("--depth" "1") fi + args+=("$outdir") git clone "${args[@]}" + if [ "{{ rev }}" != "master" ]; then + git -C "${outdir}" checkout "{{ rev }}" + fi + # start backends using docker compose; no arguments starts all backends up *backends: docker compose up --wait {{ backends }} diff --git a/nix/overlay.nix b/nix/overlay.nix index fb9a044d92f3..baeeec7083e2 100644 --- a/nix/overlay.nix +++ b/nix/overlay.nix @@ -18,14 +18,13 @@ let in { ibisTestingData = pkgs.fetchFromGitHub { - owner = "ibis-project"; + name = "ibis-testing-data"; + owner = "cpcloud"; repo = "testing-data"; - rev = "master"; - sha256 = "sha256-NbgEe0w/qf9hCr9rRfIpyaH9pv25I8x0ykY7EJxDOuk="; + rev = "cleanup"; + sha256 = "sha256-q1b5IcOl5oIFXP7/P5RufncjHEVrWp4NjoU2uo/BE9U="; }; - rustNightly = pkgs.rust-bin.selectLatestNightlyWith (toolchain: toolchain.minimal); - ibis38 = pkgs.callPackage ./ibis.nix { python3 = pkgs.python38; }; ibis39 = pkgs.callPackage ./ibis.nix { python3 = pkgs.python39; }; ibis310 = pkgs.callPackage ./ibis.nix { python3 = pkgs.python310; };