Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mssql): support case-sensitive collations #9700

Merged
merged 9 commits into from
Jul 29, 2024
42 changes: 21 additions & 21 deletions ci/schema/mssql.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
DROP TABLE IF EXISTS diamonds;
DROP TABLE IF EXISTS ibis_testing.dbo.diamonds;

CREATE TABLE diamonds (
CREATE TABLE ibis_testing.dbo.diamonds (
carat FLOAT,
cut VARCHAR(MAX),
color VARCHAR(MAX),
Expand All @@ -17,13 +17,13 @@ CREATE TABLE diamonds (
-- /data is a volume mount to the ibis testing data
-- used for snappy test data loading
-- DataFrame.to_sql is unusably slow for loading CSVs
BULK INSERT diamonds
BULK INSERT ibis_testing.dbo.diamonds
FROM '/data/diamonds.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS astronauts;
DROP TABLE IF EXISTS ibis_testing.dbo.astronauts;

CREATE TABLE astronauts (
CREATE TABLE ibis_testing.dbo.astronauts (
"id" BIGINT,
"number" BIGINT,
"nationwide_number" BIGINT,
Expand All @@ -50,13 +50,13 @@ CREATE TABLE astronauts (
"total_eva_hrs" DOUBLE PRECISION
);

BULK INSERT astronauts
BULK INSERT ibis_testing.dbo.astronauts
FROM '/data/astronauts.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS batting;
DROP TABLE IF EXISTS ibis_testing.dbo.batting;

CREATE TABLE batting (
CREATE TABLE ibis_testing.dbo.batting (
"playerID" VARCHAR(MAX),
"yearID" BIGINT,
stint BIGINT,
Expand All @@ -81,13 +81,13 @@ CREATE TABLE batting (
"GIDP" BIGINT
);

BULK INSERT batting
BULK INSERT ibis_testing.dbo.batting
FROM '/data/batting.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS awards_players;
DROP TABLE IF EXISTS ibis_testing.dbo.awards_players;

CREATE TABLE awards_players (
CREATE TABLE ibis_testing.dbo.awards_players (
"playerID" VARCHAR(MAX),
"awardID" VARCHAR(MAX),
"yearID" BIGINT,
Expand All @@ -96,13 +96,13 @@ CREATE TABLE awards_players (
notes VARCHAR(MAX)
);

BULK INSERT awards_players
BULK INSERT ibis_testing.dbo.awards_players
FROM '/data/awards_players.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS functional_alltypes;
DROP TABLE IF EXISTS ibis_testing.dbo.functional_alltypes;

CREATE TABLE functional_alltypes (
CREATE TABLE ibis_testing.dbo.functional_alltypes (
id INTEGER,
bool_col BIT,
tinyint_col SMALLINT,
Expand All @@ -118,21 +118,21 @@ CREATE TABLE functional_alltypes (
month INTEGER
);

BULK INSERT functional_alltypes
BULK INSERT ibis_testing.dbo.functional_alltypes
FROM '/data/functional_alltypes.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS win;
DROP TABLE IF EXISTS ibis_testing.dbo.win;

CREATE TABLE win (g VARCHAR(MAX), x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
CREATE TABLE ibis_testing.dbo.win (g VARCHAR(MAX), x BIGINT NOT NULL, y BIGINT);
INSERT INTO ibis_testing.dbo.win VALUES
('a', 0, 3),
('a', 1, 2),
('a', 2, 0),
('a', 3, 1),
('a', 4, 1);

DROP TABLE IF EXISTS topk;
DROP TABLE IF EXISTS ibis_testing.dbo.topk;

CREATE TABLE topk (x BIGINT);
INSERT INTO topk VALUES (1), (1), (NULL);
CREATE TABLE ibis_testing.dbo.topk (x BIGINT);
INSERT INTO ibis_testing.dbo.topk VALUES (1), (1), (NULL);
7 changes: 6 additions & 1 deletion compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,17 @@ services:
environment:
MSSQL_SA_PASSWORD: 1bis_Testing!
ACCEPT_EULA: "Y"
# The default collation in MSSQL isS QL_Latin1_General_CP1_CI_AS
# where the CI stands for Case Insensitive.
# We use a case-sensitive collation for testing so that we don't
# break users with case-sensitive collations.
MSSQL_COLLATION: Latin1_General_100_BIN2_UTF8
healthcheck:
interval: 1s
retries: 20
test:
- CMD-SHELL
- $(find / -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "IF DB_ID('ibis_testing') IS NULL BEGIN CREATE DATABASE [ibis_testing] END"
- $(find /opt -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "SELECT 1"
ports:
- 1433:1433
volumes:
Expand Down
59 changes: 46 additions & 13 deletions ibis/backends/mssql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,25 @@ def datetimeoffset_to_datetime(value):
)


# For testing we use the collation "Latin1_General_100_BIN2_UTF8"
# which is case-sensitive and supports UTF8.
# This allows us to (hopefully) support both case-sensitive and case-insensitive
# collations.
# It DOES mean, though, that we need to be correct in our usage of case when
# referring to system tables and views.
# So, the correct casing for the tables and views we use often (and the
# corresponding columns):
#
#
# Info schema tables:
# - INFORMATION_SCHEMA.COLUMNS
# - INFORMATION_SCHEMA.SCHEMATA
# - INFORMATION_SCHEMA.TABLES
# Temp table location: tempdb.dbo
# Catalogs: sys.databases
# Databases: sys.schemas


class Backend(SQLBackend, CanCreateCatalog, CanCreateDatabase, CanCreateSchema, NoUrl):
name = "mssql"
compiler = MSSQLCompiler()
Expand Down Expand Up @@ -112,11 +131,15 @@ def do_connect(
if user is None and password is None:
kwargs.setdefault("Trusted_Connection", "yes")

if database is not None:
# passing database=None tries to interpolate "None" into the
# connection string and use it as a database
kwargs["database"] = database

self.con = pyodbc.connect(
user=user,
server=f"{host},{port}",
password=password,
database=database,
driver=driver,
**kwargs,
)
Expand Down Expand Up @@ -171,8 +194,8 @@ def get_schema(
)
.from_(
sg.table(
"columns",
db="information_schema",
"COLUMNS",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down Expand Up @@ -212,23 +235,33 @@ def get_schema(
return sch.Schema(mapping)

def _get_schema_using_query(self, query: str) -> sch.Schema:
# Docs describing usage of dm_exec_describe_first_result_set
# https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-exec-describe-first-result-set-transact-sql?view=sql-server-ver16
tsql = sge.convert(str(query)).sql(self.dialect)
query = f"EXEC sp_describe_first_result_set @tsql = N{tsql}"

# For some reason when using "Latin1_General_100_BIN2_UTF8"
# the stored procedure `sp_describe_first_result_set` starts throwing errors about DLL loading.
# This "dynamic management function" uses the same algorithm and allows
# us to pre-filter the columns we want back.
# The syntax is:
# `sys.dm_exec_describe_first_result_set(@tsql, @params, @include_browse_information)`
query = f"""SELECT name,
is_nullable AS nullable,
system_type_name,
precision,
scale
FROM
sys.dm_exec_describe_first_result_set({tsql}, NULL, 0)"""
with self._safe_raw_sql(query) as cur:
rows = cur.fetchall()

schema = {}
for (
_,
_,
name,
nullable,
_,
system_type_name,
_,
precision,
scale,
*_,
) in sorted(rows, key=itemgetter(1)):
newtyp = self.compiler.type_mapper.from_string(
system_type_name, nullable=nullable
Expand Down Expand Up @@ -463,8 +496,8 @@ def list_tables(
sg.select("table_name")
.from_(
sg.table(
"tables",
db="information_schema",
"TABLES",
db="INFORMATION_SCHEMA",
catalog=catalog if catalog is not None else self.current_catalog,
)
)
Expand All @@ -486,8 +519,8 @@ def list_databases(
) -> list[str]:
query = sg.select(C.schema_name).from_(
sg.table(
"schemata",
db="information_schema",
"SCHEMATA",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down
14 changes: 12 additions & 2 deletions ibis/backends/mssql/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import os
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import pytest

Expand Down Expand Up @@ -35,13 +35,23 @@ class TestConf(ServiceBackendTest):
def test_files(self) -> Iterable[Path]:
return self.data_dir.joinpath("csv").glob("*.csv")

def postload(self, **kw: Any):
self.connection = self.connect(database=IBIS_TEST_MSSQL_DB, **kw)

def _load_data(self, *, database: str = IBIS_TEST_MSSQL_DB, **_):
with self.connection._safe_raw_sql(
"IF DB_ID('ibis_testing') is NULL BEGIN CREATE DATABASE [ibis_testing] END"
):
pass
Comment on lines +41 to +45
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! I was getting cryptic user errors when I tried this, but I must've made a syntax error somewhere

Copy link
Member

@cpcloud cpcloud Jul 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably was the other thing I had to change (if the issue was connecting), database=None doesn't do what you might expect when connecting.


super()._load_data(database=database, **_)

@staticmethod
def connect(*, tmpdir, worker_id, **kw):
return ibis.mssql.connect(
host=MSSQL_HOST,
user=MSSQL_USER,
password=MSSQL_PASS,
database=IBIS_TEST_MSSQL_DB,
port=MSSQL_PORT,
driver=MSSQL_PYODBC_DRIVER,
autocommit=True,
Expand Down
17 changes: 15 additions & 2 deletions ibis/backends/mssql/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import pytest
from pytest import param

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -35,11 +36,23 @@
("DATETIME", dt.Timestamp(scale=3)),
# Characters strings
("CHAR", dt.string),
("TEXT", dt.string),
param(
"TEXT",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to know that the TEXT type doesn't support modern text.

dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("VARCHAR", dt.string),
# Unicode character strings
("NCHAR", dt.string),
("NTEXT", dt.string),
param(
"NTEXT",
dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("NVARCHAR", dt.string),
# Binary strings
("BINARY", dt.binary),
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def test_order_by_nulls(con, op, nulls_first, expected):

@pytest.mark.notimpl(["druid"])
@pytest.mark.never(
["exasol", "mssql", "mysql"],
["exasol", "mysql"],
raises=AssertionError,
reason="someone decided a long time ago that 'A' = 'a' is true in these systems",
)
Expand Down