Skip to content

Commit

Permalink
feat(mssql): support case-sensitive collations (#9700)
Browse files Browse the repository at this point in the history
Co-authored-by: Phillip Cloud <417981+cpcloud@users.noreply.github.com>
  • Loading branch information
gforsyth and cpcloud authored Jul 29, 2024
1 parent 75b31c2 commit 9382a0e
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 40 deletions.
42 changes: 21 additions & 21 deletions ci/schema/mssql.sql
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
DROP TABLE IF EXISTS diamonds;
DROP TABLE IF EXISTS ibis_testing.dbo.diamonds;

CREATE TABLE diamonds (
CREATE TABLE ibis_testing.dbo.diamonds (
carat FLOAT,
cut VARCHAR(MAX),
color VARCHAR(MAX),
Expand All @@ -17,13 +17,13 @@ CREATE TABLE diamonds (
-- /data is a volume mount to the ibis testing data
-- used for snappy test data loading
-- DataFrame.to_sql is unusably slow for loading CSVs
BULK INSERT diamonds
BULK INSERT ibis_testing.dbo.diamonds
FROM '/data/diamonds.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS astronauts;
DROP TABLE IF EXISTS ibis_testing.dbo.astronauts;

CREATE TABLE astronauts (
CREATE TABLE ibis_testing.dbo.astronauts (
"id" BIGINT,
"number" BIGINT,
"nationwide_number" BIGINT,
Expand All @@ -50,13 +50,13 @@ CREATE TABLE astronauts (
"total_eva_hrs" DOUBLE PRECISION
);

BULK INSERT astronauts
BULK INSERT ibis_testing.dbo.astronauts
FROM '/data/astronauts.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS batting;
DROP TABLE IF EXISTS ibis_testing.dbo.batting;

CREATE TABLE batting (
CREATE TABLE ibis_testing.dbo.batting (
"playerID" VARCHAR(MAX),
"yearID" BIGINT,
stint BIGINT,
Expand All @@ -81,13 +81,13 @@ CREATE TABLE batting (
"GIDP" BIGINT
);

BULK INSERT batting
BULK INSERT ibis_testing.dbo.batting
FROM '/data/batting.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS awards_players;
DROP TABLE IF EXISTS ibis_testing.dbo.awards_players;

CREATE TABLE awards_players (
CREATE TABLE ibis_testing.dbo.awards_players (
"playerID" VARCHAR(MAX),
"awardID" VARCHAR(MAX),
"yearID" BIGINT,
Expand All @@ -96,13 +96,13 @@ CREATE TABLE awards_players (
notes VARCHAR(MAX)
);

BULK INSERT awards_players
BULK INSERT ibis_testing.dbo.awards_players
FROM '/data/awards_players.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS functional_alltypes;
DROP TABLE IF EXISTS ibis_testing.dbo.functional_alltypes;

CREATE TABLE functional_alltypes (
CREATE TABLE ibis_testing.dbo.functional_alltypes (
id INTEGER,
bool_col BIT,
tinyint_col SMALLINT,
Expand All @@ -118,21 +118,21 @@ CREATE TABLE functional_alltypes (
month INTEGER
);

BULK INSERT functional_alltypes
BULK INSERT ibis_testing.dbo.functional_alltypes
FROM '/data/functional_alltypes.csv'
WITH (FORMAT = 'CSV', FIELDTERMINATOR = ',', ROWTERMINATOR = '\n', FIRSTROW = 2)

DROP TABLE IF EXISTS win;
DROP TABLE IF EXISTS ibis_testing.dbo.win;

CREATE TABLE win (g VARCHAR(MAX), x BIGINT NOT NULL, y BIGINT);
INSERT INTO win VALUES
CREATE TABLE ibis_testing.dbo.win (g VARCHAR(MAX), x BIGINT NOT NULL, y BIGINT);
INSERT INTO ibis_testing.dbo.win VALUES
('a', 0, 3),
('a', 1, 2),
('a', 2, 0),
('a', 3, 1),
('a', 4, 1);

DROP TABLE IF EXISTS topk;
DROP TABLE IF EXISTS ibis_testing.dbo.topk;

CREATE TABLE topk (x BIGINT);
INSERT INTO topk VALUES (1), (1), (NULL);
CREATE TABLE ibis_testing.dbo.topk (x BIGINT);
INSERT INTO ibis_testing.dbo.topk VALUES (1), (1), (NULL);
7 changes: 6 additions & 1 deletion compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,17 @@ services:
environment:
MSSQL_SA_PASSWORD: 1bis_Testing!
ACCEPT_EULA: "Y"
# The default collation in MSSQL isS QL_Latin1_General_CP1_CI_AS
# where the CI stands for Case Insensitive.
# We use a case-sensitive collation for testing so that we don't
# break users with case-sensitive collations.
MSSQL_COLLATION: Latin1_General_100_BIN2_UTF8
healthcheck:
interval: 1s
retries: 20
test:
- CMD-SHELL
- $(find / -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "IF DB_ID('ibis_testing') IS NULL BEGIN CREATE DATABASE [ibis_testing] END"
- $(find /opt -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "SELECT 1"
ports:
- 1433:1433
volumes:
Expand Down
59 changes: 46 additions & 13 deletions ibis/backends/mssql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,25 @@ def datetimeoffset_to_datetime(value):
)


# For testing we use the collation "Latin1_General_100_BIN2_UTF8"
# which is case-sensitive and supports UTF8.
# This allows us to (hopefully) support both case-sensitive and case-insensitive
# collations.
# It DOES mean, though, that we need to be correct in our usage of case when
# referring to system tables and views.
# So, the correct casing for the tables and views we use often (and the
# corresponding columns):
#
#
# Info schema tables:
# - INFORMATION_SCHEMA.COLUMNS
# - INFORMATION_SCHEMA.SCHEMATA
# - INFORMATION_SCHEMA.TABLES
# Temp table location: tempdb.dbo
# Catalogs: sys.databases
# Databases: sys.schemas


class Backend(SQLBackend, CanCreateCatalog, CanCreateDatabase, CanCreateSchema, NoUrl):
name = "mssql"
compiler = MSSQLCompiler()
Expand Down Expand Up @@ -112,11 +131,15 @@ def do_connect(
if user is None and password is None:
kwargs.setdefault("Trusted_Connection", "yes")

if database is not None:
# passing database=None tries to interpolate "None" into the
# connection string and use it as a database
kwargs["database"] = database

self.con = pyodbc.connect(
user=user,
server=f"{host},{port}",
password=password,
database=database,
driver=driver,
**kwargs,
)
Expand Down Expand Up @@ -171,8 +194,8 @@ def get_schema(
)
.from_(
sg.table(
"columns",
db="information_schema",
"COLUMNS",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down Expand Up @@ -212,23 +235,33 @@ def get_schema(
return sch.Schema(mapping)

def _get_schema_using_query(self, query: str) -> sch.Schema:
# Docs describing usage of dm_exec_describe_first_result_set
# https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-exec-describe-first-result-set-transact-sql?view=sql-server-ver16
tsql = sge.convert(str(query)).sql(self.dialect)
query = f"EXEC sp_describe_first_result_set @tsql = N{tsql}"

# For some reason when using "Latin1_General_100_BIN2_UTF8"
# the stored procedure `sp_describe_first_result_set` starts throwing errors about DLL loading.
# This "dynamic management function" uses the same algorithm and allows
# us to pre-filter the columns we want back.
# The syntax is:
# `sys.dm_exec_describe_first_result_set(@tsql, @params, @include_browse_information)`
query = f"""SELECT name,
is_nullable AS nullable,
system_type_name,
precision,
scale
FROM
sys.dm_exec_describe_first_result_set({tsql}, NULL, 0)"""
with self._safe_raw_sql(query) as cur:
rows = cur.fetchall()

schema = {}
for (
_,
_,
name,
nullable,
_,
system_type_name,
_,
precision,
scale,
*_,
) in sorted(rows, key=itemgetter(1)):
newtyp = self.compiler.type_mapper.from_string(
system_type_name, nullable=nullable
Expand Down Expand Up @@ -463,8 +496,8 @@ def list_tables(
sg.select("table_name")
.from_(
sg.table(
"tables",
db="information_schema",
"TABLES",
db="INFORMATION_SCHEMA",
catalog=catalog if catalog is not None else self.current_catalog,
)
)
Expand All @@ -486,8 +519,8 @@ def list_databases(
) -> list[str]:
query = sg.select(C.schema_name).from_(
sg.table(
"schemata",
db="information_schema",
"SCHEMATA",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down
14 changes: 12 additions & 2 deletions ibis/backends/mssql/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

import os
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any

import pytest

Expand Down Expand Up @@ -35,13 +35,23 @@ class TestConf(ServiceBackendTest):
def test_files(self) -> Iterable[Path]:
return self.data_dir.joinpath("csv").glob("*.csv")

def postload(self, **kw: Any):
self.connection = self.connect(database=IBIS_TEST_MSSQL_DB, **kw)

def _load_data(self, *, database: str = IBIS_TEST_MSSQL_DB, **_):
with self.connection._safe_raw_sql(
"IF DB_ID('ibis_testing') is NULL BEGIN CREATE DATABASE [ibis_testing] END"
):
pass

super()._load_data(database=database, **_)

@staticmethod
def connect(*, tmpdir, worker_id, **kw):
return ibis.mssql.connect(
host=MSSQL_HOST,
user=MSSQL_USER,
password=MSSQL_PASS,
database=IBIS_TEST_MSSQL_DB,
port=MSSQL_PORT,
driver=MSSQL_PYODBC_DRIVER,
autocommit=True,
Expand Down
17 changes: 15 additions & 2 deletions ibis/backends/mssql/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import pytest
from pytest import param

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -35,11 +36,23 @@
("DATETIME", dt.Timestamp(scale=3)),
# Characters strings
("CHAR", dt.string),
("TEXT", dt.string),
param(
"TEXT",
dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("VARCHAR", dt.string),
# Unicode character strings
("NCHAR", dt.string),
("NTEXT", dt.string),
param(
"NTEXT",
dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("NVARCHAR", dt.string),
# Binary strings
("BINARY", dt.binary),
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def test_order_by_nulls(con, op, nulls_first, expected):

@pytest.mark.notimpl(["druid"])
@pytest.mark.never(
["exasol", "mssql", "mysql"],
["exasol", "mysql"],
raises=AssertionError,
reason="someone decided a long time ago that 'A' = 'a' is true in these systems",
)
Expand Down

0 comments on commit 9382a0e

Please sign in to comment.