Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(mssql): support case-sensitive collations #9700

Merged
merged 9 commits into from
Jul 29, 2024
7 changes: 6 additions & 1 deletion compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,17 @@ services:
environment:
MSSQL_SA_PASSWORD: 1bis_Testing!
ACCEPT_EULA: "Y"
# The default collation in MSSQL isS QL_Latin1_General_CP1_CI_AS
# where the CI stands for Case Insensitive.
# We use a case-sensitive collation for testing so that we don't
# break users with case-sensitive collations.
MSSQL_COLLATION: Latin1_General_100_BIN2_UTF8
healthcheck:
interval: 1s
retries: 20
test:
- CMD-SHELL
- $(find / -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "IF DB_ID('ibis_testing') IS NULL BEGIN CREATE DATABASE [ibis_testing] END"
- sleep 10 && $(find / -name sqlcmd -type f -executable) -C -S localhost -U sa -P "$$MSSQL_SA_PASSWORD" -Q "IF DB_ID('ibis_testing') is NULL BEGIN CREATE DATABASE [ibis_testing] END"
cpcloud marked this conversation as resolved.
Show resolved Hide resolved
ports:
- 1433:1433
volumes:
Expand Down
53 changes: 41 additions & 12 deletions ibis/backends/mssql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,25 @@ def datetimeoffset_to_datetime(value):
)


# For testing we use the collation "Latin1_General_100_BIN2_UTF8"
# which is case-sensitive and supports UTF8.
# This allows us to (hopefully) support both case-sensitive and case-insensitive
# collations.
# It DOES mean, though, that we need to be correct in our usage of case when
# referring to system tables and views.
# So, the correct casing for the tables and views we use often (and the
# corresponding columns):
#
#
# Info schema tables:
# - INFORMATION_SCHEMA.COLUMNS
# - INFORMATION_SCHEMA.SCHEMATA
# - INFORMATION_SCHEMA.TABLES
# Temp table location: tempdb.dbo
# Catalogs: sys.databases
# Databases: sys.schemas


class Backend(SQLBackend, CanCreateCatalog, CanCreateDatabase, CanCreateSchema, NoUrl):
name = "mssql"
compiler = MSSQLCompiler()
Expand Down Expand Up @@ -171,8 +190,8 @@ def get_schema(
)
.from_(
sg.table(
"columns",
db="information_schema",
"COLUMNS",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down Expand Up @@ -212,23 +231,33 @@ def get_schema(
return sch.Schema(mapping)

def _get_schema_using_query(self, query: str) -> sch.Schema:
# Docs describing usage of dm_exec_describe_first_result_set
# https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-exec-describe-first-result-set-transact-sql?view=sql-server-ver16
tsql = sge.convert(str(query)).sql(self.dialect)
query = f"EXEC sp_describe_first_result_set @tsql = N{tsql}"

# For some reason when using "Latin1_General_100_BIN2_UTF8"
# the stored procedure `sp_describe_first_result_set` starts throwing errors about DLL loading.
# This "dynamic management function" uses the same algorithm and allows
# us to pre-filter the columns we want back.
# The syntax is:
# `sys.dm_exec_describe_first_result_set(@tsql, @params, @include_browse_information)`
query = f"""SELECT name,
is_nullable AS nullable,
system_type_name,
precision,
scale
FROM
sys.dm_exec_describe_first_result_set({tsql}, NULL, 0)"""
with self._safe_raw_sql(query) as cur:
rows = cur.fetchall()

schema = {}
for (
_,
_,
name,
nullable,
_,
system_type_name,
_,
precision,
scale,
*_,
) in sorted(rows, key=itemgetter(1)):
newtyp = self.compiler.type_mapper.from_string(
system_type_name, nullable=nullable
Expand Down Expand Up @@ -463,8 +492,8 @@ def list_tables(
sg.select("table_name")
.from_(
sg.table(
"tables",
db="information_schema",
"TABLES",
db="INFORMATION_SCHEMA",
catalog=catalog if catalog is not None else self.current_catalog,
)
)
Expand All @@ -486,8 +515,8 @@ def list_databases(
) -> list[str]:
query = sg.select(C.schema_name).from_(
sg.table(
"schemata",
db="information_schema",
"SCHEMATA",
db="INFORMATION_SCHEMA",
catalog=catalog or self.current_catalog,
)
)
Expand Down
17 changes: 15 additions & 2 deletions ibis/backends/mssql/tests/test_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import pytest
from pytest import param

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -35,11 +36,23 @@
("DATETIME", dt.Timestamp(scale=3)),
# Characters strings
("CHAR", dt.string),
("TEXT", dt.string),
param(
"TEXT",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to know that the TEXT type doesn't support modern text.

dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("VARCHAR", dt.string),
# Unicode character strings
("NCHAR", dt.string),
("NTEXT", dt.string),
param(
"NTEXT",
dt.string,
marks=pytest.mark.notyet(
["mssql"], reason="Not supported by UTF-8 aware collations"
),
),
("NVARCHAR", dt.string),
# Binary strings
("BINARY", dt.binary),
Expand Down
2 changes: 1 addition & 1 deletion ibis/backends/tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def test_order_by_nulls(con, op, nulls_first, expected):

@pytest.mark.notimpl(["druid"])
@pytest.mark.never(
["exasol", "mssql", "mysql"],
["exasol", "mysql"],
raises=AssertionError,
reason="someone decided a long time ago that 'A' = 'a' is true in these systems",
)
Expand Down