Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: improve schema security #23385

Merged
merged 1 commit into from
Mar 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 63 additions & 1 deletion superset/db_engine_specs/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,7 +372,7 @@ class BaseEngineSpec: # pylint: disable=too-many-public-methods

# Is the DB engine spec able to change the default schema? This requires implementing
# a custom `adjust_database_uri` method.
dynamic_schema = False
supports_dynamic_schema = False

@classmethod
def supports_url(cls, url: URL) -> bool:
Expand Down Expand Up @@ -426,6 +426,68 @@ def supports_backend(cls, backend: str, driver: Optional[str] = None) -> bool:

return driver in cls.drivers

@classmethod
def get_default_schema(cls, database: Database) -> Optional[str]:
"""
Return the default schema in a given database.
"""
with database.get_inspector_with_context() as inspector:
return inspector.default_schema_name

@classmethod
def get_schema_from_engine_params( # pylint: disable=unused-argument
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the schema configured in a SQLALchemy URI and connection argments, if any.
"""
return None

@classmethod
def get_default_schema_for_query(
cls,
database: Database,
query: Query,
) -> Optional[str]:
"""
Return the default schema for a given query.

This is used to determine the schema of tables that aren't fully qualified, eg:

SELECT * FROM foo;

In the example above, the schema where the `foo` table lives depends on a few
factors:

1. For DB engine specs that allow dynamically changing the schema based on the
query we should use the query schema.
2. For DB engine specs that don't support dynamically changing the schema and
have the schema hardcoded in the SQLAlchemy URI we should use the schema
from the URI.
3. For DB engine specs that don't connect to a specific schema and can't
change it dynamically we need to probe the database for the default schema.

Determining the correct schema is crucial for managing access to data, so please
make sure you understand this logic when working on a new DB engine spec.
"""
# default schema varies on a per-query basis
if cls.supports_dynamic_schema:
return query.schema
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@betodealmeida judging from the description, the dropdown from sqllab has already been assigned as query.schema at this point?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right!


# check if the schema is stored in the SQLAlchemy URI or connection arguments
try:
connect_args = database.get_extra()["engine_params"]["connect_args"]
except KeyError:
connect_args = {}
sqlalchemy_uri = make_url_safe(database.sqlalchemy_uri)
if schema := cls.get_schema_from_engine_params(sqlalchemy_uri, connect_args):
return schema

# return the default schema of the database
return cls.get_default_schema(database)

@classmethod
def get_dbapi_exception_mapping(cls) -> Dict[Type[Exception], Type[Exception]]:
"""
Expand Down
17 changes: 15 additions & 2 deletions superset/db_engine_specs/drill.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class DrillEngineSpec(BaseEngineSpec):
engine_name = "Apache Drill"
default_driver = "sadrill"

dynamic_schema = True
supports_dynamic_schema = True

_time_grain_expressions = {
None: "{col}",
Expand Down Expand Up @@ -73,10 +73,23 @@ def convert_dttm(
@classmethod
def adjust_database_uri(cls, uri: URL, selected_schema: Optional[str]) -> URL:
if selected_schema:
uri = uri.set(database=parse.quote(selected_schema, safe=""))
uri = uri.set(
database=parse.quote(selected_schema.replace(".", "/"), safe="")
)

return uri

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.
"""
return parse.unquote(sqlalchemy_uri.database).replace("/", ".")

@classmethod
def get_url_for_impersonation(
cls, url: URL, impersonate_user: bool, username: Optional[str]
Expand Down
13 changes: 12 additions & 1 deletion superset/db_engine_specs/hive.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class HiveEngineSpec(PrestoEngineSpec):
allows_alias_to_source_column = True
allows_hidden_orderby_agg = False

dynamic_schema = True
supports_dynamic_schema = True

# When running `SHOW FUNCTIONS`, what is the name of the column with the
# function names?
Expand Down Expand Up @@ -268,6 +268,17 @@ def adjust_database_uri(

return uri

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.
"""
return parse.unquote(sqlalchemy_uri.database)

@classmethod
def _extract_error_message(cls, ex: Exception) -> str:
msg = str(ex)
Expand Down
19 changes: 17 additions & 2 deletions superset/db_engine_specs/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class MySQLEngineSpec(BaseEngineSpec, BasicParametersMixin):
)
encryption_parameters = {"ssl": "1"}

dynamic_schema = True
supports_dynamic_schema = True

column_type_mappings = (
(
Expand Down Expand Up @@ -192,13 +192,28 @@ def convert_dttm(

@classmethod
def adjust_database_uri(
cls, uri: URL, selected_schema: Optional[str] = None
cls,
uri: URL,
selected_schema: Optional[str] = None,
) -> URL:
if selected_schema:
uri = uri.set(database=parse.quote(selected_schema, safe=""))

return uri

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.

A MySQL database is a SQLAlchemy schema.
"""
return parse.unquote(sqlalchemy_uri.database)

@classmethod
def get_datatype(cls, type_code: Any) -> Optional[str]:
if not cls.type_code_map:
Expand Down
36 changes: 36 additions & 0 deletions superset/db_engine_specs/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from flask_babel import gettext as __
from sqlalchemy.dialects.postgresql import DOUBLE_PRECISION, ENUM, JSON
from sqlalchemy.dialects.postgresql.base import PGInspector
from sqlalchemy.engine.url import URL
from sqlalchemy.types import Date, DateTime, String

from superset.db_engine_specs.base import BaseEngineSpec, BasicParametersMixin
Expand Down Expand Up @@ -146,6 +147,41 @@ class PostgresBaseEngineSpec(BaseEngineSpec):
),
}

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.

While Postgres doesn't support connecting directly to a given schema, it allows
users to specify a "search path" that is used to resolve non-qualified table
names; this can be specified in the database ``connect_args``.

One important detail is that the search path can be a comma separated list of
schemas. While this is supported by the SQLAlchemy dialect, it shouldn't be used
in Superset because it breaks schema-level permissions, since it's impossible
to determine the schema for a non-qualified table in a query. In cases like
that we raise an exception.
"""
options = re.split(r"-c\s?", connect_args.get("options", ""))
for option in options:
if "=" not in option:
continue
key, value = option.strip().split("=", 1)
if key.strip() == "search_path":
if "," in value:
raise Exception(
"Multiple schemas are configured in the search path, which means "
"Superset is unable to determine the schema of unqualified table "
"names and enforce permissions."
)
return value.strip()

return None

@classmethod
def fetch_data(
cls, cursor: Any, limit: Optional[int] = None
Expand Down
23 changes: 22 additions & 1 deletion superset/db_engine_specs/presto.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ class PrestoBaseEngineSpec(BaseEngineSpec, metaclass=ABCMeta):
A base class that share common functions between Presto and Trino
"""

dynamic_schema = True
supports_dynamic_schema = True

column_type_mappings = (
(
Expand Down Expand Up @@ -315,6 +315,27 @@ def adjust_database_uri(

return uri

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.

For Presto the SQLAlchemy URI looks like this:

presto://localhost:8080/hive[/default]

"""
database = sqlalchemy_uri.database.strip("/")

if "/" not in database:
return None

return parse.unquote(database.split("/")[1])

@classmethod
def estimate_statement_cost(cls, statement: str, cursor: Any) -> Dict[str, Any]:
"""
Expand Down
18 changes: 17 additions & 1 deletion superset/db_engine_specs/snowflake.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class SnowflakeEngineSpec(PostgresBaseEngineSpec):
default_driver = "snowflake"
sqlalchemy_uri_placeholder = "snowflake://"

dynamic_schema = True
supports_dynamic_schema = True

_time_grain_expressions = {
None: "{col}",
Expand Down Expand Up @@ -147,6 +147,22 @@ def adjust_database_uri(

return uri

@classmethod
def get_schema_from_engine_params(
cls,
sqlalchemy_uri: URL,
connect_args: Dict[str, Any],
) -> Optional[str]:
"""
Return the configured schema.
"""
database = sqlalchemy_uri.database.strip("/")

if "/" not in database:
return None

return parse.unquote(database.split("/")[1])

@classmethod
def epoch_to_dttm(cls) -> str:
return "DATEADD(S, {col}, '1970-01-01')"
Expand Down
17 changes: 17 additions & 0 deletions superset/models/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@

if TYPE_CHECKING:
from superset.databases.ssh_tunnel.models import SSHTunnel
from superset.models.sql_lab import Query

DB_CONNECTION_MUTATOR = config["DB_CONNECTION_MUTATOR"]

Expand Down Expand Up @@ -483,6 +484,22 @@ def get_raw_connection(
with closing(engine.raw_connection()) as conn:
yield conn

def get_default_schema_for_query(self, query: "Query") -> Optional[str]:
"""
Return the default schema for a given query.

This is used to determine if the user has access to a query that reads from table
names without a specific schema, eg:

SELECT * FROM `foo`

The schema of the `foo` table depends on the DB engine spec. Some DB engine specs
can change the default schema on a per-query basis; in other DB engine specs the
default schema is defined in the SQLAlchemy URI; and in others the default schema
might be determined by the database itself (like `public` for Postgres).
"""
return self.db_engine_spec.get_default_schema_for_query(self, query)

@property
def quote_identifier(self) -> Callable[[str], str]:
"""Add quotes to potential identifiter expressions if needed"""
Expand Down
15 changes: 2 additions & 13 deletions superset/security/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -1787,7 +1787,7 @@ def get_exclude_users_from_lists() -> List[str]:
return []

def raise_for_access(
# pylint: disable=too-many-arguments, too-many-locals, too-many-branches
# pylint: disable=too-many-arguments, too-many-locals
self,
database: Optional["Database"] = None,
datasource: Optional["BaseDatasource"] = None,
Expand Down Expand Up @@ -1823,18 +1823,7 @@ def raise_for_access(
return

if query:
# Some databases can change the default schema in which the query wil run,
# respecting the selection in SQL Lab. If that's the case, the query
# schema becomes the default one.
if database.db_engine_spec.dynamic_schema:
default_schema = query.schema
# For other databases, the selected schema in SQL Lab is used only for
# table discovery and autocomplete. In this case we need to use the
# database default schema for tables that don't have an explicit schema.
else:
with database.get_inspector_with_context() as inspector:
default_schema = inspector.default_schema_name

default_schema = database.get_default_schema_for_query(query)
tables = {
Table(table_.table, table_.schema or default_schema)
for table_ in sql_parse.ParsedQuery(query.sql).tables
Expand Down
16 changes: 16 additions & 0 deletions tests/unit_tests/db_engine_specs/test_drill.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import Optional

import pytest
from sqlalchemy.engine.url import make_url

from tests.unit_tests.db_engine_specs.utils import assert_convert_dttm
from tests.unit_tests.fixtures.common import dttm
Expand Down Expand Up @@ -106,3 +107,18 @@ def test_convert_dttm(
from superset.db_engine_specs.drill import DrillEngineSpec as spec

assert_convert_dttm(spec, target_type, expected_result, dttm)


def test_get_schema_from_engine_params() -> None:
"""
Test ``get_schema_from_engine_params``.
"""
from superset.db_engine_specs.drill import DrillEngineSpec

assert (
DrillEngineSpec.get_schema_from_engine_params(
make_url("drill+sadrill://localhost:8047/dfs/test?use_ssl=False"),
{},
)
== "dfs.test"
)
Loading