From b2c4cbe245969ab5e7c47d93e44fa364cf488408 Mon Sep 17 00:00:00 2001 From: Jesse Yang Date: Wed, 19 Jun 2019 23:22:16 -0700 Subject: [PATCH] Use system.jdbc for presto schema and add table filter In lower version of Presto, sometimes the old query will throw "outputFormat should not be accessed from a null StorageFormat" error (see prestodb/presto/issues/6972). Sometimes there can be many many tables, it would be nice if we filter only for certain sources. We might also add access control based on the table filter. --- redash/query_runner/presto.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/redash/query_runner/presto.py b/redash/query_runner/presto.py index 2966d1ccf9..f55202d037 100644 --- a/redash/query_runner/presto.py +++ b/redash/query_runner/presto.py @@ -47,9 +47,13 @@ def configuration_schema(cls): 'port': { 'type': 'number' }, - 'schema': { + 'default_schema': { 'type': 'string' }, + 'table_filter': { + 'type': 'string', + 'default': 'RegExp to filter schema.tables' + }, 'catalog': { 'type': 'string' }, @@ -60,7 +64,8 @@ def configuration_schema(cls): 'type': 'string' }, }, - 'order': ['host', 'protocol', 'port', 'username', 'password', 'schema', 'catalog'], + 'order': ['host', 'protocol', 'port', 'username', 'password', + 'default_schema', 'table_filter', 'catalog'], 'required': ['host'] } @@ -75,10 +80,15 @@ def type(cls): def get_schema(self, get_stats=False): schema = {} query = """ - SELECT table_schema, table_name, column_name - FROM information_schema.columns - WHERE table_schema NOT IN ('pg_catalog', 'information_schema') - """ + SELECT + table_schem, table_name, column_name + FROM system.jdbc.columns + WHERE table_cat = '{catalog}' + AND regexp_like(concat(table_schem, '.', table_name), '{table_filter}') + """.format( + catalog=self.configuration.get('catalog', 'hive'), + table_filter=self.configuration.get('table_filter', ''), + ) results, error = self.run_query(query, None) @@ -88,7 +98,7 @@ def get_schema(self, get_stats=False): results = json_loads(results) for row in results['rows']: - table_name = '{}.{}'.format(row['table_schema'], row['table_name']) + table_name = '{}.{}'.format(row['table_schem'], row['table_name']) if table_name not in schema: schema[table_name] = {'name': table_name, 'columns': []}