Skip to content

Commit

Permalink
feat: Add RedashDashboardExtractor for extracting dashboards from red…
Browse files Browse the repository at this point in the history
…ash.io (#300)

* Add RedashDashboardExtractor for extracting dashboards from redash.io

* Fixed broken tests in python2

* Added supported Redash version and required Redash API endpoints to README
  • Loading branch information
jonhehir authored Jul 7, 2020
1 parent ad5765a commit f1b0dfa
Show file tree
Hide file tree
Showing 10 changed files with 774 additions and 1 deletion.
41 changes: 41 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,47 @@ Note that this provides accumulated view count which does [not effectively show

If you are fine with `accumulated usage`, you could use TemplateVariableSubstitutionTransformer to transform Dict payload from [ModeDashboardUsageExtractor](./databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py) to fit [DashboardUsage](./docs/models.md#dashboardusage) and transform Dict to [DashboardUsage](./docs/models.md#dashboardusage) by [TemplateVariableSubstitutionTransformer](./databuilder/transformer/template_variable_substitution_transformer.py), and [DictToModel](./databuilder/transformer/dict_to_model.py) transformers. ([Example](./databuilder/extractor/dashboard/mode_analytics/mode_dashboard_queries_extractor.py#L36) on how to combining these two transformers)

### [RedashDashboardExtractor](./databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py)

The included `RedashDashboardExtractor` provides support for extracting basic metadata for Redash dashboards (dashboard name, owner, URL, created/updated timestamps, and a generated description) and their associated queries (query name, URL, and raw query). It can be extended with a configurable table parser function to also support extraction of `DashboardTable` metadata. (See below for example usage.)

Note: `DashboardUsage` and `DashboardExecution` metadata are not supported in this extractor, as these concepts are not supported by the Redash API.

The `RedashDashboardExtractor` depends on the following Redash API endpoints: `GET /api/dashboards`, `GET /api/dashboards/<dashboard-slug>`. It has been tested against Redash 8 and is also expected to work with Redash 9.

```python
extractor = RedashDashboardExtractor()
task = DefaultTask(extractor=extractor, loader=FsNeo4jCSVLoader())

job_config = ConfigFactory.from_dict({
'extractor.redash_dashboard.redash_base_url': redash_base_url, # ex: https://redash.example.org
'extractor.redash_dashboard.api_base_url': api_base_url, # ex: https://redash.example.org/api
'extractor.redash_dashboard.api_key': api_key, # ex: abc1234
'extractor.redash_dashboard.table_parser': table_parser # ex: my_library.module.parse_tables
})

job = DefaultJob(conf=job_config,
task=task,
publisher=Neo4jCsvPublisher())
job.launch()
```

#### RedashDashboardExtractor: table_parser

The `RedashDashboardExtractor` extracts raw queries from each dashboard. You may optionally use these queries to parse out relations to tables in Amundsen. A table parser can be provided in the configuration for the `RedashDashboardExtractor`, as seen above. This function should have type signature `(RedashVisualizationWidget) -> Iterator[TableRelationData]`. For example:

```python
def parse_tables(viz_widget):
# type: (RedashVisualiationWidget) -> Iterator[TableRelationData]
# Each viz_widget corresponds to one query.
# viz_widget.data_source_id is the ID of the target DB in Redash.
# viz_widget.raw_query is the raw query (e.g., SQL).
if viz_widget.data_source_id == 123:
table_names = some_sql_parser(viz_widget.raw_query)
return [TableRelationData('some_db', 'prod', 'some_schema', tbl) for tbl in table_names]
return []
```


## List of transformers
#### [ChainedTransformer](https://github.com/lyft/amundsendatabuilder/blob/master/databuilder/transformer/base_transformer.py#L41 "ChainedTransformer")
Expand Down
Empty file.
243 changes: 243 additions & 0 deletions databuilder/extractor/dashboard/redash/redash_dashboard_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
import importlib
from pyhocon import ConfigFactory
from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata
from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp
from databuilder.models.dashboard.dashboard_owner import DashboardOwner
from databuilder.models.dashboard.dashboard_query import DashboardQuery
from databuilder.models.dashboard.dashboard_table import DashboardTable
from databuilder.models.table_metadata import TableMetadata
from databuilder.extractor.base_extractor import Extractor
from databuilder.rest_api.rest_api_query import RestApiQuery
from databuilder.rest_api.base_rest_api_query import EmptyRestApiQuerySeed
from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY
from databuilder.extractor.dashboard.redash.redash_dashboard_utils import \
get_auth_headers, get_text_widgets, get_visualization_widgets, sort_widgets, \
generate_dashboard_description, RedashPaginatedRestApiQuery
from databuilder.transformer.base_transformer import ChainedTransformer
from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME as TS_FIELD_NAME


class TableRelationData:
"""
This is sort of like a stripped down version of `TableMetadata`.
It is used as the type returned by the (optional) table parser.
"""

def __init__(self, database, cluster, schema, name):
# type: (str, str, str, str) -> None

self._data = {'db': database, 'cluster': cluster, 'schema': schema, 'tbl': name}

@property
def key(self):
# type: () -> str

return TableMetadata.TABLE_KEY_FORMAT.format(**self._data)


class RedashDashboardExtractor(Extractor):
"""
An extractor for retrieving dashboards and associated queries
(and possibly tables) from Redash.
There are five configuration values:
- `redash_base_url`: (e.g., `https://redash.example.com`) Base URL for the user-facing
Redash application
- `api_base_url`: (e.g., `https://redash.example.com/api`) Base URL for the API
- `api_key`: Redash API key
- (optional) `cluster`: A cluster name for this Redash instance (defaults to `prod`)
- (optional) `table_parser`: A function `(RedashVisualizationWidget) -> List[TableRelationData]`.
Given a `RedashVisualizationWidget`, this should return a list of potentially related tables
in Amundsen. Any table returned that exists in Amundsen will be linked to the dashboard.
Any table that does not exist will be ignored.
"""

REDASH_BASE_URL_KEY = 'redash_base_url'
API_BASE_URL_KEY = 'api_base_url'
API_KEY_KEY = 'api_key'
CLUSTER_KEY = 'cluster' # optional config
TABLE_PARSER_KEY = 'table_parser' # optional config

DEFAULT_CLUSTER = 'prod'

PRODUCT = 'redash'
DASHBOARD_GROUP_ID = 'redash'
DASHBOARD_GROUP_NAME = 'Redash'

def init(self, conf):
# type: (ConfigTree) -> None

# required configuration
self._redash_base_url = conf.get_string(RedashDashboardExtractor.REDASH_BASE_URL_KEY)
self._api_base_url = conf.get_string(RedashDashboardExtractor.API_BASE_URL_KEY)
self._api_key = conf.get_string(RedashDashboardExtractor.API_KEY_KEY)

# optional configuration
self._cluster = conf.get_string(
RedashDashboardExtractor.CLUSTER_KEY, RedashDashboardExtractor.DEFAULT_CLUSTER
)
self._parse_tables = None
tbl_parser_path = conf.get_string(RedashDashboardExtractor.TABLE_PARSER_KEY)
if tbl_parser_path:
module_name, fn_name = tbl_parser_path.rsplit('.', 1)
mod = importlib.import_module(module_name)
self._parse_tables = getattr(mod, fn_name)

self._extractor = self._build_extractor()
self._transformer = self._build_transformer()
self._extract_iter = None

def _is_published_dashboard(self, record):
# type: Dict[str, Any] -> bool

return not (record['is_archived'] or record['is_draft'])

def _get_extract_iter(self):
# type: () -> Iterator[Any]

while True:
record = self._extractor.extract()
if not record:
break # the end.

record = self._transformer.transform(record=record)

if not self._is_published_dashboard(record):
continue # filter this one out

identity_data = {
'cluster': self._cluster,
'product': RedashDashboardExtractor.PRODUCT,
'dashboard_group_id': RedashDashboardExtractor.DASHBOARD_GROUP_ID,
'dashboard_id': record['dashboard_id']
}

dash_data = {
'dashboard_group':
RedashDashboardExtractor.DASHBOARD_GROUP_NAME,
'dashboard_group_url':
self._redash_base_url,
'dashboard_name':
record['dashboard_name'],
'dashboard_url':
'{redash}/dashboard/{slug}'
.format(redash=self._redash_base_url, slug=record['slug']),
'created_timestamp':
record['created_timestamp']
}
dash_data.update(identity_data)

widgets = sort_widgets(record['widgets'])
text_widgets = get_text_widgets(widgets)
viz_widgets = get_visualization_widgets(widgets)

# generate a description for this dashboard, since Redash does not have descriptions
dash_data['description'] = generate_dashboard_description(text_widgets, viz_widgets)

yield DashboardMetadata(**dash_data)

last_mod_data = {'last_modified_timestamp': record['last_modified_timestamp']}
last_mod_data.update(identity_data)

yield DashboardLastModifiedTimestamp(**last_mod_data)

owner_data = {'email': record['user']['email']}
owner_data.update(identity_data)

yield DashboardOwner(**owner_data)

table_keys = set()

for viz in viz_widgets:
query_data = {
'query_id': viz.query_id,
'query_name': viz.query_name,
'url': self._redash_base_url + viz.query_relative_url,
'query_text': viz.raw_query
}

query_data.update(identity_data)
yield DashboardQuery(**query_data)

# if a table parser is provided, retrieve tables from this viz
if self._parse_tables:
for tbl in self._parse_tables(viz):
table_keys.add(tbl.key)

if len(table_keys) > 0:
yield DashboardTable(table_ids=list(table_keys), **identity_data)

def extract(self):
# type: () -> Any

if not self._extract_iter:
self._extract_iter = self._get_extract_iter()
try:
return next(self._extract_iter)
except StopIteration:
return None

def _build_restapi_query(self):
# type: () -> RestApiQuery

dashes_query = RedashPaginatedRestApiQuery(
query_to_join=EmptyRestApiQuerySeed(),
url='{redash_api}/dashboards'.format(redash_api=self._api_base_url),
params=self._get_default_api_query_params(),
json_path='results[*].[id,name,slug,created_at,updated_at,is_archived,is_draft,user]',
field_names=[
'dashboard_id', 'dashboard_name', 'slug', 'created_timestamp',
'last_modified_timestamp', 'is_archived', 'is_draft', 'user'
],
skip_no_result=True
)

return RestApiQuery(
query_to_join=dashes_query,
url='{redash_api}/dashboards/{{slug}}'.format(redash_api=self._api_base_url),
params=self._get_default_api_query_params(),
json_path='widgets',
field_names=['widgets'],
skip_no_result=True
)

def _get_default_api_query_params(self):
# type: () -> Dict[str, Any]

return {'headers': get_auth_headers(self._api_key)}

def _build_extractor(self):
# type: () -> RestAPIExtractor

extractor = RestAPIExtractor()
rest_api_extractor_conf = ConfigFactory.from_dict({
REST_API_QUERY: self._build_restapi_query()
})
extractor.init(rest_api_extractor_conf)
return extractor

def _build_transformer(self):
# type: () -> ChainedTransformer

transformers = []

# transform timestamps from ISO to unix epoch
ts_transformer_1 = TimestampStringToEpoch()
ts_transformer_1.init(ConfigFactory.from_dict({
TS_FIELD_NAME: 'created_timestamp',
}))
transformers.append(ts_transformer_1)

ts_transformer_2 = TimestampStringToEpoch()
ts_transformer_2.init(ConfigFactory.from_dict({
TS_FIELD_NAME: 'last_modified_timestamp',
}))
transformers.append(ts_transformer_2)

return ChainedTransformer(transformers=transformers)

def get_scope(self):
# type: () -> str

return 'extractor.redash_dashboard'
Loading

0 comments on commit f1b0dfa

Please sign in to comment.