diff --git a/.gitignore b/.gitignore index d748e3b..68f9a1e 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,11 @@ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info /*.egg + +# Images created should be checked in manually +*.png + +.coverage + +# pycharm or intellij +.idea/ diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..a58a195 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,46 @@ +language: python +python: + - 2.7 + +sudo: false + +addons: + apt_packages: + - graphviz +# command to install dependencies +install: + - pip install coveralls + - pip install -r requirements.txt + +# Setup config file +before_script: + - mkdir ~/.dataduct + - |+ + echo " + etl: + ROLE: DataPipelineDefaultRole + RESOURCE_ROLE: DataPipelineDefaultResourceRole + S3_ETL_BUCKET: FILL_ME_IN + + ec2: + CORE_INSTANCE_TYPE: m1.large + + emr: + CLUSTER_AMI: 2.4.7 + + redshift: + DATABASE_NAME: FILL_ME_IN + CLUSTER_ID: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN + + mysql: + DATABASE_KEY: + HOST: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN" > ~/.dataduct/dataduct.cfg + +# Run tests +script: nosetests --with-coverage --cover-package=. --cover-erase +after_success: + coveralls diff --git a/CHANGES.md b/CHANGES.md index 174235a..f71fc65 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,34 @@ -# Changes in ETL_Lib +# Changes in dataduct + +### 0.2.0 +- Travis integration for continous builds +- QA steps and logging to S3 +- Visualizing pipeline +- Dataduct CLI updated as a single entry point +- RDS connections for scripts +- Bootstrap step for pipelines +- Backfill or delay activation +- Output path and input path options +- Script directory for transform step +- SQL sanatization for DBA actions +- SQL parser for select and create table statements +- Logging across the library +- Support for custom steps +- Pipeline dependency step +- Reduce verbosity of imports +- Step parsing is isolated in steps +- More examples for steps +- Sync config with S3 +- Config overides with modes +- Rename keywords and safe config failure handling +- EMR Streaming support with hadoop 2 +- Exceptions cleanup +- Read the docs support +- Creating tables automatically for various steps +- History table support +- EC2 and EMR config control from YAML +- Slack integration +- Support for Regions in DP ### 0.1.0 - Initial version of the dataduct library released diff --git a/MANIFEST.in b/MANIFEST.in index adff763..8c35769 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,5 @@ include *.txt include *.md include *.rst -include *.sh include *.py recursive-include bin * -recursive-include scripts * diff --git a/README.rst b/README.rst index 9582f9b..9a5fcb8 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -Dataduct ----------- +Dataduct |build-status| |coverage-status| +----------------------------------------- Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to create ETL jobs. All jobs can be specified as a series of steps in a YAML file and would automatically be translated into datapipeline with appropriate @@ -7,7 +7,7 @@ pipeline objects. **Documentation and Details** -Documentation and more details can be found at http://pythonhosted.org/dataduct/ +Documentation and more details can be found at http://dataduct.readthedocs.org/en/latest/ **License** @@ -24,3 +24,11 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +.. |build-status| + image:: https://travis-ci.org/coursera/dataduct.svg?branch=develop + :target: https://travis-ci.org/coursera/dataduct + +.. |coverage-status| + image:: https://coveralls.io/repos/coursera/dataduct/badge.svg?branch=develop + :target: https://coveralls.io/r/coursera/dataduct?branch=develop diff --git a/bin/dataduct b/bin/dataduct index aef6933..b2bf29b 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -1,56 +1,302 @@ #!/usr/bin/env python +# PYTHON_ARGCOMPLETE_OK +"""Script that helps create and validate pipelines from command line """ -Script that helps create and validate pipelines from command line -""" +from argparse import ArgumentParser +from pytimeparse import parse +from datetime import timedelta + +from dataduct.utils.cli import * # noqa + +import logging +logger = logging.getLogger(__name__) + +PIPELINE = 'pipeline' +CREATE = 'create' +VALIDATE = 'validate' +ACTIVATE = 'activate' +VISUALIZE = 'visualize' + +CONFIG = 'config' +CONFIG_TO_S3 = 'sync_to_s3' +CONFIG_FROM_S3 = 'sync_from_s3' + +DATABASE = 'database' +DROP = 'drop' +GRANT = 'grant' +RECREATE = 'recreate' + + +def initialize_etl_objects(pipeline_definitions, time_delta=None, + frequency_override=None, backfill=False): + """Generate etl objects from yaml files + """ + from dataduct.etl import create_pipeline + from dataduct.etl import read_pipeline_definition + + # Convert the time_delta if it exists + if time_delta is not None: + time_delta = timedelta(seconds=parse(time_delta)) + if backfill: + time_delta *= -1 + + etls = [] + for pipeline_definition in pipeline_definitions: + definition = read_pipeline_definition(pipeline_definition) + if time_delta is not None: + definition.update({'time_delta': time_delta}) + if frequency_override is not None: + definition.update({'frequency': frequency_override}) + etls.append(create_pipeline(definition)) + return etls + + +def config_actions(action, filename=None, **kwargs): + """Config related actions are executed in this block + """ + from dataduct.config.config_actions import sync_to_s3 + from dataduct.config.config_actions import sync_from_s3 -import argparse -from dataduct.definition_parser import read_pipeline_definition -from dataduct.definition_parser import create_pipeline -from dataduct.definition_parser import validate_pipeline -from dataduct.definition_parser import activate_pipeline + if action == CONFIG_TO_S3: + return sync_to_s3() + return sync_from_s3(filename) -CREATE_STR = 'create' -VALIDATE_STR = 'validate' -ACTIVATE_STR = 'activate' +def pipeline_actions(action, pipeline_definitions, force=None, time_delta=None, + frequency_override=None, activities_only=None, + filename=None, backfill=False, **kwargs): + """Pipeline related actions are executed in this block + """ + from dataduct.etl import activate_pipeline + from dataduct.etl import validate_pipeline + from dataduct.etl import visualize_pipeline + + for etl in initialize_etl_objects(pipeline_definitions, time_delta, + frequency_override, backfill): + if action in [VALIDATE, ACTIVATE]: + validate_pipeline(etl, force) + if action == ACTIVATE: + activate_pipeline(etl) + if action == VISUALIZE: + visualize_pipeline(etl, activities_only, filename) + + +def database_actions(action, table_definitions, filename=None, **kwargs): + """Database related actions are executed in this block + """ + from dataduct.database import Database + + script = None + database = Database(files=table_definitions) + if action == CREATE: + script = database.create_relations_script() + elif action == DROP: + script = database.drop_relations_script() + elif action == GRANT: + script = database.grant_relations_script() + elif action == RECREATE: + script = database.recreate_relations_script() + elif action == VISUALIZE: + database.visualize(filename) + + # TODO: Build execution options + if script: + print script + def main(): - """Main function""" - parser = argparse.ArgumentParser(description='Run Dataduct commands') - parser.add_argument( - '-a', - '--action', - type=str, - choices={ - CREATE_STR: 'Create a pipeline locally', - VALIDATE_STR: 'Validate a pipeline with AWS without activating', - ACTIVATE_STR: 'create a pipeline and activate it on AWS', - }, - default=CREATE_STR, - ) - parser.add_argument( - 'load_definitions', - nargs='*', - help='Enter the paths of the load definitions.', - ) - parser.add_argument( - '-f', - '--force_overwrite', + """Main function that parses the command line arguments + """ + parser = ArgumentParser( + description='Run Dataduct commands', + add_help=False, + parents=[help_parser], + formatter_class=formatter_class, + ) + subparsers = parser.add_subparsers( + dest='command', + help='Actions for various features', + ) + + # Pipeline parser + pipeline_parser = subparsers.add_parser( + PIPELINE, + formatter_class=formatter_class, + add_help=False, + parents=[help_parser] + ) + pipeline_subparsers = pipeline_parser.add_subparsers( + dest='action', + help='Pipeline actions', + ) + + # Pipeline subparsers_action + pipeline_subparsers.add_parser( + CREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Create a pipeline locally', + ) + pipeline_subparsers.add_parser( + VALIDATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Validate a pipeline with AWS without activating', + ) + pipeline_subparsers.add_parser( + ACTIVATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Activate the pipeline on AWS', + ) + visualize_pipeline_parser = pipeline_subparsers.add_parser( + VISUALIZE, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + pipeline_definition_parser, + ], + help='Visualize the pipeline', + ) + visualize_pipeline_parser.add_argument( + '--activities_only', action='store_true', default=False, - help='Indicates that if this pipeline exists, it will be destroyed' - ' first.', + help='Visualize only activities', + ) + + # Config parser + config_parser = subparsers.add_parser( + CONFIG, + formatter_class=formatter_class, + add_help=False, + parents=[help_parser] + ) + config_subparsers = config_parser.add_subparsers( + dest='action', + help='config actions', + ) + + # config subparsers_action + config_subparsers.add_parser( + CONFIG_TO_S3, + formatter_class=formatter_class, + parents=[ + mode_parser, + ], + help='sync config file from local to s3', + ) + config_subparsers.add_parser( + CONFIG_FROM_S3, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + ], + help='sync config file from s3 to local file', + ) + + # Database parser + database_parser = subparsers.add_parser( + DATABASE, + formatter_class=formatter_class, + add_help=False, + parents=[help_parser] + ) + database_subparsers = database_parser.add_subparsers( + dest='action', + help='database actions', + ) + + # database subparsers_action + database_subparsers.add_parser( + CREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Create tables', + ) + database_subparsers.add_parser( + DROP, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Drop views and tables', + ) + database_subparsers.add_parser( + GRANT, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Grant permissions to neccessary groups', ) + database_subparsers.add_parser( + RECREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Recreate tables, load new data, drop old tables', + ) + database_subparsers.add_parser( + VISUALIZE, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + table_definition_parser, + ], + help='Visualize the database er-diagram', + ) + + # Check if autocomplete is possible + try: + import argcomplete + argcomplete.autocomplete(parser) + except ImportError: + pass args = parser.parse_args() + config = config_singleton_setup(args) - for load_definition in args.load_definitions: - definition = read_pipeline_definition(load_definition) - etl = create_pipeline(definition) - if args.action in [VALIDATE_STR, ACTIVATE_STR]: - validate_pipeline(etl, args.force_overwrite) - if args.action == ACTIVATE_STR: - activate_pipeline(etl) + # Frequency override + if hasattr(args, 'frequency') and args.frequency is not None: + frequency_override = args.frequency + else: + # Certain modes in the config can override frequency of a pipeline + frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) + + arg_vars = vars(args) + + # Action parse + if args.command == CONFIG: + config_actions(**arg_vars) + elif args.command == PIPELINE: + pipeline_actions(frequency_override=frequency_override, **arg_vars) + elif args.command == DATABASE: + database_actions(**arg_vars) + else: + raise ValueError('Unknown argument provided, use dataduct -h') if __name__ == '__main__': diff --git a/dataduct/__init__.py b/dataduct/__init__.py index 8bce1d9..dd420db 100644 --- a/dataduct/__init__.py +++ b/dataduct/__init__.py @@ -1,4 +1,4 @@ """Welcome to DataDuct """ -__version__ = '0.1.0' +__version__ = '0.2.0' __import__('pkg_resources').declare_namespace(__name__) diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index 34e05a5..548381f 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1 +1,3 @@ -from config import Config +from .config import Config +from .logger_config import logger_configuration +from .credentials import get_aws_credentials diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 95dc965..865bea3 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -1,17 +1,34 @@ -import os +"""Module that maintains the config singleton object used across the package +""" +from os.path import expanduser +from os.path import join +from os import environ import yaml -# We look at (in order of precedence): -# /etc/dataduct.cfg and ~/.dataduct for configuration constants +from .constants import CFG_FILE +from .constants import CONFIG_DIR -DataductConfigPath = '/etc/.dataduct' -DataductUserConfigPath = os.path.join(os.path.expanduser('~/.dataduct')) -DataductConfigFiles = [DataductConfigPath, DataductUserConfigPath] -# Check DATADUCT_PATH env variable for other configuration locations -if 'DATADUCT_PATH' in os.environ: - for path in os.environ['DATADUCT_PATH'].split(":"): - DataductConfigFiles.append(os.path.expanduser(path)) +def get_config_files(): + """Get the config file for dataduct + + Note: + The order of precedence is: + 1. /etc/dataduct.cfg + 2. ~/.dataduct + 3. DATADUCT_CONFIG_PATH environment variable + """ + dataduct_config_path = join('/etc', CFG_FILE) + dataduct_user_config_path = join(expanduser('~'), CONFIG_DIR, + CFG_FILE) + config_files = [dataduct_config_path, dataduct_user_config_path] + + # Check DATADUCT_CONFIG_PATH env variable for other configuration locations + if 'DATADUCT_CONFIG_PATH' in environ: + for path in environ['DATADUCT_CONFIG_PATH'].split(":"): + config_files.append(expanduser(path)) + + return config_files def load_yaml(configFiles): @@ -28,7 +45,49 @@ def load_yaml(configFiles): class Config(object): """Config singleton to manage changes config variables across the package """ - _shared_config = load_yaml(DataductConfigFiles) + _root_config = load_yaml(get_config_files()) + _isInstantiated = False + _root_mode = None + + def __new__(cls, mode=None): + """Runs once during class instantiation from the cli file + """ + if not cls._isInstantiated: + if mode is not None: + if mode not in cls._root_config: + raise ValueError('Specified mode not found in config') + + # Override the select fields specified based on mode + for key in cls._root_config[mode]: + if isinstance(cls._root_config[key], dict): + cls._root_config[key].update( + cls._root_config[mode][key]) + else: + cls._root_config[key] = cls._root_config[mode][key] + + cls._isInstantiated = True + cls._root_mode = mode + + obj = super(Config, cls).__new__(cls) + return obj + + def __init__(self, mode=None): + """Constructor for the config class + """ + self.__dict__ = self._root_config + + def __str__(self): + """String output for the config object + """ + return yaml.dump(self._root_config, default_flow_style=False, indent=4) + + def raw_config(self): + """String formatted config file + """ + return self.__str__() - def __init__(self): - self.__dict__ = self._shared_config + @property + def mode(self): + """Mode which the config was created in + """ + return self._root_mode diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py new file mode 100644 index 0000000..c9fcee3 --- /dev/null +++ b/dataduct/config/config_actions.py @@ -0,0 +1,38 @@ +""" +Script that has action functions for config +""" +from .config import Config +from ..s3 import S3Path +from ..s3 import S3File + +from .constants import CONFIG_STR +from .constants import CFG_FILE + + +config = Config() + +def s3_config_path(): + """S3 uri for the config files + """ + key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, CFG_FILE] + return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) + + +def sync_to_s3(): + """Upload the config file to an S3 location + """ + s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path()) + s3_file.upload_to_s3() + + +def sync_from_s3(filename): + """Read the config file from S3 + """ + s3_file = S3File(s3_path=s3_config_path()) + text = s3_file.text + + if filename is None: + raise ValueError('Filename for config sync must be provided') + else: + with open(filename, 'w') as op_file: + op_file.write(text) diff --git a/dataduct/config/constants.py b/dataduct/config/constants.py new file mode 100644 index 0000000..7e44208 --- /dev/null +++ b/dataduct/config/constants.py @@ -0,0 +1,7 @@ +"""Constants shared across the config package +""" + +CONFIG_STR = 'config' +CONFIG_DIR = '.dataduct' +CFG_FILE = 'dataduct.cfg' +LOG_FILE = 'dataduct.log' diff --git a/dataduct/config/credentials.py b/dataduct/config/credentials.py new file mode 100644 index 0000000..cea9a62 --- /dev/null +++ b/dataduct/config/credentials.py @@ -0,0 +1,76 @@ +"""Credentials utility functions for connecting to various services +""" +import os +import requests +import sys +from ConfigParser import SafeConfigParser + + +def get_aws_credentials_from_iam(): + """Get aws credentials using the IAM api + Note: this script only runs on an EC2 instance with the appropriate + resource roles. For more information, see the following: + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\ + AESDG-chapter-instancedata.html + + Returns: + access_key(str): AWS access key + secret_key(str): AWS secret key + token(str): Connection token + """ + url = 'http://169.254.169.254/latest/meta-data/iam/security-credentials/' + + # Get role name + r = requests.get(url) + + if not r.ok: + raise Exception('Request failed for url %s.' % url) + + # Add role name to url + url += r.content + + # Get access keys + r = requests.get(url) + if not r.ok: + raise Exception('Request failed for url %s.' % url) + + json_result = r.json() + return (json_result['AccessKeyId'], + json_result['SecretAccessKey'], + json_result['Token']) + + +def get_aws_credentials_from_file(filename=None): + """Get the aws from credential files + """ + config = SafeConfigParser() + cred_file = None + if filename is not None and os.path.isfile(filename): + cred_file = filename + elif os.path.isfile('/etc/boto.cfg'): + cred_file = '/etc/boto.cfg' + elif os.path.isfile(os.path.expanduser('~/.boto')): + cred_file = os.path.expanduser('~/.boto') + elif os.path.isfile(os.path.expanduser('~/.aws/credentials')): + cred_file = os.path.expanduser('~/.aws/credentials') + else: + raise Exception("Cannot find a credentials file") + + config.read(cred_file) + aws_access_key_id = config.get('Credentials', + 'aws_access_key_id') + aws_secret_access_key = config.get('Credentials', + 'aws_secret_access_key') + return (aws_access_key_id, aws_secret_access_key, None) + + +def get_aws_credentials(filename=None): + """Get the aws credentials from IAM or files + """ + try: + aws_key, aws_secret, token = get_aws_credentials_from_iam() + except Exception, error: + sys.stderr.write('Failed to get creds from IAM: %s \n' % error.message) + aws_key, aws_secret, token = get_aws_credentials_from_file(filename) + + return aws_key, aws_secret, token diff --git a/dataduct/config/example_config b/dataduct/config/example_config index d56a811..5028212 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -1,47 +1,17 @@ # Constants that are used across the dataduct library ec2: - DEFAULT_ROLE: FILL_ME_IN - DEFAULT_RESOURCE_ROLE: FILL_ME_IN - DEFAULT_EC2_INSTANCE_TYPE: m1.large - ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - KEY_PAIR: FILL_ME_IN - SECURITY_GROUP: FILL_ME_IN + INSTANCE_TYPE: m1.large + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline + SECURITY_GROUP: FILL_ME_IN emr: - DEFAULT_NUM_CORE_INSTANCES: 3 - DEFAULT_CORE_INSTANCE_TYPE: m1.large - DEFAULT_TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - DEFAULT_TASK_INSTANCE_TYPE: m1.large - DEFAULT_MASTER_INSTANCE_TYPE: m1.large - DEFAULT_CLUSTER_TIMEOUT: 6 Hours - DEFAULT_HADOOP_VERSION: null - DEFAULT_HIVE_VERSION: null - DEFAULT_PIG_VERSION: null - DEFAULT_CLUSTER_AMI: 2.4.7 - -redshift: - REDSHIFT_DATABASE_NAME: FILL_ME_IN - REDSHIFT_CLUSTER_ID: FILL_ME_IN - REDSHIFT_USERNAME: FILL_ME_IN - REDSHIFT_PASSWORD: FILL_ME_IN - -mysql: - DATABASE_KEY: - HOST: FILL_ME_IN, - USERNAME: FILL_ME_IN, - PASSWORD: FILL_ME_IN + MASTER_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + CORE_INSTANCE_TYPE: m1.large + CLUSTER_AMI: 3.1.0 etl: - RETRY_DELAY: 10 Minutes - DEFAULT_MAX_RETRIES: 0 - ETL_BUCKET: FILL_ME_IN - DATA_PIPELINE_TOPIC_ARN: FILL_ME_IN - DAILY_LOAD_TIME: 1 # run at 1AM UTC - -bootstrap: - - step_type: transform - input_node: [] - command: whoami >> ${OUTPUT1_STAGING_DIR}/output.txt - resource: FILL_ME_IN - name: bootstrap_transform + S3_ETL_BUCKET: FILL_ME_IN + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py new file mode 100644 index 0000000..af739fc --- /dev/null +++ b/dataduct/config/logger_config.py @@ -0,0 +1,50 @@ +"""Script that has the base logger configurations +""" +import os +import logging +from logging.handlers import RotatingFileHandler + +from .config import Config +from .constants import CONFIG_DIR +from .constants import LOG_FILE + +FILE_FORMAT_STR = '%(asctime)s [%(levelname)s]: %(message)s ' + \ + '[in %(name)s:%(lineno)d in %(funcName)s]' +CONSOLE_FORMAT_STR = '[%(levelname)s]: %(message)s' + + +def logger_configuration(): + """Set the logger configurations for dataduct + """ + config = Config() + + if hasattr(config, 'logging'): + log_directory = os.path.expanduser(config.logging.get( + 'LOG_DIR', '~' + CONFIG_DIR)) + file_name = config.logging.get( + 'LOG_FILE', LOG_FILE) + + console_level = config.logging.get( + 'CONSOLE_DEBUG_LEVEL', logging.INFO) + file_level = config.logging.get( + 'FILE_DEBUG_LEVEL', logging.DEBUG) + + if not os.path.exists(log_directory): + os.mkdir(log_directory) + + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + + file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), + maxBytes=200000, + backupCount=10) + file_handler.setLevel(file_level) + file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR, + datefmt='%Y-%m-%d %H:%M')) + + console_handler = logging.StreamHandler() + console_handler.setLevel(console_level) + console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT_STR)) + + logger.addHandler(console_handler) + logger.addHandler(file_handler) diff --git a/dataduct/config/tests/__init__.py b/dataduct/config/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/config/tests/test_credentials.py b/dataduct/config/tests/test_credentials.py new file mode 100644 index 0000000..4a96926 --- /dev/null +++ b/dataduct/config/tests/test_credentials.py @@ -0,0 +1,47 @@ +"""Tests for credentials file +""" +from mock import patch +from nose.tools import eq_ +import json + +from ..credentials import get_aws_credentials_from_iam + +@patch('requests.get') +def test_get_aws_credentials_from_iam(patched_requests_get): + """Test for get credentials from IAM + """ + class MockedReturn: + """Mock request response + """ + def __init__(self, content): + self.content = content + self.ok = True + + def json(self): + """Returns a json for the content + """ + return json.loads(self.content) + + def server_response(url): + """Mocked server responses + """ + if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/': # NOQA + return MockedReturn("role") + if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/role': # NOQA + return MockedReturn(""" + { + "Code" : "Success", + "LastUpdated" : "2012-04-26T16:39:16Z", + "Type" : "AWS-HMAC", + "AccessKeyId" : "access_id", + "SecretAccessKey" : "secret_key", + "Token" : "token", + "Expiration" : "2012-04-27T22:39:16Z" + } + """) + + patched_requests_get.side_effect = server_response + access_id, secret_key, token = get_aws_credentials_from_iam() + eq_(access_id, 'access_id') + eq_(secret_key, 'secret_key') + eq_(token, 'token') diff --git a/dataduct/data_access/__init__.py b/dataduct/data_access/__init__.py new file mode 100644 index 0000000..731c805 --- /dev/null +++ b/dataduct/data_access/__init__.py @@ -0,0 +1,4 @@ +from .connection import get_sql_config +from .connection import rds_connection +from .connection import get_redshift_config +from .connection import redshift_connection diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py new file mode 100644 index 0000000..224d26b --- /dev/null +++ b/dataduct/data_access/connection.py @@ -0,0 +1,79 @@ +""" +Connections to various databases such as RDS and Redshift +""" +import psycopg2 +import MySQLdb +import MySQLdb.cursors + +from ..config import Config +from ..utils.helpers import retry +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLConfigError + +config = Config() +CONNECTION_RETRIES = config.etl.get('CONNECTION_RETRIES', 2) + + +def get_redshift_config(): + """Get redshift config from config file and return the dictionary + """ + if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift config not found') + return config.redshift + + +@retry(CONNECTION_RETRIES, 60) +def redshift_connection(redshift_creds=None, **kwargs): + """Fetch a psql connection object to redshift + """ + if redshift_creds is None: + redshift_creds = get_redshift_config() + + connection = psycopg2.connect( + host=redshift_creds['HOST'], + user=redshift_creds['USERNAME'], + password=redshift_creds['PASSWORD'], + port=redshift_creds['PORT'], + database=redshift_creds['DATABASE_NAME'], + connect_timeout=10, + **kwargs) + return connection + + +def get_sql_config(database_name): + """Get SQL config from config file and return the dictionary + """ + if not hasattr(config, 'mysql'): + raise ETLConfigError('mysql not found in dataduct configs') + + if database_name not in config.mysql: + raise ETLConfigError( + 'Config for hostname: %s not found' % database_name) + + sql_creds = config.mysql[database_name] + sql_creds['DATABASE'] = database_name + + return sql_creds + + +@retry(CONNECTION_RETRIES, 60) +def rds_connection(database_name=None, sql_creds=None, + cursorclass=MySQLdb.cursors.SSCursor, **kwargs): + """Fetch a mysql connection object to rds databases + """ + + assert exactly_one(database_name, sql_creds), \ + 'Either database or params needed' + + if sql_creds is None: + sql_creds = get_sql_config(database_name) + + connection = MySQLdb.connect( + host=sql_creds['HOST'], + user=sql_creds['USERNAME'], + passwd=sql_creds['PASSWORD'], + db=sql_creds['DATABASE'], + charset='utf8', # Necessary for foreign chars + cursorclass=cursorclass, + **kwargs) + return connection diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py new file mode 100644 index 0000000..3a2db43 --- /dev/null +++ b/dataduct/database/__init__.py @@ -0,0 +1,7 @@ +from .database import Database +from .select_statement import SelectStatement +from .sql import SqlScript +from .sql import SqlStatement +from .table import Table +from .view import View +from .history_table import HistoryTable diff --git a/dataduct/database/column.py b/dataduct/database/column.py new file mode 100644 index 0000000..948c099 --- /dev/null +++ b/dataduct/database/column.py @@ -0,0 +1,61 @@ +"""Script containing the column class object +""" + +class Column(object): + """Class representing columns in a table + """ + def __init__(self, column_name, column_type, encoding=None, + fk_reference=None, fk_table=None, is_distkey=False, + is_sortkey=False, is_primarykey=False, is_null=False, + is_not_null=False, position=None): + """Constructor for Column class + """ + + self.column_name = column_name + self.column_type = column_type + self.encoding = encoding + self.fk_reference = fk_reference + self.fk_table = fk_table + self.is_distkey = is_distkey + self.is_sortkey = is_sortkey + self.is_primarykey = is_primarykey + self.is_null = is_null + self.is_not_null = is_not_null + self.position = position + + if is_null and is_not_null: + raise ValueError('Column cannot be both NULL and NOT NULL together') + + if self.is_primarykey: + self.is_not_null = True + self.is_null = False + + def __str__(self): + """String output for the columns + """ + if self.column_type is not None: + return '%s %s' % (self.column_name, self.column_type) + return self.column_name + + @property + def primary(self): + """Property for the column being part of primary key + """ + return self.is_primarykey + + @primary.setter + def primary(self, value=True): + """Set the primary flag for the column + """ + self.is_primarykey = value + + # Force not null for primary key columns + if self.is_primarykey: + self.is_not_null = True + self.is_null = False + + @property + def name(self): + """Get the name of the column + """ + return self.column_name diff --git a/dataduct/database/database.py b/dataduct/database/database.py new file mode 100644 index 0000000..0589093 --- /dev/null +++ b/dataduct/database/database.py @@ -0,0 +1,248 @@ +"""Script containing the database class object +""" +from copy import deepcopy + +from .relation import Relation +from .view import View +from .table import Table +from .sql import SqlScript + +from ..utils.helpers import atmost_one +from ..utils.exceptions import DatabaseInputError + +import logging +logger = logging.getLogger(__name__) + + +class Database(object): + """Class representing a database + """ + + def __init__(self, relations=None, files=None): + """Constructor for the database class + """ + self._relations = {} + + if not atmost_one(relations, files): + raise ValueError('Only one of relations and files should be given') + + if files: + relations = self._initialize_relations(files) + + if relations: + for relation in relations: + self.add_relation(relation) + + def copy(self): + """Create a copy of the database object + """ + return deepcopy(self) + + @staticmethod + def _initialize_relations(files): + """Read the files and create relations from the files + """ + relations = [] + for filename in files: + with open(filename) as f: + script = SqlScript(f.read()) + if script.creates_table(): + relations.append(Table(script)) + elif script.creates_view(): + relations.append(View(script)) + else: + raise ValueError('File does not create a relation') + return relations + + def add_relation(self, relation): + """Add a relation, only if its name is not already used. + """ + assert isinstance(relation, Relation), 'Input should be a relation' + if relation.full_name in self._relations: + raise ValueError( + 'Relation %s already added to database' % relation.full_name) + + self._relations[relation.full_name] = relation + + def relations(self): + """Unsorted list of relations of the database + """ + return self._relations.values() + + def relation(self, relation_name): + """Get the relation with the given name + """ + return self._relations.get(relation_name, None) + + @property + def num_views(self): + """The number of views in the database + """ + return len([a for a in self.relations() if isinstance(a, View)]) + + @property + def num_tables(self): + """The number of tables in the database + """ + return len([a for a in self.relations() if isinstance(a, Table)]) + + def has_cycles(self, relation=None, visited=None): + """Check if the database has no circular dependencies + """ + if visited is None: + visited = list() + + if relation: + # Don't include table as own dependency, ignore references not in DB + relations_to_check = [ + self.relation(x) for x in relation.dependencies + if x != relation and self.relation(x) is not None] + else: + relations_to_check = self._relations.values() + + for relation in relations_to_check: + if relation.full_name in visited: + return True + # Make a copy for immutability + visited_copy = deepcopy(visited) + visited_copy.append(relation.full_name) + if self.has_cycles(relation, visited_copy): + return True + return False + + def sorted_relations(self): + """Topological sort of the relations for dependency management + """ + if self.has_cycles(): + logger.warning('Database has cycles') + + sorted_relations = [] + graph = dict((x.full_name, x.dependencies) for x in self.relations()) + + # Run until the unsorted graph is empty + while graph: + acyclic = False + for relation_name, dependencies in graph.items(): + for dependency in dependencies: + if dependency in graph: + break + else: + acyclic = True + graph.pop(relation_name) + sorted_relations.append(self.relation(relation_name)) + + if not acyclic: + raise RuntimeError("A cyclic dependency occurred") + return sorted_relations + + def relations_script(self, function_name, **kwargs): + """SQL Script for all the relations of the database + """ + result = SqlScript() + for relation in self.sorted_relations(): + func = getattr(relation, function_name) + result.append(func(**kwargs)) + return result + + def grant_relations_script(self): + """SQL Script for granting permissions all the relations of the database + """ + return self.relations_script('grant_script') + + def create_relations_script(self, grant_permissions=True): + """SQL Script for creating all the relations of the database + """ + return self.relations_script( + 'create_script', grant_permissions=grant_permissions) + + def drop_relations_script(self): + """SQL Script for dropping all the relations for the database + """ + return self.relations_script('drop_script') + + def recreate_relations_script(self, grant_permissions=True): + """SQL Script for recreating all the relations of the database + """ + return self.relations_script( + 'recreate_script', grant_permissions=grant_permissions) + + def recreate_table_dependencies(self, table_name, grant_permissions=True): + """Recreate the dependencies for a particular table from the database + """ + result = SqlScript() + for relation in self.relations(): + if relation.full_name == table_name: + # Continue as cannnot be dependecy of self + continue + + if isinstance(relation, Table): + # Recreate foreign key relations + for column_names, ref_name, ref_columns in \ + relation.foreign_key_references(): + if ref_name == table_name: + result.append( + relation.foreign_key_reference_script( + source_columns=column_names, + reference_name=ref_name, + reference_columns=ref_columns)) + + if isinstance(relation, View): + # Recreate view if pointing to table + if table_name in relation.dependencies: + result.append(relation.recreate_script( + grant_permissions=grant_permissions)) + return result + + @staticmethod + def _make_node_label(relation): + """Create the table layout for graph nodes + """ + columns = list() + row = '{col_name}{pk}' + for column in sorted(relation.columns(), key=lambda x: x.position): + columns.append(row.format(col_name=column.name, + pk=' (PK)' if column.primary else '')) + + layout = ('<\n' + '\n' + '{columns}
{table_name}
>').format(table_name=relation.full_name, + columns='\n'.join(columns)) + return layout + + def visualize(self, filename=None): + """Visualize databases and create an er-diagram + + Args: + filename(str): filepath for saving the er-diagram + """ + # Import pygraphviz for plotting the graphs + try: + import pygraphviz + except ImportError: + logger.error('Install pygraphviz for visualizing databases') + raise + + if filename is None: + raise DatabaseInputError( + 'Filename must be provided for visualization') + + logger.info('Creating a visualization of the database') + graph = pygraphviz.AGraph(name='Database', label='Database') + + tables = [r for r in self.relations() if isinstance(r, Table)] + + # Add nodes + for table in tables: + graph.add_node(table.full_name, shape='none', + label=self._make_node_label(table)) + + # Add edges + for table in tables: + for cols, ref_table, ref_cols in table.foreign_key_references(): + graph.add_edge(ref_table, table.full_name, tailport=ref_cols[0], + headport=cols[0], dir='both', arrowhead='crow', + arrowtail='dot') + + # Plotting the graph with dot layout + graph.layout(prog='dot') + graph.draw(filename) diff --git a/dataduct/database/history_table.py b/dataduct/database/history_table.py new file mode 100644 index 0000000..dc0cccd --- /dev/null +++ b/dataduct/database/history_table.py @@ -0,0 +1,202 @@ +"""Script containing the history table class object +Child of the table class object +""" + +from .table import Table +from .sql import SqlScript +from .select_statement import SelectStatement + +HIST_EFFECTIVE_COLUMN = 'effective_ts' +HIST_EXPIRATION_COLUMN = 'expiration_ts' +HIST_EXPIRATION_MAX = '9999-12-31 23:59:59.999999' + + +class HistoryTable(Table): + """A history table is a table specifically designed to represent + Slowly Changing Dimensions + (http://en.wikipedia.org/wiki/Slowly_changing_dimension). + + Its first two columns must be an effective timestamp and an expiration + timestamp, but otherwise it looks just like a regular table. + """ + + def __init__(self, sql): + """Constructor for the HistoryTable class + """ + super(HistoryTable, self).__init__(sql) + # Check that first column is the effective timestamp + # And the second column is the expiration timestamp + if self.column(HIST_EFFECTIVE_COLUMN) is None or\ + self.column(HIST_EXPIRATION_COLUMN) is None: + raise ValueError('History table must have effective and expiration' + ' timestamps') + + def _select_current_script(self): + """SQL script to select current view of table + """ + + # Get all columns except for the two timestamps + selected_columns = [c.name for c in self.columns() + if c.name != HIST_EFFECTIVE_COLUMN and + c.name != HIST_EXPIRATION_COLUMN] + + return SelectStatement(""" + SELECT {selected_columns} + FROM {history_name} + WHERE {expiration_column} = '{expiration_max}' + """.format(selected_columns=', '.join(selected_columns), + history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + expiration_max=HIST_EXPIRATION_MAX)) + + def _expire_history_script(self, source): + """SQL script to expire outdated records + + Args: + source (Table): The source from which to update history + + Returns: + SqlScript: a SQL statement that removes outdated records + + A history row will be expired if: + It is currently unexpired (expiration timestamp is at max); and + either: + It's corresponding row in the source table has been changed; or + It's corresponding row in the source table has been deleted. + """ + + if not isinstance(source, Table): + raise ValueError('Source must be a table') + + # Get the secondary columns of the table + secondary_columns = [column for column in source.columns() + if not column.primary] + + # There must be at least one primary and secondary key + if len(source.primary_keys) == 0: + raise ValueError('Source table must have a primary key') + if len(secondary_columns) == 0: + raise ValueError('Source table must have a non-primary column') + + # Expire if corresponding row in the source table has been changed + # First, match primary key info to determine corresponding rows + same_statement =\ + '{history_name}.{column_name} = {source_name}.{column_name}' + matching_primary_keys_condition = ' AND '.join( + [same_statement.format(history_name=self.full_name, + source_name=source.full_name, + column_name=column.name) + for column in source.primary_keys] + ) + # Then, filter to get only the records that have changed + # A record has been changed if one of it's non-primary columns + # are different + different_statement = """ + {history_name}.{column_name} != {source_name}.{column_name} + OR ( + {history_name}.{column_name} IS NULL + AND {source_name}.{column_name} IS NOT NULL + ) + OR ( + {history_name}.{column_name} IS NOT NULL + AND {source_name}.{column_name} IS NULL + ) + """ + record_changed_condition = '(' + ' OR '.join( + [different_statement.format(history_name=self.full_name, + source_name=source.full_name, + column_name=column.name) + for column in secondary_columns] + ) + ')' + # Lastly, filter to get only the non-expired columns + # This statement will be reused for the removal check + not_expired_condition =\ + '{expiration_column} = \'{expiration_max}\''.format( + expiration_column=HIST_EXPIRATION_COLUMN, + expiration_max=HIST_EXPIRATION_MAX, + ) + # Expire changed columns + script = SqlScript(""" + UPDATE {history_name} + SET {expiration_column} = SYSDATE - INTERVAL '0.000001 seconds' + FROM {source_name} + WHERE {matching_primary_keys} + AND {record_changed} + AND {not_expired}; + """.format(history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + source_name=source.full_name, + matching_primary_keys=matching_primary_keys_condition, + record_changed=record_changed_condition, + not_expired=not_expired_condition)) + + # Expire if corresponding row in the source table has been deleted + # Filter to get the history rows which have primary keys + # that are no longer in the source table + primary_keys = ",".join([name for name in source.primary_key_names]) + missing_primary_keys_condition = """ + ( + {primary_keys} + ) + NOT IN ( + SELECT {primary_keys} + FROM {source_name} + ) + """.format(primary_keys=primary_keys, + source_name=source.full_name) + + script.append(""" + UPDATE {history_name} + SET {expiration_column} = SYSDATE - INTERVAL '0.000001 seconds' + WHERE {missing_primary_keys} + AND {not_expired}; + """.format(history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + missing_primary_keys=missing_primary_keys_condition, + not_expired=not_expired_condition)) + return script + + def update_history_script(self, source): + """SQL script to update the history table + + Args: + source (Table): The source from which to update history + + Returns: + SqlScript: a SQL statement that updates history + + Raises: + ValueError: If source is not a Table object + """ + + if not isinstance(source, Table): + raise ValueError('Source must be a table') + + # Create a temporary copy of the source relation as another table + temp_table = Table(source.temporary_clone_script()) + result = temp_table.create_script(grant_permissions=False) + + # Insert the values of the original table into the temp table + result.append(temp_table.insert_script(source)) + + # Expire outdated records + result.append(self._expire_history_script(source)) + + # Delete records from the temp table that have not changed + result.append( + temp_table.delete_matching_rows_script( + self._select_current_script())) + + # Insert the remaining rows into destination + select_statement = SelectStatement(""" + SELECT SYSDATE, '{expiration_max}'::TIMESTAMP, {columns} + FROM {temp_table_name} + """.format(expiration_max=HIST_EXPIRATION_MAX, + columns=', '.join( + [c.name for c in temp_table.columns()]), + temp_table_name=temp_table.full_name)) + result.append(self.insert_script(select_statement)) + + # Drop the temp table, in case the temporary flag isn't enough + result.append(temp_table.drop_script()) + return result diff --git a/dataduct/database/parsers/__init__.py b/dataduct/database/parsers/__init__.py new file mode 100644 index 0000000..61d5813 --- /dev/null +++ b/dataduct/database/parsers/__init__.py @@ -0,0 +1,13 @@ +from .transform import remove_comments +from .transform import remove_empty_statements +from .transform import remove_transactional +from .transform import split_statements +from .transform import remove_newlines + +from .select_query import parse_select_dependencies +from .select_query import parse_select_columns +from .select_query import parse_column_name + +from .create_table import parse_create_table +from .create_table import create_exists_clone +from .create_view import parse_create_view diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py new file mode 100644 index 0000000..f2372c6 --- /dev/null +++ b/dataduct/database/parsers/create_table.py @@ -0,0 +1,177 @@ +"""Create SQL parser +""" +from pyparsing import restOfLine +from pyparsing import ParseException +from pyparsing import ZeroOrMore + +from .utils import _all +from .utils import _create +from .utils import _db_name +from .utils import _distkey +from .utils import _diststyle +from .utils import _encode +from .utils import _even +from .utils import _foreign_key +from .utils import _key +from .utils import _not_null +from .utils import _null +from .utils import _references +from .utils import _sortkey +from .utils import _table +from .utils import column_types +from .utils import field_parser +from .utils import pk_check + +from .helpers import existance_check +from .helpers import exists +from .helpers import paranthesis_list +from .helpers import temporary_check +from .helpers import to_dict + +import logging +logger = logging.getLogger(__name__) + +FK_REFERENCE = 'fk_reference' + + +def fk_reference(): + """Get Parser for foreign key references + """ + fk_reference_columns = paranthesis_list(FK_REFERENCE) + fk_table = _db_name.setResultsName('fk_table') + return _references + fk_table + fk_reference_columns + + +def get_definition_start(): + """Get a pyparsing parse for start of the create table statement + + Returns: + table_definition(pyparsing): Parser for create table statements + """ + temp_check = temporary_check.setResultsName('temporary') + exists_check = existance_check.setResultsName('exists_checks') + + table_name = _db_name.setResultsName('full_name') + + # Initial portions of the table definition + def_start = _create + temp_check + _table + exists_check + table_name + return def_start + + +def get_base_parser(): + """Get a pyparsing parser for a create table statement + + Returns: + table_definition(pyparsing): Parser for create table statements + """ + table_def = get_definition_start() + \ + paranthesis_list('raw_fields', field_parser) + \ + get_attributes_parser() + + return table_def + + +def get_column_parser(): + """Get a pyparsing parser for a create table column field statement + + Returns: + column_definition(pyparsing): Parser for column definitions + """ + column_name = _db_name.setResultsName('column_name') + column_type = column_types.setResultsName('column_type') + + constraints = exists(_not_null, 'is_not_null') + constraints |= exists(_null, 'is_null') + constraints |= exists(pk_check, 'is_primarykey') + constraints |= exists(_distkey, 'is_distkey') + constraints |= exists(_sortkey, 'is_sortkey') + constraints |= fk_reference() + constraints |= _encode + _db_name.setResultsName('encoding') + + column_def = column_name + column_type + ZeroOrMore(constraints) + return column_def + + +def get_constraints_parser(): + """Get a pyparsing parser for a create table constraints field statement + + Returns: + constraints_definition(pyparsing): Parser for constraints definitions + """ + # Primary Key Constraints + def_pk = pk_check + paranthesis_list('pk_columns') + + # Foreign Key Constraints + def_fk = _foreign_key + paranthesis_list('fk_columns') + fk_reference() + + return def_pk | def_fk + + +def get_attributes_parser(): + """Get a pyparsing parser for a create table attributes + + Returns: + attribute_parser(pyparsing): Parser for attribute definitions + """ + diststyle_def = _diststyle + (_all | _even | _key).setResultsName( + 'diststyle') + + distkey_def = _distkey + paranthesis_list('distkey') + sortkey_def = _sortkey + paranthesis_list('sortkey') + + return ZeroOrMore(diststyle_def | sortkey_def | distkey_def) + + +def parse_create_table(string): + """Parse the create table sql query and return metadata + + Args: + string(sql): SQL string from a SQL Statement + + Returns: + table_data(dict): table_data dictionary for instantiating a table object + """ + # Parse the base table definitions + table_data = to_dict(get_base_parser().parseString(string)) + + # Parse the columns and append to the list + table_data['columns'] = list() + table_data['constraints'] = list() + + column_position = 0 + for field in table_data['raw_fields']: + try: + column = to_dict(get_column_parser().parseString(field)) + + # Add position of the column + column['position'] = column_position + column_position += 1 + + # Change fk_reference_column to string from list + if FK_REFERENCE in column: + column[FK_REFERENCE] = column[FK_REFERENCE][0] + + table_data['columns'].append(column) + + except ParseException: + try: + constraint = to_dict( + get_constraints_parser().parseString(field)) + table_data['constraints'].append(constraint) + except ParseException: + logger.error(field) + raise + + return table_data + + +def create_exists_clone(string): + """Create a clone of the table statement which has the exists check + """ + parser = get_definition_start() + restOfLine.setResultsName('definition') + result = to_dict(parser.parseString(string)) + template = 'CREATE {temp} TABLE IF NOT EXISTS {table_name} {definition}' + return template.format(temp='TEMP' if result['temporary'] else '', + table_name=result['full_name'], + definition=result['definition']) + diff --git a/dataduct/database/parsers/create_view.py b/dataduct/database/parsers/create_view.py new file mode 100644 index 0000000..10dd63a --- /dev/null +++ b/dataduct/database/parsers/create_view.py @@ -0,0 +1,48 @@ +"""Create SQL parser +""" +from pyparsing import Group +from pyparsing import printables +from pyparsing import StringEnd +from pyparsing import Word +from pyparsing import ZeroOrMore + +from .utils import _create +from .utils import _view +from .utils import _db_name +from .utils import _as + +from .helpers import to_dict +from .helpers import replace_check + + +merge = lambda x: ' '.join(x[0]) + + +def rreplace(s, old, new): + li = s.rsplit(old, 1) + return new.join(li) + +def parse_create_view(string): + """Parse the create view sql query and return metadata + + Args: + string(str): Input sql string that should be parsed + + Returns: + view_data(dict): view_data dictionary for instantiating a view object + """ + + string = rreplace(string, ')', ' )') + + end = ')' + StringEnd() + select = Group(ZeroOrMore(~end + Word(printables))) + + parser = _create + replace_check.setResultsName('replace') + _view + parser += _db_name.setResultsName('view_name') + _as + '(' + parser += select.setParseAction(merge).setResultsName('select_statement') + parser += end + + # Parse the base table definitions + view_data = to_dict(parser.parseString(string)) + + return view_data diff --git a/dataduct/database/parsers/helpers.py b/dataduct/database/parsers/helpers.py new file mode 100644 index 0000000..74068c5 --- /dev/null +++ b/dataduct/database/parsers/helpers.py @@ -0,0 +1,45 @@ +"""SQL parser helpers +""" +from pyparsing import delimitedList +from pyparsing import Optional +from pyparsing import ParseResults + +from .utils import _db_name +from .utils import _temp +from .utils import _temporary +from .utils import _if_not_exists +from .utils import _or_replace + +# Functions +isNotEmpty = lambda x: len(x) > 0 + +temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) + +replace_check = Optional(_or_replace).setParseAction(isNotEmpty) + +existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty) + + +def paranthesis_list(output_name, input_var=_db_name): + """Parser for a delimiedList enclosed in paranthesis + """ + return '(' + delimitedList(input_var).setResultsName(output_name) + ')' + + +def exists(parser, output_name): + """Get a parser that returns boolean on existance + """ + return parser.setParseAction(isNotEmpty).setResultsName(output_name) + + +def to_dict(input): + """Purge the ParseResults from output dictionary + """ + output = dict() + for key, value in input.asDict().iteritems(): + if isinstance(value, ParseResults): + output[key] = value.asList() + else: + output[key] = value + + return output diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py new file mode 100644 index 0000000..e4623c2 --- /dev/null +++ b/dataduct/database/parsers/select_query.py @@ -0,0 +1,123 @@ +"""Select SQL parser +""" +from pyparsing import delimitedList +from pyparsing import MatchFirst +from pyparsing import printables +from pyparsing import restOfLine +from pyparsing import Word +from pyparsing import WordStart +from pyparsing import ParseException +from pyparsing import Optional + +from .utils import _as +from .utils import _db_name +from .utils import _from +from .utils import _join +from .utils import _select +from .utils import _with +from .utils import subquery +from .utils import field_parser + + +def deduplicate_with_order(seq): + """Deduplicate a sequence while preserving the order + """ + seen = set() + seen_add = seen.add + return [x for x in seq if not (x in seen or seen_add(x))] + + +def parse_select_base(string): + """Parse a select query and return the dependencies + + Args: + string(str): Input string to be parsed + + Returns: + result(list of str): List of dependent tables + """ + + if string == '': + return + + base_parser = _select + restOfLine + + # Sanity check that query starts with select + base_parser.parseString(string) + + +def parse_select_dependencies(string): + """Parse a select query and return the dependencies + + Args: + string(str): Input string to be parsed + + Returns: + result(list of str): List of dependent tables + """ + + if string == '': + return list() + + # Find all dependent tables + dep_parse = WordStart() + (_from | _join) + _db_name.setResultsName('table') + output = dep_parse.setParseAction(lambda x: x.table).searchString(string) + + # Flatten the list before returning + flattened_output = [item for sublist in output for item in sublist] + + # Deduplicated the list + unique_output = deduplicate_with_order(flattened_output) + + if len(unique_output) == 0: + raise ParseException('No dependent table in select query') + return unique_output + + +def parse_select_columns(string): + """Parse a select query and return the columns + + Args: + string(str): Input string to be parsed + + Returns: + result(list of str): List of columns + """ + + if string == '': + return list() + + if string.upper().startswith('WITH'): + suppressor = _with + delimitedList(_db_name + _as + subquery) + string = suppressor.suppress().transformString(string) + + # Supress everything after the first from + suppressor = MatchFirst(_from) + restOfLine + string = suppressor.suppress().transformString(string) + + parser = _select + delimitedList(field_parser).setResultsName('columns') + output = parser.parseString(string).columns.asList() + + # Strip extra whitespace from the string + return [column.strip() for column in output] + + +def parse_column_name(string): + """Parse column name from select query + + Note: + This assumes that every column has a name and is the last word of str + + Args: + string(str): Input string to be parsed + + Returns: + result(str): column name + """ + # Find all words in the string + words = Word(printables.replace('\n\r', '')).searchString(string) + + # Get the last word matched + # TODO: Make it more complicated + name = words.pop().asList().pop() + return name diff --git a/dataduct/database/parsers/tests/__init__.py b/dataduct/database/parsers/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py new file mode 100644 index 0000000..bb489f2 --- /dev/null +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -0,0 +1,62 @@ +"""Tests for create table parser +""" + +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import raises +from pyparsing import ParseException + +from ..create_table import parse_create_table +from ..create_table import create_exists_clone + + +class TestCreateTableStatement(TestCase): + """Tests for create table + """ + @staticmethod + def test_basic(): + """Basic test for create table + """ + query = ('CREATE TABLE orders (' + 'customer_id INTEGER DISTKEY PRIMARY KEY,' + 'customer_name VARCHAR(200))') + + output = parse_create_table(query) + + eq_(output['full_name'], 'orders') + eq_(output['temporary'], False) + eq_(output['exists_checks'], False) + eq_(len(output['constraints']), 0) + eq_(len(output['columns']), 2) + + @staticmethod + def test_exists_clone(): + """Basic test for create table clone with exists condition + """ + query = ('CREATE TABLE orders (' + 'customer_id INTEGER DISTKEY PRIMARY KEY,' + 'customer_name VARCHAR(200))') + + exists_clone = create_exists_clone(query) + output = parse_create_table(exists_clone) + eq_(output['full_name'], 'orders') + eq_(output['temporary'], False) + eq_(output['exists_checks'], True) + + @staticmethod + @raises(ParseException) + def test_bad_input(): + """Feeding malformed input into create table + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id INTEGER DISTKEY PRIMARY KEY' + parse_create_table(query) + + @staticmethod + @raises(ParseException) + def test_bad_input_in_columns(): + """Feeding malformed input into create table + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id NEGATIVE DISTKEY PRIMARY KEY)' + parse_create_table(query) diff --git a/dataduct/database/parsers/tests/test_create_view.py b/dataduct/database/parsers/tests/test_create_view.py new file mode 100644 index 0000000..cf519b4 --- /dev/null +++ b/dataduct/database/parsers/tests/test_create_view.py @@ -0,0 +1,26 @@ +"""Tests for create view parser +""" + +from unittest import TestCase +from nose.tools import eq_ +from ..create_view import parse_create_view + + +class TestCreateViewStatement(TestCase): + """Tests for create view + """ + @staticmethod + def test_basic(): + """Basic test for create view + """ + query = 'CREATE VIEW orders AS (' + \ + 'SELECT x, y, z from xyz_table)' + + full_name = 'orders' + replace = False + + output = parse_create_view(query) + + eq_(output['view_name'], full_name) + eq_(output['replace'], replace) + eq_(output['select_statement'], 'SELECT x, y, z from xyz_table') diff --git a/dataduct/database/parsers/tests/test_select_query.py b/dataduct/database/parsers/tests/test_select_query.py new file mode 100644 index 0000000..ebf2171 --- /dev/null +++ b/dataduct/database/parsers/tests/test_select_query.py @@ -0,0 +1,70 @@ +"""Tests for select statement parser +""" + +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import raises +from pyparsing import ParseException + +from ..select_query import parse_select_dependencies +from ..select_query import parse_select_columns +from ..select_query import parse_column_name + + +class TestCreateTableStatement(TestCase): + """Tests for create table + """ + @staticmethod + def test_basic(): + """Basic test for select statement + """ + query = ('SELECT x, y, z AS t FROM abc JOIN pqr USING(y) WHERE x=1') + + dependencies = parse_select_dependencies(query) + eq_(dependencies, ['abc', 'pqr']) + + columns = parse_select_columns(query) + eq_(columns, ['x', 'y', 'z AS t']) + + column_name = parse_column_name(columns[0]) + eq_(column_name, 'x') + + column_name = parse_column_name(columns[2]) + eq_(column_name, 't') + + @staticmethod + @raises(ParseException) + def test_bad_input(): + """Feeding malformed input into create table + """ + query = 'SELECT x, y, z' + parse_select_dependencies(query) + + @staticmethod + def test_columns(): + """Basic test for select statement + """ + query = ('SELECT x' + ',CASE WHEN y=10 THEN 5 ELSE z' + ',CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END' + ',COUNT(1) AS c ' + 'FROM abc') + + result = [ + 'x', + 'CASE WHEN y=10 THEN 5 ELSE z', + 'CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END', + 'COUNT(1) AS c', + ] + + columns = parse_select_columns(query) + eq_(columns, result) + + @staticmethod + def test_with_query(): + """Basic test for select statement with the with query + """ + query = ('WITH data AS (SELECT x, y FROM xy) SELECT x,y FROM data') + + columns = parse_select_columns(query) + eq_(columns, ['x', 'y']) diff --git a/dataduct/database/parsers/tests/test_transfrom.py b/dataduct/database/parsers/tests/test_transfrom.py new file mode 100644 index 0000000..28a0094 --- /dev/null +++ b/dataduct/database/parsers/tests/test_transfrom.py @@ -0,0 +1,220 @@ +"""Tests for the transformation steps +""" +from unittest import TestCase +from nose.tools import eq_ + +from ..transform import split_statements +from ..transform import remove_comments +from ..transform import remove_empty_statements +from ..transform import remove_transactional +from ..transform import remove_newlines + + +class TestRemoveEmptyStatements(TestCase): + """Tests for remove_empty_statements function + """ + @staticmethod + def test_basic(): + """Basic test for single location of seperator + """ + data = 'a;;;' + result = 'a;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_multiple_statements_single_duplication(): + """Test for multiple locations of seperator with single duplication + """ + data = 'a; b;; c;' + result = 'a; b; c;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_multiple_statements_multiple_duplication(): + """Test for multiple locations of seperator with multiple duplication + """ + data = 'a;;; b;; c;;;' + result = 'a; b; c;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_start_empty(): + """Test for removing an empty statement at start + """ + data = '; a; ; ;;; b; c;;;' + result = ' a; b; c;' + + eq_(remove_empty_statements(data), result) + + +class TestRemoveNewLines(TestCase): + """Tests for remove_empty_statements function + """ + @staticmethod + def test_basic(): + """Basic test for single location of seperator + """ + data = 'a\n \n;' + result = 'a ;' + + eq_(remove_newlines(data), result) + + @staticmethod + def test_advanced(): + """Basic test for single location of seperator + """ + data = 'a,\nb,\nc\n\rfrom \r\n xyz' + result = 'a, b, c from xyz' + + eq_(remove_newlines(data), result) + + @staticmethod + def test_quoted_newlines(): + """Basic test for single location of seperator + """ + data = "a,\nb,\nc\n\rfrom \r\n xyz where b='a\nc'" + result = "a, b, c from xyz where b='a\nc'" + + eq_(remove_newlines(data), result) + + +class TestRemoveComments(TestCase): + """Tests for remove_comments function + """ + @staticmethod + def test_multiline_comment(): + """Basic test for removing multiline comments + """ + data = 'a; /* This is \n \n a multiline comment */ b;' + result = 'a; b;' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_comment_basic(): + """Basic test for removing singleline comments + """ + data = 'a; b; --Comment' + result = 'a; b; ' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_comment_advanced(): + """Advanced test for removing singleline comments + """ + data = '-- Comment \n a; b;' + result = '\n a; b;' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_multiline_comment(): + """Advanced test for removing singleline comments + """ + data = 'a; /* This is \n \n a multiline comment */ b;-- Comment ' + result = 'a; b;' + + eq_(remove_comments(data), result) + + +class TestRemoveTransactional(TestCase): + """Tests for remove_transactional function + """ + @staticmethod + def test_remove_none(): + """Basic test for removing nothing + """ + data = 'a; b;' + result = 'a; b;' + + eq_(remove_transactional(data), result) + + @staticmethod + def test_remove_begin(): + """Basic test for removing begin + """ + data = 'begin; a; b;' + result = ' a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_remove_commit(): + """Basic test for removing commit + """ + data = 'a; b; commit;' + result = 'a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_remove_begin_commit(): + """Basic test for removing begin & commit + """ + data = 'begin; a; b; commit;' + result = ' a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_just_begin_commit(): + """Basic test for removing begin & commit + """ + data = 'begin; commit;' + result = '' + + eq_(remove_empty_statements(remove_transactional(data)), result) + +class TestSplitOmitQuoted(TestCase): + """Tests for split_statements function + """ + @staticmethod + def test_basic(): + """Basic test for spliting a string based on the seperator + """ + data = 'a; b \n t; c; d ; ' + result = ['a', 'b \n t', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_newline_sql(): + """Split SQL statement with newlines + """ + data = 'a; b \n e; c; \n \n d ; ' + result = ['a', 'b \n e', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_paran_sql(): + """Split SQL statement with paranthesis + """ + data = 'a; b (x\n,y,z) d; c; \n \n d ; ' + result = ['a', 'b (x\n,y,z) d', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_multiple_sql(): + """Advanced test with removing comments and empty sql statements + """ + data = """a; /* This is \n + a multiline comment */ b;; \n ; -- Comment \n c; d; """ + + result = ['a', 'b', 'c', 'd'] + + eq_(split_statements(remove_empty_statements( + remove_comments(data))), result) + + @staticmethod + def test_split_escaped_sql(): + """Split SQL statement with strings that have semicolon + """ + data = "a; xyz='0;0'; c;" + result = ['a', "xyz='0;0'", 'c'] + eq_(split_statements(data), result) diff --git a/dataduct/database/parsers/transform.py b/dataduct/database/parsers/transform.py new file mode 100644 index 0000000..c675bb4 --- /dev/null +++ b/dataduct/database/parsers/transform.py @@ -0,0 +1,111 @@ +"""Module containing basic transform functions on strings +""" + +import re + +from pyparsing import CaselessKeyword +from pyparsing import CharsNotIn +from pyparsing import Literal +from pyparsing import nestedExpr +from pyparsing import OneOrMore +from pyparsing import replaceWith +from pyparsing import WordStart +from pyparsing import ZeroOrMore + + +def remove_empty_statements(string, seperator=';'): + """Remove empty statements from the string + + Args: + string(str): String to be processed + seperator(str): Seperater to be checked for duplicates + + Returns: + result(str): String with empty statements trimmed + """ + if string == '': + return string + + empty_statement = seperator + OneOrMore(seperator) + empty_statement.setParseAction(replaceWith(seperator)) + string = empty_statement.transformString(string) + + return string.lstrip(seperator) + + +def remove_comments(string): + """Remove comments from the statements + + Args: + string(str): String to be processed + + Returns: + result(str): String with comments trimmed + """ + + if string == '': + return string + + # Remove multiline comments + multiline_comment = nestedExpr('/*', '*/').suppress() + string = multiline_comment.transformString(string) + + # Remove single line comments + singleline_comment = Literal('--') + ZeroOrMore(CharsNotIn('\n')) + string = singleline_comment.suppress().transformString(string) + + return string + + +def remove_transactional(string): + """Remove begin or commit from the statement + + Args: + string(str): String to be processed + + Returns: + result(str): String with begin and commit trimmed + """ + transaction = WordStart() + ( + CaselessKeyword('BEGIN')| CaselessKeyword('COMMIT')) + + return transaction.suppress().transformString(string) + + +def split_statements(string, seperator=';', quote_char="'"): + """Seperate the string based on the seperator + + Args: + string(str): String to be processed + seperator(str): Seperater to split the statements + + Returns: + result(list of str): Statements split based on the seperator + """ + if string == '': + return [] + + # We can not directly split a sql statement as we want to skip on + # semicolons inside a string in the sql query. + stack = 0 + result = [] + statement = '' + for char in string: + if char == seperator and not stack % 2: + result.append(statement.strip()) + statement = '' + else: + statement += char + if char == quote_char: + stack += 1 + if statement.strip(): + result.append(statement.strip()) + return result + + +def remove_newlines(string): + """Remove new lines from a string unless in single quotes + """ + # In general the aim is to avoid regex as they are hard to maintain + regex = r"(?:[^\s\n\r']|'(?:\\.|[^'])*')+" + return ' '.join(re.findall(regex, string)) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py new file mode 100644 index 0000000..67d2044 --- /dev/null +++ b/dataduct/database/parsers/utils.py @@ -0,0 +1,77 @@ +"""SQL parser utils and constants +""" + +from pyparsing import alphanums +from pyparsing import CaselessKeyword +from pyparsing import Combine +from pyparsing import Forward +from pyparsing import nums +from pyparsing import OneOrMore +from pyparsing import Word + + +# Data types +_smallint = CaselessKeyword('SMALLINT') +_int = CaselessKeyword('INT') +_integer = CaselessKeyword('INTEGER') +_bigint = CaselessKeyword('BIGINT') +_decimal = Combine(CaselessKeyword('DECIMAL') + '(' + Word(nums + ',') + ')') +_real = (CaselessKeyword('REAL') | CaselessKeyword('FLOAT')) +_double = CaselessKeyword('DOUBLE') +_boolean = CaselessKeyword('BOOLEAN') +_char = CaselessKeyword('CHAR') +_varchar = Combine(CaselessKeyword('VARCHAR') + '(' + Word(alphanums) + ')') +_date = CaselessKeyword('DATE') +_timestamp = CaselessKeyword('TIMESTAMP') + +# Create SQL keywords +_create = CaselessKeyword('CREATE') +_table = CaselessKeyword('TABLE') +_view = CaselessKeyword('VIEW') +_temp = CaselessKeyword('TEMP') +_temporary = CaselessKeyword('TEMPORARY') +_if_not_exists = CaselessKeyword('IF NOT EXISTS') +_or_replace = CaselessKeyword('OR REPLACE') +_primary_key = CaselessKeyword('PRIMARY KEY') +_foreign_key = CaselessKeyword('FOREIGN KEY') +_references = CaselessKeyword('REFERENCES') +_unique = CaselessKeyword('UNIQUE') +_null = CaselessKeyword('NULL') +_not_null = CaselessKeyword('NOT NULL') +_distkey = CaselessKeyword('DISTKEY') +_diststyle = CaselessKeyword('DISTSTYLE') +_sortkey = CaselessKeyword('SORTKEY') +_encode = CaselessKeyword('ENCODE') +_all = CaselessKeyword('ALL') +_even = CaselessKeyword('EVEN') +_key = CaselessKeyword('KEY') + +# Select SQL Keywords +_select = CaselessKeyword('SELECT') +_with = CaselessKeyword('WITH') +_from = CaselessKeyword('FROM') +_as = CaselessKeyword('AS') +_join = CaselessKeyword('JOIN') + +# Parsers +_db_name = Word(alphanums+"_-.") +pk_check = (_primary_key | _unique) + +# Column types +column_types = _smallint | _integer | _bigint | _decimal | _real | _double +column_types |= _boolean | _char | _varchar | _date | _timestamp | _int + +# Define a field parser for create table fields or select query fields +field_parser = Forward() +subquery = Forward() + +# List of characters allowed in the query statements +special_character = "_-. *`> 1: + raise ValueError('SQL Statement can not contain more than 1 query') + elif len(raw_statements) == 1: + return raw_statements[0] + else: + return '' + + def _validate_parser(self, func): + """Check if a parser satisfies the sql statement + """ + try: + func(self.sql()) + except Exception: + return False + return True + + def creates_table(self): + """SQL statement creates a table. + """ + return self._validate_parser(parse_create_table) + + def creates_view(self): + """SQL statement creates a view. + """ + return self._validate_parser(parse_create_view) diff --git a/dataduct/database/sql/tests/__init__.py b/dataduct/database/sql/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/database/sql/tests/test_sql_script.py b/dataduct/database/sql/tests/test_sql_script.py new file mode 100644 index 0000000..dd6121f --- /dev/null +++ b/dataduct/database/sql/tests/test_sql_script.py @@ -0,0 +1,170 @@ +"""Tests for the SqlScript class +""" +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import assert_not_equal + +from ..sql_statement import SqlStatement +from ..sql_script import SqlScript + + +class TestSqlScript(TestCase): + """Tests for sql Script function + """ + @staticmethod + def test_basic(): + """Basic test for Script declaration + """ + query = 'SELECT \n 1;' + result = 'SELECT 1;' + + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_sanatization(): + """Sanatization of comments + """ + query = 'SELECT 1 -- test connect \n;' + result = 'SELECT 1;' + + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_multiple_queries(): + """Raise error if multiple queries are passed + """ + query = 'SELECT 1; SELECT 2;' + result = 'SELECT 1;\nSELECT 2;' + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_empty_declaration(): + """Empty if no sql query is passed + """ + eq_(SqlScript().sql(), ';') + + @staticmethod + def test_length(): + """Length of sql script + """ + query = 'SELECT 1; SELECT 2;' + result = 2 + eq_(len(SqlScript(query)), result) + + @staticmethod + def test_append_statement(): + """Appending a statement to sql script + """ + script = SqlScript() + script.append(SqlStatement('SELECT 1')) + eq_(script.sql(), 'SELECT 1;') + + @staticmethod + def test_append_script(): + """Appending a script to sql script + """ + script = SqlScript('SELECT 1;') + script_new = SqlScript('SELECT 2;') + script.append(script_new) + eq_(script.sql(), 'SELECT 1;\nSELECT 2;') + + @staticmethod + def test_append_string(): + """Appending a string to sql script + """ + script = SqlScript('SELECT 1;') + script.append('SELECT 2;') + eq_(script.sql(), 'SELECT 1;\nSELECT 2;') + + @staticmethod + def test_copy(): + """Copy a sql script + """ + script = SqlScript('SELECT 1;') + script_new = script.copy() + eq_(script.sql(), script_new.sql()) + + # Check if it was a copy or the same object + assert_not_equal(id(script), id(script_new)) + + @staticmethod + def test_wrap_transaction(): + """Wrap the sql script in a transaction + """ + script = SqlScript('SELECT 1;').wrap_transaction() + result = 'BEGIN;\nSELECT 1;\nCOMMIT;' + eq_(script.sql(), result) + + @staticmethod + def test_paranthesis(): + """Test sql with paranthesis is sanatized correctly + """ + script = SqlScript('CREATE TABLE test_begin (session_id INTEGER);') + result = 'CREATE TABLE test_begin (session_id INTEGER);' + eq_(script.sql(), result) + + @staticmethod + def test_creates_table_success(): + """Correctly recognizes that the sql creates a table + """ + script = SqlScript('CREATE TABLE test_begin (session_id INTEGER);') + eq_(script.creates_table(), True) + + @staticmethod + def test_creates_table_failure(): + """Correctly recognizes that the sql does not create a table + """ + script = SqlScript('SELECT * FROM test_begin;') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_table_failure_not_first_statement(): + """Correctly recognizes that the first sql statement does not create + a table + """ + script = SqlScript(""" + SELECT * FROM test_begin; + CREATE TABLE test_begin (session_id INTEGER); + """) + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_table_failure_bad_syntax(): + """Correctly recognizes bad syntax when creating a view + """ + script = SqlScript( + 'CREATE TABLE test_begin AS (SELECT * FROM test_table);') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_view_success(): + """Correctly recognizes that the sql creates a view + """ + script = SqlScript( + 'CREATE VIEW test_begin AS (SELECT * FROM test_table);') + eq_(script.creates_view(), True) + + @staticmethod + def test_creates_view_failure(): + """Correctly recognizes that the sql does not create a view + """ + script = SqlScript('SELECT * FROM test_begin;') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_view_failure_not_first_statement(): + """Correctly recognizes that the first sql statment does not create + a view + """ + script = SqlScript(""" + SELECT * FROM test_begin; + CREATE VIEW test_begin AS (SELECT * FROM test_table); + """) + eq_(script.creates_view(), False) + + @staticmethod + def test_creates_view_failure_bad_syntax(): + """Correctly recognizes bad syntax when creating a view + """ + script = SqlScript('CREATE VIEW test_begin (session_id INTEGER);') + eq_(script.creates_view(), False) diff --git a/dataduct/database/sql/tests/test_sql_statement.py b/dataduct/database/sql/tests/test_sql_statement.py new file mode 100644 index 0000000..9067bb8 --- /dev/null +++ b/dataduct/database/sql/tests/test_sql_statement.py @@ -0,0 +1,43 @@ +"""Tests for the SqlStatement class +""" +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import raises + +from ..sql_statement import SqlStatement + + +class TestSqlStatement(TestCase): + """Tests for sql statement function + """ + @staticmethod + def test_basic(): + """Basic test for statement declaration + """ + query = 'select \n 1;' + result = 'select 1' + + eq_(SqlStatement(query).sql(), result) + + @staticmethod + def test_sanatization(): + """Sanatization of comments + """ + query = 'select 1 -- test connect \n;' + result = 'select 1' + + eq_(SqlStatement(query).sql(), result) + + @staticmethod + @raises(ValueError) + def test_error(): + """Raise error if multiple queries are passed + """ + query = 'select 1; select 2;' + SqlStatement(query) + + @staticmethod + def test_empty_declaration(): + """Empty if no sql query is passed + """ + eq_(SqlStatement().sql(), '') diff --git a/dataduct/database/sql/tests/test_sql_utils.py b/dataduct/database/sql/tests/test_sql_utils.py new file mode 100644 index 0000000..7222feb --- /dev/null +++ b/dataduct/database/sql/tests/test_sql_utils.py @@ -0,0 +1,32 @@ +"""Tests the utils functions +""" +from unittest import TestCase +from nose.tools import eq_ + +from ..utils import balanced_parenthesis +from ..utils import sanitize_sql + + +class TestSqlUtils(TestCase): + """Tests for sql utils function + """ + @staticmethod + def test_balanced_paranthesis(): + """Test for balanced_parenthesis + """ + eq_(balanced_parenthesis('SELECT 1;'), True) + eq_(balanced_parenthesis('SELECT 1(;'), False) + eq_(balanced_parenthesis('SELECT 1();'), True) + eq_(balanced_parenthesis('SELECT 1(abcd);'), True) + eq_(balanced_parenthesis('SELECT 1(ab[cd);'), True) + eq_(balanced_parenthesis('SELECT 1(ab[cd));'), False) + eq_(balanced_parenthesis('SELECT 1);'), False) + eq_(balanced_parenthesis('SELECT 1(ab)(ab);'), True) + eq_(balanced_parenthesis('SELECT 1(a(ab)b);'), True) + + @staticmethod + def test_sanitize_sql(): + """Test for sanitize_sql + """ + sql = "SELECT 1 if x='x;y'; SELECT 1 ;" + eq_(sanitize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1']) diff --git a/dataduct/database/sql/transaction.py b/dataduct/database/sql/transaction.py new file mode 100644 index 0000000..e0cacc7 --- /dev/null +++ b/dataduct/database/sql/transaction.py @@ -0,0 +1,22 @@ +"""SQL Statements used in transactions +""" + +from .sql_statement import SqlStatement + + +class BeginStatement(SqlStatement): + """Class representing begin sql statement + """ + def __init__(self): + """Constructor for begin class + """ + super(BeginStatement, self).__init__('BEGIN', True) + + +class CommitStatement(SqlStatement): + """Class representing Commit sql statement + """ + def __init__(self): + """Constructor for Commit class + """ + super(CommitStatement, self).__init__('COMMIT', True) diff --git a/dataduct/database/sql/utils.py b/dataduct/database/sql/utils.py new file mode 100644 index 0000000..55af3f1 --- /dev/null +++ b/dataduct/database/sql/utils.py @@ -0,0 +1,42 @@ +""" +Shared utility functions +""" +from ..parsers import remove_comments +from ..parsers import remove_empty_statements +from ..parsers import split_statements +from ..parsers import remove_transactional +from ..parsers import remove_newlines + + +def balanced_parenthesis(statement): + """Check if the SQL statement is balanced + """ + counter = 0 + for character in statement: + if character == '(': + counter += 1 + if character == ')': + counter -= 1 + if counter < 0: + return False + return counter == 0 + + +def sanitize_sql(sql, keep_transaction=False): + """Sanatize the sql string + """ + # remove comments + string = remove_comments(sql) + + # remove transactionals + if not keep_transaction: + string = remove_transactional(string) + + # remove new lines + string = remove_newlines(string) + + # remove empty statements + string = remove_empty_statements(string) + + # split into multiple statements + return split_statements(string) diff --git a/dataduct/database/table.py b/dataduct/database/table.py new file mode 100644 index 0000000..34184ae --- /dev/null +++ b/dataduct/database/table.py @@ -0,0 +1,314 @@ +"""Script containing the table class object +""" +from .parsers import parse_create_table +from .parsers import create_exists_clone +from .sql import SqlScript +from .select_statement import SelectStatement +from .column import Column +from .relation import Relation + + +def comma_seperated(elements): + """Create a comma separated string from the iterator + """ + return ','.join(elements) + + +class Table(Relation): + """Class representing tables in the database + """ + def __init__(self, sql): + """Constructor for Table class + """ + super(Table, self).__init__() + + if isinstance(sql, SqlScript): + # Take the first statement and ignore the rest + sql = sql.statements[0] + + parameters = parse_create_table(sql.sql()) + + self.sql_statement = sql + self.parameters = parameters + + self.full_name = parameters.get('full_name') + self.temporary = parameters.get('temporary') + self.exists_check = parameters.get('exists_check', False) + + self.sort_keys = parameters.get('sortkey', list()) + self.dist_keys = parameters.get('distkey', list()) + self.diststyle = parameters.get('diststyle', 'EVEN') + + self._constraints = parameters.get('constraints', list()) + + self._columns = dict() + for column_params in parameters.get('columns', list()): + column_name = column_params['column_name'] + self._columns[column_name] = Column(**column_params) + + self.schema_name, self.table_name = self.initialize_name() + self.update_attributes_from_columns() + self.update_columns_with_constrains() + + def update_attributes_from_columns(self): + """ Update attributes sortkey and distkey based on columns + """ + distkeys = self.dist_keys + sortkeys = self.sort_keys + for column in self._columns.values(): + # Update the table attributes based on columns + if column.is_distkey: + distkeys.append(column.name) + if column.is_sortkey: + sortkeys.append(column.name) + + self.dist_keys = list(set(distkeys)) + self.sort_keys = list(set(sortkeys)) + + def update_columns_with_constrains(self): + """ Update columns with primary and foreign key constraints + """ + for constraint in self._constraints: + for col_name in constraint.get('pk_columns', list()): + self._columns[col_name].primary = True + + def columns(self): + """Unsorted list of columns in the table + """ + return sorted(self._columns.values(), key=lambda x: x.position) + + def column(self, column_name): + """Get the column with the given name + """ + return self._columns.get(column_name, None) + + @property + def primary_keys(self): + """Primary keys of the table + """ + return [c for c in self.columns() if c.primary] + + @property + def primary_key_names(self): + """Primary keys of the table + """ + return [c.name for c in self.columns() if c.primary] + + def foreign_key_references(self): + """Get a list of all foreign key references from the table + """ + result = list() + for column in self.columns(): + if column.fk_table is not None: + result.append(( + [column.name], column.fk_table, [column.fk_reference])) + + for constraint in self._constraints: + if 'fk_table' in constraint: + result.append((constraint.get('fk_columns'), + constraint.get('fk_table'), + constraint.get('fk_reference'))) + return result + + @property + def dependencies(self): + """List of tables which this table references. + """ + return [table_name for _, table_name, _ + in self.foreign_key_references()] + + def temporary_clone_script(self): + """Sql script to create a temporary clone table + + Note: + The temporary table only copies the schema and not any data + """ + + # We don't need to use schema for temp tables + table_name = self.table_name + '_temp' + + # Create a list of column definitions + columns = comma_seperated( + ['%s %s' % (c.column_name, c.column_type) for c in self.columns()]) + + sql = """CREATE TEMPORARY TABLE {table_name} ( + {columns}, + PRIMARY KEY( {primary_keys} ) + )""".format(table_name=table_name, + columns=columns, + primary_keys=comma_seperated(self.primary_key_names)) + + return SqlScript(sql) + + def exists_clone_script(self): + """Sql script to create a exists clone table + """ + return SqlScript(create_exists_clone(self.sql_statement.sql())) + + def drop_script(self): + """Sql script to drop the table + """ + return SqlScript('DROP TABLE IF EXISTS %s CASCADE' % self.full_name) + + def analyze_script(self): + """Sql script to analyze the table + """ + return SqlScript('ANALYZE %s' % self.full_name) + + def rename_script(self, new_name): + """Sql script to rename the table + """ + return SqlScript( + 'ALTER TABLE %s RENAME TO %s' %(self.full_name, new_name)) + + def delete_script(self, where_condition=''): + """Sql script to delete from table based on where condition + """ + return SqlScript('DELETE FROM %s %s' %(self.full_name, where_condition)) + + def foreign_key_reference_script(self, source_columns, reference_name, + reference_columns): + """Sql Script to create a FK reference from table x to y + """ + sql = """ + ALTER TABLE {source_name} + ADD FOREIGN KEY ({source_columns}) + REFERENCES {reference_name} ({reference_columns}) + """.format(source_name=self.full_name, + source_columns=comma_seperated(source_columns), + reference_name=reference_name, + reference_columns=comma_seperated(reference_columns)) + + return SqlScript(sql) + + def select_duplicates_script(self): + """Sql Script to select duplicate primary keys from the table + """ + pk_columns = comma_seperated(self.primary_key_names) + sql = """ + SELECT {pk_columns} + ,COUNT(1) duplicate_count + FROM {table_name} + GROUP BY {pk_columns} + HAVING COUNT(1) > 1 + """.format(table_name=self.full_name, + pk_columns=pk_columns) + + return SqlScript(sql) + + def _source_sql(self, source_relation): + """Get the source sql based on the type of the source specified + """ + if not (isinstance(source_relation, Relation) or + isinstance(source_relation, SelectStatement)): + raise ValueError('Source Relation must be a relation or select') + + if len(self.columns()) < len(source_relation.columns()): + raise ValueError('Source has more columns than destination') + + if isinstance(source_relation, SelectStatement): + source_sql = '(' + source_relation.sql() + ')' + else: + source_sql = source_relation.full_name + + return source_sql + + def insert_script(self, source_relation): + """Sql Script to insert into the table while avoiding PK violations + """ + sql = 'INSERT INTO %s (SELECT * FROM %s)' % ( + self.full_name, self._source_sql(source_relation)) + return SqlScript(sql) + + def delete_matching_rows_script(self, source_relation): + """Sql Script to delete matching rows between table and source + """ + if len(self.primary_keys) == 0: + raise RuntimeError( + 'Cannot delete matching rows from table with no primary keys') + + source_col_names, pk_names = [], [] + for column in self.columns(): + if column.primary: + pk_names.append(column.name) + source_col_names.append(column.name) + + where_condition = 'WHERE (%s) IN (SELECT DISTINCT %s FROM %s)' % ( + comma_seperated(pk_names), comma_seperated(source_col_names), + self._source_sql(source_relation)) + + return self.delete_script(where_condition) + + def de_duplication_script(self): + """De-duplicate the table to enforce primary keys + """ + if len(self.primary_keys) == 0: + raise RuntimeError( + 'Cannot de-duplicate table with no primary keys') + + script = self.temporary_clone_script() + column_names = [c.name for c in self.columns()] + + # Create a temporary clone from the script + temp_table = self.__class__(script) + script.append(temp_table.insert_script(self)) + script.append(self.delete_script()) + + # Pick a random value on multiple primary keys + sql = """ + INSERT INTO {table_name} ( + SELECT {column_names} + FROM ( + SELECT *, + COUNT(1) OVER ( + PARTITION BY {pk_names} + ORDER BY 1 ROWS UNBOUNDED PRECEDING) rnk + FROM {temp_table}) + WHERE rnk = 1) + """.format(table_name=self.full_name, + column_names=comma_seperated(column_names), + pk_names=comma_seperated(self.primary_key_names), + temp_table=temp_table.full_name) + + script.append(SqlScript(sql)) + return script + + def upsert_script(self, source_relation, enforce_primary_key=True, + delete_existing=False): + """Sql script to upsert into a table + + The script first copies all the source data into a temporary table. + Then if the enforce_primary_key flag is set we de-duplicate the temp + table. After which if the delete existing flag is set we delete all + the data from the destination table otherwise only the rows that match + the temporary table. After which we copy the temporary table into the + destination table. + """ + script = self.temporary_clone_script() + + # Create a temporary clone from the script + temp_table = self.__class__(script) + script.append(temp_table.insert_script(source_relation)) + if enforce_primary_key: + script.append(temp_table.de_duplication_script()) + + if delete_existing: + script.append(self.delete_script()) + else: + script.append(self.delete_matching_rows_script(temp_table)) + + script.append(self.insert_script(temp_table)) + script.append(temp_table.drop_script()) + return script + + def check_not_exists_script(self): + """Sql script to create statement if the table exists or not + """ + return SqlScript(""" + SELECT NOT EXISTS( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = '%s' + AND table_name = '%s' + ) + """ % (self.schema_name, self.table_name)) diff --git a/dataduct/database/tests/__init__.py b/dataduct/database/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py new file mode 100644 index 0000000..32160c1 --- /dev/null +++ b/dataduct/database/tests/test_database.py @@ -0,0 +1,240 @@ +"""Tests for Database +""" +import os + +from unittest import TestCase +from testfixtures import TempDirectory +from nose.tools import assert_not_equal +from nose.tools import eq_ +from nose.tools import raises + +from ..database import Database +from ..table import Table +from ..view import View +from ..sql import SqlScript + + +class TestDatabase(TestCase): + """Tests for Database + """ + + @staticmethod + def _create_table(sql): + """Creates a table object from a SQL string + """ + return Table(SqlScript(sql)) + + @staticmethod + def _create_view(sql): + """Creates a view object from a SQL string + """ + return View(SqlScript(sql)) + + def setUp(self): + """Setup test fixtures for the database tests + """ + # A basic table and view + self.basic_table = self._create_table( + 'CREATE TABLE test_table (id INTEGER);') + self.basic_view = self._create_view( + 'CREATE VIEW test_view AS (SELECT * FROM test_table);') + + # Create tables with dependencies between them + self.first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER + );""") + self.first_table_dependent = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + self.second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER, + id2 INTEGER + );""") + self.second_table_dependent = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER REFERENCES first_table(id1), + id2 INTEGER + );""") + + # Create a template database to test script generation + table = self._create_table('CREATE TABLE test_table ( id INTEGER );') + view = self._create_view("""CREATE VIEW test_view AS ( + SELECT id FROM test_table + );""") + self.script_database = Database(relations=[table, view]) + + def test_create(self): + """Tests database initialization + """ + database = Database(relations=[self.basic_table]) + + # Verify that the database is constructed properly + eq_(database.num_tables, 1) + eq_(database.num_views, 0) + assert_not_equal(database.relation(self.basic_table.full_name), None) + + def test_create_from_file(self): + """Tests database initialization from file + """ + with TempDirectory() as d: + # Create files in the temp directory + d.write(self.basic_table.full_name, + self.basic_table.sql_statement.sql()) + d.write(self.basic_view.full_name, + self.basic_view.sql_statement.sql()) + database = Database( + files=[os.path.join(d.path, self.basic_table.full_name), + os.path.join(d.path, self.basic_view.full_name)]) + + # Verify that the database is constructed properly + eq_(database.num_tables, 1) + eq_(database.num_views, 1) + assert_not_equal( + database.relation(self.basic_table.full_name), None) + assert_not_equal( + database.relation(self.basic_view.full_name), None) + + @staticmethod + @raises(ValueError) + def test_create_from_file_no_relation(): + """Database initialization with a file that does not create a + relation + """ + with TempDirectory() as d: + # Create a file in the temp directory + d.write('test.sql', + 'SELECT * FROM test_table;') + Database(files=[os.path.join(d.path, 'test.sql')]) + + @staticmethod + @raises(ValueError) + def test_create_two_arguments(): + """Must create database with less than two arguments + """ + Database(relations=['test_rel'], files=['test_file']) + + @raises(ValueError) + def test_create_duplicate_relations(self): + """Database initialization with duplicate relations + """ + Database(relations=[self.basic_table, self.basic_table]) + + def test_database_copy(self): + """Copying a database is a deepcopy + """ + database = Database(relations=[self.basic_table]) + database_copy = database.copy() + + # Check that the copied database contains the relation + assert_not_equal( + database_copy.relation(self.basic_table.full_name), None) + + # Delete the relation in the copy + database_copy._relations = {} + + # Check that the original database still contains the relation + assert_not_equal( + database.relation(self.basic_table.full_name), None) + + def test_database_has_cycles(self): + """Check if a database has cycles + """ + database = Database(relations=[self.first_table_dependent, + self.second_table_dependent]) + eq_(database.has_cycles(), True) + + def test_database_has_no_cycles(self): + """Check if a database has no cycles + """ + database = Database(relations=[self.first_table_dependent, + self.second_table]) + eq_(database.has_cycles(), False) + + def test_database_has_no_cycles_2(self): + """Check if a database has no cycles + """ + database = Database(relations=[self.first_table, + self.second_table_dependent]) + eq_(database.has_cycles(), False) + + def test_database_sorted_relations(self): + """Get the topological sort of the database + """ + database = Database(relations=[self.first_table_dependent, + self.second_table]) + relations = database.sorted_relations() + + # Verify that the relations are sorted correctly + eq_(len(relations), 2) + eq_(relations[0].table_name, self.second_table.full_name) + eq_(relations[1].table_name, self.first_table_dependent.full_name) + + @raises(RuntimeError) + def test_database_sorted_relations_cyclic(self): + """Get the topological sort of the database with cycles + """ + database = Database(relations=[self.first_table_dependent, + self.second_table_dependent]) + database.sorted_relations() + + @staticmethod + def _compare_scripts(actual_script, expected_script): + """Validates a SqlScript chain + """ + assert len(actual_script) == len(expected_script) + for actual, expected in zip(actual_script, expected_script): + eq_(actual.sql(), expected) + + def test_database_create_relations_script(self): + """Creating relations in the database + """ + result = ['CREATE TABLE test_table ( id INTEGER )', + 'CREATE VIEW test_view AS ( SELECT id FROM test_table )'] + self._compare_scripts( + self.script_database.create_relations_script(False), + result) + + def test_database_drop_relations_script(self): + """Dropping relations in the database + """ + result = ['DROP TABLE IF EXISTS test_table CASCADE', + 'DROP VIEW IF EXISTS test_view CASCADE'] + self._compare_scripts( + self.script_database.drop_relations_script(), + result) + + def test_database_recreate_relations_script(self): + """Recreating relations in the database + """ + result = ['DROP TABLE IF EXISTS test_table CASCADE', + 'CREATE TABLE test_table ( id INTEGER )', + 'DROP VIEW IF EXISTS test_view CASCADE', + 'CREATE VIEW test_view AS ( SELECT id FROM test_table )'] + self._compare_scripts( + self.script_database.recreate_relations_script(False), + result) + + def test_database_recreate_table_dependencies(self): + """Recreating table dependencies + """ + view = self._create_view( + """CREATE VIEW view AS ( + SELECT id1 FROM second_table + );""") + database = Database(relations=[self.first_table_dependent, + self.second_table, view]) + + result = ['ALTER TABLE first_table ADD FOREIGN KEY (id2) ' + 'REFERENCES second_table (id2)', + 'DROP VIEW IF EXISTS view CASCADE', + 'CREATE VIEW view AS ( SELECT id1 FROM second_table )'] + self._compare_scripts( + database.recreate_table_dependencies('second_table', False), + result) + eq_(database.recreate_table_dependencies('first_table', False).sql(), + ';') diff --git a/dataduct/database/tests/test_history_table.py b/dataduct/database/tests/test_history_table.py new file mode 100644 index 0000000..7318370 --- /dev/null +++ b/dataduct/database/tests/test_history_table.py @@ -0,0 +1,113 @@ +"""Tests for the HistoryTable class +""" +from unittest import TestCase +from nose.tools import raises +from nose.tools import eq_ + +from ..sql.sql_script import SqlScript +from ..table import Table +from ..history_table import HistoryTable + + +class TestHistoryTable(TestCase): + """Tests for the HistoryTable class + """ + + @staticmethod + def _create_history_table(sql): + """Helper function""" + return HistoryTable(SqlScript(sql)) + + @staticmethod + def _create_table(sql): + """Helper function""" + return Table(SqlScript(sql)) + + def setUp(self): + """Setup test fixtures + """ + self.basic_table = self._create_table( + """CREATE TABLE test_table ( + id INTEGER PRIMARY KEY, + value VARCHAR(25) + );""") + self.basic_history_table = self._create_history_table( + """CREATE TABLE test_history_table ( + effective_ts TIMESTAMP, + expiration_ts TIMESTAMP, + id INTEGER, + value VARCHAR(25) + );""") + + @raises(ValueError) + def test_create_history_table_no_timestamps(self): + """Tests if creating a history table with no timestamps + returns an error + """ + self._create_history_table('CREATE TABLE test_table ( id INTEGER );') + + def test_history_script(self): + """Diff comparison of generated SQL script + """ + expected_script = [ + # Create temp table + 'CREATE TEMPORARY TABLE test_table_temp ( ' + 'id INTEGER,' + 'value VARCHAR(25), ' + 'PRIMARY KEY( id ) ' + ')', + # Update temp table with source table's entries + 'INSERT INTO test_table_temp (SELECT * FROM test_table)', + # Expire updated rows + 'UPDATE test_history_table ' + 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' + 'FROM test_table ' + 'WHERE test_history_table.id = test_table.id ' + 'AND ( ' + 'test_history_table.value != test_table.value ' + 'OR ( ' + 'test_history_table.value IS NULL ' + 'AND test_table.value IS NOT NULL ' + ') ' + 'OR ( ' + 'test_history_table.value IS NOT NULL ' + 'AND test_table.value IS NULL ' + ') ' + ') ' + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\'', + # Expire deleted rows + 'UPDATE test_history_table ' + 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' + 'WHERE ( id ) NOT IN ( ' + 'SELECT id ' + 'FROM test_table ' + ') ' + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\'', + # Delete updated rows from temp table + 'DELETE FROM test_table_temp ' + 'WHERE (id) IN (' + 'SELECT DISTINCT id ' + 'FROM (' + 'SELECT id, value ' + 'FROM test_history_table ' + 'WHERE expiration_ts = \'9999-12-31 23:59:59.999999\'' + ')' + ')', + # Copy temp table rows into source table + 'INSERT INTO test_history_table (' + 'SELECT * FROM (' + 'SELECT SYSDATE, ' + '\'9999-12-31 23:59:59.999999\'::TIMESTAMP, ' + 'id, ' + 'value ' + 'FROM test_table_temp' + ')' + ')', + # Drop temp table + 'DROP TABLE IF EXISTS test_table_temp CASCADE'] + + actual_script = self.basic_history_table.update_history_script( + self.basic_table) + eq_(len(actual_script), len(expected_script)) + for actual, expected in zip(actual_script, expected_script): + eq_(actual.sql(), expected) diff --git a/dataduct/database/view.py b/dataduct/database/view.py new file mode 100644 index 0000000..435fe72 --- /dev/null +++ b/dataduct/database/view.py @@ -0,0 +1,60 @@ +"""Script containing the view class object +""" +from .parsers import parse_create_view +from .sql import SqlScript +from .select_statement import SelectStatement +from .relation import Relation + + +class View(Relation): + """Class representing view in the database + """ + def __init__(self, sql): + """Constructor for view class + """ + super(View, self).__init__() + + if isinstance(sql, SqlScript): + # Take the first statement and ignore the rest + sql = sql.statements[0] + + parameters = parse_create_view(sql.sql()) + + self.sql_statement = sql + self.parameters = parameters + + self.full_name = parameters.get('view_name') + self.replace_flag = parameters.get('replace', False) + + self.select_statement = SelectStatement(parameters.get('select_statement')) + + self.schema_name, self.view_name = self.initialize_name() + + @property + def dependencies(self): + """List of relations which this view references. + """ + return self.select_statement.dependencies + + @property + def columns(self): + """List of columns in the view's select statement + """ + return self.select_statement.columns + + def drop_script(self): + """Sql script to drop the view + """ + return SqlScript('DROP VIEW IF EXISTS %s CASCADE' % self.full_name) + + def check_not_exists_script(self): + """Sql script to create statement if the table exists or not + """ + return SqlScript(""" + SELECT NOT EXISTS( + SELECT 1 + FROM information_schema.views + WHERE table_schema = '%s' + AND table_name = '%s' + ) + """ % (self.schema_name, self.view_name)) diff --git a/dataduct/definition_parser.py b/dataduct/definition_parser.py deleted file mode 100644 index 2bb55d3..0000000 --- a/dataduct/definition_parser.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -Script that parses the pipeline definition from the yaml schema -""" -import yaml - -from .etl_pipeline import ETLPipeline -from .utils.exceptions import ETLInputError - - -def read_pipeline_definition(file_path): - """Function reads the yaml pipeline definitions. - - Function reads the yaml pipeline definitions. We also remove the variables - key as that was only used for yaml placeholders. - - Args: - file_path (str): Path to the pipeline definition. - - Returns: - dict: parsed yaml definition as dictionary. - - Raises: - ETLInputError: If `file_path` extention is not yaml - """ - extention = file_path.split('.').pop() - if extention != 'yaml': - raise ETLInputError('Pipeline definition should have a yaml extention') - with open(file_path) as f: - definition = yaml.load(f.read()) - - # remove the variables key from the pipeline definition - # http://stackoverflow.com/questions/4150782/using-yaml-with-variables - definition.pop('variables', None) - definition.pop('description', None) - - return definition - -def create_pipeline(definition): - """Creates the pipeline and add the steps specified to the pipeline - - Args: - definition(dict): YAML definition parsed from the datapipeline - """ - steps = definition.pop('steps') - etl = ETLPipeline(**definition) - - # Add the steps to the pipeline object - etl.create_steps(steps) - print 'Created pipeline. Name: %s' % etl.name - - return etl - -def validate_pipeline(etl, force_overwrite=False): - """Validates the pipeline that was created - - Args: - etl(EtlPipeline): pipeline object that needs to be validated - force_overwrite(bool): delete if a pipeline of same name exists - """ - if force_overwrite: - etl.delete_if_exists() - etl.validate() - print 'Validated pipeline. Id: %s' % etl.pipeline.id - -def activate_pipeline(etl): - """Activate the pipeline that was created - - Args: - etl(EtlPipeline): pipeline object that needs to be activated - """ - etl.activate() - print 'Activated pipeline. Id: %s' % etl.pipeline.id diff --git a/dataduct/etl/__init__.py b/dataduct/etl/__init__.py new file mode 100644 index 0000000..a2707bc --- /dev/null +++ b/dataduct/etl/__init__.py @@ -0,0 +1,5 @@ +from .etl_actions import activate_pipeline +from .etl_actions import create_pipeline +from .etl_actions import read_pipeline_definition +from .etl_actions import validate_pipeline +from .etl_actions import visualize_pipeline diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py new file mode 100644 index 0000000..47c6148 --- /dev/null +++ b/dataduct/etl/etl_actions.py @@ -0,0 +1,163 @@ +"""Script that parses the pipeline definition and has action functions +""" +import yaml + +from .etl_pipeline import ETLPipeline +from ..pipeline import Activity +from ..pipeline import MysqlNode +from ..pipeline import RedshiftNode +from ..pipeline import S3Node +from ..config import Config +from ..utils.exceptions import ETLInputError +from ..utils.slack_hook import post_message + +import logging +logger = logging.getLogger(__name__) + + +config = Config() +REGION = config.etl.get('REGION', None) +URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?%s#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa +URL_TEMPLATE %= 'region=%s' % REGION if REGION is not None else '' + +def read_pipeline_definition(file_path): + """Function reads the yaml pipeline definitions. + + Function reads the yaml pipeline definitions. We also remove the variables + key as that was only used for yaml placeholders. + + Args: + file_path (str): Path to the pipeline definition. + + Returns: + dict: parsed yaml definition as dictionary. + + Raises: + ETLInputError: If `file_path` extention is not yaml + """ + extension = file_path.split('.').pop() + if extension != 'yaml': + raise ETLInputError('Pipeline definition should have a yaml extention') + with open(file_path) as f: + definition = yaml.load(f.read()) + + # remove the variables key from the pipeline definition + # http://stackoverflow.com/questions/4150782/using-yaml-with-variables + definition.pop('variables', None) + return definition + + +def create_pipeline(definition): + """Creates the pipeline and add the steps specified to the pipeline + + Args: + definition(dict): YAML definition parsed from the datapipeline + """ + steps = definition.pop('steps') + etl = ETLPipeline(**definition) + + # Add the steps to the pipeline object + etl.create_steps(steps) + logger.info('Created pipeline. Name: %s', etl.name) + return etl + + +def validate_pipeline(etl, force=False): + """Validates the pipeline that was created + + Args: + etl(EtlPipeline): pipeline object that needs to be validated + force(bool): delete if a pipeline of same name exists + """ + if force: + etl.delete_if_exists() + etl.validate() + logger.debug(yaml.dump(etl.pipeline.aws_format)) + logger.info('Validated pipeline. Id: %s', etl.pipeline.id) + + +def activate_pipeline(etl): + """Activate the pipeline that was created + + Args: + etl(EtlPipeline): pipeline object that needs to be activated + """ + etl.activate() + logger.info('Activated pipeline. Id: %s', etl.pipeline.id) + logger.info('Monitor pipeline here: %s', + URL_TEMPLATE.format(ID=etl.pipeline.id)) + # Post a slack message if slack is setup + post_message('{user} started pipeline: `%s`' % etl.name) + + +def visualize_pipeline(etl, activities_only=False, filename=None): + """Visualize the pipeline that was created + + Args: + etl(EtlPipeline): pipeline object that needs to be visualized + filename(str): filepath for saving the graph + """ + # Import pygraphviz for plotting the graphs + try: + import pygraphviz + except ImportError: + logger.error('Install pygraphviz for visualizing pipelines') + raise + + if filename is None: + raise ETLInputError('Filename must be provided for visualization') + + logger.info('Creating a visualization of %s', etl.name) + graph = pygraphviz.AGraph(name=etl.name, directed=True, label=etl.name) + pipeline_objects = etl.pipeline_objects() + + # Add nodes for all activities + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + graph.add_node(p_object.id, shape='rect', color='turquoise', + style='filled') + if not activities_only: + if isinstance(p_object, MysqlNode): + graph.add_node(p_object.id, shape='oval', color='beige', + style='filled') + if isinstance(p_object, RedshiftNode): + graph.add_node(p_object.id, shape='oval', color='goldenrod', + style='filled') + if isinstance(p_object, S3Node): + graph.add_node(p_object.id, shape='folder', color='grey', + style='filled') + + # Add data dependencies + if not activities_only: + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + if p_object.input: + if isinstance(p_object.input, list): + for ip in p_object.input: + graph.add_edge(ip.id, p_object.id) + else: + graph.add_edge(p_object.input.id, p_object.id) + if p_object.output: + graph.add_edge(p_object.id, p_object.output.id) + + # Add depends_on dependencies + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + if isinstance(p_object.depends_on, list): + dependencies = p_object.depends_on + elif isinstance(p_object.depends_on, Activity): + dependencies = [p_object.depends_on] + else: + continue + + for dependency in dependencies: + graph.add_edge(dependency.id, p_object.id, color='blue') + + if not activities_only and isinstance(p_object, S3Node): + for dependency in p_object.dependency_nodes: + graph.add_edge(dependency.id, p_object.id, color='grey') + + # Plotting the graph with dot layout + graph.tred() + graph.layout(prog='dot') + graph.draw(filename) diff --git a/dataduct/etl_pipeline.py b/dataduct/etl/etl_pipeline.py similarity index 55% rename from dataduct/etl_pipeline.py rename to dataduct/etl/etl_pipeline.py index 0b7e930..466d47f 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -2,45 +2,46 @@ Class definition for DataPipeline """ from datetime import datetime +from datetime import timedelta +import csv +import os +from StringIO import StringIO import yaml -from .config import Config - -from .pipeline.default_object import DefaultObject -from .pipeline.data_pipeline import DataPipeline -from .pipeline.ec2_resource import Ec2Resource -from .pipeline.emr_resource import EmrResource -from .pipeline.redshift_database import RedshiftDatabase -from .pipeline.s3_node import S3Node -from .pipeline.schedule import Schedule -from .pipeline.sns_alarm import SNSAlarm -from .pipeline.utils import list_pipelines - -from .steps.emr_streaming import EMRStreamingStep -from .steps.extract_local import ExtractLocalStep -from .steps.extract_rds import ExtractRdsStep -from .steps.extract_redshift import ExtractRedshiftStep -from .steps.extract_s3 import ExtractS3Step -from .steps.load_redshift import LoadRedshiftStep -from .steps.sql_command import SqlCommandStep -from .steps.transform import TransformStep - -from .s3.s3_file import S3File -from .s3.s3_path import S3Path -from .s3.s3_log_path import S3LogPath - -from .utils.exceptions import ETLInputError +from .utils import process_steps +from ..config import Config -config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -ETL_BUCKET = config.etl['ETL_BUCKET'] -BOOTSTRAP_STEPS_DEFINITION = config.bootstrap +from ..pipeline import DefaultObject +from ..pipeline import DataPipeline +from ..pipeline import Ec2Resource +from ..pipeline import EmrResource +from ..pipeline import RedshiftDatabase +from ..pipeline import S3Node +from ..pipeline import Schedule +from ..pipeline import SNSAlarm +from ..pipeline.utils import list_pipelines +from ..pipeline.utils import list_formatted_instance_details + +from ..s3 import S3File +from ..s3 import S3Path +from ..s3 import S3LogPath + +from ..utils.exceptions import ETLInputError +from ..utils.helpers import get_s3_base_path +from ..utils import constants as const -EC2_RESOURCE_STR = 'ec2' -EMR_CLUSTER_STR = 'emr' -LOG_STR = 'logs' -DATA_STR = 'data' -SRC_STR = 'src' +import logging +logger = logging.getLogger(__name__) + +config = Config() +S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) +NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) +QA_LOG_PATH = config.etl.get('QA_LOG_PATH', const.QA_STR) +DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.NONE) +DP_PIPELINE_LOG_PATH = config.etl.get('DP_PIPELINE_LOG_PATH', const.NONE) class ETLPipeline(object): @@ -50,47 +51,59 @@ class ETLPipeline(object): and has functionality to add steps to the pipeline """ - def __init__(self, name, frequency='one-time', - ec2_resource_terminate_after='6 Hours', - delay=None, emr_cluster_config=None, load_time=None, - max_retries=DEFAULT_MAX_RETRIES): - """Example of docstring on the __init__ method. - - The __init__ method may be documented in either the class level - docstring, or as a docstring on the __init__ method itself. - - Either form is acceptable, but the two should not be mixed. Choose one - convention to document the __init__ method and be consistent with it. - - Note: - Do not include the `self` parameter in the ``Args`` section. + def __init__(self, name, frequency='one-time', ec2_resource_config=None, + time_delta=None, emr_cluster_config=None, load_time=None, + topic_arn=None, max_retries=MAX_RETRIES, + bootstrap=None, description=None): + """Constructor for the pipeline class Args: name (str): Name of the pipeline should be globally unique. frequency (enum): Frequency of the pipeline. Can be - attr2 (list of str): Description of `attr2`. - attr3 (int): Description of `attr3`. - + time_delta(timedelta): Duration to change the start time by + emr_cluster_config(dict): Dictionary for emr config + topic_arn(str): sns alert to be used by the pipeline + max_retries(int): number of retries for pipeline activities + bootstrap(list of steps): bootstrap step definitions for resources """ - if load_time: + + if load_time and isinstance(load_time, str): load_hour, load_min = [int(x) for x in load_time.split(':')] + elif load_time and isinstance(load_time, int): + load_hour, load_min = (load_time / 60, load_time % 60) else: load_hour, load_min = [None, None] + if time_delta is None: + time_delta = timedelta(seconds=0) + # Input variables - self._name = name + self._name = name if not NAME_PREFIX else NAME_PREFIX + '_' + name self.frequency = frequency - self.ec2_resource_terminate_after = ec2_resource_terminate_after - self.delay = delay self.load_hour = load_hour self.load_min = load_min + self.time_delta = time_delta + self.description = description self.max_retries = max_retries + self.topic_arn = topic_arn + + if bootstrap is not None: + self.bootstrap_definitions = bootstrap + elif getattr(config, 'bootstrap', None): + self.bootstrap_definitions = config.bootstrap + else: + self.bootstrap_definitions = dict() if emr_cluster_config: self.emr_cluster_config = emr_cluster_config else: self.emr_cluster_config = dict() + if ec2_resource_config: + self.ec2_resource_config = ec2_resource_config + else: + self.ec2_resource_config = dict() + # Pipeline versions self.version_ts = datetime.utcnow() self.version_name = "version_" + \ @@ -99,7 +112,7 @@ def __init__(self, name, frequency='one-time', self.errors = None self._base_objects = dict() - self._intermediate_nodes = dict() + self.intermediate_nodes = dict() self._steps = dict() self._bootstrap_steps = list() @@ -155,18 +168,22 @@ def create_base_objects(self): self.schedule = self.create_pipeline_object( object_class=Schedule, frequency=self.frequency, - delay=self.delay, + time_delta=self.time_delta, load_hour=self.load_hour, load_min=self.load_min, ) - # self.sns = None -> Used for testing - self.sns = self.create_pipeline_object( - object_class=SNSAlarm, - pipeline_name=self.name - ) + if self.topic_arn is None and SNS_TOPIC_ARN_FAILURE is None: + self.sns = None + else: + self.sns = self.create_pipeline_object( + object_class=SNSAlarm, + topic_arn=self.topic_arn, + pipeline_name=self.name, + ) self.default = self.create_pipeline_object( object_class=DefaultObject, sns=self.sns, + pipeline_log_uri=self.s3_log_dir, ) @property @@ -187,6 +204,15 @@ def name(self): """ return self._name + @property + def steps(self): + """Get the steps of the pipeline + + Returns: + result: steps of the pipeline + """ + return self._steps + def _s3_uri(self, data_type): """Get the S3 location for various data associated with the pipeline @@ -196,20 +222,20 @@ def _s3_uri(self, data_type): Returns: s3_path(S3Path): S3 location of directory of the given data type """ - if data_type not in [SRC_STR, LOG_STR, DATA_STR]: + if data_type not in [const.SRC_STR, const.LOG_STR, const.DATA_STR]: raise ETLInputError('Unknown data type found') # Versioning prevents using data from older versions - key = [data_type, self.name, self.version_name] + key = [S3_BASE_PATH, data_type, self.name, self.version_name] - if self.frequency == 'daily' and data_type in [LOG_STR, DATA_STR]: + if self.frequency == 'daily' and data_type == const.DATA_STR: # For repeated loads, include load date - key.append("#{format(@scheduledStartTime, 'YYYYMMdd')}") + key.append("#{format(@scheduledStartTime, 'YYYYMMdd-hh-mm-ss')}") - if data_type == LOG_STR: - return S3LogPath(key, bucket=ETL_BUCKET, is_directory=True) + if data_type == const.LOG_STR: + return S3LogPath(key, bucket=S3_ETL_BUCKET, is_directory=True) else: - return S3Path(key, bucket=ETL_BUCKET, is_directory=True) + return S3Path(key, bucket=S3_ETL_BUCKET, is_directory=True) @property def s3_log_dir(self): @@ -218,7 +244,7 @@ def s3_log_dir(self): Returns: s3_dir(S3Directory): Directory where s3 log will be stored. """ - return self._s3_uri(LOG_STR) + return self._s3_uri(const.LOG_STR) @property def s3_data_dir(self): @@ -227,7 +253,7 @@ def s3_data_dir(self): Returns: s3_dir(S3Directory): Directory where s3 data will be stored. """ - return self._s3_uri(DATA_STR) + return self._s3_uri(const.DATA_STR) @property def s3_source_dir(self): @@ -236,7 +262,7 @@ def s3_source_dir(self): Returns: s3_dir(S3Directory): Directory where s3 src will be stored. """ - return self._s3_uri(SRC_STR) + return self._s3_uri(const.SRC_STR) @property def ec2_resource(self): @@ -253,10 +279,9 @@ def ec2_resource(self): object_class=Ec2Resource, s3_log_dir=self.s3_log_dir, schedule=self.schedule, - terminate_after=self.ec2_resource_terminate_after, + **self.ec2_resource_config ) - - self.create_bootstrap_steps(EC2_RESOURCE_STR) + self.create_bootstrap_steps(const.EC2_RESOURCE_STR) return self._ec2_resource @property @@ -273,17 +298,11 @@ def emr_cluster(self): # Process the boostrap input bootstrap = self.emr_cluster_config.get('bootstrap', None) if bootstrap: - if isinstance(bootstrap, dict): - # If bootstrap script is not a path to local file - param_type = bootstrap['type'] - bootstrap = bootstrap['value'] - else: - # Default the type to path of a local file - param_type = 'path' - - if param_type == 'path': - bootstrap = S3File(path=bootstrap) + if 'string' in bootstrap: + bootstrap = bootstrap['string'] + elif 'script' in bootstrap: # Set the S3 Path for the bootstrap script + bootstrap = S3File(path=bootstrap) bootstrap.s3_path = self.s3_source_dir self.emr_cluster_config['bootstrap'] = bootstrap @@ -294,7 +313,7 @@ def emr_cluster(self): **self.emr_cluster_config ) - self.create_bootstrap_steps(EMR_CLUSTER_STR) + self.create_bootstrap_steps(const.EMR_CLUSTER_STR) return self._emr_cluster @property @@ -325,60 +344,6 @@ def step(self, step_id): """ return self._steps.get(step_id, None) - def determine_step_class(self, step_type, step_args): - """Determine step class from input to correct ETL step types - - Args: - step_type(str): string specifing step_type of the objects - step_args(dict): dictionary of step arguments - - Returns: - step_class(ETLStep): Class object for the specific step_type - step_args(dict): dictionary of step arguments - """ - if step_type == 'transform': - step_class = TransformStep - if step_args.get('resource', None) == 'emr-cluster': - step_args['resource'] = self.emr_cluster - - elif step_type == 'extract-s3': - step_class = ExtractS3Step - step_args.pop('resource') - - elif step_type == 'extract-local': - step_class = ExtractLocalStep - step_args.pop('resource') - if self.frequency != 'one-time': - raise ETLInputError( - 'Extract Local can be used for one-time pipelines only') - - elif step_type == 'extract-rds': - step_class = ExtractRdsStep - step_args.pop('input_node', None) - - elif step_type == 'extract-redshift': - step_class = ExtractRedshiftStep - step_args['redshift_database'] = self.redshift_database - step_args.pop('input_node', None) - - elif step_type == 'sql-command': - step_class = SqlCommandStep - step_args['redshift_database'] = self.redshift_database - step_args.pop('input_node', None) - - elif step_type == 'emr-streaming': - step_class = EMRStreamingStep - step_args['resource'] = self.emr_cluster - - elif step_type == 'load-redshift': - step_class = LoadRedshiftStep - step_args['redshift_database'] = self.redshift_database - - else: - raise ETLInputError('Step type %s not recogonized' % step_type) - - return step_class, step_args - def translate_input_nodes(self, input_node): """Translate names from YAML to input_nodes @@ -411,97 +376,11 @@ def translate_input_nodes(self, input_node): """ output = dict() for key, value in input_node.iteritems(): - if key not in self._intermediate_nodes: + if key not in self.intermediate_nodes: raise ETLInputError('Input reference does not exist') - output[value] = self._intermediate_nodes[key] + output[value] = self.intermediate_nodes[key] return output - def parse_step_args(self, step_type, **kwargs): - """Parse step arguments from input to correct ETL step types - - Args: - step_type(str): string specifing step_type of the objects - **kwargs: Keyword arguments read from YAML - - Returns: - step_class(ETLStep): Class object for the specific type - step_args(dict): dictionary of step arguments - """ - - if not isinstance(step_type, str): - raise ETLInputError('Step type must be a string') - - # Base dictionary for every step - step_args = { - 'resource': None, - 'schedule': self.schedule, - 'max_retries': self.max_retries, - 'required_steps': list() - } - step_args.update(kwargs) - - # Description is optional and should not be passed - step_args.pop('description', None) - - # Add dependencies - depends_on = step_args.pop('depends_on', None) - if depends_on: - for step_id in list(depends_on): - if step_id not in self._steps: - raise ETLInputError('Step depends on non-existent step') - step_args['required_steps'].append(self._steps[step_id]) - - step_class, step_args = self.determine_step_class(step_type, step_args) - - # Set input node and required_steps - input_node = step_args.get('input_node', None) - if input_node: - if isinstance(input_node, dict): - input_node = self.translate_input_nodes(input_node) - elif isinstance(input_node, str): - input_node = self._intermediate_nodes[input_node] - step_args['input_node'] = input_node - - # Add dependencies from steps that create input nodes - if isinstance(input_node, dict): - required_nodes = input_node.values() - else: - required_nodes = [input_node] - - for required_node in required_nodes: - for step in self._steps.values(): - if step not in step_args['required_steps'] and \ - required_node in step.pipeline_objects: - step_args['required_steps'].append(step) - - # Set resource for the step if not specified or removed - if 'resource' in step_args and step_args['resource'] is None: - step_args['resource'] = self.ec2_resource - - # Set the name if name not provided - if 'name' in step_args: - name = step_args.pop('name') - else: - # If the name of the step is not provided, one is assigned as: - # [step_class][index] - name = step_class.__name__ + str(sum( - [1 for a in self._steps.values() if isinstance(a, step_class)] - )) - - # Each step is given it's own directory so that there is no clashing - # of file names. - step_args.update({ - 'id': name, - 's3_log_dir': S3LogPath(name, parent_dir=self.s3_log_dir, - is_directory=True), - 's3_data_dir': S3Path(name, parent_dir=self.s3_data_dir, - is_directory=True), - 's3_source_dir': S3Path(name, parent_dir=self.s3_source_dir, - is_directory=True), - }) - - return step_class, step_args - def add_step(self, step, is_bootstrap=False): """Add a step to the pipeline @@ -518,9 +397,9 @@ def add_step(self, step, is_bootstrap=False): # Update intermediate_nodes dict if isinstance(step.output, dict): - self._intermediate_nodes.update(step.output) + self.intermediate_nodes.update(step.output) elif step.output and step.id: - self._intermediate_nodes[step.id] = step.output + self.intermediate_nodes[step.id] = step.output def create_steps(self, steps_params, is_bootstrap=False): """Create pipeline steps and add appropriate dependencies @@ -538,22 +417,27 @@ def create_steps(self, steps_params, is_bootstrap=False): """ input_node = None steps = [] + steps_params = process_steps(steps_params) for step_param in steps_params: # Assume that the preceding step is the input if not specified if isinstance(input_node, S3Node) and \ - 'input_node' not in step_param: + 'input_node' not in step_param and \ + 'input_path' not in step_param: step_param['input_node'] = input_node - step_class, step_args = self.parse_step_args(**step_param) + try: + step_class = step_param.pop('step_class') + step_args = step_class.arguments_processor(self, step_param) + except Exception: + logger.error('Error creating step with params : %s', step_param) + raise try: step = step_class(**step_args) except Exception: - print "Error creating step of class %s, step_param %s." % ( - str(step_class.__name__), - str(step_args) - ) + logger.error('Error creating step of class %s, step_param %s', + str(step_class.__name__), str(step_args)) raise # Add the step to the pipeline @@ -562,16 +446,6 @@ def create_steps(self, steps_params, is_bootstrap=False): steps.append(step) return steps - def allocate_resource(self, resource_type): - """Allocate the resource object based on the resource type specified - """ - if resource_type == EMR_CLUSTER_STR: - return self.emr_cluster - elif resource_type == EC2_RESOURCE_STR: - return self.ec2_resource - else: - raise ETLInputError('Unknown resource type found') - def create_bootstrap_steps(self, resource_type): """Create the boostrap steps for installation on all machines @@ -579,22 +453,8 @@ def create_bootstrap_steps(self, resource_type): resource_type(enum of str): type of resource we're bootstraping can be ec2 / emr """ - step_params = BOOTSTRAP_STEPS_DEFINITION - selected_steps = list() - for step in step_params: - step['name'] += '_' + resource_type # Append type for unique names - - # If resource type is specified and doesn't match we skip - if 'resource_type' in step: - if step['resource_type'] != resource_type: - continue - else: - step.pop('resource_type') - - step['resource'] = self.allocate_resource(resource_type) - selected_steps.append(step) - - steps = self.create_steps(selected_steps, True) + step_params = self.bootstrap_definitions.get(resource_type, list()) + steps = self.create_steps(step_params, True) self._bootstrap_steps.extend(steps) return steps @@ -611,6 +471,58 @@ def pipeline_objects(self): result.extend(step.pipeline_objects) return result + @staticmethod + def log_uploader(uri, filename, string): + """Utility function to upload log files to S3 + """ + dp_dir = S3Path(uri=uri, is_directory=True) + dp_path = S3Path( + key=filename + '.tsv', + parent_dir=dp_dir, + ) + dp_file = S3File( + text=string, + s3_path=dp_path, + ) + dp_file.upload_to_s3() + + def log_s3_dp_instance_data(self, pipeline): + """Uploads instance info for dp_instances to S3 + """ + dp_instance_entries = list_formatted_instance_details(pipeline) + if len(dp_instance_entries) > 0: + + output_string = StringIO() + writer = csv.writer(output_string, delimiter='\t') + writer.writerows(dp_instance_entries) + + # S3 Path computation + uri = os.path.join(get_s3_base_path(), QA_LOG_PATH, + DP_INSTANCE_LOG_PATH, + datetime.utcnow().strftime('%Y%m%d')) + + self.log_uploader(uri, pipeline.id, output_string.getvalue()) + output_string.close() + + def log_s3_dp_pipeline_data(self): + """Uploads instance info for dp_pipeline to S3 + """ + output_string = StringIO() + writer = csv.writer(output_string, delimiter='\t') + writer.writerow([ + self.pipeline.id, + self.name, + self.version_ts + ]) + + # S3 Path computation + uri = os.path.join(get_s3_base_path(), QA_LOG_PATH, + DP_PIPELINE_LOG_PATH, + datetime.utcnow().strftime('%Y%m%d')) + + self.log_uploader(uri, self.pipeline.id, output_string.getvalue()) + output_string.close() + def delete_if_exists(self): """Delete the pipelines with the same name as current pipeline """ @@ -619,6 +531,9 @@ def delete_if_exists(self): for p_iter in list_pipelines(): if p_iter['name'] == self.name: pipeline_instance = DataPipeline(pipeline_id=p_iter['id']) + + if DP_INSTANCE_LOG_PATH: + self.log_s3_dp_instance_data(pipeline_instance) pipeline_instance.delete() def s3_files(self): @@ -632,22 +547,44 @@ def s3_files(self): result.extend(pipeline_object.s3_files) return result + def get_tags(self): + """Get all the pipeline tags that are specified in the config + """ + tag_config = config.etl.get('TAGS', None) + if tag_config is None: + return None + + tags = [] + for key, value in tag_config.iteritems(): + if 'string' in value and 'variable' in value: + raise ETLInputError( + 'Tag config can not have both string and variable') + elif 'string' in value: + tags.append({'key': key, 'value': value['string']}) + elif 'variable' in value: + variable = getattr(self, value['variable']) + tags.append({'key': key, 'value': variable}) + return tags + def validate(self): """Validate the given pipeline definition by creating a pipeline Returns: errors(list): list of errors in the pipeline, empty if no errors """ - # Create AwsPipeline and add objects to it - self.pipeline = DataPipeline(self.name) + self.pipeline = DataPipeline(unique_id=self.name, + description=self.description, + tags=self.get_tags()) + for pipeline_object in self.pipeline_objects(): self.pipeline.add_object(pipeline_object) # Check for errors self.errors = self.pipeline.validate_pipeline_definition() if len(self.errors) > 0: - print '\nThere are errors with your pipeline:\n', self.errors + logger.error('There are errors with your pipeline:\n %s', + self.errors) # Update pipeline definition self.pipeline.update_pipeline_definition() @@ -680,5 +617,9 @@ def activate(self): ) pipeline_definition.upload_to_s3() + # Upload pipeline instance metadata to S3 + if DP_PIPELINE_LOG_PATH: + self.log_s3_dp_pipeline_data() + # Activate the pipeline with AWS self.pipeline.activate() diff --git a/dataduct/etl/tests/__init__.py b/dataduct/etl/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/etl/tests/test_etl_actions.py b/dataduct/etl/tests/test_etl_actions.py new file mode 100644 index 0000000..8b108eb --- /dev/null +++ b/dataduct/etl/tests/test_etl_actions.py @@ -0,0 +1,85 @@ +"""Tests for the ETL actions +""" +import os + +import unittest +from testfixtures import TempDirectory +from nose.tools import raises +from nose.tools import eq_ + +from ..etl_actions import read_pipeline_definition +from ..etl_actions import create_pipeline +from ...utils.exceptions import ETLInputError + + +class EtlActionsTests(unittest.TestCase): + """Tests for the ETL actions + """ + + def setUp(self): + """Setup text fixtures + """ + self.load_hour = '01' + self.load_min = '23' + load_time = self.load_hour + ':' + self.load_min + self.test_yaml = '\n'.join([ + 'name: example_load_redshift', + 'frequency: one-time', + 'load_time: ' + load_time, + 'max_retries: 5', + 'description: Example for the load_redshift step', + 'steps:', + '- step_type: extract-local', + ' path: data/test_table1.tsv', + '- step_type: load-redshift', + ' schema: dev', + ' table: test_table', + ]) + # Definition has no description field + self.test_definition = { + 'name': 'example_load_redshift', + 'frequency': 'one-time', + 'description': 'Example for the load_redshift step', + 'load_time': load_time, + 'max_retries': 5, + 'steps': [{ + 'step_type': 'extract-local', + 'path': 'data/test_table1.tsv', + }, { + 'step_type': 'load-redshift', + 'schema': 'dev', + 'table': 'test_table', + }], + } + + @staticmethod + @raises(ETLInputError) + def test_yaml_extension(): + """Test if the yaml extension check works correctly + for read_pipeline_definition + """ + read_pipeline_definition("name.txt") + + def test_read_pipeline_definition(self): + """Test if the pipeline definition is parsed correctly + """ + with TempDirectory() as directory: + directory.write('test_definition.yaml', self.test_yaml) + result = read_pipeline_definition( + os.path.join(directory.path, 'test_definition.yaml')) + eq_(result, self.test_definition) + + def test_create_pipeline(self): + """Test if simple pipeline creation is correct + """ + result = create_pipeline(self.test_definition) + # Check that pipeline properties are accurate + assert result.name.endswith(self.test_definition['name']) + eq_(result.frequency, self.test_definition['frequency']) + eq_(result.load_hour, int(self.load_hour)) + eq_(result.load_min, int(self.load_min)) + eq_(result.max_retries, self.test_definition['max_retries']) + # Check that vital steps are created + steps = result.steps + assert 'ExtractLocalStep0' in steps + assert 'LoadRedshiftStep0' in steps diff --git a/dataduct/etl/tests/test_etl_pipeline.py b/dataduct/etl/tests/test_etl_pipeline.py new file mode 100644 index 0000000..a40c16c --- /dev/null +++ b/dataduct/etl/tests/test_etl_pipeline.py @@ -0,0 +1,61 @@ +"""Tests for the ETL Pipeline object +""" +import unittest +from nose.tools import raises +from nose.tools import eq_ + +from datetime import timedelta +from ..etl_pipeline import ETLPipeline +from ...utils.exceptions import ETLInputError + + +class EtlPipelineTests(unittest.TestCase): + """Tests for the ETL Pipeline object + """ + + def setUp(self): + """Setup text fixtures + """ + self.default_pipeline = ETLPipeline('test_pipeline') + + @staticmethod + def test_construct_etl_pipeline(): + """Test if the constructor for EtlPipeline is correct + """ + result = ETLPipeline( + 'test_pipeline', + frequency='one-time', + ec2_resource_config={'terminate_after':'2 Hours'}, + time_delta=timedelta(seconds=3600), + emr_cluster_config={'cfg1': 'value'}, + load_time='12:34', + topic_arn='sns:topic-arn:test-case', + max_retries=5, + bootstrap={'cfg1': 'value'}, + ) + assert result.name.endswith('test_pipeline') + eq_(result.frequency, 'one-time') + eq_(result.ec2_resource_config, {'terminate_after':'2 Hours'}) + eq_(result.load_hour, 12) + eq_(result.load_min, 34) + eq_(result.time_delta, timedelta(seconds=3600)) + eq_(result.max_retries, 5) + eq_(result.topic_arn, 'sns:topic-arn:test-case') + eq_(result.bootstrap_definitions, {'cfg1': 'value'}) + eq_(result.emr_cluster_config, {'cfg1': 'value'}) + + @staticmethod + def test_no_load_time_default_none(): + """Test if the load_hour and load_min get set to None + if load_time is None + """ + result = ETLPipeline('no_load_time_pipeline', load_time=None) + eq_(result.load_hour, None) + eq_(result.load_min, None) + + @raises(ETLInputError) + def test_bad_data_type_throws(self): + """Test that exception is thrown if the data_type parameter for + _s3_uri is bad + """ + self.default_pipeline._s3_uri('TEST_DATA_TYPE') diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py new file mode 100644 index 0000000..8cdeefc --- /dev/null +++ b/dataduct/etl/utils.py @@ -0,0 +1,69 @@ +"""Utility functions for processing etl steps +""" +import imp +from ..config import Config +from ..steps import * # noqa +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError + +STEP_CLASSES = { + 'column-check': ColumnCheckStep, + 'count-check': CountCheckStep, + 'create-load-redshift': CreateAndLoadStep, + 'create-update-sql': CreateUpdateSqlStep, + 'emr-step': EMRJobStep, + 'emr-streaming': EMRStreamingStep, + 'extract-local': ExtractLocalStep, + 'extract-rds': ExtractRdsStep, + 'extract-redshift': ExtractRedshiftStep, + 'extract-s3': ExtractS3Step, + 'load-redshift': LoadRedshiftStep, + 'pipeline-dependencies': PipelineDependenciesStep, + 'primary-key-check': PrimaryKeyCheckStep, + 'qa-transform': QATransformStep, + 'reload': ReloadStep, + 'sql-command': SqlCommandStep, + 'transform': TransformStep, + 'upsert': UpsertStep, +} + + +def get_custom_steps(): + """Fetch the custom steps specified in config + """ + config = Config() + custom_steps = dict() + + for step_def in getattr(config, 'custom_steps', list()): + step_type = step_def['step_type'] + path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH') + + # Load source from the file path provided + step_mod = imp.load_source(step_type, path) + + # Get the step class based on class_name provided + step_class = getattr(step_mod, step_def['class_name']) + + # Check if step_class is of type ETLStep + if not issubclass(step_class, ETLStep): + raise ETLInputError('Step type %s is not of type ETLStep' % + step_class.__name__) + + custom_steps[step_type] = step_class + return custom_steps + + +STEP_CONFIG = STEP_CLASSES.copy() +STEP_CONFIG.update(get_custom_steps()) + + +def process_steps(steps_params): + """Format the step parameters by changing step type to step class + """ + steps = [] + for step_param in steps_params: + params = step_param.copy() + step_type = params.pop('step_type') + params['step_class'] = STEP_CONFIG[step_type] + steps.append(params) + return steps diff --git a/dataduct/pipeline/__init__.py b/dataduct/pipeline/__init__.py index e69de29..630a29c 100644 --- a/dataduct/pipeline/__init__.py +++ b/dataduct/pipeline/__init__.py @@ -0,0 +1,18 @@ +from .activity import Activity +from .copy_activity import CopyActivity +from .data_pipeline import DataPipeline +from .default_object import DefaultObject +from .ec2_resource import Ec2Resource +from .emr_resource import EmrResource +from .emr_activity import EmrActivity +from .mysql_node import MysqlNode +from .pipeline_object import PipelineObject +from .precondition import Precondition +from .redshift_copy_activity import RedshiftCopyActivity +from .redshift_node import RedshiftNode +from .redshift_database import RedshiftDatabase +from .s3_node import S3Node +from .schedule import Schedule +from .shell_command_activity import ShellCommandActivity +from .sns_alarm import SNSAlarm +from .sql_activity import SqlActivity diff --git a/dataduct/pipeline/copy_activity.py b/dataduct/pipeline/copy_activity.py index b7cb604..4fc865d 100644 --- a/dataduct/pipeline/copy_activity.py +++ b/dataduct/pipeline/copy_activity.py @@ -6,11 +6,12 @@ from .schedule import Schedule from ..config import Config +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class CopyActivity(Activity): @@ -48,7 +49,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(CopyActivity, self).__init__( id=id, diff --git a/dataduct/pipeline/data_pipeline.py b/dataduct/pipeline/data_pipeline.py index 775b575..acc5488 100644 --- a/dataduct/pipeline/data_pipeline.py +++ b/dataduct/pipeline/data_pipeline.py @@ -1,12 +1,12 @@ """ Base class for data pipeline instance """ +import json from collections import defaultdict -from boto.datapipeline.layer1 import DataPipelineConnection - from .pipeline_object import PipelineObject from .utils import list_pipeline_instances +from .utils import get_datapipeline_connection from ..utils.exceptions import ETLInputError @@ -18,7 +18,8 @@ class DataPipeline(object): executing it. """ - def __init__(self, unique_id=None, name=None, pipeline_id=None): + def __init__(self, unique_id=None, name=None, pipeline_id=None, + tags=None, description=None): """Constructor for the datapipeline object Args: @@ -29,7 +30,7 @@ def __init__(self, unique_id=None, name=None, pipeline_id=None): Note: If pipelineId is provided we don't need name or unique_id """ - self.conn = DataPipelineConnection() + self.conn = get_datapipeline_connection() self.objects = [] if pipeline_id: @@ -44,7 +45,8 @@ def __init__(self, unique_id=None, name=None, pipeline_id=None): if not name: name = unique_id - response = self.conn.create_pipeline(name, unique_id) + response = self.custom_create_pipeline( + name, unique_id, description, tags) self.pipeline_id = response['pipelineId'] @property @@ -114,3 +116,21 @@ def instance_details(self): for instance in instances: result[instance['@scheduledStartTime']].append(instance) return result + + def custom_create_pipeline(self, name, unique_id, description=None, + tags=None): + """ + Creates a new empty pipeline. Adds tags feature not yet available in + boto + + Args: + tags(list(dict)): a list of tags in the format + [{key: foo, value: bar}] + """ + params = {'name': name, 'uniqueId': unique_id, } + if description is not None: + params['description'] = description + if tags is not None: + params['tags'] = tags + return self.conn.make_request(action='CreatePipeline', + body=json.dumps(params)) diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index 98bbbec..53d10e8 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -4,22 +4,20 @@ from .pipeline_object import PipelineObject from ..config import Config +from ..utils import constants as const config = Config() -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] -DEFAULT_RESOURCE_ROLE = config.ec2['DEFAULT_RESOURCE_ROLE'] +ROLE = config.etl['ROLE'] +RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] +MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', const.ONE) class DefaultObject(PipelineObject): """Default object added to all pipelines """ - def __init__(self, - id='Default', - sns=None, - scheduleType='cron', - failureAndRerunMode='CASCADE', - **kwargs): + def __init__(self, id, pipeline_log_uri, sns=None, scheduleType='cron', + failureAndRerunMode='CASCADE', **kwargs): """Constructor for the DefaultObject class Args: @@ -34,10 +32,12 @@ def __init__(self, """ super(DefaultObject, self).__init__( - id=id, + id='Default', # This should always have the default id scheduleType=scheduleType, failureAndRerunMode=failureAndRerunMode, - role=DEFAULT_ROLE, - resourceRole=DEFAULT_RESOURCE_ROLE, + role=ROLE, + resourceRole=RESOURCE_ROLE, + maxActiveInstances=MAX_ACTIVE_INSTANCES, + pipelineLogUri=pipeline_log_uri, onFail=sns ) diff --git a/dataduct/pipeline/ec2_resource.py b/dataduct/pipeline/ec2_resource.py index 0231002..b648ef8 100644 --- a/dataduct/pipeline/ec2_resource.py +++ b/dataduct/pipeline/ec2_resource.py @@ -4,18 +4,20 @@ from ..config import Config from .pipeline_object import PipelineObject -from ..s3.s3_log_path import S3LogPath +from ..s3 import S3LogPath from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_RESOURCE_ROLE = config.ec2['DEFAULT_RESOURCE_ROLE'] -DEFAULT_EC2_INSTANCE_TYPE = config.ec2['DEFAULT_EC2_INSTANCE_TYPE'] -ETL_AMI = config.ec2['ETL_AMI'] -KEY_PAIR = config.ec2['KEY_PAIR'] -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] -SECURITY_GROUP = config.ec2['SECURITY_GROUP'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +ROLE = config.etl['ROLE'] +RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] + +INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE) +ETL_AMI = config.ec2.get('ETL_AMI', const.NONE) +SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', const.NONE) +KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class Ec2Resource(PipelineObject): @@ -27,7 +29,7 @@ def __init__(self, s3_log_dir=None, schedule=None, terminate_after='6 Hours', - instance_type=DEFAULT_EC2_INSTANCE_TYPE, + instance_type=INSTANCE_TYPE, ami=ETL_AMI, security_group=SECURITY_GROUP, **kwargs): @@ -60,8 +62,8 @@ def __init__(self, schedule=schedule, imageId=ami, instanceType=instance_type, - role=DEFAULT_ROLE, - resourceRole=DEFAULT_RESOURCE_ROLE, + role=ROLE, + resourceRole=RESOURCE_ROLE, keyPair=KEY_PAIR, retryDelay=RETRY_DELAY, securityGroups=security_group diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index 79c1257..4351d60 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -5,10 +5,11 @@ from .activity import Activity from ..config import Config from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) class EmrActivity(Activity): @@ -19,6 +20,7 @@ def __init__(self, id, resource, schedule, + input_node, emr_step_string, output_node=None, additional_files=None, @@ -46,7 +48,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(EmrActivity, self).__init__( id=id, @@ -57,6 +59,7 @@ def __init__(self, schedule=schedule, step=emr_step_string, output=output_node, + input=input_node, ) self.add_additional_files(additional_files) diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index 9dc1b9d..bf5118d 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -4,22 +4,26 @@ from ..config import Config from .pipeline_object import PipelineObject -from ..s3.s3_log_path import S3LogPath +from ..s3 import S3LogPath from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_NUM_CORE_INSTANCES = config.emr['DEFAULT_NUM_CORE_INSTANCES'] -DEFAULT_CORE_INSTANCE_TYPE = config.emr['DEFAULT_CORE_INSTANCE_TYPE'] -DEFAULT_TASK_INSTANCE_BID_PRICE = config.emr['DEFAULT_TASK_INSTANCE_BID_PRICE'] -DEFAULT_TASK_INSTANCE_TYPE = config.emr['DEFAULT_TASK_INSTANCE_TYPE'] -DEFAULT_MASTER_INSTANCE_TYPE = config.emr['DEFAULT_MASTER_INSTANCE_TYPE'] -DEFAULT_CLUSTER_TIMEOUT = config.emr['DEFAULT_CLUSTER_TIMEOUT'] -DEFAULT_HADOOP_VERSION = config.emr['DEFAULT_HADOOP_VERSION'] -DEFAULT_HIVE_VERSION = config.emr['DEFAULT_HIVE_VERSION'] -DEFAULT_PIG_VERSION = config.emr['DEFAULT_PIG_VERSION'] -DEFAULT_CLUSTER_AMI = config.emr['DEFAULT_CLUSTER_AMI'] -KEY_PAIR = config.ec2['KEY_PAIR'] +NUM_CORE_INSTANCES = config.emr.get('NUM_CORE_INSTANCES', const.NONE) +CORE_INSTANCE_TYPE = config.emr.get('CORE_INSTANCE_TYPE', const.M1_LARGE) +TASK_INSTANCE_BID_PRICE = config.emr.get('TASK_INSTANCE_BID_PRICE', const.NONE) +TASK_INSTANCE_TYPE = config.emr.get('TASK_INSTANCE_TYPE', const.M1_LARGE) +MASTER_INSTANCE_TYPE = config.emr.get('MASTER_INSTANCE_TYPE', const.M1_LARGE) +CLUSTER_TIMEOUT = config.emr.get('CLUSTER_TIMEOUT', const.DEFAULT_TIMEOUT) +HADOOP_VERSION = config.emr.get('HADOOP_VERSION', const.NONE) +HIVE_VERSION = config.emr.get('HIVE_VERSION', const.NONE) +PIG_VERSION = config.emr.get('PIG_VERSION', const.NONE) +CLUSTER_AMI = config.emr.get('CLUSTER_AMI', '2.4.7') +KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) + +import logging +logger = logging.getLogger(__name__) class EmrResource(PipelineObject): @@ -30,18 +34,18 @@ def __init__(self, id, s3_log_dir, schedule, - num_instances=DEFAULT_NUM_CORE_INSTANCES, - instance_size=DEFAULT_CORE_INSTANCE_TYPE, + num_instances=NUM_CORE_INSTANCES, + instance_size=CORE_INSTANCE_TYPE, bootstrap=None, num_task_instances=None, - task_bid_price=DEFAULT_TASK_INSTANCE_BID_PRICE, - task_instance_type=DEFAULT_TASK_INSTANCE_TYPE, - master_instance_size=DEFAULT_MASTER_INSTANCE_TYPE, - terminate_after=DEFAULT_CLUSTER_TIMEOUT, - hadoop_version=DEFAULT_HADOOP_VERSION, - install_hive=DEFAULT_HIVE_VERSION, - install_pig=DEFAULT_PIG_VERSION, - ami_version=DEFAULT_CLUSTER_AMI): + task_bid_price=TASK_INSTANCE_BID_PRICE, + task_instance_type=TASK_INSTANCE_TYPE, + master_instance_size=MASTER_INSTANCE_TYPE, + terminate_after=CLUSTER_TIMEOUT, + hadoop_version=HADOOP_VERSION, + install_hive=HIVE_VERSION, + install_pig=PIG_VERSION, + ami_version=CLUSTER_AMI): """Constructor for the Ec2Resource class Args: @@ -99,7 +103,8 @@ def __init__(self, if self['taskInstanceType'].find('xlarge') >= 0: if num_task_instances > 10: - print 'Using taskInstanceType: (%s)' % \ - self['taskInstanceType'] - print 'WARNING!!! Are you sure you need', \ - '%s task instances?' % num_task_instances + logger.info('Using taskInstanceType: (%s)', + self['taskInstanceType']) + logger.warning( + 'Are you sure you need %s task instances?', + num_task_instances) diff --git a/dataduct/pipeline/pipeline_object.py b/dataduct/pipeline/pipeline_object.py index ed09a8e..42af751 100644 --- a/dataduct/pipeline/pipeline_object.py +++ b/dataduct/pipeline/pipeline_object.py @@ -3,9 +3,9 @@ """ from collections import defaultdict -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File -from ..s3.s3_directory import S3Directory +from ..s3 import S3Path +from ..s3 import S3File +from ..s3 import S3Directory from ..utils.exceptions import ETLInputError @@ -100,15 +100,12 @@ def __setitem__(self, key, value): key(str): Key of the item to be fetched value: Value of the item to be fetched """ - # Do not add none values - if value is None: - return - # Store value as a list if there is only one if not isinstance(value, list): value = [value] - self.fields[key].extend(value) + # Do not add none values + self.fields[key].extend([x for x in value if x is not None]) if key == 'dependsOn': self.fields[key] = list(set(self.fields[key])) @@ -118,6 +115,9 @@ def add_additional_files(self, new_files): Args: new_files(S3File): list of new S3 files for the activity """ + if new_files is None: + return + for new_file in new_files: if not isinstance(new_file, S3File): raise ETLInputError('File must be an S3 File object') diff --git a/dataduct/pipeline/redshift_copy_activity.py b/dataduct/pipeline/redshift_copy_activity.py index 0c91a57..0346735 100644 --- a/dataduct/pipeline/redshift_copy_activity.py +++ b/dataduct/pipeline/redshift_copy_activity.py @@ -6,11 +6,12 @@ from ..config import Config from .redshift_node import RedshiftNode from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class RedshiftCopyActivity(Activity): @@ -49,7 +50,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES kwargs = { 'id': id, diff --git a/dataduct/pipeline/redshift_database.py b/dataduct/pipeline/redshift_database.py index 4302a33..dfd3649 100644 --- a/dataduct/pipeline/redshift_database.py +++ b/dataduct/pipeline/redshift_database.py @@ -4,12 +4,17 @@ from ..config import Config from .pipeline_object import PipelineObject +from ..utils.exceptions import ETLConfigError config = Config() -REDSHIFT_DATABASE_NAME = config.redshift['REDSHIFT_DATABASE_NAME'] -REDSHIFT_CLUSTER_ID = config.redshift['REDSHIFT_CLUSTER_ID'] -REDSHIFT_USERNAME = config.redshift['REDSHIFT_USERNAME'] -REDSHIFT_PASSWORD = config.redshift['REDSHIFT_PASSWORD'] + +if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift credentials missing from config') + +DATABASE_NAME = config.redshift['DATABASE_NAME'] +CLUSTER_ID = config.redshift['CLUSTER_ID'] +USERNAME = config.redshift['USERNAME'] +PASSWORD = config.redshift['PASSWORD'] class RedshiftDatabase(PipelineObject): @@ -18,10 +23,10 @@ class RedshiftDatabase(PipelineObject): def __init__(self, id, - database_name=REDSHIFT_DATABASE_NAME, - cluster_id=REDSHIFT_CLUSTER_ID, - username=REDSHIFT_USERNAME, - password=REDSHIFT_PASSWORD): + database_name=DATABASE_NAME, + cluster_id=CLUSTER_ID, + username=USERNAME, + password=PASSWORD): """Constructor for the RedshiftDatabase class Args: diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index f9c8413..cbcad98 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -7,13 +7,14 @@ from .precondition import Precondition from .schedule import Schedule -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File -from ..s3.s3_directory import S3Directory +from ..s3 import S3Path +from ..s3 import S3File +from ..s3 import S3Directory +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -RETRY_DELAY = config.etl['RETRY_DELAY'] +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class S3Node(PipelineObject): @@ -23,7 +24,7 @@ class S3Node(PipelineObject): def __init__(self, id, schedule, - s3_path, + s3_object, precondition=None, format=None, **kwargs): @@ -32,7 +33,7 @@ def __init__(self, Args: id(str): id of the object schedule(Schedule): pipeline schedule - s3_path(S3Path / S3File / S3Directory): s3 location + s3_object(S3Path / S3File / S3Directory): s3 location precondition(Precondition): precondition to the data node **kwargs(optional): Keyword arguments directly passed to base class """ @@ -46,19 +47,23 @@ def __init__(self, raise ETLInputError( 'Input precondition must be of the type Precondition') - if not(isinstance(s3_path, S3Path) or - isinstance(s3_path, S3File) or - isinstance(s3_path, S3Directory)): + if not(isinstance(s3_object, S3Path) or + isinstance(s3_object, S3File) or + isinstance(s3_object, S3Directory)): raise ETLInputError('Mismatched type for S3 path') additional_args = {} - if isinstance(s3_path, S3Path) and s3_path.is_directory: - additional_args['directoryPath'] = s3_path + if (isinstance(s3_object, S3Path) and s3_object.is_directory) or \ + (isinstance(s3_object, S3Directory)): + additional_args['directoryPath'] = s3_object else: - additional_args['filePath'] = s3_path + additional_args['filePath'] = s3_object - # Save the s3_path variable - self._s3_path = s3_path + # Save the s3_object variable + self._s3_object = s3_object + + # Save the dependent nodes from the S3 Node + self._dependency_nodes = list() super(S3Node, self).__init__( id=id, @@ -72,12 +77,23 @@ def __init__(self, def path(self): - """Get the s3_path associated with the S3 data node + """Get the s3_object associated with the S3 data node Returns: - s3_path(S3Path): The s3 path of the node can a directory or file + s3_object(S3Path): The s3 path of the node can a directory or file """ - if isinstance(self._s3_path, S3File): - return self._s3_path.s3_path + if isinstance(self._s3_object, S3File): + return self._s3_object.s3_path else: - return self._s3_path + return self._s3_object + + @property + def dependency_nodes(self): + """Fetch the dependent nodes for the S3 node + """ + return self._dependency_nodes + + def add_dependency_node(self, input_node): + """Add nodes to the list of dependencies among S3 Nodes + """ + self._dependency_nodes.append(input_node) diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index ff2f029..6fdd0c7 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -3,16 +3,19 @@ """ from datetime import datetime from datetime import timedelta +from pytimeparse import parse from ..config import Config from .pipeline_object import PipelineObject +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DAILY_LOAD_TIME = config.etl['DAILY_LOAD_TIME'] +DAILY_LOAD_TIME = config.etl.get('DAILY_LOAD_TIME', const.ONE) FEQUENCY_PERIOD_CONVERTION = { + 'weekly': ('1 week', None), 'daily': ('1 day', None), 'hourly': ('1 hour', None), 'one-time': ('15 minutes', 1), @@ -26,7 +29,7 @@ class Schedule(PipelineObject): def __init__(self, id, frequency='one-time', - delay=None, + time_delta=None, load_hour=None, load_minutes=None, **kwargs): @@ -36,7 +39,7 @@ def __init__(self, id(str): id of the Schedule object frequency(enum): rate at which pipeline should be run \ can be daily, hourly and one-time - delay(timedelta): Additional offset provided to the schedule + time_delta(timedelta): Additional offset provided to the schedule load_hour(int): Hour at which the pipeline should start load_minutes(int): Minutes at which the pipeline should be run **kwargs(optional): Keyword arguments directly passed to base class @@ -50,10 +53,12 @@ def __init__(self, if load_hour is None: load_hour = DAILY_LOAD_TIME - if delay is None: - delay = timedelta(0) - elif not isinstance(delay, timedelta): - raise ETLInputError('Delay must be an instance of timedelta') + if time_delta is None: + time_delta = timedelta(seconds=0) + elif isinstance(time_delta, int): + time_delta = timedelta(days=time_delta) + elif not isinstance(time_delta, timedelta): + raise ETLInputError('time_delta must be an instance of timedelta or int') if frequency in FEQUENCY_PERIOD_CONVERTION: period, occurrences = FEQUENCY_PERIOD_CONVERTION[frequency] @@ -67,9 +72,12 @@ def __init__(self, start_time = start_time.replace(hour=load_hour) if current_time.hour < load_hour: - delay += timedelta(days=-1) + if frequency == 'one-time': + time_delta -= timedelta(days=1) + else: + time_delta -= timedelta(seconds=parse(period)) - start_time += delay + start_time += time_delta super(Schedule, self).__init__( id=id, diff --git a/dataduct/pipeline/shell_command_activity.py b/dataduct/pipeline/shell_command_activity.py index d22bdfa..69f311c 100644 --- a/dataduct/pipeline/shell_command_activity.py +++ b/dataduct/pipeline/shell_command_activity.py @@ -5,12 +5,12 @@ from .activity import Activity from ..config import Config from .schedule import Schedule - +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class ShellCommandActivity(Activity): @@ -57,7 +57,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(ShellCommandActivity, self).__init__( id=id, @@ -76,5 +76,4 @@ def __init__(self, ) # Add the additional s3 files - if additional_s3_files is not None: - self.add_additional_files(additional_s3_files) + self.add_additional_files(additional_s3_files) diff --git a/dataduct/pipeline/sns_alarm.py b/dataduct/pipeline/sns_alarm.py index e0b1cfa..b9f30cd 100644 --- a/dataduct/pipeline/sns_alarm.py +++ b/dataduct/pipeline/sns_alarm.py @@ -4,10 +4,11 @@ from ..config import Config from .pipeline_object import PipelineObject +from ..utils import constants as const config = Config() -DATA_PIPELINE_TOPIC_ARN = config.etl['DATA_PIPELINE_TOPIC_ARN'] -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) +ROLE = config.etl['ROLE'] class SNSAlarm(PipelineObject): @@ -18,6 +19,7 @@ def __init__(self, id, pipeline_name=None, failure_message=None, + topic_arn=None, **kwargs): """Constructor for the SNSAlarm class @@ -40,11 +42,14 @@ def __init__(self, 'Error Stack Trace: #{node.errorStackTrace}' ]) + if topic_arn is None: + topic_arn = SNS_TOPIC_ARN_FAILURE + super(SNSAlarm, self).__init__( id=id, type='SnsAlarm', - topicArn=DATA_PIPELINE_TOPIC_ARN, - role=DEFAULT_ROLE, + topicArn=topic_arn, + role=ROLE, subject='Data Pipeline Failure', message=failure_message, ) diff --git a/dataduct/pipeline/sql_activity.py b/dataduct/pipeline/sql_activity.py index 7d6d315..2eb9767 100644 --- a/dataduct/pipeline/sql_activity.py +++ b/dataduct/pipeline/sql_activity.py @@ -5,13 +5,13 @@ from .activity import Activity from ..config import Config from .schedule import Schedule -from ..s3.s3_file import S3File - +from ..s3 import S3File +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class SqlActivity(Activity): @@ -54,7 +54,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(SqlActivity, self).__init__( id=id, diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index c5d8db4..1817119 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -1,8 +1,20 @@ """ Shared utility functions """ +from boto.datapipeline import regions from boto.datapipeline.layer1 import DataPipelineConnection from time import sleep +import dateutil.parser + +from dataduct.config import Config + +config = Config() +REGION = config.etl.get('REGION', None) + +DP_ACTUAL_END_TIME = '@actualEndTime' +DP_ATTEMPT_COUNT_KEY = '@attemptCount' +DP_INSTANCE_ID_KEY = 'id' +DP_INSTANCE_STATUS_KEY = '@status' def _update_sleep_time(last_time=None): @@ -35,16 +47,16 @@ def get_response_from_boto(fn, *args, **kwargs): Args: func(function): Function to call - *args(optional): arguments - **kwargs(optional): keyword arguments + args(optional): arguments + kwargs(optional): keyword arguments Returns: response(json): request response. Input: func(function): Function to call - *args(optional): arguments - **kwargs(optional): keyword arguments + args(optional): arguments + kwargs(optional): keyword arguments """ response = None @@ -102,7 +114,7 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): instances(list): list of pipeline instances """ if conn is None: - conn = DataPipelineConnection() + conn = get_datapipeline_connection() # Get all instances instance_ids = sorted(get_list_from_boto(conn.query_objects, @@ -136,6 +148,18 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): return instances + +def get_datapipeline_connection(): + """Get boto connection of AWS data pipeline + + Returns: + DataPipelineConnection: boto connection + """ + region = next((x for x in regions() if x.name == str(REGION).lower()), None) + conn = DataPipelineConnection(region=region) + return conn + + def list_pipelines(conn=None): """Fetch a list of all pipelines with boto @@ -146,9 +170,41 @@ def list_pipelines(conn=None): pipelines(list): list of pipelines fetched with boto """ if conn is None: - conn = DataPipelineConnection() + conn = get_datapipeline_connection() return get_list_from_boto( conn.list_pipelines, 'pipelineIdList', ) + + +def date_string(date): + """Normalizes a date string to YYYY-mm-dd HH:MM:SS + """ + if date is None: + return 'NULL' + return str(dateutil.parser.parse(date)) + + +def list_formatted_instance_details(pipeline): + """List of instance rows formatted to match + """ + etl_runs = pipeline.instance_details() + entries = [] + for etl_run_dt in sorted(etl_runs.keys()): + + # Look through instances + for instance in sorted( + etl_runs[etl_run_dt], + key=lambda x: x.get(DP_ACTUAL_END_TIME, None)): + entries.append( + [ + instance[DP_INSTANCE_ID_KEY], + pipeline.id, + date_string(etl_run_dt), + date_string(instance.get(DP_ACTUAL_END_TIME)), + instance[DP_INSTANCE_STATUS_KEY], + instance.get(DP_ATTEMPT_COUNT_KEY, 'NULL'), + ] + ) + return entries diff --git a/dataduct/qa/__init__.py b/dataduct/qa/__init__.py new file mode 100644 index 0000000..c660cf5 --- /dev/null +++ b/dataduct/qa/__init__.py @@ -0,0 +1,4 @@ +from .check import Check +from .count_check import CountCheck +from .column_check import ColumnCheck +from .primary_key_check import PrimaryKeyCheck diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py new file mode 100644 index 0000000..04fbb84 --- /dev/null +++ b/dataduct/qa/check.py @@ -0,0 +1,160 @@ +"""Base class for QA steps that provides template function for publishing +""" +from boto.sns import SNSConnection +from datetime import datetime +import os + +from .utils import render_output +from ..config import Config +from ..database import SelectStatement +from ..s3 import S3Path +from ..s3 import S3File +from ..utils import constants as const +from ..utils.helpers import exactly_one +from ..utils.helpers import get_s3_base_path + +QA_TEST_ROW_LENGTH = 8 + + +class Check(object): + """Base class for QA steps that provides template function for publishing + """ + def __init__(self, name, tolerance=0, sns_topic_arn=None): + """Constructor for Check class + + Args: + name(str): Name of the QA test + tolerance(float): Error tolerance levels for the ETL + sns_topic_arn(str): sns topic arn for QA test + """ + self.name = name + self.tolerance = tolerance + if sns_topic_arn is None: + config = Config() + sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None) + self.sns_topic_arn = sns_topic_arn + self.alert_func = self.get_sns_alert_function() + + def get_sns_alert_function(self): + """Get a lamdda function for SNS alert publishing + """ + if self.sns_topic_arn is None: + return None + return lambda message, subject: \ + SNSConnection().publish(self.sns_topic_arn, message, subject) + + @property + def success(self): + """True if error rate is below the tolerance levels + """ + return self.error_rate is not None and \ + self.error_rate <= self.tolerance + + @property + def summary(self): + """Summary information about this test. This text must not + contain any PII or otherwise sensitive data that cannot + be published via email. + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success + ] + ) + + @property + def results(self): + """The results of this test. This may contain PII, as it + should only be sent to S3 or Redshift. The default results are empty. + Subclasses may override this. + """ + # The default can just be the summary text as risk isn't increasing + return self.summary + + @property + def error_rate(self): + """The error rate for the QA test + """ + return None + + @property + def export_output(self): + """List of data associated with this check for analytics + """ + return [ + self.name, + 1 if self.success else 0, + self.tolerance, + self.error_rate, + ] + + @property + def alert_subject(self): + """String for alerts in case of calling the alert_func + """ + return "Failure on %s" % self.name + + def publish(self, log_to_s3=False, dest_sql=None, table=None, + path_suffix=None): + """Publish the results of the QA test + + Note: + Prints result summary, Exports check data, Call the alert function + if specified + """ + + # Print results for logs + print self.results + print self.summary + + if log_to_s3: + self.log_output_to_s3(dest_sql, table, path_suffix) + + if not self.success: + if self.alert_func is not None: + # Send summary to alert func for further publishing + self.alert_func(self.summary, self.alert_subject) + else: + raise Exception(self.alert_subject) + + def log_output_to_s3(self, destination_sql=None, table=None, + path_suffix=None): + """Log the results of the QA test in S3 + """ + if not exactly_one(destination_sql, table): + raise Exception('Needs table or destination_sql') + + if destination_sql is not None: + full_table_name = SelectStatement(destination_sql).dependencies[0] + else: + full_table_name = table + + config = Config() + + schema_name, table_name = full_table_name.split('.', 1) + pipeline_name, _ = self.name.split(".", 1) + timestamp = datetime.utcnow() + + row = [schema_name, table_name, pipeline_name, timestamp] + row.extend(self.export_output) + if len(row) < QA_TEST_ROW_LENGTH: + row.extend(['NULL'] * (QA_TEST_ROW_LENGTH - len(row))) + + # Convert to TSV + string = '\t'.join(map(str, row)) + + # S3 Path computation + qa_test_dir_uri = os.path.join( + get_s3_base_path(), config.etl.get('QA_LOG_PATH', const.QA_STR), + config.etl.get('DP_QA_TESTS_LOG_PATH', 'dba_table_qa_tests'), + path_suffix if path_suffix else '') + + parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) + + key = '_'.join(map(str, row)).replace('.', '_').replace(' ', '_') + key += '.tsv' + + qa_tests_path = S3Path(key=key, parent_dir=parent_dir) + qa_tests_file = S3File(text=string, s3_path=qa_tests_path) + qa_tests_file.upload_to_s3() diff --git a/dataduct/qa/column_check.py b/dataduct/qa/column_check.py new file mode 100644 index 0000000..cc5e172 --- /dev/null +++ b/dataduct/qa/column_check.py @@ -0,0 +1,83 @@ +"""QA test for comparing columns in the source system with the Warehouse +""" +from .check import Check +from .utils import render_output + + +class ColumnCheck(Check): + """QA test for comparing columns across the ETL + """ + def __init__(self, source_data, destination_data, **kwargs): + """Constructor for the Count based QA + + Args: + source_data(DataFrame): Sample of source data + destination_data(DataFrame): Sample of destination data + """ + super(ColumnCheck, self).__init__(**kwargs) + self.source_data = source_data + self.destination_data = destination_data + self.errors = [] + self.observed = 0 + + # Identify errors + for key in source_data.index: + if key not in destination_data.index: + continue + + source_value = ColumnCheck.column_value(self.source_data, key) + dest_value = ColumnCheck.column_value(self.destination_data, key) + + if source_value != dest_value: + self.errors.append((key, source_value, dest_value)) + self.observed += 1 + + @property + def error_rate(self): + """The error rate for the column comparisons + + Note: + The error is only calculated for keys that exist in both dataframes. + Thus, we presume that issues dealing with row counts are addressed + in a separate QA test. + """ + if self.observed == 0: + return None + + return float(len(self.errors) * 100) / self.observed + + @staticmethod + def column_value(data, key): + """Fetch the value for a key in the dataframe + + Args: + data(DataFrame): Single column dataframe + key(str): Key to lookup in the dataframe + + Returns: + value(str): Value for the key, unicode values are encoded as utf-8 + """ + value = data.loc[key].values[0] + if isinstance(value, unicode): + return value.encode('utf-8') + return value + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %0.4f%%' % self.tolerance, + 'Error Rate: %0.4f%%' % self.error_rate, + 'Observed: %d' % self.observed, + ] + ) + + @property + def results(self): + """Results from the the comparison of the errors + """ + return render_output([str(a) for a in self.errors]) diff --git a/dataduct/qa/count_check.py b/dataduct/qa/count_check.py new file mode 100644 index 0000000..c0bd116 --- /dev/null +++ b/dataduct/qa/count_check.py @@ -0,0 +1,57 @@ +"""QA test for comparing number of rows in the source system with the Warehouse +""" + +from .check import Check +from .utils import render_output + + +class CountCheck(Check): + """QA test for comparing number of rows across the ETL + """ + def __init__(self, source_count, destination_count, **kwargs): + """Constructor for the Count based QA + + Args: + source_count(int): Count of objects in the source system + destination_count(int): Count of objects in the warehouse + """ + super(CountCheck, self).__init__(**kwargs) + self.source_count = source_count + self.destination_count = destination_count + + @property + def error_rate(self): + """The error rate. + If there are no values in the source or destination, the error is 0. + If there are no values in the source but some in the destination, + the error is None + """ + return self.calculate_error_rate(self.source_count, + self.destination_count) + + @staticmethod + def calculate_error_rate(source_count, destination_count): + """Calculate the error rate based on the source and destination counts + """ + if source_count > 0: + error_difference = float(source_count - destination_count) + return abs(error_difference * 100) / source_count + elif destination_count == 0: + return 0 + else: + return None + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %0.4f%%' % self.tolerance, + 'Error Rate: %0.4f%%' % self.error_rate, + 'Source Count: %d' % self.source_count, + 'Destination Count: %d' % self.destination_count, + ] + ) diff --git a/dataduct/qa/primary_key_check.py b/dataduct/qa/primary_key_check.py new file mode 100644 index 0000000..86cf061 --- /dev/null +++ b/dataduct/qa/primary_key_check.py @@ -0,0 +1,37 @@ +"""QA test for we have duplicate primary keys inside redshift +""" + +from .check import Check +from .utils import render_output + + +class PrimaryKeyCheck(Check): + """QA test for checking duplicate primary keys inside redshift + """ + def __init__(self, duplicate_count=0, **kwargs): + """Constructor for Primary Key Check + + Args: + duplicate_count(int): Number of duplicates + """ + super(PrimaryKeyCheck, self).__init__(**kwargs) + self.duplicate_count = duplicate_count + + @property + def error_rate(self): + """The error rate for the QA test + """ + return self.duplicate_count + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %d' % self.tolerance, + 'Error Rate: %d' % self.error_rate, + ] + ) diff --git a/dataduct/qa/utils.py b/dataduct/qa/utils.py new file mode 100644 index 0000000..c910bbb --- /dev/null +++ b/dataduct/qa/utils.py @@ -0,0 +1,10 @@ +""" +Shared utility functions +""" + +def render_output(data): + """Print the formatted output for the list + """ + output = ['[Dataduct]: '] + output.extend(data) + return '\n'.join(output) diff --git a/dataduct/s3/__init__.py b/dataduct/s3/__init__.py index e69de29..0b2e46e 100644 --- a/dataduct/s3/__init__.py +++ b/dataduct/s3/__init__.py @@ -0,0 +1,4 @@ +from .s3_file import S3File +from .s3_path import S3Path +from .s3_directory import S3Directory +from .s3_log_path import S3LogPath diff --git a/dataduct/s3/s3_directory.py b/dataduct/s3/s3_directory.py index 10eee88..1eb1f50 100644 --- a/dataduct/s3/s3_directory.py +++ b/dataduct/s3/s3_directory.py @@ -3,6 +3,8 @@ """ from .s3_path import S3Path from .utils import upload_dir_to_s3 +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError class S3Directory(object): @@ -21,7 +23,7 @@ def __init__(self, path=None, s3_path=None): s3_path (S3Path, optional): s3_path of the file """ - self.path = path + self.path = parse_path(path) self._s3_path = s3_path @property @@ -37,8 +39,11 @@ def s3_path(self, value): Args: value(S3Path): s3path of the directory """ - assert isinstance(value, S3Path), 'input path must be of type S3Path' - assert value.is_directory, 'input path must be a directory' + if not isinstance(value, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not value.is_directory: + raise ETLInputError('S3 path must be directory') self._s3_path = value def upload_to_s3(self): diff --git a/dataduct/s3/s3_file.py b/dataduct/s3/s3_file.py index 0530b7e..4654a7c 100644 --- a/dataduct/s3/s3_file.py +++ b/dataduct/s3/s3_file.py @@ -4,9 +4,9 @@ from .s3_path import S3Path from .utils import upload_to_s3 from .utils import read_from_s3 +from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError - DEFAULT_FILE_NAME = 'file' @@ -32,7 +32,7 @@ def __init__(self, path=None, text=None, s3_path=None): 'Cannot specify both path and text for s3 file.' # Initialize all the values - self._path = path + self._path = parse_path(path) self._text = text self._s3_path = s3_path @@ -73,7 +73,7 @@ def file_name(self): file_name(str): The file_name of this file """ if self._path: - return self._path.split("/").pop() + return self._path.split('/').pop() else: return DEFAULT_FILE_NAME @@ -83,7 +83,6 @@ def s3_path(self): """ return self._s3_path - @s3_path.setter def s3_path(self, s3_path): """Set the S3 path for the file @@ -94,7 +93,8 @@ def s3_path(self, s3_path): If there is no path, the name "file" will be applied. """ - assert isinstance(s3_path, S3Path), 'input path must be of type S3Path' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') # Copy the object as we would change it for the file self._s3_path = S3Path( diff --git a/dataduct/s3/s3_log_path.py b/dataduct/s3/s3_log_path.py index f8b38fc..eabfa56 100644 --- a/dataduct/s3/s3_log_path.py +++ b/dataduct/s3/s3_log_path.py @@ -14,8 +14,8 @@ class S3LogPath(S3Path): unless there is a backslash: :: - s3:://coursera-datapipeline/dev - s3:://coursera-datapipeline/dev_log_dir + s3:://coursera-bucket/dev + s3:://coursera-bucket/dev_log_dir However, if one adds a backslash to the log s3 URI, Data Pipeline will add another backslash before adding subdirectories. These diff --git a/dataduct/s3/s3_path.py b/dataduct/s3/s3_path.py index 4b4e36f..fd154dd 100644 --- a/dataduct/s3/s3_path.py +++ b/dataduct/s3/s3_path.py @@ -76,7 +76,7 @@ def append(self, new_key, is_directory=False): new_key = join(*new_key) # Remove duplicate, leading, and trailing '/' - new_key = [a for a in new_key.split("/") if a != ""] + new_key = [a for a in new_key.split("/") if a != ''] # AWS prevents us from using periods in paths # Substitute them with '_' diff --git a/dataduct/s3/utils.py b/dataduct/s3/utils.py index 712cffc..33937d4 100644 --- a/dataduct/s3/utils.py +++ b/dataduct/s3/utils.py @@ -31,7 +31,8 @@ def read_from_s3(s3_path): Returns: results(str): Contents of the file as a string """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') bucket = get_s3_bucket(s3_path.bucket) key = bucket.get_key(s3_path.key) @@ -47,8 +48,11 @@ def upload_to_s3(s3_path, file_name=None, file_text=None): file_name(str): Name of the file to be uploaded to s3 file_text(str): Contents of the file to be uploaded """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert any([file_name, file_text]), 'file_name or text should be given' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not any([file_name, file_text]): + raise ETLInputError('File_name or text should be given') bucket = get_s3_bucket(s3_path.bucket) if s3_path.is_directory: @@ -78,8 +82,7 @@ def copy_within_s3(s3_old_path, s3_new_path, raise_when_no_exist=True): key = bucket.get_key(s3_old_path.key) if key: key.copy(s3_new_path.bucket, s3_new_path.key) - - if raise_when_no_exist and not key: + elif raise_when_no_exist: raise ETLInputError('The key does not exist: %s' % s3_old_path.uri) @@ -91,9 +94,14 @@ def upload_dir_to_s3(s3_path, local_path, filter_function=None): local_path(file_path): Input path of the file to be uploaded filter_function(function): Function to filter out directories """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' - assert os.path.isdir(local_path), 'Local path must be a directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') + + if not os.path.isdir(local_path): + raise ETLInputError('Local path must be a directory') bucket = get_s3_bucket(s3_path.bucket) @@ -119,8 +127,11 @@ def download_dir_from_s3(s3_path, local_path): s3_path(S3Path): Input path of the file to be downloaded local_path(file_path): Output path of the file to be downloaded """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') bucket = get_s3_bucket(s3_path.bucket) keys = bucket.get_all_keys(prefix=s3_path.key + '/') @@ -146,15 +157,56 @@ def delete_dir_from_s3(s3_path): Args: s3_path(S3Path): Path of the directory to be deleted """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') bucket = get_s3_bucket(s3_path.bucket) prefix = s3_path.key # Enforce this to be a folder's prefix - if not prefix.endswith('/'): - prefix += '/' + prefix += '/' if not prefix.endswith('/') else '' + keys = bucket.get_all_keys(prefix=s3_path.key) for key in keys: key.delete() + + +def copy_dir_with_s3(s3_old_path, s3_new_path, raise_when_no_exist=True): + """Copies files from one S3 Path to another + + Args: + s3_old_path(S3Path): Output path of the file to be uploaded + s3_new_path(S3Path): Output path of the file to be uploaded + raise_when_no_exist(bool, optional): Raise error if file not found + + Raises: + ETLInputError: If s3_old_path does not exist + """ + if not isinstance(s3_old_path, S3Path): + raise ETLInputError('S3 old path should be of type S3Path') + + if not s3_old_path.is_directory: + raise ETLInputError('S3 old path must be directory') + + if not isinstance(s3_new_path, S3Path): + raise ETLInputError('S3 new path should be of type S3Path') + + if not s3_new_path.is_directory: + raise ETLInputError('S3 new path must be directory') + + bucket = get_s3_bucket(s3_old_path.bucket) + prefix = s3_old_path.key + + # Enforce this to be a folder's prefix + prefix += '/' if not prefix.endswith('/') else '' + + keys = bucket.get_all_keys(prefix=s3_old_path.key) + for key in keys: + if key: + key.copy(s3_new_path.bucket, + os.path.join(s3_new_path.key, os.path.basename(key.key))) + elif raise_when_no_exist: + raise ETLInputError('The key does not exist: %s' % s3_old_path.uri) diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index e69de29..282d020 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -0,0 +1,19 @@ +from .column_check import ColumnCheckStep +from .count_check import CountCheckStep +from .create_load_redshift import CreateAndLoadStep +from .create_update_sql import CreateUpdateSqlStep +from .emr_job import EMRJobStep +from .emr_streaming import EMRStreamingStep +from .etl_step import ETLStep +from .extract_local import ExtractLocalStep +from .extract_rds import ExtractRdsStep +from .extract_redshift import ExtractRedshiftStep +from .extract_s3 import ExtractS3Step +from .load_redshift import LoadRedshiftStep +from .pipeline_dependencies import PipelineDependenciesStep +from .primary_key_check import PrimaryKeyCheckStep +from .qa_transform import QATransformStep +from .reload import ReloadStep +from .sql_command import SqlCommandStep +from .transform import TransformStep +from .upsert import UpsertStep diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py new file mode 100644 index 0000000..3b0ddaa --- /dev/null +++ b/dataduct/steps/column_check.py @@ -0,0 +1,142 @@ +"""ETL step wrapper for column check step can be executed on Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..config import Config +from ..database import SqlScript +from ..database import Table +from ..database import SelectStatement +from ..utils import constants as const +from ..utils.helpers import parse_path +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLInputError + +config = Config() +COLUMN_TEMPLATE = "COALESCE(CONCAT({column_name}, ''), '')" + + +class ColumnCheckStep(QATransformStep): + """ColumnCheckStep class that checks if the rows of a table has been + populated with the correct values + """ + + def __init__(self, id, source_sql, source_host, + destination_table_definition=None, script=None, + destination_sql=None, sql_tail_for_source=None, + sample_size=100, tolerance=1.0, script_arguments=None, + log_to_s3=False, **kwargs): + """Constructor for the ColumnCheckStep class + + Args: + destination_table_definition(file): + table definition for the destination table + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if not exactly_one(destination_table_definition, destination_sql): + raise ETLInputError('One of dest table or dest sql needed') + + if script_arguments is None: + script_arguments = list() + + if sql_tail_for_source is None: + sql_tail_for_source = '' + + # Get the EDW column SQL + dest_sql, primary_key_index = self.convert_destination_to_column_sql( + destination_table_definition, destination_sql) + + src_sql = self.convert_source_to_column_sql(source_sql, + primary_key_index, + sql_tail_for_source) + + script_arguments.extend([ + '--sample_size=%s' % str(sample_size), + '--tolerance=%s' % str(tolerance), + '--destination_sql=%s' % dest_sql, + '--source_sql=%s' % src_sql, + '--source_host=%s' % source_host + ]) + + if log_to_s3: + script_arguments.append('--log_to_s3') + + if script is None: + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) + + super(ColumnCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) + + @staticmethod + def convert_destination_to_column_sql(destination_table_definition=None, + destination_sql=None): + """Convert the destination query into generic structure to compare + """ + if destination_table_definition is not None: + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + + destination_table = Table(SqlScript(destination_table_string)) + destination_columns = destination_table.columns() + primary_key_index, primary_keys = zip(*[ + (idx, col.name) + for idx, col in enumerate(destination_columns) + if col.primary]) + + if len(destination_columns) == len(primary_key_index): + raise ValueError('Cannot check table without non-pk columns') + + column_string = '||'.join( + [COLUMN_TEMPLATE.format(column_name=c.name) + for c in destination_columns if not c.primary]) + concatenated_column = '( {columns} )'.format(columns=column_string) + + destination_sql = '''SELECT {primary_keys}, {concat_column} + FROM {table_name} + WHERE ({primary_keys}) IN PRIMARY_KEY_SET + '''.format(primary_keys=','.join(primary_keys), + concat_column=concatenated_column, + table_name=destination_table.full_name) + + elif destination_sql is not None: + select_stmnt = SelectStatement(destination_sql) + primary_key_index = range(len(select_stmnt.columns()))[:-1] + + return SqlScript(destination_sql).sql(), primary_key_index + + @staticmethod + def convert_source_to_column_sql(source_sql, primary_key_index, + sql_tail_for_source): + """Convert the source query into generic structure to compare + """ + origin_sql = SelectStatement(SqlScript(source_sql).statements[0].sql()) + + # Remove column name references to tables as t.session_id should be + # session_id as we wrap the whole query. + column_names = [x.name.split('.')[-1] for x in origin_sql.columns()] + + non_primary_key_index = [idx for idx in range(len(column_names)) + if idx not in primary_key_index] + + primary_key_str = ','.join( + [column_names[idx] for idx in primary_key_index]) + + if len(column_names) == len(primary_key_index): + raise ValueError('Cannot check column on table with no pk columns') + + column_string = ','.join( + [COLUMN_TEMPLATE.format(column_name=column_names[idx]) + for idx in non_primary_key_index]) + concatenated_column = ('CONCAT(%s)' % column_string) + + template = '''SELECT {primary_keys}, {concat_column} AS merged_string + FROM ({origin_sql}) AS origin {sql_tail}''' + + query = template.format(primary_keys=primary_key_str, + concat_column=concatenated_column, + origin_sql=origin_sql.sql(), + sql_tail=sql_tail_for_source) + + return SqlScript(query).sql() diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py new file mode 100644 index 0000000..1977a05 --- /dev/null +++ b/dataduct/steps/count_check.py @@ -0,0 +1,104 @@ +"""ETL step wrapper for count check step can be executed on the Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..config import Config +from ..database import SqlScript +from ..database import SqlStatement +from ..database import Table +from ..utils import constants as const +from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError + +config = Config() + + +class CountCheckStep(QATransformStep): + """CountCheckStep class that compares the number of rows in the source + select script with the number of rows in the destination table + """ + + def __init__(self, id, source_host, source_sql=None, source_table_name=None, + destination_table_name=None, destination_table_definition=None, + destination_sql=None, tolerance=1.0, script_arguments=None, + log_to_s3=False, script=None, source_count_sql=None, **kwargs): + """Constructor for the CountCheckStep class + + Args: + source_sql(str): SQL select script from the source table + destination_table_name(str): table name for the destination table + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if not exactly_one(destination_table_name, destination_sql, + destination_table_definition): + raise ETLInputError( + 'One of dest table name/schema or dest sql needed') + + if not exactly_one(source_sql, source_table_name, source_count_sql): + raise ETLInputError( + 'One of source table name or source sql or source count needed') + + if script_arguments is None: + script_arguments = list() + + if destination_table_definition is not None: + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + destination_table = Table(SqlScript(destination_table_string)) + destination_table_name = destination_table.full_name + + # Get the EDW column SQL + dest_sql = self.convert_destination_to_count_sql( + destination_table_name, destination_sql) + + src_sql = self.convert_source_to_count_sql( + source_table_name, source_sql, source_count_sql) + + script_arguments.extend([ + '--tolerance=%s' % str(tolerance), + '--destination_sql=%s' % dest_sql, + '--source_sql=%s' % src_sql, + '--source_host=%s' % source_host + ]) + + if log_to_s3: + script_arguments.append('--log_to_s3') + + if script is None: + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) + + super(CountCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) + + @staticmethod + def convert_destination_to_count_sql(destination_table_name=None, + destination_sql=None): + """Convert the destination query into generic structure to compare + """ + if destination_table_name is not None: + destination_sql = "SELECT COUNT(1) FROM %s" % destination_table_name + else: + dest_sql = SqlStatement(destination_sql) + destination_sql = "SELECT COUNT(1) FROM (%s)a" % dest_sql.sql() + + return SqlScript(destination_sql).sql() + + @staticmethod + def convert_source_to_count_sql(source_table_name=None, + source_sql=None, + source_count_sql=None): + """Convert the source query into generic structure to compare + """ + if source_table_name is not None: + source_sql = "SELECT COUNT(1) FROM %s" % source_table_name + elif source_count_sql is not None: + source_sql = source_count_sql + else: + origin_sql = SqlStatement(source_sql) + source_sql = "SELECT COUNT(1) FROM (%s)a" % origin_sql.sql() + + return SqlScript(source_sql).sql() diff --git a/dataduct/steps/create_load_redshift.py b/dataduct/steps/create_load_redshift.py new file mode 100644 index 0000000..21023a6 --- /dev/null +++ b/dataduct/steps/create_load_redshift.py @@ -0,0 +1,63 @@ +"""ETL step wrapper for loading into redshift with the COPY command +""" +import os + +from .transform import TransformStep +from ..database import Table +from ..database import SqlStatement +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class CreateAndLoadStep(TransformStep): + """CreateAndLoad Step class that creates table if needed and loads data + """ + + def __init__(self, id, table_definition, input_node=None, + script_arguments=None, **kwargs): + """Constructor for the CreateAndLoadStep class + + Args: + table_definition(filepath): schema file for the table to be loaded + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(table_definition)) as f: + table_def_string = f.read() + + table = Table(SqlStatement(table_def_string)) + + if isinstance(input_node, dict): + input_paths = [i.path().uri for i in input_node.values()] + else: + input_paths = [input_node.path().uri] + + + if script_arguments is None: + script_arguments = list() + + script_arguments.extend([ + '--table_definition=%s' % table.sql().sql(), + '--s3_input_paths'] + input_paths) + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.CREATE_LOAD_SCRIPT_PATH) + + super(CreateAndLoadStep, self).__init__( + id=id, script=script, input_node=input_node, + script_arguments=script_arguments, **kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py new file mode 100644 index 0000000..b196ea9 --- /dev/null +++ b/dataduct/steps/create_update_sql.py @@ -0,0 +1,77 @@ +"""ETL step wrapper for sql command for inserting into tables +""" +import os +from .transform import TransformStep +from ..database import SqlScript +from ..database import Table +from ..utils import constants as const +from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError + + +class CreateUpdateSqlStep(TransformStep): + """Create and Insert step that creates a table and then uses the query to + update the table data with any sql query provided + """ + + def __init__(self, + table_definition, + script=None, + command=None, + analyze_table=True, + script_arguments=None, + non_transactional=False, + **kwargs): + """Constructor for the CreateUpdateStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + if not exactly_one(command, script): + raise ETLInputError('Both command and script found') + + # Create S3File with script / command provided + if script: + update_script = SqlScript(filename=parse_path(script)) + else: + update_script = SqlScript(command) + + dest = Table(SqlScript(filename=parse_path(table_definition))) + + steps_path = os.path.abspath(os.path.dirname(__file__)) + runner_script = os.path.join(steps_path, const.SQL_RUNNER_SCRIPT_PATH) + + arguments = [ + '--table_definition=%s' % dest.sql().sql(), + '--sql=%s' % update_script.sql() + ] + + if analyze_table: + arguments.append('--analyze') + + if non_transactional: + arguments.append('--non_transactional') + + if script_arguments is not None: + if not isinstance(script_arguments, list): + raise ETLInputError( + 'Script arguments for SQL steps should be a list') + arguments.extend(script_arguments) + + super(CreateUpdateSqlStep, self).__init__( + script=runner_script, script_arguments=arguments, + no_output=True, **kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + cls.pop_inputs(step_args) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/emr_job.py b/dataduct/steps/emr_job.py new file mode 100644 index 0000000..4158ec8 --- /dev/null +++ b/dataduct/steps/emr_job.py @@ -0,0 +1,49 @@ +""" +ETL step wrapper for EmrActivity can be executed on EMR Cluster +""" +from .etl_step import ETLStep +from ..pipeline import EmrActivity + + +class EMRJobStep(ETLStep): + """EMR Step class that helps run a step on the emr cluster + """ + + def __init__(self, + step_string, + **kwargs): + """Constructor for the EMRJobStep class + + Args: + step_string(str): Step string for the emr job to be executed + **kwargs(optional): Keyword arguments directly passed to base class + + Note: + In the step_string all comma within arguments should be escaped + using 4 backslashes + """ + super(EMRJobStep, self).__init__(**kwargs) + + self.activity = self.create_pipeline_object( + object_class=EmrActivity, + resource=self.resource, + input_node=self.input, + schedule=self.schedule, + emr_step_string=step_string, + output_node=self.output, + depends_on=self.depends_on, + max_retries=self.max_retries + ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.emr_cluster + + return step_args diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 04e821b..66f2156 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -1,12 +1,9 @@ """ -ETL step wrapper for EmrActivity can be executed on Ec2 +ETL step wrapper for EmrStreamingActivity can be executed on EMR Cluster """ from .etl_step import ETLStep -from ..pipeline.emr_activity import EmrActivity -from ..s3.s3_file import S3File -from ..s3.s3_path import S3Path -from ..utils.exceptions import ETLInputError - +from ..pipeline import EmrActivity +from ..s3 import S3File HADOOP_1_SERIES = ['1', '2'] @@ -48,7 +45,7 @@ def create_command_hadoop_2(mapper, reducer, command, command_options): return ','.join(command) -def create_command(mapper, reducer, ami_version, input_uri, output, +def create_command(mapper, reducer, ami_version, input, output, hadoop_params): """Create the command step string given the input to streaming step """ @@ -66,13 +63,7 @@ def create_command(mapper, reducer, ami_version, input_uri, output, command_options.extend(['-output', output.path().uri]) # Add input uri - if isinstance(input_uri, list): - for i in input_uri: - assert isinstance(i, S3Path) - command_options.extend(['-input', i.uri]) - else: - assert isinstance(input_uri, S3Path), type(input_uri) - command_options.extend(['-input', input_uri.uri]) + command_options.extend(['-input', input.path().uri]) if ami_family in HADOOP_1_SERIES: return create_command_hadoop_1(mapper, reducer, command, @@ -89,9 +80,8 @@ class EMRStreamingStep(ETLStep): def __init__(self, mapper, reducer=None, - input=None, hadoop_params=None, - depends_on=None, + output_path=None, **kwargs): """Constructor for the EMRStreamingStep class @@ -102,23 +92,10 @@ def __init__(self, hadoop_params(list of str): arguments to the hadoop command **kwargs(optional): Keyword arguments directly passed to base class """ - - # As EMR streaming allows inputs as both input_node and input - # We remove the default input_node if input is given - if input is not None: - input_node = kwargs.pop('input_node', None) - else: - input_node = kwargs.get('input_node', None) - - if input is not None and 'input_node' in kwargs: - raise ETLInputError('Both input and input_node specified') - super(EMRStreamingStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - - self._output = self.create_s3_data_node() + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) # Create S3File with script / command provided mapper = self.create_script(S3File(path=mapper)) @@ -128,43 +105,30 @@ def __init__(self, reducer = self.create_script(S3File(path=reducer)) additional_files.append(reducer) - if input is not None: - if isinstance(input, list): - input = [S3Path(uri=i) for i in input] - else: - input = S3Path(uri=input) - else: - if isinstance(input_node, list): - input = [i.path() for i in input_node] - elif isinstance(input_node, dict): - input = [i.path() for i in input_node.values()] - else: - input = input_node.path() - step_string = create_command(mapper, reducer, self.resource.ami_version, - input, self._output, hadoop_params) + self.input, self.output, hadoop_params) self.activity = self.create_pipeline_object( object_class=EmrActivity, resource=self.resource, + input_node=self.input, schedule=self.schedule, emr_step_string=step_string, - output_node=self._output, + output_node=self.output, additional_files=additional_files, depends_on=self.depends_on, max_retries=self.max_retries ) - def merge_s3_nodes(self, input_nodes): - """Override the merge S3Node case for EMR Streaming Step + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline Args: - input_nodes(dict): Map of the form {'node_name': node} - - Returns: - output_node(list of S3Node): list of input nodes - depends_on(list): Empty list + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class """ - depends_on = [] - output_node = input_nodes.values() - return output_node, depends_on + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.emr_cluster + + return step_args diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 09f6092..f14ce29 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -1,17 +1,18 @@ """ Base class for an etl step """ - from ..config import Config -from ..pipeline.activity import Activity -from ..pipeline.copy_activity import CopyActivity -from ..pipeline.s3_node import S3Node -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File +from ..pipeline import Activity +from ..pipeline import CopyActivity +from ..pipeline import S3Node +from ..s3 import S3Path +from ..s3 import S3File +from ..s3 import S3LogPath +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) class ETLStep(object): @@ -31,8 +32,8 @@ class ETLStep(object): def __init__(self, id, s3_data_dir=None, s3_log_dir=None, s3_source_dir=None, schedule=None, resource=None, - input_node=None, required_steps=None, - max_retries=DEFAULT_MAX_RETRIES): + input_node=None, input_path=None, required_steps=None, + max_retries=MAX_RETRIES): """Constructor for the ETLStep object Args: @@ -53,14 +54,18 @@ def __init__(self, id, s3_data_dir=None, s3_log_dir=None, self.resource = resource self.max_retries = max_retries self._depends_on = list() - self._input = input_node self._output = None self._objects = dict() self._required_steps = list() - - self._activities = list() + self._required_activities = list() self._input_node = input_node + if input_path is not None and input_node is not None: + raise ETLInputError('Both input_path and input_node specified') + + if input_path is not None: + self._input_node = self.create_s3_data_node(S3Path(uri=input_path)) + if isinstance(input_node, list): if len(input_node) == 0: input_node = None @@ -91,14 +96,12 @@ def add_required_steps(self, required_steps): """ self._required_steps.extend(required_steps) - # Find all activities which need to be completed. - required_activities = [] - for step in self._required_steps: - required_activities.extend(step.activities) + for step in required_steps: + self._required_activities.extend(step.activities) # Set required_acitivites as depend_on variable of all activities for activity in self.activities: - activity['dependsOn'] = required_activities + activity['dependsOn'] = self._required_activities def create_pipeline_object(self, object_class, **kwargs): """Create the pipeline objects associated with the step @@ -119,6 +122,10 @@ def create_pipeline_object(self, object_class, **kwargs): str(instance_count) new_object = object_class(object_id, **kwargs) + + if isinstance(new_object, Activity): + new_object['dependsOn'] = self._required_activities + self._objects[object_id] = new_object return new_object @@ -148,9 +155,9 @@ def create_s3_data_node(self, s3_object=None, **kwargs): s3_object = s3_dir s3_node = self.create_pipeline_object( - S3Node, + object_class=S3Node, schedule=self.schedule, - s3_path=s3_object, + s3_object=s3_object, **kwargs ) @@ -174,12 +181,16 @@ def create_output_nodes(self, output_node, sub_dirs): Returns: s3_output_nodes(dict of s3Node): Output nodes keyed with sub dirs """ - return dict( - ( - sub_dir, - self.create_s3_data_node(S3Path(sub_dir, is_directory=True, - parent_dir=output_node.path())) - ) for sub_dir in sub_dirs) + output_map = dict() + for sub_dir in sub_dirs: + new_node = self.create_s3_data_node( + S3Path(sub_dir, is_directory=True, + parent_dir=output_node.path())) + new_node.add_dependency_node(output_node) + + output_map[sub_dir] = new_node + + return output_map def create_script(self, s3_object): """Set the s3 path for s3 objects with the s3_source_dir @@ -211,19 +222,18 @@ def copy_s3(self, input_node, dest_uri): if not(isinstance(input_node, S3Node) and isinstance(dest_uri, S3Path)): raise ETLInputError('input_node and uri have type mismatch') - # Copy the input node. We need to use directories for copying if we - # are going to omit the data format - if input_node.path().is_directory: - uri = input_node.path().uri - else: - uri = '/'.join(input_node.path().uri.split('/')[:-1]) - - new_input_node = self.create_s3_data_node( - s3_object=S3Path(uri=uri, is_directory=True)) - # create s3 node for output output_node = self.create_s3_data_node(dest_uri) + # Create new input node if file and not directory + if input_node.path().is_directory: + new_input_node = input_node + else: + uri = "/".join(input_node.path().uri.split("/")[:-1]) + new_input_node = self.create_s3_data_node( + S3Path(uri=uri, is_directory=True)) + new_input_node.add_dependency_node(input_node) + # create copy activity activity = self.create_pipeline_object( CopyActivity, @@ -250,12 +260,15 @@ def merge_s3_nodes(self, input_nodes): """ depends_on = list() combined_node = self.create_s3_data_node() - for input_node in input_nodes: - dest_uri = S3Path(key=input_node, is_directory=True, + + for string_key, input_node in input_nodes.iteritems(): + dest_uri = S3Path(key=string_key, is_directory=True, parent_dir=combined_node.path()) - copy_activity = self.copy_s3(input_node=input_nodes[input_node], + copy_activity = self.copy_s3(input_node=input_node, dest_uri=dest_uri) depends_on.append(copy_activity) + combined_node.add_dependency_node(copy_activity.output) + return combined_node, depends_on @property @@ -268,7 +281,7 @@ def input(self): Note: Input is represented as None, a single node or dict of nodes """ - return self._input + return self._input_node @property def output(self): @@ -341,3 +354,109 @@ def activities(self): result: All aws activites that are created for this step """ return [x for x in self._objects.values() if isinstance(x, Activity)] + + @classmethod + def base_arguments_processor(cls, etl, input_args): + """Process the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + input_args(dict): Dictionary of the step arguments from the YAML + """ + # Base dictionary for every step + step_args = { + 'resource': None, + 'schedule': etl.schedule, + 'max_retries': etl.max_retries, + 'required_steps': list() + } + step_args.update(input_args) + + # Description is optional and should not be passed + step_args.pop('description', None) + + # Add dependencies + depends_on = step_args.pop('depends_on', None) + if isinstance(depends_on, str): + depends_on = [depends_on] + + if depends_on: + for step_id in list(depends_on): + if step_id not in etl.steps: + raise ETLInputError('Step depends on non-existent step') + step_args['required_steps'].append(etl.steps[step_id]) + + # Set input node and required_steps + input_node = step_args.get('input_node', None) + if input_node: + if isinstance(input_node, dict): + input_node = etl.translate_input_nodes(input_node) + elif isinstance(input_node, str): + input_node = etl.intermediate_nodes[input_node] + step_args['input_node'] = input_node + + # Add dependencies from steps that create input nodes + if isinstance(input_node, dict): + required_nodes = input_node.values() + else: + required_nodes = [input_node] + + for required_node in required_nodes: + for step in etl.steps.values(): + if step not in step_args['required_steps'] and \ + required_node in step.pipeline_objects: + step_args['required_steps'].append(step) + + # Set the name if name not provided + if 'name' in step_args: + name = step_args.pop('name') + else: + # If the name of the step is not provided, one is assigned as: + # [step_class][index] + name = cls.__name__ + str(sum( + [1 for a in etl.steps.values() if isinstance(a, cls)] + )) + + # Each step is given it's own directory so that there is no clashing + # of file names. + step_args.update({ + 'id': name, + 's3_log_dir': S3LogPath(name, parent_dir=etl.s3_log_dir, + is_directory=True), + 's3_data_dir': S3Path(name, parent_dir=etl.s3_data_dir, + is_directory=True), + 's3_source_dir': S3Path(name, parent_dir=etl.s3_source_dir, + is_directory=True), + }) + + return step_args + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + return step_args + + @staticmethod + def pop_inputs(input_args): + """Remove the input nodes from the arguments dictionary + """ + input_args.pop('input_node', None) + input_args.pop('input_path', None) + + return input_args + + @staticmethod + def get_output_s3_path(output_path, is_directory=True): + """Create an S3 Path variable based on the output path + """ + if output_path: + s3_path = S3Path(uri=output_path, is_directory=is_directory) + else: + s3_path = None + return s3_path diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 34c6132..1275358 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -2,14 +2,15 @@ ETL step wrapper for creating an S3 node for input from local files """ from .etl_step import ETLStep -from ..s3.s3_file import S3File +from ..s3 import S3File +from ..utils.exceptions import ETLInputError class ExtractLocalStep(ETLStep): """ExtractLocal Step class that helps get data from a local file """ - def __init__(self, path, **kwargs): + def __init__(self, path, output_path=None, **kwargs): """Constructor for the ExtractLocalStep class Args: @@ -17,4 +18,23 @@ def __init__(self, path, **kwargs): **kwargs(optional): Keyword arguments directly passed to base class """ super(ExtractLocalStep, self).__init__(**kwargs) - self._output = self.create_s3_data_node(s3_object=S3File(path=path)) + self._output = self.create_s3_data_node( + S3File(path=path, s3_path=self.get_output_s3_path(output_path))) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + + step_args.pop('resource') + if etl.frequency != 'one-time': + raise ETLInputError( + 'Extract Local can be used for one-time pipelines only') + + return step_args diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index b363e5d..87b4497 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -1,32 +1,21 @@ """ ETL step wrapper to extract data from RDS to S3 """ -from re import findall - from ..config import Config from .etl_step import ETLStep -from ..pipeline.copy_activity import CopyActivity -from ..pipeline.mysql_node import MysqlNode -from ..pipeline.pipeline_object import PipelineObject -from ..pipeline.shell_command_activity import ShellCommandActivity +from ..pipeline import CopyActivity +from ..pipeline import MysqlNode +from ..pipeline import PipelineObject +from ..pipeline import ShellCommandActivity from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError +from ..database import SelectStatement config = Config() -MYSQL_CONFIG = config.mysql - - -def guess_input_tables(sql): - """Guess input tables from the sql query +if not hasattr(config, 'mysql'): + raise ETLInputError('MySQL config not specified in ETL') - Returns: - results(list of str): tables which are used in the sql statement - """ - results = findall(r'from ([A-Za-z0-9._]+)', sql) - results.extend(findall(r'FROM ([A-Za-z0-9._]+)', sql)) - results.extend(findall(r'join ([A-Za-z0-9._]+)', sql)) - results.extend(findall(r'JOIN ([A-Za-z0-9._]+)', sql)) - return list(set(results)) +MYSQL_CONFIG = config.mysql class ExtractRdsStep(ETLStep): @@ -38,7 +27,7 @@ def __init__(self, sql=None, host_name=None, database=None, - depends_on=None, + output_path=None, **kwargs): """Constructor for the ExtractRdsStep class @@ -54,13 +43,10 @@ def __init__(self, super(ExtractRdsStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - if table: - sql = 'select * from %s;' % table + sql = 'SELECT * FROM %s;' % table elif sql: - table = guess_input_tables(sql) + table = SelectStatement(sql).dependencies[0] else: raise ETLInputError('Provide a sql statement or a table name') @@ -96,10 +82,11 @@ def __init__(self, max_retries=self.max_retries, ) + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) + # This shouldn't be necessary but - # AWS uses \\n as null, so we need to remove it - self._output = self.create_s3_data_node() - command = ' '.join(["cat", "${INPUT1_STAGING_DIR}/*", "| sed 's/\\\\\\\\n/NULL/g'", # replace \\n @@ -110,9 +97,23 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, input_node=intermediate_node, - output_node=self._output, + output_node=self.output, command=command, max_retries=self.max_retries, resource=self.resource, schedule=self.schedule, ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index a94a6f9..e524668 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -2,8 +2,8 @@ ETL step wrapper for RedshiftCopyActivity to extract data to S3 """ from .etl_step import ETLStep -from ..pipeline.redshift_node import RedshiftNode -from ..pipeline.redshift_copy_activity import RedshiftCopyActivity +from ..pipeline import RedshiftNode +from ..pipeline import RedshiftCopyActivity class ExtractRedshiftStep(ETLStep): @@ -15,7 +15,7 @@ def __init__(self, table, redshift_database, insert_mode="TRUNCATE", - depends_on=None, + output_path=None, **kwargs): """Constructor for the ExtractRedshiftStep class @@ -28,9 +28,6 @@ def __init__(self, """ super(ExtractRedshiftStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - # Create input node self._input_node = self.create_pipeline_object( object_class=RedshiftNode, @@ -40,16 +37,32 @@ def __init__(self, table_name=table, ) - self._output = self.create_s3_data_node() + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) self.create_pipeline_object( object_class=RedshiftCopyActivity, max_retries=self.max_retries, - input_node=self._input_node, - output_node=self._output, + input_node=self.input, + output_node=self.output, insert_mode=insert_mode, resource=self.resource, schedule=self.schedule, depends_on=self.depends_on, command_options=["DELIMITER '\t' ESCAPE"], ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['redshift_database'] = etl.redshift_database + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index a970cea..dbb9477 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -2,19 +2,47 @@ ETL step wrapper for creating an S3 node for input """ from .etl_step import ETLStep -from ..s3.s3_path import S3Path +from ..s3 import S3Path +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLInputError +from ..utils.helpers import get_modified_s3_path class ExtractS3Step(ETLStep): """ExtractS3 Step class that helps get data from S3 """ - def __init__(self, uri, **kwargs): + def __init__(self, directory_uri=None, file_uri=None, **kwargs): """Constructor for the ExtractS3Step class Args: - uri(str): s3 path for s3 data + directory_uri(str): s3 path for s3 data directory + file_uri(str): s3 path for s3 data file **kwargs(optional): Keyword arguments directly passed to base class """ + if not exactly_one(directory_uri, file_uri): + raise ETLInputError('One of file_uri or directory_uri needed') + super(ExtractS3Step, self).__init__(**kwargs) - self._output = self.create_s3_data_node(s3_object=S3Path(uri=uri)) + + if directory_uri: + directory_uri = get_modified_s3_path(directory_uri) + s3_path = S3Path(uri=directory_uri, is_directory=True) + else: + file_uri = get_modified_s3_path(file_uri) + s3_path = S3Path(uri=file_uri) + self._output = self.create_s3_data_node(s3_path) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args.pop('resource') + + return step_args diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index 40b9b59..f76c8f5 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -2,8 +2,8 @@ ETL step wrapper for RedshiftCopyActivity to load data into Redshift """ from .etl_step import ETLStep -from ..pipeline.redshift_node import RedshiftNode -from ..pipeline.redshift_copy_activity import RedshiftCopyActivity +from ..pipeline import RedshiftNode +from ..pipeline import RedshiftCopyActivity class LoadRedshiftStep(ETLStep): @@ -17,7 +17,6 @@ def __init__(self, insert_mode="TRUNCATE", max_errors=None, replace_invalid_char=None, - depends_on=None, **kwargs): """Constructor for the LoadRedshiftStep class @@ -32,9 +31,6 @@ def __init__(self, """ super(LoadRedshiftStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - # Create output node self._output = self.create_pipeline_object( object_class=RedshiftNode, @@ -55,11 +51,25 @@ def __init__(self, self.create_pipeline_object( object_class=RedshiftCopyActivity, max_retries=self.max_retries, - input_node=self._input_node, - output_node=self._output, + input_node=self.input, + output_node=self.output, insert_mode=insert_mode, resource=self.resource, schedule=self.schedule, depends_on=self.depends_on, command_options=command_options, ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + step_args['redshift_database'] = etl.redshift_database + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/pipeline_dependencies.py b/dataduct/steps/pipeline_dependencies.py new file mode 100644 index 0000000..1f6303c --- /dev/null +++ b/dataduct/steps/pipeline_dependencies.py @@ -0,0 +1,87 @@ +""" +ETL step for pipeline dependencies using transform step +""" +import os + +from .transform import TransformStep +from ..utils import constants as const +from ..config import Config + +config = Config() +NAME_PREFIX = config.etl.get('NAME_PREFIX', '') +DEPENDENCY_OVERRIDE = config.etl.get('DEPENDENCY_OVERRIDE', False) + + +class PipelineDependenciesStep(TransformStep): + """PipelineDependencies Step class that helps wait for other pipelines + to finish + """ + + def __init__(self, + id, + dependent_pipelines=None, + refresh_rate=300, + start_date=None, + script_arguments=None, + **kwargs): + """Constructor for the QATransformStep class + + Args: + sns_arn(str): sns topic arn for QA steps + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if script_arguments is None: + script_arguments = list() + + if dependent_pipelines is None: + raise ValueError('Must have some dependencies for dependency step') + + if DEPENDENCY_OVERRIDE: + command = 'ls' + script = None + script_arguments = None + else: + command = None + if start_date is None: + start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}" + + script_arguments.extend( + [ + '--start_date=%s' % start_date, + '--refresh_rate=%s' % str(refresh_rate), + '--dependencies', + ] + ) + script_arguments.extend([ + pipeline if not NAME_PREFIX else NAME_PREFIX + '_' + pipeline + for pipeline in dependent_pipelines + ]) + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.DEPENDENCY_SCRIPT_PATH) + + super(PipelineDependenciesStep, self).__init__( + id=id, + script=script, + command=command, + script_arguments=script_arguments, + no_output=True, + **kwargs) + + self._output = None + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py new file mode 100644 index 0000000..c0ce7c1 --- /dev/null +++ b/dataduct/steps/primary_key_check.py @@ -0,0 +1,45 @@ +""" +ETL step wrapper for PK check step can be executed on Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..database import Table +from ..database import SqlStatement +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class PrimaryKeyCheckStep(QATransformStep): + """PrimaryKeyCheckStep class that checks a table for PK violations + """ + + def __init__(self, id, table_definition, script_arguments=None, + log_to_s3=False, **kwargs): + """Constructor for the PrimaryKeyCheckStep class + + Args: + table_definition(file): table definition for the table to check + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(table_definition)) as f: + table_def_string = f.read() + + if script_arguments is None: + script_arguments = list() + + # We initialize the table object to check valid strings + script_arguments.append( + '--table=%s' % Table(SqlStatement(table_def_string)).sql()) + + if log_to_s3: + script_arguments.append('--log_to_s3') + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.PK_CHECK_SCRIPT_PATH) + + super(PrimaryKeyCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py new file mode 100644 index 0000000..c9f7a52 --- /dev/null +++ b/dataduct/steps/qa_transform.py @@ -0,0 +1,57 @@ +""" +ETL step wrapper for QA step can be executed on Ec2 resource +""" +from .transform import TransformStep +from ..config import Config + +config = Config() + + +class QATransformStep(TransformStep): + """QATransform Step class that helps run scripts on resouces for QA checks + """ + + def __init__(self, + id, + pipeline_name, + script_arguments=None, + sns_topic_arn=None, + **kwargs): + """Constructor for the QATransformStep class + + Args: + sns_arn(str): sns topic arn for QA steps + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if sns_topic_arn is None: + sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None) + + if script_arguments is None: + script_arguments = list() + + script_arguments.append('--test_name=%s' % (pipeline_name + "." + id)) + if sns_topic_arn: + script_arguments.append('--sns_topic_arn=%s' % sns_topic_arn) + + super(QATransformStep, self).__init__( + id=id, + script_arguments=script_arguments, + no_output=True, + **kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['pipeline_name'] = etl.name + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/reload.py b/dataduct/steps/reload.py new file mode 100644 index 0000000..39281ee --- /dev/null +++ b/dataduct/steps/reload.py @@ -0,0 +1,27 @@ +"""ETL step wrapper for Reload SQL script +""" +from .upsert import UpsertStep + + +class ReloadStep(UpsertStep): + """Reload Step class that helps run a step on the emr cluster + """ + + def __init__(self, **kwargs): + """Constructor for the ReloadStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + super(ReloadStep, self).__init__(**kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args['delete_existing'] = True + return super(ReloadStep, cls).arguments_processor(etl, input_args) diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py new file mode 100644 index 0000000..3ea5264 --- /dev/null +++ b/dataduct/steps/scripts/column_check_test.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python + +"""Script that checks if the rows of the destination table has been populated +with the correct values +""" + +import argparse +import collections +import re +import pandas +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.data_access import rds_connection +from dataduct.qa import ColumnCheck + +pandas.options.display.max_colwidth = 1000 +pandas.options.display.max_rows = 1000 + + +def _get_source_data(sql, hostname, sample_size): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + connection = rds_connection(hostname) + query = re.sub( + r'(?i)LIMIT_PLACEHOLDER', + str(sample_size), + sql, + ) + + data = pdsql.read_sql(query, connection) + connection.close() + # All columns apart from last are PK columns + return data.set_index(list(data.columns[:-1])) + + +def _get_destination_data(sql, primary_keys): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + + Returns: + DataFrame: The rows of the table + """ + connection = redshift_connection() + + # Make primary_keys always a list of tuples + if isinstance(primary_keys[0], basestring): + primary_keys = [(pk) for pk in primary_keys] + + # Check whether it is not iterable + if not isinstance(primary_keys, collections.Iterable): + primary_keys = [tuple([pk]) for pk in primary_keys] + + # Format primary key string + primary_key_string = re.sub( + r",\)", + ")", + str(tuple(primary_keys)) + ) + + # If a key is Timestamp, the output string needs to be fixed. + # e.g., from Timestamp('2014-06-09 05:13:11') to '2014-06-09 05:13:11' + primary_key_string = re.sub(r"Timestamp\(([^,]*)[^)]*\)", r"\1", + primary_key_string) + + query = re.sub( + r'(?i)PRIMARY_KEY_SET', + primary_key_string, + sql, + ) + + print query + + data = pdsql.read_sql(query, connection) + connection.close() + # All columns apart from last are PK columns + return data.set_index(list(data.columns[:-1])) + + +def main(): + """Main function + + Args (taken in through argparse): + source_sql: SQL script of the source data + destination_sql: SQL script of the destination data + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--source_sql', dest='source_sql', required=True) + parser.add_argument('--source_host', dest='source_host', required=True) + parser.add_argument('--destination_sql', dest='destination_sql', + required=True) + parser.add_argument('--sample_size', dest='sample_size', required=True) + parser.add_argument('--tolerance', type=float, dest='tolerance', + default=1.0) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', + default='Check Column') + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) + + args = parser.parse_args() + + # Open up a connection and read the source and destination tables + source_data = _get_source_data(args.source_sql, args.source_host, + args.sample_size) + print source_data.to_string().encode('utf-8') + + destination_data = _get_destination_data(args.destination_sql, + list(source_data.index)) + print destination_data.to_string().encode('utf-8') + + check = ColumnCheck(source_data, destination_data, + name=args.test_name, + sns_topic_arn=args.sns_topic_arn, + tolerance=args.tolerance) + + check.publish(args.log_to_s3, dest_sql=args.destination_sql, + path_suffix=args.path_suffix) + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/count_check_test.py b/dataduct/steps/scripts/count_check_test.py new file mode 100644 index 0000000..0d45409 --- /dev/null +++ b/dataduct/steps/scripts/count_check_test.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python + +"""Script that compares the number of rows in the source select script with the +number of rows in the destination table +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.data_access import rds_connection +from dataduct.qa import CountCheck + + +def _get_source_data(sql, hostname): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + connection = rds_connection(hostname) + data = pdsql.read_sql(sql, connection) + connection.close() + return data.iloc[0][0] + + +def _get_destination_data(sql): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + connection = redshift_connection() + data = pdsql.read_sql(sql, connection) + connection.close() + # All columns apart from last are PK columns + return data.iloc[0][0] + + +def main(): + """Main function + + Args (taken in through argparse): + source_sql: SQL script of the source data + destination_sql: SQL script of the destination data + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--source_sql', dest='source_sql', required=True) + parser.add_argument('--source_host', dest='source_host', required=True) + parser.add_argument('--destination_sql', dest='destination_sql', + required=True) + parser.add_argument('--tolerance', type=float, dest='tolerance', + default=1.0) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', + default='Check Count') + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) + + args = parser.parse_args() + + source_count = _get_source_data(args.source_sql, args.source_host) + destination_count = _get_destination_data(args.destination_sql) + + check = CountCheck(source_count, destination_count, + name=args.test_name, + sns_topic_arn=args.sns_topic_arn, + tolerance=args.tolerance) + + check.publish(args.log_to_s3, dest_sql=args.destination_sql, + path_suffix=args.path_suffix) + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py new file mode 100644 index 0000000..7c1ab1a --- /dev/null +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python + +"""Replacement for the load step to use the redshift COPY command instead +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.config import get_aws_credentials +from dataduct.data_access import redshift_connection +from dataduct.database import SqlStatement +from dataduct.database import Table + + +def load_redshift(table, input_paths, max_error=0, + replace_invalid_char=None, no_escape=False, gzip=False, + command_options=None): + """Load redshift table with the data in the input s3 paths + """ + table_name = table.full_name + print 'Loading data into %s' % table_name + + # Credentials string + aws_key, aws_secret, token = get_aws_credentials() + creds = 'aws_access_key_id=%s;aws_secret_access_key=%s' % ( + aws_key, aws_secret) + if token: + creds += ';token=%s' % token + + delete_statement = 'DELETE FROM %s;' % table_name + error_string = 'MAXERROR %d' % max_error if max_error > 0 else '' + if replace_invalid_char is not None: + invalid_char_str = "ACCEPTINVCHARS AS %s" % replace_invalid_char + else: + invalid_char_str = '' + + query = [delete_statement] + + template = \ + "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' {options};" + + for input_path in input_paths: + if not command_options: + command_options = ( + "DELIMITER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " + "{max_error} {invalid_char_str};" + ).format(escape='ESCAPE' if not no_escape else '', + gzip='GZIP' if gzip else '', + max_error=error_string, + invalid_char_str=invalid_char_str) + + statement = template.format(table=table_name, + path=input_path, + creds=creds, + options=command_options) + query.append(statement) + + return ' '.join(query) + + +def main(): + """Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument('--table_definition', dest='table_definition', + required=True) + parser.add_argument('--max_error', dest='max_error', default=0, type=int) + parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', + default=None) + parser.add_argument('--no_escape', action='store_true', default=False) + parser.add_argument('--gzip', action='store_true', default=False) + parser.add_argument('--command_options', dest='command_options', default=None) + parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') + args = parser.parse_args() + print args + + table = Table(SqlStatement(args.table_definition)) + connection = redshift_connection() + table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), + connection).loc[0][0] + + cursor = connection.cursor() + # Create table in redshift, this is safe due to the if exists condition + if table_not_exists: + cursor.execute(table.create_script().sql()) + + # Load data into redshift + load_query = load_redshift(table, args.input_paths, args.max_error, + args.replace_invalid_char, args.no_escape, + args.gzip, args.command_options) + cursor.execute(load_query) + cursor.execute('COMMIT') + cursor.close() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/pipeline_dependency_check.py b/dataduct/steps/scripts/pipeline_dependency_check.py new file mode 100644 index 0000000..704adc8 --- /dev/null +++ b/dataduct/steps/scripts/pipeline_dependency_check.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +""" +Allows pipeline to have dependencies with other pipelines + +Expected behaviour of dependency step: + +1) If pipeline X does not depend on anything (dependency list is empty ""), + then the transform step should exit safely (sys.exit) + +2) Assume pipeline X depends on Y. If Y does not exist, then throw an + exception saying "Pipeline Y not found". + +3) Assume pipeline X depends on Y. If pipeline Y just sleeps for 10 minutes, + then pipeline X should not finish until after Y finishes in 10 minutes. + +4) Assume pipeline X depends on Y. Pipeline Y exists but no instances of Y ran + today. Pipeline X should throw an exception saying "Y does not exist today". + +5) Assume pipeline X depends on Y. Pipeline Y was "CANCELED"/"CANCELLED" today. + Pipeline X should throw exception saying "Bad status" + +6) Assume pipeline X depends on Y. Pipeline Y was "TIMEDOUT" today. Pipeline X + should throw exception saying "Bad status" + +7) Assume pipeline X depends on Y. Pipeline Y was "FAILED" today. Pipeline X + should throw exception saying "Bad status" + +8) Assume pipeline X depends on Y. Pipeline Y was "CASCADE_FAILED" today. + Pipeline X should throw exception saying "Bad status" +""" + +import argparse +import sys +import time +from datetime import datetime + +from dataduct.pipeline.utils import list_pipelines +from dataduct.pipeline.utils import list_pipeline_instances + + +# Docs and API spelling of "CANCELED" don't match +FAILED_STATUSES = set(['CANCELED', 'CANCELLED', 'TIMEDOUT', 'FAILED', + 'CASCADE_FAILED']) + +# Pipeline attributes +STATUS = '@status' +START_TIME = '@scheduledStartTime' +FINISHED = 'FINISHED' + + +def check_dependencies_ready(dependencies, start_date): + """Checks if every dependent pipeline has completed + + Args: + dependencies(list of str): list of pipeline name that it depends on + start_date(str): string representing the start date of the pipeline + """ + + print 'Checking dependency at ', str(datetime.now()) + + dependency_ready = True + + # Convert date string to datetime object + start_date = datetime.strptime(start_date, '%Y-%m-%d') + + for pipeline in dependencies: + # Get instances of each pipeline + instances = list_pipeline_instances(pipeline) + + # Collect all pipeline instances that are scheduled for today + instances_today = [] + for instance in instances: + date = datetime.strptime(instance[START_TIME], '%Y-%m-%dT%H:%M:%S') + if date.date() == start_date.date(): + instances_today.append(instance) + + # Dependency pipeline has not started from today + if not instances_today: + dependency_ready = False + + for instance in instances_today: + # One of the dependency failed/cancelled + if instance[STATUS] in FAILED_STATUSES: + raise Exception( + 'Pipeline %s has bad status: %s' + % (pipeline, instance[STATUS]) + ) + # Dependency is still running + elif instance[STATUS] != FINISHED: + dependency_ready = False + + # All dependencies are done + return dependency_ready + + +def main(): + """ + Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument( + '--dependencies', type=str, nargs='+', default=None) + parser.add_argument('--refresh_rate', dest='refresh_rate', default='900') + parser.add_argument('--start_date', dest='start_date') + + args = parser.parse_args() + + # Exit if there are no dependencies + if not args.dependencies: + sys.exit() + + # Create mapping from pipeline name to id + pipeline_name_to_id = dict( + (pipeline['name'], pipeline['id']) for pipeline in list_pipelines() + ) + + # Remove whitespace from dependency list + dependencies = map(str.strip, args.dependencies) + + # Check if all dependencies are valid pipelines + for dependency in dependencies: + if dependency not in pipeline_name_to_id: + raise Exception('Pipeline not found: %s.' % dependency) + + # Map from pipeline object to pipeline ID + dependencies = [pipeline_name_to_id[dependency] + for dependency in dependencies] + + print 'Start checking for dependencies' + start_time = datetime.now() + + # Loop until all dependent pipelines have finished + while not check_dependencies_ready(dependencies, args.start_date): + print 'checking' + time.sleep(float(args.refresh_rate)) + + print 'Finished checking for dependencies. Total time spent: ', + print (datetime.now() - start_time).total_seconds(), ' seconds' + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/primary_key_test.py b/dataduct/steps/scripts/primary_key_test.py new file mode 100644 index 0000000..58f362a --- /dev/null +++ b/dataduct/steps/scripts/primary_key_test.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python + +"""Script that checks for primary key violations on the input table +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlScript +from dataduct.database import Table +from dataduct.qa import PrimaryKeyCheck + + +def main(): + """Main function + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--table', dest='table', required=True) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', + default="Check Primary Key") + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) + + args = parser.parse_args() + + connection = redshift_connection() + table = Table(SqlScript(args.table)) + result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) + check = PrimaryKeyCheck(len(result), name=args.test_name, + sns_topic_arn=args.sns_topic_arn) + check.publish(args.log_to_s3, table=table.full_name, + path_suffix=args.path_suffix) + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/script_runner.py b/dataduct/steps/scripts/script_runner.py new file mode 100644 index 0000000..d87864f --- /dev/null +++ b/dataduct/steps/scripts/script_runner.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +""" +This script initiates the different calls needed when running +a transform step with the script_directory argument +""" + +# imports +import argparse +import os +import subprocess + + +def run_command(arguments): + """ + Args: + arguments(list of str): Arguments to be executed as a command. Arguments + are passed as if calling subprocess.call() directly + """ + return subprocess.call(arguments) + + +def main(): + """ + Parses the command line arguments and runs the suitable functions + """ + parser = argparse.ArgumentParser() + # Environment variable for the source directory + parser.add_argument('--INPUT_SRC_ENV_VAR', dest='input_src_env_var') + + # Argument for script name + parser.add_argument('--SCRIPT_NAME', dest='script_name') + args, ext_script_args = parser.parse_known_args() + + # Check if the source directory exists + input_src_dir = os.getenv(args.input_src_env_var) + if not os.path.exists(input_src_dir): + raise Exception(input_src_dir + " does not exist") + + run_command(['ls', '-l', input_src_dir]) + run_command(['chmod', '-R', '+x', input_src_dir]) + run_command(['ls', '-l', input_src_dir]) + + input_file = os.path.join(input_src_dir, args.script_name) + result = run_command([input_file] + ext_script_args) + if result != 0: + raise Exception("Script failed.") + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py new file mode 100644 index 0000000..ec9f749 --- /dev/null +++ b/dataduct/steps/scripts/sql_runner.py @@ -0,0 +1,59 @@ +#!/usr/bin/env python +"""Runner for the upsert SQL step +""" +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlStatement +from dataduct.database import Table + + +def main(): + """Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument('--table_definition', dest='table_definition', + required=True) + parser.add_argument('--sql', dest='sql', required=True) + parser.add_argument('--analyze', action='store_true', default=False) + parser.add_argument('--non_transactional', action='store_true', + default=False) + + args, sql_arguments = parser.parse_known_args() + print args, sql_arguments + + table = Table(SqlStatement(args.table_definition)) + connection = redshift_connection() + # Enable autocommit for non transactional sql execution + if args.non_transactional: + connection.autocommit = True + + table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), + connection).loc[0][0] + + cursor = connection.cursor() + # Create table in redshift, this is safe due to the if exists condition + if table_not_exists: + cursor.execute(table.create_script().sql()) + + # Load data into redshift with upsert query + # If there are sql_arguments, place them along with the query + # Otherwise, don't include them to avoid having to use %% everytime + if len(sql_arguments) >= 1: + print cursor.mogrify(args.sql, tuple(sql_arguments)) + cursor.execute(args.sql, tuple(sql_arguments)) + else: + print args.sql + cursor.execute(args.sql) + cursor.execute('COMMIT') + + # Analyze the table + if args.analyze: + cursor.execute(table.analyze_script().sql()) + + cursor.close() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 11169d4..7e8d918 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -2,11 +2,16 @@ ETL step wrapper for SqlActivity can be executed on Ec2 """ from .etl_step import ETLStep -from ..pipeline.sql_activity import SqlActivity -from ..s3.s3_file import S3File +from ..pipeline import SqlActivity +from ..database import SqlScript +from ..s3 import S3File from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError +import logging +logger = logging.getLogger(__name__) + class SqlCommandStep(ETLStep): """SQL Command Step class that helps run scripts on resouces @@ -17,8 +22,9 @@ def __init__(self, script=None, script_arguments=None, queue=None, + sql_script=None, command=None, - depends_on=None, + wrap_transaction=True, **kwargs): """Constructor for the SqlCommandStep class @@ -30,19 +36,27 @@ def __init__(self, redshift_database(RedshiftDatabase): database to excute the query **kwargs(optional): Keyword arguments directly passed to base class """ - if not exactly_one(command, script): - raise ETLInputError('Both command or script found') + if not exactly_one(command, script, sql_script): + raise ETLInputError('Both command and script found') - super(SqlCommandStep, self).__init__(**kwargs) + if sql_script is not None and not isinstance(sql_script, SqlScript): + raise ETLInputError('sql_script should be of the type SqlScript') - if depends_on is not None: - self._depends_on = depends_on + super(SqlCommandStep, self).__init__(**kwargs) # Create S3File with script / command provided if script: - script = self.create_script(S3File(path=script)) - else: - script = self.create_script(S3File(text=command)) + sql_script = SqlScript(filename=parse_path(script)) + elif command: + sql_script = SqlScript(command) + + if wrap_transaction: + sql_script = sql_script.wrap_transaction() + + script = self.create_script(S3File(text=sql_script.sql())) + + logger.debug('Sql Query:') + logger.debug(sql_script) self.create_pipeline_object( object_class=SqlActivity, @@ -55,3 +69,17 @@ def __init__(self, script=script, queue=queue, ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['redshift_database'] = etl.redshift_database + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 6d01c54..c8e7c18 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -1,11 +1,23 @@ """ ETL step wrapper for shell command activity can be executed on Ec2 / EMR """ +import os + from .etl_step import ETLStep -from ..pipeline.shell_command_activity import ShellCommandActivity -from ..s3.s3_file import S3File +from ..pipeline import ShellCommandActivity +from ..pipeline import S3Node +from ..s3 import S3File +from ..s3 import S3Directory from ..utils.helpers import exactly_one +from ..utils.helpers import get_modified_s3_path from ..utils.exceptions import ETLInputError +from ..utils import constants as const + +import logging +logger = logging.getLogger(__name__) + +SCRIPT_ARGUMENT_TYPE_STRING = 'string' +SCRIPT_ARGUMENT_TYPE_SQL = 'sql' class TransformStep(ETLStep): @@ -15,43 +27,89 @@ class TransformStep(ETLStep): def __init__(self, command=None, script=None, - output=None, + script_directory=None, + script_name=None, + output_node=None, script_arguments=None, additional_s3_files=None, - depends_on=None, + output_path=None, + no_output=False, **kwargs): """Constructor for the TransformStep class Args: command(str): command to be executed directly script(path): local path to the script that should executed - output(dict): output data nodes from the transform + script_directory(path): local path to the script directory + script_name(str): script to be executed in the directory + output_node(dict): output data nodes from the transform script_arguments(list of str): list of arguments to the script additional_s3_files(list of S3File): additional files used **kwargs(optional): Keyword arguments directly passed to base class """ - if not exactly_one(command, script): - raise ETLInputError('Both command or script found') - super(TransformStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on + if not exactly_one(command, script, script_directory): + raise ETLInputError( + 'Only one of script, command and directory allowed') + + base_output_node = None + if not no_output: + # Create output_node based on output_path + base_output_node = self.create_s3_data_node( + self.get_output_s3_path(get_modified_s3_path(output_path))) - # Create output_node if not provided - if self._output is None: - output_node = self.create_s3_data_node() + script_arguments = self.translate_arguments(script_arguments) + + if self.input: + input_nodes = [self.input] else: - output_node = self._output + input_nodes = list() + + if script_directory: + # The script to be run with the directory + if script_name is None: + raise ETLInputError('script_name required with directory') + + script_directory = self.create_script( + S3Directory(path=script_directory)) + + # Input node for the source code in the directory + input_nodes.append(self.create_pipeline_object( + object_class=S3Node, + schedule=self.schedule, + s3_object=script_directory + )) + + # We need to create an additional script that later calls the main + # script as we need to change permissions of the input directory + ip_src_env = 'INPUT%d_STAGING_DIR' % (1 if not self.input else 2) + additional_args = ['--INPUT_SRC_ENV_VAR=%s' % ip_src_env, + '--SCRIPT_NAME=%s' % script_name] + + script_arguments = additional_args + script_arguments + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.SCRIPT_RUNNER_PATH) # Create S3File if script path provided if script: script = self.create_script(S3File(path=script)) + # Translate output nodes if output map provided + if output_node: + self._output = self.create_output_nodes( + base_output_node, output_node) + else: + self._output = base_output_node + + logger.debug('Script Arguments:') + logger.debug(script_arguments) + self.create_pipeline_object( object_class=ShellCommandActivity, - input_node=self._input_node, - output_node=output_node, + input_node=input_nodes, + output_node=base_output_node, resource=self.resource, schedule=self.schedule, script_uri=script, @@ -62,9 +120,57 @@ def __init__(self, additional_s3_files=additional_s3_files, ) - # Translate output nodes if output map provided - if self._output is None: - if output: - self._output = self.create_output_nodes(output_node, output) - else: - self._output = output_node + def translate_arguments(self, script_arguments): + """Translate script argument to lists + + Args: + script_arguments(list of str/dict): arguments to the script + + Note: + Dict: (k -> v) is turned into an argument "--k=v" + List: Either pure strings or dictionaries with name, type and value + """ + if script_arguments is None: + return script_arguments + + elif isinstance(script_arguments, list): + result = list() + for argument in script_arguments: + if isinstance(argument, dict): + result.extend([self.input_format(key, value) + for key, value in argument.iteritems()]) + else: + result.append(str(argument)) + return result + + elif isinstance(script_arguments, dict): + return [self.input_format(key, value) + for key, value in script_arguments.iteritems()] + + elif isinstance(script_arguments, str): + return [script_arguments] + + else: + raise ETLInputError('Script Arguments for unrecognized type') + + @staticmethod + def input_format(key, value): + """Format the key and value to command line arguments + """ + return ''.join('--', key, '=', value) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + if step_args.pop('resource_type', None) == const.EMR_CLUSTER_STR: + step_args['resource'] = etl.emr_cluster + else: + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py new file mode 100644 index 0000000..902bc5a --- /dev/null +++ b/dataduct/steps/upsert.py @@ -0,0 +1,46 @@ +"""ETL step wrapper for Upsert SQL script +""" +from .create_update_sql import CreateUpdateSqlStep +from ..database import Table +from ..database import SqlScript +from ..database import SelectStatement +from ..database import HistoryTable +from ..utils.helpers import parse_path +from ..utils.helpers import exactly_one + + +class UpsertStep(CreateUpdateSqlStep): + """Upsert Step class that helps run a step on the emr cluster + """ + + def __init__(self, destination, sql=None, script=None, source=None, + enforce_primary_key=True, delete_existing=False, history=None, + analyze_table=True, **kwargs): + """Constructor for the UpsertStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + assert exactly_one(sql, source, script), 'One of sql/source/script' + + # Input formatting + dest = Table(SqlScript(filename=parse_path(destination))) + + if source is not None: + source_relation = Table(SqlScript(filename=parse_path(source))) + else: + source_relation = SelectStatement( + SqlScript(sql=sql, filename=parse_path(script)).sql()) + + # Create the destination table if doesn't exist + sql_script = dest.upsert_script(source_relation, enforce_primary_key, + delete_existing) + + if history: + hist = HistoryTable(SqlScript( + filename=parse_path(history))) + sql_script.append(hist.update_history_script(dest)) + + super(UpsertStep, self).__init__( + table_definition=destination, command=sql_script.sql(), + analyze_table=analyze_table, **kwargs) diff --git a/dataduct/tests/__init__.py b/dataduct/tests/__init__.py index e69de29..8b13789 100644 --- a/dataduct/tests/__init__.py +++ b/dataduct/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/dataduct/tests/test_definition_parser.py b/dataduct/tests/test_definition_parser.py deleted file mode 100644 index c9ec414..0000000 --- a/dataduct/tests/test_definition_parser.py +++ /dev/null @@ -1,19 +0,0 @@ -#!/usr/bin/env python -""" -Tests for the definition parser functions -""" -import unittest - -class DefitionParserTests(unittest.TestCase): - """Tests for the definition parser. - """ - - def setUp(self): - """Fixtures for the definition test - """ - pass - - def test_yaml_extention(self): - """Test if the pipeline extention is yaml - """ - pass diff --git a/dataduct/tests/test_import.py b/dataduct/tests/test_import.py new file mode 100644 index 0000000..c355164 --- /dev/null +++ b/dataduct/tests/test_import.py @@ -0,0 +1,85 @@ +"""Tests for dependencies +""" +from unittest import TestCase + + +class TestImports(TestCase): + """Tests for dependencies + """ + @staticmethod + def test_boto(): + """Testing boto + """ + print 'Trying to import boto' + import boto + + @staticmethod + def test_mysqldb(): + """Testing MySQLdb + """ + print 'Trying to import MySQLdb' + import MySQLdb + + @staticmethod + def test_pandas(): + """Testing pandas + """ + print 'Trying to import pandas' + import pandas + print pandas.io.sql + + @staticmethod + def test_psycopg2(): + """Testing psycopg2 + """ + print 'Trying to import psycopg2' + import psycopg2 + + @staticmethod + def test_pygraphviz(): + """Testing pygraphviz + """ + print 'Trying to import pygraphviz' + import pygraphviz + + @staticmethod + def test_pyparsing(): + """Testing pyparsing + """ + print 'Trying to import pyparsing' + import pyparsing + + @staticmethod + def test_pyyaml(): + """Testing PyYAML + """ + print 'Trying to import pyyaml' + import yaml + + @staticmethod + def test_setuptools(): + """Testing setuptools + """ + print 'Trying to import setuptools' + import setuptools + + @staticmethod + def test_sphinx_rtd_theme(): + """Testing sphinx_rtd_theme + """ + print 'Trying to import sphinx_rtd_theme' + import sphinx_rtd_theme + + @staticmethod + def test_testfixtures(): + """Testing testfixtures + """ + print 'Trying to import testfixtures' + import testfixtures + + @staticmethod + def test_pytimeparse(): + """Testing pytimeparse + """ + print 'Trying to import pytimeparse' + import pytimeparse diff --git a/dataduct/utils/cli.py b/dataduct/utils/cli.py new file mode 100644 index 0000000..f542fb2 --- /dev/null +++ b/dataduct/utils/cli.py @@ -0,0 +1,152 @@ +"""Helper function for CLI scripts +""" +from argparse import ArgumentParser +from argparse import RawTextHelpFormatter +import argparse + + +def config_singleton_setup(args): + """Setup the config singleton based on the mode in args + + Note: + To instantiate the singleton object with the correct state as this is + the single entry point to the library. We can use the __new__ function + to set the debug_level + + We import inside the function as the singleton declaration should be + done here and at no other entry point. The same pattern is followed + at all the entry point scripts. + """ + mode = args.mode if hasattr(args, 'mode') else None + + import logging + logger = logging.getLogger(__name__) + + from dataduct.config import Config + from dataduct.config import logger_configuration + + config = Config(mode=mode) + + # Setup up logging for package + logger_configuration() + + if mode is not None: + logger.warning('Running in %s mode', config.mode) + return config + + +class DataductHelpAction(argparse._HelpAction): + """HelpAction class used to render a custom help message + """ + def __call__(self, parser, namespace, values, option_string=None): + parser.print_help() + print '' + + # Retrieve subparsers from parser + subparsers_actions = [ + action for action in parser._actions + if isinstance(action, argparse._SubParsersAction)] + + for subparsers_action in subparsers_actions: + # get all subparsers and print help + for choice, subparser in subparsers_action.choices.items(): + print "Command '{}'".format(choice) + print subparser.format_usage() + parser.exit() + + +# Change the width of the output format +formatter_class = lambda prog: RawTextHelpFormatter(prog, max_help_position=50) + + +# Help parser for parsing subparsers in help +help_parser = ArgumentParser( + description='Run Dataduct commands', + add_help=False, + formatter_class=formatter_class, +) +help_parser.add_argument( + '-h', + '--help', + action=DataductHelpAction, + help='Help message', +) + +# Mode parser shared across all pipeline subparsers +mode_help = 'Mode or config overrides to use for the commands' +mode_parser = ArgumentParser( + description=mode_help, + add_help=False, +) +mode_parser.add_argument( + '-m', + '--mode', + default=None, + help=mode_help +) + +# Options parser shared actions all pipeline run options +pipeline_run_options = ArgumentParser( + description='Specify actions related to running pipelines', + add_help=False +) +pipeline_run_options.add_argument( + '-f', + '--force', + action='store_true', + default=False, + help='Destroy previous versions of this pipeline, if they exist', +) +pipeline_run_options.add_argument( + '-t', + '--time_delta', + default='0h', + help='Timedelta the pipeline by x time difference', +) +pipeline_run_options.add_argument( + '-b', + '--backfill', + action='store_true', + default=False, + help='Indicates that the timedelta supplied is for a backfill', +) +pipeline_run_options.add_argument( + '--frequency', + default=None, + help='Frequency override for the pipeline', +) + +# Pipeline definitions parser +pipeline_definition_help = 'Paths of the pipeline definitions' +pipeline_definition_parser = ArgumentParser( + description=pipeline_definition_help, + add_help=False, +) +pipeline_definition_parser.add_argument( + 'pipeline_definitions', + nargs='+', + help=pipeline_definition_help, +) + +# Table definitions parser +table_definition_help = 'Paths of the table definitions' +table_definition_parser = ArgumentParser( + description=table_definition_help, + add_help=False, +) +table_definition_parser.add_argument( + 'table_definitions', + nargs='+', + help=table_definition_help, +) + +# Filepath input parser +filepath_help = 'Filepath input for storing output of actions' +file_parser = ArgumentParser( + description=filepath_help, + add_help=False, +) +file_parser.add_argument( + dest='filename', + help='Filename to store output of commands', +) diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py new file mode 100644 index 0000000..e19bd95 --- /dev/null +++ b/dataduct/utils/constants.py @@ -0,0 +1,39 @@ +"""Constants shared across dataduct +""" +import os + +# Constants +ZERO = 0 +ONE = 1 +NONE = None +EMPTY_STR = '' +NULL_STR = 'NULL' +DEFAULT_DELAY = '10 Minutes' +DEFAULT_TIMEOUT = '6 Hours' + +# ETL Constants +EMR_CLUSTER_STR = 'emr' +EC2_RESOURCE_STR = 'ec2' +M1_LARGE = 'm1.large' + +LOG_STR = 'logs' +DATA_STR = 'data' +SRC_STR = 'src' +QA_STR = 'qa' + +# Step paths +SCRIPTS_DIRECTORY = 'scripts' +SCRIPT_RUNNER_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'script_runner.py') +DEPENDENCY_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'pipeline_dependency_check.py') +PK_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'primary_key_test.py') +COUNT_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'count_check_test.py') +COLUMN_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'column_check_test.py') +CREATE_LOAD_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'create_load_redshift_runner.py') +SQL_RUNNER_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'sql_runner.py') diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index 9c59ebd..1bb122c 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -1,20 +1,8 @@ +"""Exceptions for dataduct """ -Exceptions for etl_lib -""" - - -class ETLInputError(Exception): - """Error raised when function input is incorrect. - Args: - msg (str): Human readable string describing the exception. - code (int, optional): Error code, defaults to 2. +class ETLInputError(Exception): pass - Attributes: - msg (str): Human readable string describing the exception. - code (int): Exception error code. +class ETLConfigError(Exception): pass - """ - def __init__(self, msg, code=2): - self.msg = msg - self.code = code +class DatabaseInputError(Exception): pass diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 9e418c1..c128ca6 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -1,6 +1,34 @@ """ Shared utility functions """ +import time +import math +import os +from sys import stderr + +from ..config import Config + +RESOURCE_BASE_PATH = 'RESOURCE_BASE_PATH' +CUSTOM_STEPS_PATH = 'CUSTOM_STEPS_PATH' + + +def atmost_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if exactly one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) <= 1 + + +def atleast_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if atleast one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) >= 1 + def exactly_one(*args): """Asserts one of the arguments is not None @@ -9,3 +37,121 @@ def exactly_one(*args): result(bool): True if exactly one of the arguments is not None """ return sum([1 for a in args if a is not None]) == 1 + + +def retry(tries, delay=3, backoff=2): + """Retries a function or method until it succedes + + Note: + This assume the function succeded if no exception was thrown + + Args: + tries(int): Number of attempts of the function. Must be >= 0 + delay(int): Initial delay in seconds, should be > 0 + backoff(int): Factor by which delay should increase between attempts + """ + + if backoff <= 1: + raise ValueError('backoff must be greater than 1') + + tries = math.floor(tries) + if tries < 0: + raise ValueError('tries must be 0 or greater') + + if delay <= 0: + raise ValueError('delay must be greater than 0') + + def deco_retry(f): + """Decorator for retries""" + + def function_attempt(f, *args, **kwargs): + """ + Single attempt of the function + """ + template = 'Attempt failed with Exception: \n{0}: {1}\n' + try: + r_value = f(*args, **kwargs) # first attempt + r_status = True + except Exception as exp: + stderr.write(template.format(type(exp).__name__, exp)) + r_value = exp + r_status = False + + return r_value, r_status + + def f_retry(*args, **kwargs): + """True decorator""" + m_tries, m_delay = tries, delay # make mutable + + r_value, r_status = function_attempt(f, *args, **kwargs) + + while m_tries > 0: + + # Done on success + if r_status is True: + return r_value + + m_tries -= 1 # consume an attempt + time.sleep(m_delay) # wait... + m_delay *= backoff # make future wait longer + + # Try again + r_value, r_status = function_attempt(f, *args, **kwargs) + + if r_status is True: + return r_value + else: + raise r_value + + # true decorator -> decorated function + return f_retry + + # @retry(arg[, ...]) -> true decorator + return deco_retry + + +def parse_path(path, path_type=RESOURCE_BASE_PATH): + """Change the resource paths for files and directory based on params + + If the path is None, the function returns None. + Else if the path is an absolute path then return the path as is. + Else if the path is a relative path and resource_base_path is declared then + assume the path is relative to the resource_base_path + Else return the path as is. + + Args: + path(str): path specified in the YAML file + """ + # If path is None or absolute + if path is None or os.path.isabs(path): + return path + + # Try relative path to specified config + config = Config() + if path_type == RESOURCE_BASE_PATH: + if RESOURCE_BASE_PATH in config.etl: + return os.path.join( + os.path.expanduser(config.etl[RESOURCE_BASE_PATH]), path) + else: + if CUSTOM_STEPS_PATH in config.etl: + return os.path.join( + os.path.expanduser(config.etl[CUSTOM_STEPS_PATH]), path) + + # Return the path as is. + return path + + +def get_s3_base_path(): + """Get the root S3 path from config + """ + config = Config() + return os.path.join('s3://', config.etl.get('S3_ETL_BUCKET', ''), + config.etl.get('S3_BASE_PATH', '')) + +def get_modified_s3_path(path): + """Modify the s3 path to replace S3_BASE_PATH with config parameter + """ + config = Config() + if path is None: + return None + return path.replace('{S3_BASE_PATH}', config.etl.get('S3_BASE_PATH')) diff --git a/dataduct/utils/slack_hook.py b/dataduct/utils/slack_hook.py new file mode 100644 index 0000000..9f969a9 --- /dev/null +++ b/dataduct/utils/slack_hook.py @@ -0,0 +1,47 @@ +"""Action hook for posting a message on slack +""" + +from ..config import Config + +import logging +logger = logging.getLogger(__name__) + + +def post_message(message): + """Post a message on a specified slack channel. + Will silently skip if there is no etl.slack configuration. + Will print a help message if etl.slack is misconfigured. + + Args: + message(str): The message to post with templating + {user}: The username as specified in the config file + """ + + # If there is no slack configuration, silently skip because the user + # doesn't know about slack integration or doesn't care + config = Config() + slack_config = config.etl.get('slack', None) + if slack_config is None: + return + + try: + import slack + import slack.chat + slack.api_token = slack_config['api_token'] + user = slack_config.get('username', 'Unknown User') + slack.chat.post_message(slack_config['channel_name'], + message.format(user=user), + username=slack_config.get('bot_username', + 'Dataduct')) + except Exception: + message = ['If you want to post a slack message when you activate a pipeline', # noqa + '1) Run: pip install pyslack', + '2) Visit https://api.slack/com/web to generate a token', + '3) Add ([] denotes optional field):', + ' api_token:', + ' channel_name:', + ' [username:]', + ' [bot_username:]', + ' to the etl section of your config file'] + for line in message: + logger.info(line) diff --git a/docs/conf.py b/docs/conf.py index db9615f..46a24fb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -260,7 +260,7 @@ # dir menu entry, description, category) texinfo_documents = [ ('index', 'dataduct', u'dataduct Documentation', - u'Coursera', 'dataduct', 'One line description of project.', + u'Coursera', 'dataduct', 'DataPipeline for Humans.', 'Miscellaneous'), ] diff --git a/docs/config.rst b/docs/config.rst new file mode 100644 index 0000000..e0d01d6 --- /dev/null +++ b/docs/config.rst @@ -0,0 +1,288 @@ +Config +====== + +All the dataduct setting are controlled from a single config file that +stores the credentials as well as different settings. + +The config file is read from the following places in the specified order +of priority. + +1. ``/etc/dataduct.cfg`` +2. ``~/.dataduct`` +3. ``DATADUCT_CONFIG_PATH`` environment variable + +Minimum example config: + +.. code:: YAML + + ec2: + INSTANCE_TYPE: m1.large + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - Python 2.6 + SECURITY_GROUP: FILL_ME_IN + + emr: + MASTER_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + CORE_INSTANCE_TYPE: m1.large + CLUSTER_AMI: 3.1.0 + + etl: + S3_ETL_BUCKET: FILL_ME_IN + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN + +Config Parameters +----------------- + +Bootstrap +~~~~~~~~~ + +.. code:: YAML + + bootstrap: + ec2: + - step_type: transform + command: echo "Welcome to dataduct" + no_output: true + emr: + - step_type: transform + command: echo "Welcome to dataduct" + no_output: true + +Bootstrap steps are a chain of steps that should be executed before any +other step in the datapipeline. This can be used to copy files from S3 +or install libraries on the resource. At Coursera we use this to +download some binaries from S3 that are required for some of the +transformations. + +Note that the EMR bootstrap is only executed on the master node. If you +want to install something on the task nodes then you should use the +bootstrap parameter in the ``emr_cluster_config`` in your datapipeline. + +Custom Steps +~~~~~~~~~~~~ + +:: + + custom_steps: + - class_name: CustomExtractLocalStep + file_path: custom_extract_local.py + step_type: custom-extract-local + +Custom steps are steps that are not part of dataduct but are created to +augment the functionality provided by dataduct. At Coursera these are +often Steps that Inherit from the current object but abstract out some +of the functionality so that multiple pipelines don't have to write the +same thing twice. + +The file\_path can be an absolute path or a relative path with respect +to the ``CUSTOM_STEPS_PATH`` path defined in the ETL parameter field. +The Step classes are dynamically imported based on the config and +``step-type`` field is the one that is matched when parsing the pipeline +definition. + +Database +~~~~~~~~ + +:: + + database: + permissions: + - user: admin + permission: all + - group: consumer_group + permission: select + +Some steps such as ``upsert`` or ``create-load-redshift`` create tables +and grant them appropriate permissions so that one does not have to +create tables prior to running the ETL. The permission is the +``permission`` being granted on the table or view to the ``user`` or +``group``. If both are specified then both the grant statements are +executed. + +EC2 +~~~ + +:: + + ec2: + INSTANCE_TYPE: m1.small + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - Python 2.6 + SECURITY_GROUP: FILL_ME_IN + +The ec2 config controls the configuration for the ec2-resource started +by the datapipeline. You can override these with ``ec2_resouce_config`` +in your pipeline definition for specific pipelines. + +EMR +~~~ + +:: + + emr: + CLUSTER_AMI: 3.1.0 + CLUSTER_TIMEOUT: 6 Hours + CORE_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + HADOOP_VERSION: 2.4.0 + HIVE_VERSION: null + MASTER_INSTANCE_TYPE: m3.xlarge + PIG_VERSION: null + TASK_INSTANCE_BID_PRICE: null + TASK_INSTANCE_TYPE: m1.large + +The emr config controls the configuration for the emr-resource started +by the datapipeline. + +ETL +~~~ + +:: + + etl: + CONNECTION_RETRIES: 2 + CUSTOM_STEPS_PATH: ~/dataduct/examples/steps + DAILY_LOAD_TIME: 1 + KEY_PAIR: FILL_ME_IN + MAX_RETRIES: 2 + NAME_PREFIX: dev + QA_LOG_PATH: qa + DP_INSTANCE_LOG_PATH: dp_instances + DP_PIPELINE_LOG_PATH: dp_pipelines + DP_QA_TESTS_LOG_PATH: dba_table_qa_tests + RESOURCE_BASE_PATH: ~/dataduct/examples/resources + RESOURCE_ROLE: FILL_ME_IN + RETRY_DELAY: 10 Minutes + REGION: us-east-1 + ROLE: FILL_ME_IN + S3_BASE_PATH: dev + S3_ETL_BUCKET: FILL_ME_IN + SNS_TOPIC_ARN_FAILURE: null + SNS_TOPIC_ARN_WARNING: null + FREQUENCY_OVERRIDE: one-time + DEPENDENCY_OVERRIDE: false + slack: + api_token: FILL_ME_IN + channel_name: "#dataduct" + username: FILL_ME_IN + bot_username: Dataduct Bot + TAGS: + env: + string: dev + Name: + variable: name + +This is the core parameter object which controls the ETL at the high +level. The parameters are explained below: + +- ``CONNECTION_RETRIES``: Number of retries for the database + connections. This is used to eliminate some of the transient errors + that might occur. +- ``CUSTOM_STEPS_PATH``: Path to the directory to be used for custom + steps that are specified using a relative path. +- ``DAILY_LOAD_TIME``: Default time to be used for running pipelines +- ``KEY_PAIR``: SSH key pair to be used in both the ec2 and the emr + resource. +- ``MAX_RETRIES``: Number of retries for the pipeline activities +- ``NAME_PREFIX``: Prefix all the pipeline names with this string +- ``QA_LOG_PATH``: Path prefix for all the QA steps when logging output + to S3 +- ``DP_INSTANCE_LOG_PATH``: Path prefix for DP instances to be logged + before destroying +- ``DP_PIPELINE_LOG_PATH``: Path prefix for DP pipelines to be logged +- ``DP_QA_TESTS_LOG_PATH``: Path prefix for QA tests to be logged +- ``RESOURCE_BASE_PATH``: Path to the directory used to relative + resource paths +- ``RESOURCE_ROLE``: Resource role needed for DP +- ``RETRY_DELAY``: Delay between each of activity retires +- ``REGION``: Region to run the datapipeline from +- ``ROLE``: Role needed for DP +- ``S3_BASE_PATH``: Prefix to be used for all S3 paths that are created + anywhere. This is used for splitting logs across multiple developer + or across production and dev +- ``S3_ETL_BUCKET``: S3 bucket to use for DP data, logs, source code + etc. +- ``SNS_TOPIC_ARN_FAILURE``: SNS to trigger for failed steps or + pipelines +- ``SNS_TOPIC_ARN_WARNING``: SNS to trigger for failed QA checks +- ``FREQUENCY_OVERRIDE``: Override every frequency given in a pipeline + with this unless overridden by CLI +- ``DEPENDENCY_OVERRIDE``: Will ignore the dependency step if set to + true. +- ``slack``: Configuration for posting messages on slack whenever a + pipeline is run +- ``Tags``: Tags to be added to the pipeline. The first key is the Tag + to be used, the second key is the type. If the type is string the + value is passed directly. If the type is variable then it looks up + the pipeline object for that variable. + +Logging +~~~~~~~ + +:: + + logging: + CONSOLE_DEBUG_LEVEL: INFO + FILE_DEBUG_LEVEL: DEBUG + LOG_DIR: ~/.dataduct + LOG_FILE: dataduct.log + +Settings for specifying where the logs should be outputted and debug +levels that should be used in the library code execution. + +MySQL +~~~~~ + +:: + + mysql: + host_alias_1: + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + host_alias_2: + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + +Rds (MySQL) database connections are stored in this parameter. The +pipeline definitions can refer to the host with the host\_alias. +``HOST`` refers to the full db hostname inside AWS. + +Redshift +~~~~~~~~ + +:: + + redshift: + CLUSTER_ID: FILL_ME_IN + DATABASE_NAME: FILL_ME_IN + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + PORT: FILL_ME_IN + +Redshift database credentials that are used in all the steps that +interact with a warehouse. ``CLUSTER_ID`` is the first word of the +``HOST`` as this is used by ``RedshiftNode`` at a few places to identify +the cluster. + +Modes +~~~~~ + +:: + + production: + etl: + S3_BASE_PATH: prod + +Modes define override settings for running a pipeline. As config is a +singleton we can declare the overrides once and that should update the +config settings across all use cases. + +In the example we have a mode called ``production`` in which the +``S3_BASE_PATH`` is overridden to ``prod`` instead of whatever value was +specified in the defaults. + +At coursera one of the uses for modes is to change between the dev +redshift cluster to the production one when we deploy a new ETL. diff --git a/docs/creating_an_etl.rst b/docs/creating_an_etl.rst index 3ef5ece..745ba58 100644 --- a/docs/creating_an_etl.rst +++ b/docs/creating_an_etl.rst @@ -35,19 +35,20 @@ Example: # PIPELINE STEPS steps: - step_type: extract-local - path: examples/resources/word_data.txt + path: data/word_data.txt - step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py - step_type: transform - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR - -f + Header Information ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -90,135 +91,3 @@ Description The description allows the creator of the YAML file to clearly explain the purpose of the pipeline. - -Pipeline Steps -~~~~~~~~~~~~~~ - -The pipeline steps are very verbose and easy to understand, as they map -directly into Data Pipeline steps. Each step must have a type associated -with it (transform step / emr-streaming step) and should be named for -clarification purposes. The following lists every step type: - -emr-streaming -^^^^^^^^^^^^^ - -The *emr-streaming* step runs on a EMR instance configured from the -header. You can specify the bootstrap, mapper, and reducer files. - -.. code:: yaml - - - step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py - -extract-local -^^^^^^^^^^^^^ - -The *extract-local* step will extract a local file (for example, a TSV -file) and write it to the output node. From there, the data can be -loaded into redshift or apply further transformations. - -.. code:: yaml - - - name: extract_local_step - step_type: extract-local - path: examples/resources/word_data.txt - -extract-rds -^^^^^^^^^^^ - -The *extract-rds* step extracts data from MySQL databases to S3. You can -also specify the SQL statement that you would like to execute. This -extraction will look for tables based on the host name and the database -name which needs to be pre-configured in ~/.dataduct - -.. code:: yaml - - - step_type: extract-rds - host_name: maestro - database: maestro - sql: | - SELECT * - FROM networks_network; - -extract-redshift -^^^^^^^^^^^^^^^^ - -The *extract-redshift* step extracts data from AWS Redshift (the host -and AWS details must be preconfigured in the ~/.dataduct file) into S3. - -.. code:: yaml - - - step_type: extract-redshift - schema: dev - table: categories - -extract-s3 -^^^^^^^^^^ - -The *extract-s3* step extracts files from a given S3 URI into the output -S3 node. - -.. code:: yaml - - - step_type: extract-s3 - uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py - -load-redshift -^^^^^^^^^^^^^ - -The *load-redshift* step loads data from the input nodes to the -specified Redshift table. Before specifying the Redshift table and -schema, the host and AWS details must be preconfigured in the -~/.dataduct file. For example, the following steps will upload a local -file into dev.test\_table - -.. code:: yaml - - - step_type: extract-local - path: examples/resources/test_table1.tsv - - - step_type: load-redshift - schema: dev - table: test_table - -sql-command -^^^^^^^^^^^ - -The *sql-command* step will execute a query in Redshift (the host and -AWS details must be preconfigured in the ~/.dataduct file). - -.. code:: yaml - - - step_type: sql-command - command: INSERT INTO dev.test_table VALUES (1, 'hello_etl'); - -transform -^^^^^^^^^ - -The *transform* step allows you to specify the input node, apply -transformations, and write to a specified output node. The -transformation can be in the form of a script or a UNIX command. - -.. code:: yaml - - # Unix Example - - step_type: transform - command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR - input_node: - step1_a: step2_a - step1_b: step2_b - output: - - "step2_a" - - "step2_b" - - # Script Example - - step_type: transform - script: examples/scripts/s3_profiler.py - input_node: - step2_a: output1 - script_arguments: - - "-i=${INPUT1_STAGING_DIR}" - - "-o=${OUTPUT1_STAGING_DIR}" - - -f - diff --git a/docs/dataduct.config.rst b/docs/dataduct.config.rst new file mode 100644 index 0000000..ac2cf66 --- /dev/null +++ b/docs/dataduct.config.rst @@ -0,0 +1,61 @@ +dataduct.config package +======================= + +Subpackages +----------- + +.. toctree:: + + dataduct.config.tests + +Submodules +---------- + +dataduct.config.config module +----------------------------- + +.. automodule:: dataduct.config.config + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.config_actions module +------------------------------------- + +.. automodule:: dataduct.config.config_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.constants module +-------------------------------- + +.. automodule:: dataduct.config.constants + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.credentials module +---------------------------------- + +.. automodule:: dataduct.config.credentials + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.logger_config module +------------------------------------ + +.. automodule:: dataduct.config.logger_config + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.config.tests.rst b/docs/dataduct.config.tests.rst new file mode 100644 index 0000000..792d9fc --- /dev/null +++ b/docs/dataduct.config.tests.rst @@ -0,0 +1,22 @@ +dataduct.config.tests package +============================= + +Submodules +---------- + +dataduct.config.tests.test_credentials module +--------------------------------------------- + +.. automodule:: dataduct.config.tests.test_credentials + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.config.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.data_access.rst b/docs/dataduct.data_access.rst new file mode 100644 index 0000000..00c1eec --- /dev/null +++ b/docs/dataduct.data_access.rst @@ -0,0 +1,22 @@ +dataduct.data_access package +============================ + +Submodules +---------- + +dataduct.data_access.connection module +-------------------------------------- + +.. automodule:: dataduct.data_access.connection + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.data_access + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.parsers.rst b/docs/dataduct.database.parsers.rst new file mode 100644 index 0000000..3d2a44c --- /dev/null +++ b/docs/dataduct.database.parsers.rst @@ -0,0 +1,69 @@ +dataduct.database.parsers package +================================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.parsers.tests + +Submodules +---------- + +dataduct.database.parsers.create_table module +--------------------------------------------- + +.. automodule:: dataduct.database.parsers.create_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.create_view module +-------------------------------------------- + +.. automodule:: dataduct.database.parsers.create_view + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.helpers module +---------------------------------------- + +.. automodule:: dataduct.database.parsers.helpers + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.select_query module +--------------------------------------------- + +.. automodule:: dataduct.database.parsers.select_query + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.transform module +------------------------------------------ + +.. automodule:: dataduct.database.parsers.transform + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.utils module +-------------------------------------- + +.. automodule:: dataduct.database.parsers.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.parsers.tests.rst b/docs/dataduct.database.parsers.tests.rst new file mode 100644 index 0000000..7c181ed --- /dev/null +++ b/docs/dataduct.database.parsers.tests.rst @@ -0,0 +1,46 @@ +dataduct.database.parsers.tests package +======================================= + +Submodules +---------- + +dataduct.database.parsers.tests.test_create_table module +-------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_create_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_create_view module +------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_create_view + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_select_query module +-------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_select_query + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_transfrom module +----------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_transfrom + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.parsers.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.rst b/docs/dataduct.database.rst new file mode 100644 index 0000000..4046783 --- /dev/null +++ b/docs/dataduct.database.rst @@ -0,0 +1,79 @@ +dataduct.database package +========================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.parsers + dataduct.database.sql + dataduct.database.tests + +Submodules +---------- + +dataduct.database.column module +------------------------------- + +.. automodule:: dataduct.database.column + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.database module +--------------------------------- + +.. automodule:: dataduct.database.database + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.history_table module +-------------------------------------- + +.. automodule:: dataduct.database.history_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.relation module +--------------------------------- + +.. automodule:: dataduct.database.relation + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.select_statement module +----------------------------------------- + +.. automodule:: dataduct.database.select_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.table module +------------------------------ + +.. automodule:: dataduct.database.table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.view module +----------------------------- + +.. automodule:: dataduct.database.view + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.sql.rst b/docs/dataduct.database.sql.rst new file mode 100644 index 0000000..e438344 --- /dev/null +++ b/docs/dataduct.database.sql.rst @@ -0,0 +1,53 @@ +dataduct.database.sql package +============================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.sql.tests + +Submodules +---------- + +dataduct.database.sql.sql_script module +--------------------------------------- + +.. automodule:: dataduct.database.sql.sql_script + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.sql_statement module +------------------------------------------ + +.. automodule:: dataduct.database.sql.sql_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.transaction module +---------------------------------------- + +.. automodule:: dataduct.database.sql.transaction + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.utils module +---------------------------------- + +.. automodule:: dataduct.database.sql.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.sql + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.sql.tests.rst b/docs/dataduct.database.sql.tests.rst new file mode 100644 index 0000000..59d7ced --- /dev/null +++ b/docs/dataduct.database.sql.tests.rst @@ -0,0 +1,38 @@ +dataduct.database.sql.tests package +=================================== + +Submodules +---------- + +dataduct.database.sql.tests.test_sql_script module +-------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_script + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.tests.test_sql_statement module +----------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.tests.test_sql_utils module +------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.sql.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.tests.rst b/docs/dataduct.database.tests.rst new file mode 100644 index 0000000..86cc9c8 --- /dev/null +++ b/docs/dataduct.database.tests.rst @@ -0,0 +1,30 @@ +dataduct.database.tests package +=============================== + +Submodules +---------- + +dataduct.database.tests.test_database module +-------------------------------------------- + +.. automodule:: dataduct.database.tests.test_database + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.tests.test_history_table module +------------------------------------------------- + +.. automodule:: dataduct.database.tests.test_history_table + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.etl.rst b/docs/dataduct.etl.rst new file mode 100644 index 0000000..abde039 --- /dev/null +++ b/docs/dataduct.etl.rst @@ -0,0 +1,45 @@ +dataduct.etl package +==================== + +Subpackages +----------- + +.. toctree:: + + dataduct.etl.tests + +Submodules +---------- + +dataduct.etl.etl_actions module +------------------------------- + +.. automodule:: dataduct.etl.etl_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.etl_pipeline module +-------------------------------- + +.. automodule:: dataduct.etl.etl_pipeline + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.utils module +------------------------- + +.. automodule:: dataduct.etl.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.etl + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.etl.tests.rst b/docs/dataduct.etl.tests.rst new file mode 100644 index 0000000..41a5eb0 --- /dev/null +++ b/docs/dataduct.etl.tests.rst @@ -0,0 +1,30 @@ +dataduct.etl.tests package +========================== + +Submodules +---------- + +dataduct.etl.tests.test_etl_actions module +------------------------------------------ + +.. automodule:: dataduct.etl.tests.test_etl_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.tests.test_etl_pipeline module +------------------------------------------- + +.. automodule:: dataduct.etl.tests.test_etl_pipeline + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.etl.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.pipeline.rst b/docs/dataduct.pipeline.rst index 8c4fbb5..b122eae 100644 --- a/docs/dataduct.pipeline.rst +++ b/docs/dataduct.pipeline.rst @@ -1,6 +1,9 @@ dataduct.pipeline package ========================= +Submodules +---------- + dataduct.pipeline.activity module --------------------------------- diff --git a/docs/dataduct.qa.rst b/docs/dataduct.qa.rst new file mode 100644 index 0000000..5cba802 --- /dev/null +++ b/docs/dataduct.qa.rst @@ -0,0 +1,54 @@ +dataduct.qa package +=================== + +Submodules +---------- + +dataduct.qa.check module +------------------------ + +.. automodule:: dataduct.qa.check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.column_check module +------------------------------- + +.. automodule:: dataduct.qa.column_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.count_check module +------------------------------ + +.. automodule:: dataduct.qa.count_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.primary_key_check module +------------------------------------ + +.. automodule:: dataduct.qa.primary_key_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.utils module +------------------------ + +.. automodule:: dataduct.qa.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.qa + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.rst b/docs/dataduct.rst index ceb30fe..6f37f8e 100644 --- a/docs/dataduct.rst +++ b/docs/dataduct.rst @@ -1,29 +1,16 @@ -Code References -================ - -.. automodule:: dataduct - :members: - :undoc-members: - :show-inheritance: - -Subpackages and Modules ------------------------ +Code documentation +================== .. toctree:: :maxdepth: 1 - etl_pipeline + dataduct.config + dataduct.data_access + dataduct.database + dataduct.etl dataduct.pipeline + dataduct.qa dataduct.s3 dataduct.steps dataduct.tests dataduct.utils - - -Definition Parser ------------------ - -.. automodule:: dataduct.definition_parser - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/dataduct.steps.rst b/docs/dataduct.steps.rst index b73f54d..e625b5e 100644 --- a/docs/dataduct.steps.rst +++ b/docs/dataduct.steps.rst @@ -4,6 +4,46 @@ dataduct.steps package Submodules ---------- +dataduct.steps.column_check module +---------------------------------- + +.. automodule:: dataduct.steps.column_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.count_check module +--------------------------------- + +.. automodule:: dataduct.steps.count_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.create_load_redshift module +------------------------------------------ + +.. automodule:: dataduct.steps.create_load_redshift + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.create_update_sql module +--------------------------------------- + +.. automodule:: dataduct.steps.create_update_sql + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.emr_job module +----------------------------- + +.. automodule:: dataduct.steps.emr_job + :members: + :undoc-members: + :show-inheritance: + dataduct.steps.emr_streaming module ----------------------------------- @@ -60,6 +100,38 @@ dataduct.steps.load_redshift module :undoc-members: :show-inheritance: +dataduct.steps.pipeline_dependencies module +------------------------------------------- + +.. automodule:: dataduct.steps.pipeline_dependencies + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.primary_key_check module +--------------------------------------- + +.. automodule:: dataduct.steps.primary_key_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.qa_transform module +---------------------------------- + +.. automodule:: dataduct.steps.qa_transform + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.reload module +---------------------------- + +.. automodule:: dataduct.steps.reload + :members: + :undoc-members: + :show-inheritance: + dataduct.steps.sql_command module --------------------------------- @@ -76,6 +148,14 @@ dataduct.steps.transform module :undoc-members: :show-inheritance: +dataduct.steps.upsert module +---------------------------- + +.. automodule:: dataduct.steps.upsert + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/dataduct.tests.rst b/docs/dataduct.tests.rst index 7a1df98..215751d 100644 --- a/docs/dataduct.tests.rst +++ b/docs/dataduct.tests.rst @@ -4,10 +4,10 @@ dataduct.tests package Submodules ---------- -dataduct.tests.test_definition_parser module --------------------------------------------- +dataduct.tests.test_import module +--------------------------------- -.. automodule:: dataduct.tests.test_definition_parser +.. automodule:: dataduct.tests.test_import :members: :undoc-members: :show-inheritance: diff --git a/docs/dataduct.utils.rst b/docs/dataduct.utils.rst index a25391f..cfe860e 100644 --- a/docs/dataduct.utils.rst +++ b/docs/dataduct.utils.rst @@ -4,6 +4,22 @@ dataduct.utils package Submodules ---------- +dataduct.utils.cli module +------------------------- + +.. automodule:: dataduct.utils.cli + :members: + :undoc-members: + :show-inheritance: + +dataduct.utils.constants module +------------------------------- + +.. automodule:: dataduct.utils.constants + :members: + :undoc-members: + :show-inheritance: + dataduct.utils.exceptions module -------------------------------- @@ -20,6 +36,14 @@ dataduct.utils.helpers module :undoc-members: :show-inheritance: +dataduct.utils.slack_hook module +-------------------------------- + +.. automodule:: dataduct.utils.slack_hook + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/etl_pipeline.rst b/docs/etl_pipeline.rst deleted file mode 100644 index 2f40774..0000000 --- a/docs/etl_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -ETLPipeline -=========== - -.. automodule:: dataduct.etl_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/index.rst b/docs/index.rst index d70b6b6..e615a80 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,23 +3,43 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Dataduct - DataPipeline for humans -==================================== +Dataduct +======== -Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to -create ETL jobs. All jobs can be specified as a series of steps in a YAML file -and would automatically be translated into datapipeline with appropriate -pipeline objects. + Dataduct - DataPipeline for humans + +`Dataduct `__ is a wrapper built +on top of `AWS +Datapipeline `__ +which makes it easy to create ETL jobs. All jobs can be specified as a +series of steps in a YAML file and would automatically be translated +into datapipeline with appropriate pipeline objects. + +Features include: + +- Visualizing pipeline activities +- Extracting data from different sources such as RDS, S3, local files +- Transforming data using EC2 and EMR +- Loading data into redshift +- Transforming data inside redshift +- QA data between the source system and warehouse + +It is easy to create custom steps to augment the DSL as per the +requirements. As well as running a backfill with the command line +interface. -Running an ETL is as simple as ``$ dataduct -a create pipeline.yaml`` Contents: .. toctree:: - :maxdepth: 1 + :maxdepth: 2 + introduction installation + config creating_an_etl + steps + input_output dataduct Indices and tables diff --git a/docs/input_output.rst b/docs/input_output.rst new file mode 100644 index 0000000..832ac20 --- /dev/null +++ b/docs/input_output.rst @@ -0,0 +1,180 @@ +Input and Output Nodes +======================= + +In dataduct, data is shared between two activities using S3. After a +step is finished, it saves its output to a file in S3 for successive +steps to read. Input and output nodes abstract this process, they +represent the S3 directories in which the data is stored. A step's input +node determines which S3 file it will read as input, and its output node +determines where it will store its output. In most cases, this +input-output node chain is taken care of by dataduct, but there are +situations where you may want finer control over this process. + +Input Nodes +~~~~~~~~~~~ + +The default behaviour of steps (except Extract- and Check-type steps) is +to link its input node with the preceding step's output node. For +example, in this pipeline snippet + +:: + + - step_type: extract-local + path: data/test_table1.tsv + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +the output of the ``extract-local`` step is fed into the +``create-load-redshift`` step, so the pipeline will load the data found +inside ``data/test_table1.tsv`` into ``dev.test_table.sql``. This +behaviour can be made explicit through the ``name`` and ``input_node`` +properties. + +:: + + # This pipeline has the same behaviour as the previous pipeline. + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table.sql + +When an input -> output node link is created, implicitly or explicitly, +dependencies are created automatically between the two steps. This +behaviour can be made explicit through the ``depends_on`` property. + +:: + + # This pipeline has the same behaviour as the previous pipeline. + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + depends_on: extract_data + table_definition: tables/dev.test_table.sql + +You can use input nodes to communicate between steps that are not next +to each other. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: extract-local + path: data/test_table2.tsv + + # This step will use the output of the first extract-local step (test_table1.tsv) + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table.sql + +Without the use of ``input_node``, the ``create-load-redshift`` step +would have used the data from ``test_table2.tsv`` instead. + +You can also use input nodes to reuse the output of a step. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table1.sql + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table2.sql + +Sometimes, you may not want a step to have any input nodes. You can +specify this by writing ``input_node: []``. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + # This step will not receive any input data + - step_type: transform + input_node: [] + script: scripts/example_script.py + +If you are running your own script (e.g. through the Transform step), +the input node's data can be found in the directory specified by +``INPUT1_STAGING_DIR``. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + # manipulate_data.py takes in the input directory as a script argument + - step_type: transform + script: scripts/manipulate_data.py + script_arguments: + - --input=INPUT1_STAGING_DIR + +Output Nodes +~~~~~~~~~~~~ + +Dataduct usually handles a step's output nodes automatically, saving the +file into a default path in S3. You can set the default path through +your dataduct configuration file. However, some steps also have an +optional ``output_path`` property, allowing you to choose an S3 +directory to store the step's output. + +Transform Step and Output Nodes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Transform steps allow you to run your own scripts. If you want to save +the results of your script, you can store data into the output node by +writing to the directory specified by ``OUTPUT1_STAGING_DIR``. + +:: + + # generate_data.py takes in the output directory as a script argument + - step_type: transform + script: scripts/generate_data.py + script_arguments: + - --output=OUTPUT1_STAGING_DIR + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +You may wish to output more than one set of data for multiple proceeding +steps to use. You can do this through the ``output_node`` property. + +:: + + - step_type: transform + script: scripts/generate_data.py + script_arguments: + - --output=OUTPUT1_STAGING_DIR + output_node: + - foo_data + - bar_data + + - step_type: create-load-redshift + input_node: foo_data + table_definition: tables/dev.test_table1.sql + + - step_type: create-load-redshift + input_node: bar_data + table_definition: tables/dev.test_table2.sql + +In this case, the script must save data to subdirectories with names +matching the output nodes. In the above example, ``generate_data.py`` +must save data in ``OUTPUT1_STAGING_DIR/foo_data`` and +``OUTPUT1_STAGING_DIR/bar_data`` directories. If the subdirectory and +output node names are mismatched, the output nodes will not be generated +correctly. diff --git a/docs/installation.rst b/docs/installation.rst index 7fdb39c..178e02d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,78 +1,85 @@ Installation -~~~~~~~~~~~~ +============ -Install the dataduct package using pip +Installation using pip +---------------------- + +Dataduct can easily be installed using pip with the following commands. :: pip install dataduct -**Dependencies** +The major dependencies of dataduct are: + +- ``boto`` greater than version 2.34, older versions are missing some + of the functionality provided by EMR +- ``PyYAML`` +- ``pandas`` +- ``psycopg2`` +- ``pytimeparse`` +- ``MySQL-python`` +- ``pyparsing`` +- ``testfixtures`` + +The visualizations are created using: + +- ``graphviz`` +- ``pygraphviz`` + +Autocomplete for the CLI is supported using: + +- ``argcomplete`` + +The documentation is created using: + +- ``sphinx`` +- ``sphinx-napolean`` +- ``sphinx_rtd_theme`` + +Installing in the developer environment +--------------------------------------- + +1. Clone the Repo +^^^^^^^^^^^^^^^^^ + +:: + + git clone https://github.com/coursera/dataduct.git + +2. Update PATH and PYTHONPATH +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add these lines into your ``.bash_profile`` or ``.zshrc`` etc based on +your shell type. + +:: -dataduct currently has the following dependencies: - boto >= 2.32.0 - -yaml + export PYTHONPATH=~/dataduct:$PYTHONPATH + export PATH=~/dataduct/bin:$PATH -We have tried some older versions of boto with the problem being support -some functionality around EMR that will be used in the later versions of -dataduct. +3. Config +^^^^^^^^^ -**Setup Configuration** +Create a config file. Instructions for this are provided in the config +section. + +Setup Autocomplete +------------------ + +Install argcomplete with ``pip install argcomplete``. + +If you're using ``bash`` then add the following to your +``.bash_profile``: + +:: -Setup the configuration file to set the credentials and defaul values -for various parameters passed to datapipeline. Copy the config template -from https://github.com/coursera/dataduct/../example\_config and write -it to ``~/.dataduct`` or ``/etc/.dataduct``. You can also set an -environment variable pointing to the config file location by setting the -``DATADUCT_PATH`` variable. + eval "$(register-python-argcomplete dataduct)" -*Config file template:* +if you're using ``zsh`` then add the following line to your ``.zshrc``: :: - # Constants that are used across the dataduct library - - ec2: - DEFAULT_ROLE: FILL_ME_IN - DEFAULT_RESOURCE_ROLE: FILL_ME_IN - DEFAULT_EC2_INSTANCE_TYPE: m1.large - ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - KEY_PAIR: FILL_ME_IN - SECURITY_GROUP: FILL_ME_IN - - emr: - DEFAULT_NUM_CORE_INSTANCES: 3 - DEFAULT_CORE_INSTANCE_TYPE: m1.large - DEFAULT_TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - DEFAULT_TASK_INSTANCE_TYPE: m1.large - DEFAULT_MASTER_INSTANCE_TYPE: m1.large - DEFAULT_CLUSTER_TIMEOUT: 6 Hours - DEFAULT_HADOOP_VERSION: null - DEFAULT_HIVE_VERSION: null - DEFAULT_PIG_VERSION: null - DEFAULT_CLUSTER_AMI: 2.4.7 - - redshift: - REDSHIFT_DATABASE_NAME: FILL_ME_IN - REDSHIFT_CLUSTER_ID: FILL_ME_IN - REDSHIFT_USERNAME: FILL_ME_IN - REDSHIFT_PASSWORD: FILL_ME_IN - - mysql: - DATABASE_KEY: - HOST: FILL_ME_IN, - USERNAME: FILL_ME_IN, - PASSWORD: FILL_ME_IN - - etl: - RETRY_DELAY: 10 Minutes - DEFAULT_MAX_RETRIES: 0 - ETL_BUCKET: FILL_ME_IN - DATA_PIPELINE_TOPIC_ARN: FILL_ME_IN - DAILY_LOAD_TIME: 1 # run at 1AM UTC - - bootstrap: - - step_type: transform - input_node: [] - command: whoami >> ${OUTPUT1_STAGING_DIR}/output.txt - resource: FILL_ME_IN - name: bootstrap_transform + autoload bashcompinit + bashcompinit + eval "$(register-python-argcomplete dataduct)" diff --git a/docs/introduction.rst b/docs/introduction.rst new file mode 100644 index 0000000..cdab355 --- /dev/null +++ b/docs/introduction.rst @@ -0,0 +1,50 @@ +Introduction +============= + +`Dataduct `__ is a wrapper built +on top of `AWS +Datapipeline `__ +which makes it easy to create ETL jobs. All jobs can be specified as a +series of steps in a YAML file and would automatically be translated +into datapipeline with appropriate pipeline objects. + +Features include: + +- Visualizing pipeline activities +- Extracting data from different sources such as RDS, S3, local files +- Transforming data using EC2 and EMR +- Loading data into redshift +- Transforming data inside redshift +- QA data between the source system and warehouse +It is easy to create custom steps to augment the DSL as per the +requirements. As well as running a backfill with the command line +interface. + +An example ETL from RDS would look like: + +.. code:: YAML + + name: example_upsert + frequency: daily + load_time: 01:00 # Hour:Min in UTC + + steps: + - step_type: extract-rds + host_name: test_host + database: test_database + sql: | + SELECT * + FROM test_table; + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + + - step_type: upsert + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql + +This would first perform an extraction from the RDS database with the +``extract-rds`` step using the ``COPY ACTIVITY``. Then load the data +into the ``dev.test_table`` in redshift with the +``create-load-redshift``. Then perform an ``upsert`` with the data into +the ``test_table_2``. diff --git a/docs/modules.rst b/docs/modules.rst new file mode 100644 index 0000000..e7b9c81 --- /dev/null +++ b/docs/modules.rst @@ -0,0 +1,7 @@ +dataduct +======== + +.. toctree:: + :maxdepth: 4 + + dataduct diff --git a/docs/steps.rst b/docs/steps.rst new file mode 100644 index 0000000..832ca1b --- /dev/null +++ b/docs/steps.rst @@ -0,0 +1,520 @@ +Steps and Pipeline Objects +========================== + +Pipeline objects are classes that directly translate one-one from the +dataduct classes to `DP +objects `__. +A step is an abstraction layer that can translate into one or more +pipeline objects based on the action type. For example a ``sql-command`` +step translates into a ``sql-activity`` or a ``transform`` step +translates into ``shell command activity`` and creates an output +``s3 node``. + +Definition of a Step +-------------------- + +A step is defined as a series of properties in yaml. For example, + +:: + + - step_type: extract-s3 + name: get_file + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + +defines an ``extract-s3`` step with properties ``name`` and +``file_uri``. + +Common +------ + +These are the properties that all steps possess. + +- ``step_type``: The step type. Must be either a pre-defined step or a + custom step. (Required) +- ``name``: The user-defined name of the step. Will show up as part of + the component name in DataPipeline. +- ``input_node``: See input and output nodes. +- ``depends_on``: This step will not run until the step(s) specified + have finished. + +Extract S3 +---------- + +Extracts the contents from the specified file or directory in S3. May +used as input to other steps. + +Properties +^^^^^^^^^^ + +One of: (Required) + +- ``file_uri``: The location of a single file in S3. +- ``directory_uri``: The location of a directory in S3. + +Example +^^^^^^^ + +:: + + - step_type: extract-s3 + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + +Extract Local +------------- + +Extracts the contents from the specified file locally. May be used as +input to other steps. May only be used with one-time pipelines. + +Properties +^^^^^^^^^^ + +- ``path``: The location of a single file. (Required) + +Example +^^^^^^^ + +:: + + - step_type: extract-local + path: data/example_file.tsv + +Extract RDS +----------- + +Extracts the contents of a table from an RDS instance. May be used as +input to other steps. Data is stored in TSV format. + +Properties +^^^^^^^^^^ + +- ``host_name``: The host name to lookup in the ``mysql`` section of + the configuration file. (Required) +- ``database``: The database in the RDS instance in which the table + resides. (Required) +- ``output_path``: Output the extracted data to the specified S3 path. + +One of: (Required) + +- ``sql``: The SQL query to execute to extract data. +- ``table``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM table``. + +Example +^^^^^^^ + +:: + + - step_type: extract-rds + host_name: maestro + database: maestro + sql: | + SELECT * + FROM example_rds_table; + +Extract Redshift +------------------------- + +Extracts the contents of a table from a Redshift instance. May be used +as input to other steps. Data is stored in TSV format. + +Properties +^^^^^^^^^^ + +- ``schema``: The schema of the table. (Required) +- ``table``: The name of the table. (Required) +- ``output_path``: Output the extracted data to the specified S3 path. + Optional. + +Example +^^^^^^^ + +:: + + - step_type: extract-redshift + schema: prod + table: example_redshift_table + +Transform +------------------------- + +Runs a specified script on an resource. + +Properties +^^^^^^^^^^ + +- ``output_node``: See input and output nodes. +- ``script_arguments``: Arguments passed to the script. +- ``script_name``: Required if ``script_directory`` is specified. + Script to be executed in the directory. +- ``additional_s3_files``: Additional files to include from S3. +- ``output_path``: Save the script's output to the specified S3 path. +- ``no_output``: If ``true``, step will produce no extractable output. + Default: ``false`` + +One of: (Required) + +- ``command``: A command to be executed directly. +- ``script``: Local path to the script that should be executed. +- ``script_directory``: Local path to a directory of scripts to be + uploaded to the resource. + +Example +^^^^^^^ + +:: + + - step_type: transform + script: scripts/example_script.py + script_arguments: + - "--foo=bar" + +SQL Command +------------------------- + +Executes a SQL statement in a Redshift instance. + +Properties +^^^^^^^^^^ + +- ``script_arguments``: Arguments passed to the SQL command. +- ``queue``: Query queue that should be used. +- ``wrap_transaction``: If ``true``, SQL command will be wrapped inside + a transaction. Default: ``true`` + +One of: (Required) + +- ``command``: Command to be executed directly. +- ``script``: Local path to the script that should be executed. + +Example +^^^^^^^ + +:: + + - step_type: sql-command + command: SELECT * FROM dev.test_table; + +EMR Streaming +------------------------- + +Executes a map and an optional reduce script using Amazon Elastic +MapReduce. + +Properties +^^^^^^^^^^ + +- ``mapper``: Local path to the mapper script (Required) +- ``reducer``: Local path to the reducer script +- ``hadoop_params``: List of arguments to the hadoop command +- ``output_path``: Save the script's output to the specified S3 path + +Example +^^^^^^^ + +:: + + - step_type: emr-streaming + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py + +Load Redshift +------------------------- + +Loads the data from its input node into a Redshift instance. + +Properties +^^^^^^^^^^ + +- ``schema``: The schema of the table. (Required) +- ``table``: The name of the table. (Required) +- ``insert_mode``: See Amazon's RedshiftCopyActivity documentation. + Default: TRUNCATE +- ``max_errors``: The maximum number of errors to be ignored during the + load +- ``replace_invalid_char``: Character to replace non-utf8 characters + with + +Example +^^^^^^^ + +:: + + - step_type: load-redshift + schema: dev + table: example_table + +Pipeline Dependencies +------------------------- + +Keeps running until another pipeline has finished. Use with +``depends_on`` properties to stall the pipeline. + +Properties +^^^^^^^^^^ + +- ``dependent_pipelines``: List of pipelines to wait for. (Required) +- ``refresh_rate``: Time, in seconds, to wait between polls. Default: + 300 +- ``start_date``: Date on which the pipelines started at. Default: + Current day + +Example +^^^^^^^ + +:: + + - step_type: pipeline-dependencies + refresh_rate: 60 + dependent_pipelines: + - example_transform + +Create Load Redshift +------------------------- + +Special transform step that loads the data from its input node into a +Redshift instance. If the table it's loading into does not exist, the +table will be created. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to be loaded. + (Required) +- ``script_arguments``: Arguments for the runner. + + - ``--max_error``: The maximum number of errors to be ignored during + the load. Usage: ``--max_error=5`` + - ``--replace_invalid_char``: Character the replace non-utf8 + characters with. Usage: ``--replace_invalid_char='?'`` + - ``--no_escape``: If passed, does not escape special characters. + Usage: ``--no_escape`` + - ``--gzip``: If passed, compresses the output with gzip. Usage: + ``--gzip`` + - ``--command_options``: A custom SQL string as the options for the + copy command. Usage: ``--command_options="DELIMITER '\t'"`` + + - Note: If ``--command_options`` is passed, script arguments + ``--max_error``, ``--replace_invalid_char``, ``--no_escape``, + and ``--gzip`` have no effect. + +Example +^^^^^^^ + +:: + + - step_type: create-load-redshift + table_definition: tables/dev.example_table.sql + +Upsert +------------------------- + +Extracts data from a Redshift instance and upserts the data into a +table. Upsert = Update + Insert. If a row already exists (by matching +primary keys), the row will be updated. If the row does not already +exist, insert the row. If the table it's upserting into does not exist, +the table will be created. + +Properties +^^^^^^^^^^ + +- ``destination``: Schema file for the table to upsert into. (Required) +- ``enforce_primary_key``: If true, de-duplicates data by matching + primary keys. Default: true +- ``history``: Schema file for the history table to record the changes + in the destination table. +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``sql``: The SQL query to run to extract data. +- ``script``: Local path to a SQL query to run. +- ``source``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM source``. + +Example +^^^^^^^ + +:: + + - step_type: upsert + source: tables/dev.example_table.sql + destination: tables/dev.example_table_2.sql + +Reload +------------------------- + +Extracts data from a Redshift instance and reloads a table with the +data. If the table it's reloading does not exist, the table will be +created. + +Properties +^^^^^^^^^^ + +- ``destination``: Schema file for the table to reload. (Required) +- ``enforce_primary_key``: If true, de-duplicates data by matching + primary keys. Default: true +- ``history``: Schema file for the history table to record the changes + in the destination table. +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``sql``: The SQL query to run to extract data. +- ``script``: Local path to a SQL query to run. +- ``source``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM source``. + +Example +^^^^^^^ + +:: + + - step_type: reload + source: tables/dev.example_table.sql + destination: tables/dev.example_table_2.sql + +Create Update SQL +------------------------- + +Creates a table if it exists and then runs a SQL command. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to create. (Required) +- ``script_arguments``: Arguments for the SQL script. +- ``non_transactional``: If true, does not wrap the command in a + transaction. Default: false +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``command``: SQL command to execute directly. +- ``script``: Local path to a SQL command to run. + +Example +^^^^^^^ + +:: + + - step_type: create-update-sql + command: | + DELETE FROM dev.test_table WHERE id < 0; + INSERT INTO dev.test_table + SELECT * FROM dev.test_table_2 + WHERE id < %s; + table_definition: tables/dev.test_table.sql + script_arguments: + - 4 + +Primary Key Check +------------------------- + +Checks for primary key violations on a specific table. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to check. (Required) +- ``script_arguments``: Arguments for the runner script. +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false + +Example +^^^^^^^ + +:: + + - step_type: primary-key-check + table_definition: tables/dev.test_table.sql + +Count Check +------------------------- + +Compares the number of rows in the source and destination tables/SQL +scripts. + +Properties +^^^^^^^^^^ + +- ``source_host``: The source host name to lookup in the ``mysql`` + section of the configuration file. (Required) +- ``tolerance``: Tolerance threshold, in %, for the difference in count + between source and destination. Default: 1 +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false +- ``script``: Replace the default count script. +- ``script_arguments``: Arguments for the script. + +One of: (Required) + +- ``source_sql``: SQL query to select rows to count for the source. +- ``source_count_sql``: SQL query that returns a count for the source. +- ``source_table_name``: Name of source table to count. Equivalent to a + source\_count\_sql of ``SELECT COUNT(1) from source_table_name``. + +One of: (Required) + +- ``destination_sql``: SQL query to select rows to count for the + destination. +- ``destination_table_name``: Name of the destination table to count. +- ``destination_table_definition``: Schema file for the destination + table to count. + +Example +^^^^^^^ + +:: + + - step_type: count-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + tolerance: 2.0 + log_to_s3: true + +Column Check +------------------------- + +Compares a sample of rows from the source and destination tables/SQL +scripts to see if they match + +Properties +^^^^^^^^^^ + +- ``source_host``: The source host name to lookup in the ``mysql`` + section of the configuration file. (Required) +- ``source_sql``: SQL query to select rows to check for the source. + (Required) +- ``sql_tail_for_source``: Statement to append at the end of the SQL + query for the source +- ``sample_size``: Number of samples to check. Default: 100 +- ``tolerance``: Tolerance threshold, in %, for mismatched rows. + Default: 1 +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false +- ``script``: Replace the default column check script. +- ``script_arguments``: Arguments for the script. + +One of: (Required) + +- ``destination_sql``: SQL query to select rows to check for the + destination. +- ``destination_table_definition``: Schema file for the destination + table to check. + +Example +^^^^^^^ + +:: + + - step_type: column-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" + sample_size: 10 + log_to_s3: true diff --git a/examples/README.md b/examples/README.md index 269fe2c..b1ca096 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1 +1 @@ -#### THIS IS THE EXAMPLES FOLDER \ No newline at end of file +#### Examples diff --git a/examples/emr_streaming.yaml b/examples/emr_streaming.yaml deleted file mode 100644 index 0bf29e9..0000000 --- a/examples/emr_streaming.yaml +++ /dev/null @@ -1,24 +0,0 @@ -name : example_emr_streaming -frequency : one-time -load_time: 01:00 # Hour:Min in UTC -emr_cluster_config: - num_instances: 1 - instance_size: m1.xlarge - ami_version: 3.3.1 - -description : Example for the emr_streaming step - -steps: -- step_type: extract-local - path: examples/resources/word_data.txt - -- step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py - -- step_type: transform - script: examples/scripts/s3_profiler.py - script_arguments: - - --input=INPUT1_STAGING_DIR - - --output=OUTPUT1_STAGING_DIR - - -f diff --git a/examples/example_bootstrap.yaml b/examples/example_bootstrap.yaml new file mode 100644 index 0000000..b55fa72 --- /dev/null +++ b/examples/example_bootstrap.yaml @@ -0,0 +1,17 @@ +name: example_bootstrap +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the transform step + +bootstrap: + ec2: + - step_type: transform + input_node: [] + command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt + name: bootstrap_override + +steps: +- step_type: transform + input_node: [] + command: python -c "import dataduct" >> ${OUTPUT1_STAGING_DIR}/output.txt diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml new file mode 100644 index 0000000..201e7b7 --- /dev/null +++ b/examples/example_column_check.yaml @@ -0,0 +1,14 @@ +name: example_column_check +frequency: one-time +load_time: 01:00 + +description: Example for the column-check step + +steps: +- step_type: column-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" + sample_size: 10 + log_to_s3: true diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml new file mode 100644 index 0000000..f2504cd --- /dev/null +++ b/examples/example_count_check.yaml @@ -0,0 +1,13 @@ +name: example_count_check +frequency: one-time +load_time: 01:00 + +description: Example for the count-check step + +steps: +- step_type: count-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + tolerance: 2.0 + log_to_s3: true diff --git a/examples/example_create_and_load_redshift.yaml b/examples/example_create_and_load_redshift.yaml new file mode 100644 index 0000000..57c2f9d --- /dev/null +++ b/examples/example_create_and_load_redshift.yaml @@ -0,0 +1,12 @@ +name: example_create_and_load_redshift +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the load_redshift step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql diff --git a/examples/example_create_update_sql.yaml b/examples/example_create_update_sql.yaml new file mode 100644 index 0000000..7169ecf --- /dev/null +++ b/examples/example_create_update_sql.yaml @@ -0,0 +1,16 @@ +name: example_create_update_sql +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the create-update-sql step + +steps: +- step_type: create-update-sql + command: | + DELETE FROM dev.test_table WHERE id < 0; + INSERT INTO dev.test_table + SELECT * FROM dev.test_table_2 + WHERE id < %s; + table_definition: tables/dev.test_table.sql + script_arguments: + - 4 diff --git a/examples/example_custom_extract_local.yaml b/examples/example_custom_extract_local.yaml new file mode 100644 index 0000000..fa14c4c --- /dev/null +++ b/examples/example_custom_extract_local.yaml @@ -0,0 +1,10 @@ +name: example_custom_extract_local +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: | + This example uploads a local file to S3 with the extract-local step. + +steps: +- step_type: custom-extract-local + path: data/test_table1.tsv diff --git a/examples/double_input.yaml b/examples/example_double_input.yaml similarity index 58% rename from examples/double_input.yaml rename to examples/example_double_input.yaml index e4c8913..da73c19 100644 --- a/examples/double_input.yaml +++ b/examples/example_double_input.yaml @@ -1,20 +1,20 @@ -name : example_double_input -frequency : one-time +name: example_double_input +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the transform step with multiple inputs +description: Example for the transform step with multiple inputs steps: - step_type: extract-local name: step1 - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: extract-local name: step2 - path: examples/resources/test_table2.tsv + path: data/test_table2.tsv - step_type: transform - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py input_node: step1: script step2: directory diff --git a/examples/double_output.yaml b/examples/example_double_output.yaml similarity index 60% rename from examples/double_output.yaml rename to examples/example_double_output.yaml index 968486a..0104ea7 100644 --- a/examples/double_output.yaml +++ b/examples/example_double_output.yaml @@ -1,40 +1,40 @@ -name : example_double_output -frequency : one-time +name: example_double_output +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the transform step with multiple outputs +description: Example for the transform step with multiple outputs steps: - step_type: extract-local name: step1_a - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: extract-local name: step1_b - path: examples/resources/test_table2.tsv + path: data/test_table2.tsv - step_type: transform command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR input_node: step1_a: step2_a step1_b: step2_b - output: + output_node: - step2_a - step2_b - step_type: transform - script: examples/scripts/s3_profiler.py - input_node: - step2_a: output1 + name: profiler_1 + script: scripts/s3_profiler.py + input_node: step2_a script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR - -f - step_type: transform - script: examples/scripts/s3_profiler.py - input_node: - step2_b : output1 + name: profiler_2 + script: scripts/s3_profiler.py + input_node: step2_b script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR diff --git a/examples/example_emr_streaming.yaml b/examples/example_emr_streaming.yaml new file mode 100644 index 0000000..05fa2a6 --- /dev/null +++ b/examples/example_emr_streaming.yaml @@ -0,0 +1,24 @@ +name: example_emr_streaming +frequency: one-time +load_time: 01:00 # Hour:Min in UTC +emr_cluster_config: + num_instances: 1 + instance_size: m1.large + ami_version: 3.3.1 + +description: Example for the emr_streaming step + +steps: +- step_type: extract-local + path: data/word_data.txt + +- step_type: emr-streaming + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py + +- step_type: transform + script: scripts/s3_profiler.py + script_arguments: + - --input=INPUT1_STAGING_DIR + - --output=OUTPUT1_STAGING_DIR + - -f diff --git a/examples/extract_local.yaml b/examples/example_extract_local.yaml similarity index 56% rename from examples/extract_local.yaml rename to examples/example_extract_local.yaml index 005de12..5ab1a5d 100644 --- a/examples/extract_local.yaml +++ b/examples/example_extract_local.yaml @@ -1,10 +1,10 @@ -name : example_extract_local -frequency : one-time +name: example_extract_local +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | +description: | This example uploads a local file to S3 with the extract-local step. steps: - step_type: extract-local - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv diff --git a/examples/extract_rds.yaml b/examples/example_extract_rds.yaml similarity index 85% rename from examples/extract_rds.yaml rename to examples/example_extract_rds.yaml index a6b002f..c8a382b 100644 --- a/examples/extract_rds.yaml +++ b/examples/example_extract_rds.yaml @@ -1,8 +1,8 @@ -name : example_extract_rds -frequency : one-time +name: example_extract_rds +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | +description: | This example extracts data from mysql to S3 with the extract-rds step. steps: diff --git a/examples/example_extract_redshift.yaml b/examples/example_extract_redshift.yaml new file mode 100644 index 0000000..6e18f62 --- /dev/null +++ b/examples/example_extract_redshift.yaml @@ -0,0 +1,10 @@ +name: example_extract_redshift +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: This example extracts data out of redshift + +steps: +- step_type: extract-redshift + schema: dev + table: categories diff --git a/examples/example_extract_s3.yaml b/examples/example_extract_s3.yaml new file mode 100644 index 0000000..cfbbd2b --- /dev/null +++ b/examples/example_extract_s3.yaml @@ -0,0 +1,9 @@ +name: example_extract_s3 +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: This example creates an S3Node given a S3 Uri + +steps: +- step_type: extract-s3 + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py diff --git a/examples/example_load_redshift.yaml b/examples/example_load_redshift.yaml new file mode 100644 index 0000000..06bebf3 --- /dev/null +++ b/examples/example_load_redshift.yaml @@ -0,0 +1,13 @@ +name: example_load_redshift +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the load_redshift step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: load-redshift + schema: dev + table: test_table diff --git a/examples/example_pipeline_dependency.yaml b/examples/example_pipeline_dependency.yaml new file mode 100644 index 0000000..70a2b2e --- /dev/null +++ b/examples/example_pipeline_dependency.yaml @@ -0,0 +1,14 @@ +name: example_pipeline_dependency +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +steps: +- step_type: pipeline-dependencies + name: dependency_step + refresh_rate: 60 + dependent_pipelines: + - example_transform + +- step_type: transform + depends_on: dependency_step + command: whoami >> $OUTPUT1_STAGING_DIR/output.txt diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml new file mode 100644 index 0000000..d3a8b14 --- /dev/null +++ b/examples/example_primary_key_check.yaml @@ -0,0 +1,12 @@ +name: example_primary_key_check +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the primary-key-check step + +steps: +- step_type: primary-key-check + table_definition: tables/dev.test_table.sql + log_to_s3: true + script_arguments: + - "--path_suffix=#{format(@scheduledStartTime, 'YYYY-MM-dd')}" diff --git a/examples/example_reload.yaml b/examples/example_reload.yaml new file mode 100644 index 0000000..073b16c --- /dev/null +++ b/examples/example_reload.yaml @@ -0,0 +1,16 @@ +name: example_reload +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the reload step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +- step_type: reload + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql diff --git a/examples/example_sql_command.yaml b/examples/example_sql_command.yaml new file mode 100644 index 0000000..5c13c7c --- /dev/null +++ b/examples/example_sql_command.yaml @@ -0,0 +1,9 @@ +name: example_sql_command +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the sql_command step + +steps: +- step_type: sql-command + command: SELECT * FROM dev.test_table; diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml new file mode 100644 index 0000000..c7b5ccd --- /dev/null +++ b/examples/example_transform.yaml @@ -0,0 +1,29 @@ +name: example_transform +frequency: one-time +load_time: 01:00 # Hour:Min in UTC +ec2_resource_config: + instance_type: m1.small + +description: | + Example for the transform step, uses an m1.small instance instead of + the default + +steps: +- step_type: extract-local + name: extract-node + path: data/test_table1.tsv + +- step_type: transform + input_node: extract-node + script: scripts/s3_profiler.py + script_arguments: + - --input=INPUT1_STAGING_DIR + - --output=OUTPUT1_STAGING_DIR + +- step_type: transform + input_node: extract-node + script_directory: scripts/ + script_name: s3_profiler.py + script_arguments: + - --input=INPUT1_STAGING_DIR + - --output=OUTPUT1_STAGING_DIR diff --git a/examples/example_upsert.yaml b/examples/example_upsert.yaml new file mode 100644 index 0000000..e0f54a2 --- /dev/null +++ b/examples/example_upsert.yaml @@ -0,0 +1,16 @@ +name: example_upsert +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the upsert step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +- step_type: upsert + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql diff --git a/examples/extract_redshift.yaml b/examples/extract_redshift.yaml deleted file mode 100644 index 699673d..0000000 --- a/examples/extract_redshift.yaml +++ /dev/null @@ -1,11 +0,0 @@ -name : example_extract_redshift -frequency : one-time -load_time: 01:00 # Hour:Min in UTC - -description : | - This example extracts data out of redshift - -steps: -- step_type: extract-redshift - schema: dev - table: categories diff --git a/examples/extract_s3.yaml b/examples/extract_s3.yaml deleted file mode 100644 index f683976..0000000 --- a/examples/extract_s3.yaml +++ /dev/null @@ -1,10 +0,0 @@ -name : example_extract_s3 -frequency : one-time -load_time: 01:00 # Hour:Min in UTC - -description : | - This example creates an S3Node given a S3 Uri - -steps: -- step_type: extract-s3 - uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py diff --git a/examples/load_redshift.yaml b/examples/load_redshift.yaml deleted file mode 100644 index 735a386..0000000 --- a/examples/load_redshift.yaml +++ /dev/null @@ -1,13 +0,0 @@ -name : example_load_redshift -frequency : one-time -load_time: 01:00 # Hour:Min in UTC - -description : Example for the load_redshift step - -steps: -- step_type: extract-local - path: examples/resources/test_table1.tsv - -- step_type: load-redshift - schema: dev - table: test_table diff --git a/examples/resources/test_table1.tsv b/examples/resources/data/test_table1.tsv similarity index 100% rename from examples/resources/test_table1.tsv rename to examples/resources/data/test_table1.tsv diff --git a/examples/resources/test_table2.tsv b/examples/resources/data/test_table2.tsv similarity index 100% rename from examples/resources/test_table2.tsv rename to examples/resources/data/test_table2.tsv diff --git a/examples/resources/word_data.txt b/examples/resources/data/word_data.txt similarity index 100% rename from examples/resources/word_data.txt rename to examples/resources/data/word_data.txt diff --git a/examples/scripts/s3_profiler.py b/examples/resources/scripts/s3_profiler.py similarity index 97% rename from examples/scripts/s3_profiler.py rename to examples/resources/scripts/s3_profiler.py index a4f8bad..767e4a7 100755 --- a/examples/scripts/s3_profiler.py +++ b/examples/resources/scripts/s3_profiler.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -""" -Walk over files in S3 output node and provide basic information about them +"""Walk over files in S3 output node and provide basic information about them """ import argparse @@ -14,8 +13,10 @@ def run_command(command): """Execute a shell command + Args: command(list of str): list of command arguments + Returns: output(str): stdout of command """ @@ -24,8 +25,10 @@ def run_command(command): def recurse_directory(directory_path): """Recursively walk directories and output basic stats on files + Args: directory_path(str): Path to the directory which is read + Returns: result(list of tuples): (filename, count of lines in file, size of file) """ @@ -43,6 +46,7 @@ def recurse_directory(directory_path): def paths_exist(input_directory, paths): """Check if a path exists or not + Args: input_directory(str): input directory to be checked paths(list of str): paths for which one should check the existence. @@ -58,6 +62,7 @@ def profile_input(input_directory, output_directory, output_file_name, fail_if_empty): """Lists statistics for all files located in input directrory. Output is written to a file in the output directory. + Args: input_directory(path): path to the input directory output_directory(path): path to the output directory diff --git a/examples/scripts/word_mapper.py b/examples/resources/scripts/word_mapper.py similarity index 100% rename from examples/scripts/word_mapper.py rename to examples/resources/scripts/word_mapper.py diff --git a/examples/scripts/word_reducer.py b/examples/resources/scripts/word_reducer.py similarity index 100% rename from examples/scripts/word_reducer.py rename to examples/resources/scripts/word_reducer.py diff --git a/examples/resources/tables/categories.sql b/examples/resources/tables/categories.sql new file mode 100644 index 0000000..54848f0 --- /dev/null +++ b/examples/resources/tables/categories.sql @@ -0,0 +1,5 @@ +CREATE TABLE categories ( + category_id INTEGER DISTKEY PRIMARY KEY + ,category_name VARCHAR(100) + ,description VARCHAR(2000) +) SORTKEY(category_id); diff --git a/examples/resources/tables/customers.sql b/examples/resources/tables/customers.sql new file mode 100644 index 0000000..bd55bb0 --- /dev/null +++ b/examples/resources/tables/customers.sql @@ -0,0 +1,9 @@ +CREATE TABLE customers ( + customer_id INTEGER DISTKEY PRIMARY KEY + ,customer_name VARCHAR(200) + ,contact_name VARCHAR(200) + ,address VARCHAR(200) + ,city VARCHAR(100) + ,postal_code VARCHAR(10) + ,country VARCHAR(100) +) SORTKEY(customer_id); diff --git a/examples/resources/tables/dev.test_table.sql b/examples/resources/tables/dev.test_table.sql new file mode 100644 index 0000000..238486f --- /dev/null +++ b/examples/resources/tables/dev.test_table.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); diff --git a/examples/resources/tables/dev.test_table_2.sql b/examples/resources/tables/dev.test_table_2.sql new file mode 100644 index 0000000..81eb90d --- /dev/null +++ b/examples/resources/tables/dev.test_table_2.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table_2( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); diff --git a/examples/resources/tables/employees.sql b/examples/resources/tables/employees.sql new file mode 100644 index 0000000..fbbcf9a --- /dev/null +++ b/examples/resources/tables/employees.sql @@ -0,0 +1,7 @@ +CREATE TABLE employees ( + employee_id INTEGER DISTKEY PRIMARY KEY + ,last_name VARCHAR(100) + ,first_name VARCHAR(100) + ,birth_date DATE + ,notes VARCHAR(2000) +) SORTKEY(employee_id); diff --git a/examples/resources/tables/order_details.sql b/examples/resources/tables/order_details.sql new file mode 100644 index 0000000..e0f2f75 --- /dev/null +++ b/examples/resources/tables/order_details.sql @@ -0,0 +1,6 @@ +CREATE TABLE order_details ( + order_detail_id INTEGER DISTKEY PRIMARY KEY + ,order_id INTEGER REFERENCES orders(order_id) + ,product_id INTEGER REFERENCES products(product_id) + ,quantity INTEGER +) SORTKEY(order_detail_id); diff --git a/examples/resources/tables/orders.sql b/examples/resources/tables/orders.sql new file mode 100644 index 0000000..c332965 --- /dev/null +++ b/examples/resources/tables/orders.sql @@ -0,0 +1,7 @@ +CREATE TABLE orders ( + order_id INTEGER DISTKEY PRIMARY KEY + ,customer_id INTEGER REFERENCES customers(customer_id) + ,employee_id INTEGER REFERENCES employees(employee_id) + ,order_date DATE + ,shipper_id INTEGER REFERENCES shippers(shipper_id) +) SORTKEY(order_id); diff --git a/examples/resources/tables/products.sql b/examples/resources/tables/products.sql new file mode 100644 index 0000000..356198d --- /dev/null +++ b/examples/resources/tables/products.sql @@ -0,0 +1,8 @@ +CREATE TABLE products ( + product_id INTEGER DISTKEY PRIMARY KEY + ,product_name VARCHAR(200) + ,supplier_id INTEGER REFERENCES suppliers(supplier_id) + ,category_id INTEGER REFERENCES categories(category_id) + ,unit VARCHAR(200) + ,price REAL +) SORTKEY(product_id); diff --git a/examples/resources/tables/shippers.sql b/examples/resources/tables/shippers.sql new file mode 100644 index 0000000..bed1454 --- /dev/null +++ b/examples/resources/tables/shippers.sql @@ -0,0 +1,5 @@ +CREATE TABLE shippers ( + shipper_id INTEGER DISTKEY PRIMARY KEY + ,shipper_name VARCHAR(200) + ,phone VARCHAR(20) +) SORTKEY(shipper_id); diff --git a/examples/resources/tables/suppliers.sql b/examples/resources/tables/suppliers.sql new file mode 100644 index 0000000..d70e7f1 --- /dev/null +++ b/examples/resources/tables/suppliers.sql @@ -0,0 +1,10 @@ +CREATE TABLE suppliers ( + supplier_id INTEGER DISTKEY PRIMARY KEY + ,supplier_name VARCHAR(200) + ,contact_name VARCHAR(200) + ,address VARCHAR(200) + ,city VARCHAR(100) + ,postal_code VARCHAR(10) + ,county VARCHAR(100) + ,phone VARCHAR(20) +) SORTKEY(supplier_id); diff --git a/examples/sql_command.yaml b/examples/sql_command.yaml deleted file mode 100644 index e80aa0e..0000000 --- a/examples/sql_command.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name : example_sql_command -frequency : one-time -load_time: 01:00 # Hour:Min in UTC - -description : Example for the sql_command step - -steps: -- step_type: sql-command - command: INSERT INTO dev.test_table VALUES (1, 'hello_etl'); diff --git a/examples/steps/custom_extract_local.py b/examples/steps/custom_extract_local.py new file mode 100644 index 0000000..6614af6 --- /dev/null +++ b/examples/steps/custom_extract_local.py @@ -0,0 +1,20 @@ +""" +ETL step wrapper for creating an S3 node for input from local files +""" +from dataduct.steps import ExtractLocalStep +import logging +logger = logging.getLogger(__name__) + + +class CustomExtractLocalStep(ExtractLocalStep): + """CustomExtractLocal Step class that helps get data from a local file + """ + + def __init__(self, **kwargs): + """Constructor for the CustomExtractLocal class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + logger.info('Using the Custom Extract Local Step') + super(CustomExtractLocalStep, self).__init__(**kwargs) diff --git a/examples/transform.yaml b/examples/transform.yaml deleted file mode 100644 index 4ecda05..0000000 --- a/examples/transform.yaml +++ /dev/null @@ -1,15 +0,0 @@ -name : example_transform -frequency : one-time -load_time: 01:00 # Hour:Min in UTC - -description : Example for the transform step - -steps: -- step_type: extract-local - path: examples/resources/test_table1.tsv - -- step_type: transform - script: examples/scripts/s3_profiler.py - script_arguments: - - --input=INPUT1_STAGING_DIR - - --output=OUTPUT1_STAGING_DIR diff --git a/requirements.txt b/requirements.txt index 1007ddc..6021e82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,14 @@ -Sphinx==1.2.3 -boto==2.34.0 -sphinx-rtd-theme==0.1.6 -sphinxcontrib-napoleon==0.2.8 +boto>=2.34.0 +Sphinx>=1.2.3 +sphinx-rtd-theme>=0.1.6 +sphinxcontrib-napoleon>=0.2.8 +pandas>=0.14.1 +psycopg2 +MySQL-python +PyYAML +coverage +pyparsing>=2 +pygraphviz +testfixtures>=4.1.1 +mock +pytimeparse diff --git a/setup.py b/setup.py index e4bbd7b..02bd8d1 100644 --- a/setup.py +++ b/setup.py @@ -1,30 +1,32 @@ """ -Setup file for installation of the etllib code +Setup file for installation of the dataduct code """ from setuptools import setup +from setuptools import find_packages setup( name='dataduct', - version='0.1.0', + version='0.2.0', author='Coursera Inc.', - packages=[ - 'dataduct', - 'dataduct.config', - 'dataduct.pipeline', - 'dataduct.s3', - 'dataduct.steps', - 'dataduct.utils', - ], + packages=find_packages( + exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), namespace_packages=['dataduct'], include_package_data=True, url='https://github.com/coursera/dataduct', long_description=open('README.rst').read(), author_email='data-infra@coursera.org', license='Apache License 2.0', - description='DataPipeline for Humans.', + description='DataPipeline for Humans', install_requires=[ - 'boto>=2.32', - 'pyyaml' + 'boto>=2.34', + 'PyYAML', + 'pandas', + 'psycopg2', + 'pytimeparse', + 'MySQL-python', + 'pyparsing', + 'testfixtures', + 'sphinx_rtd_theme' ], scripts=['bin/dataduct'], classifiers=[